{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6675567423230975, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 196.6666717529297, "epoch": 0.0004450378282153983, "grad_norm": 0.7149407267570496, "kl": 0.00043882918544113636, "learning_rate": 2.2222222222222224e-08, "loss": 0.0, "reward": -0.312666654586792, "reward_std": 0.39005160331726074, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.312666654586792, "step": 1 }, { "completion_length": 179.0, "epoch": 0.0008900756564307966, "grad_norm": 1.2034916877746582, "kl": 0.00043298103264532983, "learning_rate": 4.444444444444445e-08, "loss": 0.0, "reward": -0.2953333556652069, "reward_std": 0.3479785621166229, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2953333556652069, "step": 2 }, { "completion_length": 200.0, "epoch": 0.0013351134846461949, "grad_norm": 0.00107863440643996, "kl": 0.0003771593910641968, "learning_rate": 6.666666666666668e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 3 }, { "completion_length": 200.0, "epoch": 0.0017801513128615932, "grad_norm": 0.7310553789138794, "kl": 0.00044770282693207264, "learning_rate": 8.88888888888889e-08, "loss": 0.0, "reward": -0.085999995470047, "reward_std": 0.3784494996070862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.085999995470047, "step": 4 }, { "completion_length": 174.83334350585938, "epoch": 0.0022251891410769915, "grad_norm": 0.861896276473999, "kl": 0.00042549317004159093, "learning_rate": 1.1111111111111112e-07, "loss": 0.0, "reward": -0.09316667169332504, "reward_std": 0.2892870306968689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09316667169332504, "step": 5 }, { "completion_length": 200.0, "epoch": 0.0026702269692923898, "grad_norm": 0.0017526125302538276, "kl": 0.0004182992852292955, "learning_rate": 1.3333333333333336e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 6 }, { "completion_length": 166.5, "epoch": 0.003115264797507788, "grad_norm": 0.9585131406784058, "kl": 0.00047177166561596096, "learning_rate": 1.5555555555555556e-07, "loss": 0.0, "reward": -0.33766669034957886, "reward_std": 0.2008279412984848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3376666307449341, "step": 7 }, { "completion_length": 200.0, "epoch": 0.0035603026257231864, "grad_norm": 0.6275889873504639, "kl": 0.0004587940056808293, "learning_rate": 1.777777777777778e-07, "loss": 0.0, "reward": -0.4321666657924652, "reward_std": 0.2741382122039795, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4321666657924652, "step": 8 }, { "completion_length": 152.5, "epoch": 0.004005340453938585, "grad_norm": 0.7820535898208618, "kl": 0.0003722615947481245, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": -0.24449998140335083, "reward_std": 0.2453835904598236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24449998140335083, "step": 9 }, { "completion_length": 200.0, "epoch": 0.004450378282153983, "grad_norm": 0.6632036566734314, "kl": 0.00043898209696635604, "learning_rate": 2.2222222222222224e-07, "loss": 0.0, "reward": -0.006833334919065237, "reward_std": 0.3229244351387024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.006833334919065237, "step": 10 }, { "completion_length": 200.0, "epoch": 0.004895416110369382, "grad_norm": 0.7768407464027405, "kl": 0.00042121915612369776, "learning_rate": 2.444444444444445e-07, "loss": 0.0, "reward": -0.5506666898727417, "reward_std": 0.06373277306556702, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5506666898727417, "step": 11 }, { "completion_length": 199.6666717529297, "epoch": 0.0053404539385847796, "grad_norm": 0.7278463244438171, "kl": 0.0004068611597176641, "learning_rate": 2.666666666666667e-07, "loss": 0.0, "reward": -0.2864999771118164, "reward_std": 0.334197998046875, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2864999771118164, "step": 12 }, { "completion_length": 200.0, "epoch": 0.005785491766800178, "grad_norm": 0.6500892043113708, "kl": 0.0003313750494271517, "learning_rate": 2.888888888888889e-07, "loss": 0.0, "reward": -0.32199999690055847, "reward_std": 0.34629470109939575, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32199999690055847, "step": 13 }, { "completion_length": 200.0, "epoch": 0.006230529595015576, "grad_norm": 0.7193189859390259, "kl": 0.0003894752007909119, "learning_rate": 3.111111111111111e-07, "loss": 0.0, "reward": -0.08683334290981293, "reward_std": 0.32831722497940063, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08683334290981293, "step": 14 }, { "completion_length": 200.0, "epoch": 0.006675567423230975, "grad_norm": 0.6580405235290527, "kl": 0.00035931920865550637, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": 0.006666670553386211, "reward_std": 0.28985628485679626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006666670553386211, "step": 15 }, { "completion_length": 200.0, "epoch": 0.007120605251446373, "grad_norm": 0.7829955220222473, "kl": 0.0004430452245287597, "learning_rate": 3.555555555555556e-07, "loss": 0.0, "reward": -0.4845000207424164, "reward_std": 0.30278095602989197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4845000207424164, "step": 16 }, { "completion_length": 199.6666717529297, "epoch": 0.0075656430796617715, "grad_norm": 0.7301868796348572, "kl": 0.0004354792181402445, "learning_rate": 3.777777777777778e-07, "loss": 0.0, "reward": -0.5509999990463257, "reward_std": 0.09809383004903793, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5509999990463257, "step": 17 }, { "completion_length": 195.33334350585938, "epoch": 0.00801068090787717, "grad_norm": 0.627457320690155, "kl": 0.0004219269612804055, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": -0.382999986410141, "reward_std": 0.258131742477417, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.382999986410141, "step": 18 }, { "completion_length": 198.5, "epoch": 0.008455718736092568, "grad_norm": 0.6126062273979187, "kl": 0.00037288008024916053, "learning_rate": 4.2222222222222226e-07, "loss": 0.0, "reward": -0.5198333263397217, "reward_std": 0.06735106557607651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5198333859443665, "step": 19 }, { "completion_length": 200.0, "epoch": 0.008900756564307966, "grad_norm": 0.7298072576522827, "kl": 0.00042569750803522766, "learning_rate": 4.444444444444445e-07, "loss": 0.0, "reward": -0.12316666543483734, "reward_std": 0.38789814710617065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12316666543483734, "step": 20 }, { "completion_length": 200.0, "epoch": 0.009345794392523364, "grad_norm": 0.7508992552757263, "kl": 0.0003784544242080301, "learning_rate": 4.666666666666667e-07, "loss": 0.0, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 21 }, { "completion_length": 200.0, "epoch": 0.009790832220738763, "grad_norm": 0.7774270176887512, "kl": 0.0004591545439325273, "learning_rate": 4.88888888888889e-07, "loss": 0.0, "reward": -0.12583334743976593, "reward_std": 0.3894747495651245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12583334743976593, "step": 22 }, { "completion_length": 178.6666717529297, "epoch": 0.010235870048954161, "grad_norm": 0.9822273254394531, "kl": 0.0003580343909561634, "learning_rate": 5.111111111111112e-07, "loss": 0.0, "reward": -0.40816670656204224, "reward_std": 0.2941179871559143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40816670656204224, "step": 23 }, { "completion_length": 195.6666717529297, "epoch": 0.010680907877169559, "grad_norm": 0.7280099987983704, "kl": 0.0003752989578060806, "learning_rate": 5.333333333333335e-07, "loss": 0.0, "reward": -0.5036666393280029, "reward_std": 0.06944254040718079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5036666393280029, "step": 24 }, { "completion_length": 194.1666717529297, "epoch": 0.011125945705384957, "grad_norm": 0.7669874429702759, "kl": 0.0003995794686488807, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "reward": -0.1613333374261856, "reward_std": 0.3338997960090637, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1613333374261856, "step": 25 }, { "completion_length": 200.0, "epoch": 0.011570983533600357, "grad_norm": 0.0017349894624203444, "kl": 0.00044584478018805385, "learning_rate": 5.777777777777778e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 26 }, { "completion_length": 200.0, "epoch": 0.012016021361815754, "grad_norm": 0.7146498560905457, "kl": 0.0004711821093223989, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.004333337303251028, "reward_std": 0.2955717444419861, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.004333337303251028, "step": 27 }, { "completion_length": 194.0, "epoch": 0.012461059190031152, "grad_norm": 0.7583353519439697, "kl": 0.0003412925580050796, "learning_rate": 6.222222222222223e-07, "loss": 0.0, "reward": -0.32066667079925537, "reward_std": 0.34830141067504883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32066667079925537, "step": 28 }, { "completion_length": 200.0, "epoch": 0.01290609701824655, "grad_norm": 0.6495766639709473, "kl": 0.0003407001495361328, "learning_rate": 6.444444444444445e-07, "loss": 0.0, "reward": 0.013833334669470787, "reward_std": 0.2723016142845154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013833334669470787, "step": 29 }, { "completion_length": 200.0, "epoch": 0.01335113484646195, "grad_norm": 0.6830177307128906, "kl": 0.00040841297595761716, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": -0.38466668128967285, "reward_std": 0.3953356444835663, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38466668128967285, "step": 30 }, { "completion_length": 197.0, "epoch": 0.013796172674677348, "grad_norm": 0.6479209661483765, "kl": 0.00035427496186457574, "learning_rate": 6.88888888888889e-07, "loss": 0.0, "reward": -0.41850003600120544, "reward_std": 0.27061542868614197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41850003600120544, "step": 31 }, { "completion_length": 200.0, "epoch": 0.014241210502892745, "grad_norm": 0.001505751977674663, "kl": 0.00040443878970108926, "learning_rate": 7.111111111111112e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 32 }, { "completion_length": 194.83334350585938, "epoch": 0.014686248331108143, "grad_norm": 0.8729995489120483, "kl": 0.00045898579992353916, "learning_rate": 7.333333333333334e-07, "loss": 0.0, "reward": -0.4713333547115326, "reward_std": 0.0908794030547142, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4713333547115326, "step": 33 }, { "completion_length": 200.0, "epoch": 0.015131286159323543, "grad_norm": 0.83029705286026, "kl": 0.0004323194734752178, "learning_rate": 7.555555555555556e-07, "loss": 0.0, "reward": -0.3368333578109741, "reward_std": 0.35846197605133057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33683332800865173, "step": 34 }, { "completion_length": 200.0, "epoch": 0.01557632398753894, "grad_norm": 0.678088903427124, "kl": 0.0004401813494041562, "learning_rate": 7.777777777777779e-07, "loss": 0.0, "reward": -0.22800001502037048, "reward_std": 0.38724154233932495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22800001502037048, "step": 35 }, { "completion_length": 200.0, "epoch": 0.01602136181575434, "grad_norm": 0.7426117658615112, "kl": 0.0003528599627315998, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": -0.19583332538604736, "reward_std": 0.3525362014770508, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19583332538604736, "step": 36 }, { "completion_length": 200.0, "epoch": 0.016466399643969738, "grad_norm": 0.0014730676775798202, "kl": 0.000418979674577713, "learning_rate": 8.222222222222223e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 37 }, { "completion_length": 200.0, "epoch": 0.016911437472185136, "grad_norm": 0.6773275136947632, "kl": 0.0003312948392704129, "learning_rate": 8.444444444444445e-07, "loss": 0.0, "reward": 0.016833335161209106, "reward_std": 0.2649531364440918, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016833335161209106, "step": 38 }, { "completion_length": 200.0, "epoch": 0.017356475300400534, "grad_norm": 0.4805554449558258, "kl": 0.00037392970989458263, "learning_rate": 8.666666666666668e-07, "loss": 0.0, "reward": -0.4266667068004608, "reward_std": 0.2752167582511902, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4266667068004608, "step": 39 }, { "completion_length": 188.33334350585938, "epoch": 0.017801513128615932, "grad_norm": 0.8252691626548767, "kl": 0.00034736632369458675, "learning_rate": 8.88888888888889e-07, "loss": 0.0, "reward": -0.43516668677330017, "reward_std": 0.10999348759651184, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43516668677330017, "step": 40 }, { "completion_length": 195.83334350585938, "epoch": 0.01824655095683133, "grad_norm": 0.6390711069107056, "kl": 0.00039921089773997664, "learning_rate": 9.111111111111113e-07, "loss": 0.0, "reward": -0.44216665625572205, "reward_std": 0.1345755159854889, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44216665625572205, "step": 41 }, { "completion_length": 200.0, "epoch": 0.018691588785046728, "grad_norm": 0.584726095199585, "kl": 0.000410672917496413, "learning_rate": 9.333333333333334e-07, "loss": 0.0, "reward": -0.08250001072883606, "reward_std": 0.3218010365962982, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 42 }, { "completion_length": 200.0, "epoch": 0.01913662661326213, "grad_norm": 0.7584457993507385, "kl": 0.0004279993590898812, "learning_rate": 9.555555555555556e-07, "loss": 0.0, "reward": -0.1798333376646042, "reward_std": 0.4852597117424011, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1798333376646042, "step": 43 }, { "completion_length": 200.0, "epoch": 0.019581664441477527, "grad_norm": 0.6866025328636169, "kl": 0.0004159708914812654, "learning_rate": 9.77777777777778e-07, "loss": 0.0, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 44 }, { "completion_length": 200.0, "epoch": 0.020026702269692925, "grad_norm": 0.6524875164031982, "kl": 0.0004553616454359144, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -0.30799999833106995, "reward_std": 0.33663925528526306, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30799999833106995, "step": 45 }, { "completion_length": 197.1666717529297, "epoch": 0.020471740097908322, "grad_norm": 1.1161606311798096, "kl": 0.0004416520241647959, "learning_rate": 1.0222222222222223e-06, "loss": 0.0, "reward": -0.609333336353302, "reward_std": 0.2037426233291626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.609333336353302, "step": 46 }, { "completion_length": 200.0, "epoch": 0.02091677792612372, "grad_norm": 0.0014684926718473434, "kl": 0.0004343845648691058, "learning_rate": 1.0444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 47 }, { "completion_length": 200.0, "epoch": 0.021361815754339118, "grad_norm": 0.7850491404533386, "kl": 0.0004720585129689425, "learning_rate": 1.066666666666667e-06, "loss": 0.0, "reward": -0.24550001323223114, "reward_std": 0.4167482554912567, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24550001323223114, "step": 48 }, { "completion_length": 174.0, "epoch": 0.021806853582554516, "grad_norm": 0.6916747093200684, "kl": 0.0003871291410177946, "learning_rate": 1.0888888888888889e-06, "loss": 0.0, "reward": -0.3348333239555359, "reward_std": 0.15262427926063538, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3348333537578583, "step": 49 }, { "completion_length": 200.0, "epoch": 0.022251891410769914, "grad_norm": 0.6415843367576599, "kl": 0.0004480450297705829, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "reward": -0.5586666464805603, "reward_std": 0.04939500615000725, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5586667060852051, "step": 50 }, { "completion_length": 200.0, "epoch": 0.022696929238985315, "grad_norm": 0.7905615568161011, "kl": 0.00043076984002254903, "learning_rate": 1.1333333333333334e-06, "loss": 0.0, "reward": -0.5353333353996277, "reward_std": 0.01973491534590721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5353333353996277, "step": 51 }, { "completion_length": 195.33334350585938, "epoch": 0.023141967067200713, "grad_norm": 0.6714709401130676, "kl": 0.000462042837170884, "learning_rate": 1.1555555555555556e-06, "loss": 0.0, "reward": -0.45649999380111694, "reward_std": 0.0799018144607544, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45650002360343933, "step": 52 }, { "completion_length": 200.0, "epoch": 0.02358700489541611, "grad_norm": 0.002091527683660388, "kl": 0.0004879847401753068, "learning_rate": 1.1777777777777778e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 53 }, { "completion_length": 200.0, "epoch": 0.02403204272363151, "grad_norm": 0.7136774659156799, "kl": 0.00044702840386889875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.019333332777023315, "reward_std": 0.25882941484451294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019333332777023315, "step": 54 }, { "completion_length": 191.6666717529297, "epoch": 0.024477080551846907, "grad_norm": 1.0719506740570068, "kl": 0.00041414215229451656, "learning_rate": 1.2222222222222223e-06, "loss": 0.0, "reward": 0.0806666687130928, "reward_std": 0.10859404504299164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0806666687130928, "step": 55 }, { "completion_length": 200.0, "epoch": 0.024922118380062305, "grad_norm": 0.002473922213539481, "kl": 0.0005147390766069293, "learning_rate": 1.2444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 56 }, { "completion_length": 200.0, "epoch": 0.025367156208277702, "grad_norm": 0.7581794857978821, "kl": 0.0004257292894180864, "learning_rate": 1.2666666666666669e-06, "loss": 0.0, "reward": -0.09583333134651184, "reward_std": 0.34220486879348755, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09583333134651184, "step": 57 }, { "completion_length": 200.0, "epoch": 0.0258121940364931, "grad_norm": 0.6923669576644897, "kl": 0.0004169998865108937, "learning_rate": 1.288888888888889e-06, "loss": 0.0, "reward": -0.3475000262260437, "reward_std": 0.3798324763774872, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3475000262260437, "step": 58 }, { "completion_length": 109.0, "epoch": 0.0262572318647085, "grad_norm": 1.0712668895721436, "kl": 0.00041128776501864195, "learning_rate": 1.3111111111111112e-06, "loss": 0.0, "reward": -0.06133333593606949, "reward_std": 0.20354819297790527, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06133333966135979, "step": 59 }, { "completion_length": 200.0, "epoch": 0.0267022696929239, "grad_norm": 0.6178426742553711, "kl": 0.0004559112712740898, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 60 }, { "completion_length": 200.0, "epoch": 0.027147307521139297, "grad_norm": 0.6320557594299316, "kl": 0.0005056136287748814, "learning_rate": 1.3555555555555558e-06, "loss": 0.0, "reward": -0.4624999761581421, "reward_std": 0.29319530725479126, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4624999761581421, "step": 61 }, { "completion_length": 196.83334350585938, "epoch": 0.027592345349354695, "grad_norm": 0.7537997364997864, "kl": 0.0004290025099180639, "learning_rate": 1.377777777777778e-06, "loss": 0.0, "reward": -0.48366665840148926, "reward_std": 0.12520170211791992, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48366665840148926, "step": 62 }, { "completion_length": 200.0, "epoch": 0.028037383177570093, "grad_norm": 0.0018171067349612713, "kl": 0.0004468559636734426, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 63 }, { "completion_length": 200.0, "epoch": 0.02848242100578549, "grad_norm": 0.6882118582725525, "kl": 0.0004662362043745816, "learning_rate": 1.4222222222222223e-06, "loss": 0.0, "reward": 0.01850000023841858, "reward_std": 0.26087066531181335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01850000023841858, "step": 64 }, { "completion_length": 195.33334350585938, "epoch": 0.02892745883400089, "grad_norm": 0.7403289675712585, "kl": 0.00043322655255906284, "learning_rate": 1.4444444444444445e-06, "loss": 0.0, "reward": -0.47933337092399597, "reward_std": 0.09026111662387848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.47933337092399597, "step": 65 }, { "completion_length": 200.0, "epoch": 0.029372496662216287, "grad_norm": 0.0017793107545003295, "kl": 0.00043216149788349867, "learning_rate": 1.4666666666666669e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 66 }, { "completion_length": 199.0, "epoch": 0.029817534490431688, "grad_norm": 0.6182725429534912, "kl": 0.00037733028875663877, "learning_rate": 1.4888888888888888e-06, "loss": 0.0, "reward": -0.07999999821186066, "reward_std": 0.32039913535118103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08000000566244125, "step": 67 }, { "completion_length": 130.6666717529297, "epoch": 0.030262572318647086, "grad_norm": 1.1008906364440918, "kl": 0.0003855052054859698, "learning_rate": 1.5111111111111112e-06, "loss": 0.0, "reward": -0.04516666755080223, "reward_std": 0.15019378066062927, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04516666755080223, "step": 68 }, { "completion_length": 200.0, "epoch": 0.030707610146862484, "grad_norm": 0.6984473466873169, "kl": 0.00044030696153640747, "learning_rate": 1.5333333333333334e-06, "loss": 0.0, "reward": -0.34150001406669617, "reward_std": 0.364574134349823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34150001406669617, "step": 69 }, { "completion_length": 195.33334350585938, "epoch": 0.03115264797507788, "grad_norm": 0.6155555248260498, "kl": 0.00039145484333857894, "learning_rate": 1.5555555555555558e-06, "loss": 0.0, "reward": -0.3800000548362732, "reward_std": 0.2700370252132416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3799999952316284, "step": 70 }, { "completion_length": 200.0, "epoch": 0.03159768580329328, "grad_norm": 0.5811487436294556, "kl": 0.0003899557632394135, "learning_rate": 1.5777777777777778e-06, "loss": 0.0, "reward": 0.014166663400828838, "reward_std": 0.2714851498603821, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014166663400828838, "step": 71 }, { "completion_length": 200.0, "epoch": 0.03204272363150868, "grad_norm": 0.6491996049880981, "kl": 0.0004053341690450907, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.01066666841506958, "reward_std": 0.2800583243370056, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01066666841506958, "step": 72 }, { "completion_length": 192.6666717529297, "epoch": 0.032487761459724075, "grad_norm": 0.7621211409568787, "kl": 0.0005313883302733302, "learning_rate": 1.6222222222222223e-06, "loss": 0.0, "reward": -0.4951666593551636, "reward_std": 0.05715736746788025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4951666593551636, "step": 73 }, { "completion_length": 200.0, "epoch": 0.032932799287939477, "grad_norm": 0.7010623216629028, "kl": 0.00042472081258893013, "learning_rate": 1.6444444444444447e-06, "loss": 0.0, "reward": -0.0989999994635582, "reward_std": 0.34915727376937866, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0989999994635582, "step": 74 }, { "completion_length": 200.0, "epoch": 0.03337783711615487, "grad_norm": 0.7583237886428833, "kl": 0.00046256266068667173, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": -0.09333333373069763, "reward_std": 0.33824294805526733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09333333373069763, "step": 75 }, { "completion_length": 200.0, "epoch": 0.03382287494437027, "grad_norm": 0.7639651894569397, "kl": 0.00039581506280228496, "learning_rate": 1.688888888888889e-06, "loss": 0.0, "reward": -0.2276666760444641, "reward_std": 0.3896222710609436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2276666760444641, "step": 76 }, { "completion_length": 164.1666717529297, "epoch": 0.03426791277258567, "grad_norm": 0.8515862822532654, "kl": 0.0004627097805496305, "learning_rate": 1.7111111111111112e-06, "loss": 0.0, "reward": -0.3383333683013916, "reward_std": 0.19050845503807068, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3383333683013916, "step": 77 }, { "completion_length": 200.0, "epoch": 0.03471295060080107, "grad_norm": 0.6962697505950928, "kl": 0.000421873846789822, "learning_rate": 1.7333333333333336e-06, "loss": 0.0, "reward": 0.013333330862224102, "reward_std": 0.2735263705253601, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013333330862224102, "step": 78 }, { "completion_length": 200.0, "epoch": 0.03515798842901647, "grad_norm": 0.6623450517654419, "kl": 0.0004432189743965864, "learning_rate": 1.7555555555555556e-06, "loss": 0.0, "reward": -0.0011666715145111084, "reward_std": 0.3090439736843109, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0011666715145111084, "step": 79 }, { "completion_length": 145.5, "epoch": 0.035603026257231864, "grad_norm": 0.9290313720703125, "kl": 0.0004976950585842133, "learning_rate": 1.777777777777778e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.20369061827659607, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333535194397, "step": 80 }, { "completion_length": 200.0, "epoch": 0.036048064085447265, "grad_norm": 0.00226940237917006, "kl": 0.00044577824883162975, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 81 }, { "completion_length": 200.0, "epoch": 0.03649310191366266, "grad_norm": 0.7341117262840271, "kl": 0.0003961599140893668, "learning_rate": 1.8222222222222225e-06, "loss": 0.0, "reward": 0.012833337299525738, "reward_std": 0.27475112676620483, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012833337299525738, "step": 82 }, { "completion_length": 176.83334350585938, "epoch": 0.03693813974187806, "grad_norm": 1.1084420680999756, "kl": 0.0004944322863593698, "learning_rate": 1.8444444444444445e-06, "loss": 0.0, "reward": -0.4231666922569275, "reward_std": 0.38362035155296326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4231666922569275, "step": 83 }, { "completion_length": 200.0, "epoch": 0.037383177570093455, "grad_norm": 0.5538046956062317, "kl": 0.0004035194288007915, "learning_rate": 1.8666666666666669e-06, "loss": 0.0, "reward": 0.020999997854232788, "reward_std": 0.2547469735145569, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020999997854232788, "step": 84 }, { "completion_length": 199.6666717529297, "epoch": 0.037828215398308856, "grad_norm": 0.6329994797706604, "kl": 0.00045469467295333743, "learning_rate": 1.888888888888889e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.3470242917537689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333386182785, "step": 85 }, { "completion_length": 200.0, "epoch": 0.03827325322652426, "grad_norm": 0.7051041722297668, "kl": 0.00048318642075173557, "learning_rate": 1.9111111111111112e-06, "loss": 0.0, "reward": 0.012499998323619366, "reward_std": 0.27556759119033813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012499998323619366, "step": 86 }, { "completion_length": 200.0, "epoch": 0.03871829105473965, "grad_norm": 0.9833978414535522, "kl": 0.0004773414693772793, "learning_rate": 1.9333333333333336e-06, "loss": 0.0, "reward": -0.37816667556762695, "reward_std": 0.35067784786224365, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37816667556762695, "step": 87 }, { "completion_length": 200.0, "epoch": 0.039163328882955054, "grad_norm": 0.7608519196510315, "kl": 0.0004284613241907209, "learning_rate": 1.955555555555556e-06, "loss": 0.0, "reward": -0.4285000264644623, "reward_std": 0.27370041608810425, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4285000264644623, "step": 88 }, { "completion_length": 200.0, "epoch": 0.03960836671117045, "grad_norm": 0.8064699769020081, "kl": 0.00046465283958241343, "learning_rate": 1.977777777777778e-06, "loss": 0.0, "reward": -0.6141666769981384, "reward_std": 0.05563242360949516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6141666769981384, "step": 89 }, { "completion_length": 200.0, "epoch": 0.04005340453938585, "grad_norm": 0.7693816423416138, "kl": 0.0004991068853996694, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.21366667747497559, "reward_std": 0.42142030596733093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21366667747497559, "step": 90 }, { "completion_length": 170.83334350585938, "epoch": 0.040498442367601244, "grad_norm": 0.8475862145423889, "kl": 0.0004466826212592423, "learning_rate": 2.0222222222222223e-06, "loss": 0.0, "reward": -0.25600001215934753, "reward_std": 0.3458733558654785, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25600001215934753, "step": 91 }, { "completion_length": 200.0, "epoch": 0.040943480195816645, "grad_norm": 0.001588360988534987, "kl": 0.00045517744729295373, "learning_rate": 2.0444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 92 }, { "completion_length": 168.83334350585938, "epoch": 0.04138851802403204, "grad_norm": 0.7196416854858398, "kl": 0.0005219130543991923, "learning_rate": 2.0666666666666666e-06, "loss": 0.0, "reward": -0.28700000047683716, "reward_std": 0.23125484585762024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28700003027915955, "step": 93 }, { "completion_length": 200.0, "epoch": 0.04183355585224744, "grad_norm": 0.0016818700823932886, "kl": 0.0004399843164719641, "learning_rate": 2.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 94 }, { "completion_length": 187.33334350585938, "epoch": 0.04227859368046284, "grad_norm": 0.662451446056366, "kl": 0.0005664011696353555, "learning_rate": 2.1111111111111114e-06, "loss": 0.0, "reward": -0.07083334028720856, "reward_std": 0.4042125344276428, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07083334028720856, "step": 95 }, { "completion_length": 200.0, "epoch": 0.042723631508678236, "grad_norm": 0.8778610229492188, "kl": 0.0005331834545359015, "learning_rate": 2.133333333333334e-06, "loss": 0.0, "reward": 0.013499995693564415, "reward_std": 0.27311810851097107, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013499995693564415, "step": 96 }, { "completion_length": 200.0, "epoch": 0.04316866933689364, "grad_norm": 0.7499815821647644, "kl": 0.0005722627975046635, "learning_rate": 2.1555555555555558e-06, "loss": 0.0, "reward": -0.36016663908958435, "reward_std": 0.37692034244537354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36016666889190674, "step": 97 }, { "completion_length": 199.6666717529297, "epoch": 0.04361370716510903, "grad_norm": 0.6992683410644531, "kl": 0.0006015513790771365, "learning_rate": 2.1777777777777777e-06, "loss": 0.0, "reward": -0.3303333520889282, "reward_std": 0.3531128168106079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3303333520889282, "step": 98 }, { "completion_length": 200.0, "epoch": 0.044058744993324434, "grad_norm": 0.7034227848052979, "kl": 0.0005352528532966971, "learning_rate": 2.2e-06, "loss": 0.0, "reward": -0.5301666855812073, "reward_std": 0.03528975695371628, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5301666855812073, "step": 99 }, { "completion_length": 200.0, "epoch": 0.04450378282153983, "grad_norm": 0.7136642932891846, "kl": 0.0005618830909952521, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 100 }, { "completion_length": 200.0, "epoch": 0.04494882064975523, "grad_norm": 0.9584755301475525, "kl": 0.0008303936338052154, "learning_rate": 2.2444444444444445e-06, "loss": 0.0, "reward": -0.34933334589004517, "reward_std": 0.369188129901886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34933334589004517, "step": 101 }, { "completion_length": 200.0, "epoch": 0.04539385847797063, "grad_norm": 0.810990035533905, "kl": 0.0007378787267953157, "learning_rate": 2.266666666666667e-06, "loss": 0.0, "reward": -0.5558333396911621, "reward_std": 0.02642286941409111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5558333396911621, "step": 102 }, { "completion_length": 200.0, "epoch": 0.045838896306186025, "grad_norm": 0.8462039232254028, "kl": 0.0007519976934418082, "learning_rate": 2.2888888888888892e-06, "loss": 0.0, "reward": -0.5571666955947876, "reward_std": 0.042873844504356384, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5571666955947876, "step": 103 }, { "completion_length": 152.33334350585938, "epoch": 0.046283934134401426, "grad_norm": 0.8582832217216492, "kl": 0.0006616115570068359, "learning_rate": 2.311111111111111e-06, "loss": 0.0, "reward": -0.20516666769981384, "reward_std": 0.3065997064113617, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20516668260097504, "step": 104 }, { "completion_length": 200.0, "epoch": 0.04672897196261682, "grad_norm": 0.751700758934021, "kl": 0.0007129245204851031, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "reward": -0.24383334815502167, "reward_std": 0.40426644682884216, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24383334815502167, "step": 105 }, { "completion_length": 200.0, "epoch": 0.04717400979083222, "grad_norm": 0.7948376536369324, "kl": 0.0009101564064621925, "learning_rate": 2.3555555555555555e-06, "loss": 0.0, "reward": -0.26483333110809326, "reward_std": 0.44175583124160767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26483333110809326, "step": 106 }, { "completion_length": 200.0, "epoch": 0.047619047619047616, "grad_norm": 0.6845387816429138, "kl": 0.0005481390398927033, "learning_rate": 2.377777777777778e-06, "loss": 0.0, "reward": -0.08950000256299973, "reward_std": 0.3333369195461273, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08950000256299973, "step": 107 }, { "completion_length": 174.33334350585938, "epoch": 0.04806408544726302, "grad_norm": 0.9041478633880615, "kl": 0.0010797386057674885, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": -0.3813333511352539, "reward_std": 0.1361376941204071, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3813333511352539, "step": 108 }, { "completion_length": 200.0, "epoch": 0.04850912327547842, "grad_norm": 0.8546884059906006, "kl": 0.0009791944175958633, "learning_rate": 2.4222222222222223e-06, "loss": 0.0, "reward": -0.17666666209697723, "reward_std": 0.3318732678890228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17666666209697723, "step": 109 }, { "completion_length": 200.0, "epoch": 0.04895416110369381, "grad_norm": 0.002719539450481534, "kl": 0.0005873936461284757, "learning_rate": 2.4444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 110 }, { "completion_length": 200.0, "epoch": 0.049399198931909215, "grad_norm": 0.7770993113517761, "kl": 0.0009914538823068142, "learning_rate": 2.466666666666667e-06, "loss": 0.0, "reward": -0.3736667037010193, "reward_std": 0.3873085379600525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3736667037010193, "step": 111 }, { "completion_length": 200.0, "epoch": 0.04984423676012461, "grad_norm": 0.6850268244743347, "kl": 0.0007792222313582897, "learning_rate": 2.488888888888889e-06, "loss": 0.0, "reward": -0.5395000576972961, "reward_std": 0.025041967630386353, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5395000576972961, "step": 112 }, { "completion_length": 198.33334350585938, "epoch": 0.05028927458834001, "grad_norm": 0.7239099144935608, "kl": 0.001012115739285946, "learning_rate": 2.5111111111111114e-06, "loss": 0.0, "reward": -0.42133331298828125, "reward_std": 0.27339982986450195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42133331298828125, "step": 113 }, { "completion_length": 200.0, "epoch": 0.050734312416555405, "grad_norm": 0.5771994590759277, "kl": 0.0015950507950037718, "learning_rate": 2.5333333333333338e-06, "loss": 0.0001, "reward": -0.621666669845581, "reward_std": 0.031443070620298386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.621666669845581, "step": 114 }, { "completion_length": 187.6666717529297, "epoch": 0.051179350244770806, "grad_norm": 0.8059555292129517, "kl": 0.0011016380740329623, "learning_rate": 2.5555555555555557e-06, "loss": 0.0, "reward": -0.31283336877822876, "reward_std": 0.25179630517959595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31283336877822876, "step": 115 }, { "completion_length": 200.0, "epoch": 0.0516243880729862, "grad_norm": 0.7699070572853088, "kl": 0.000991647131741047, "learning_rate": 2.577777777777778e-06, "loss": 0.0, "reward": -0.4051666855812073, "reward_std": 0.26059579849243164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4051666855812073, "step": 116 }, { "completion_length": 200.0, "epoch": 0.0520694259012016, "grad_norm": 0.6547927856445312, "kl": 0.0008666824433021247, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 117 }, { "completion_length": 198.1666717529297, "epoch": 0.052514463729417, "grad_norm": 0.8903563022613525, "kl": 0.0012721801176667213, "learning_rate": 2.6222222222222225e-06, "loss": 0.0001, "reward": -0.5, "reward_std": 0.10164842009544373, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5, "step": 118 }, { "completion_length": 200.0, "epoch": 0.0529595015576324, "grad_norm": 0.7130769491195679, "kl": 0.0009466246119700372, "learning_rate": 2.6444444444444444e-06, "loss": 0.0, "reward": -0.07766667008399963, "reward_std": 0.36592990159988403, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07766667008399963, "step": 119 }, { "completion_length": 200.0, "epoch": 0.0534045393858478, "grad_norm": 0.010055916383862495, "kl": 0.0016336208209395409, "learning_rate": 2.666666666666667e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 120 }, { "completion_length": 200.0, "epoch": 0.05384957721406319, "grad_norm": 0.7626153826713562, "kl": 0.0019612584728747606, "learning_rate": 2.6888888888888892e-06, "loss": 0.0001, "reward": -0.44099998474121094, "reward_std": 0.27870485186576843, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4410000443458557, "step": 121 }, { "completion_length": 200.0, "epoch": 0.054294615042278595, "grad_norm": 0.7897221446037292, "kl": 0.001940418384037912, "learning_rate": 2.7111111111111116e-06, "loss": 0.0001, "reward": -0.20816665887832642, "reward_std": 0.36510735750198364, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20816665887832642, "step": 122 }, { "completion_length": 200.0, "epoch": 0.05473965287049399, "grad_norm": 0.004020801745355129, "kl": 0.0008703676867298782, "learning_rate": 2.7333333333333336e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 123 }, { "completion_length": 200.0, "epoch": 0.05518469069870939, "grad_norm": 0.005421373061835766, "kl": 0.0008717196760699153, "learning_rate": 2.755555555555556e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 124 }, { "completion_length": 178.33334350585938, "epoch": 0.05562972852692479, "grad_norm": 0.8171259164810181, "kl": 0.0014178266283124685, "learning_rate": 2.7777777777777783e-06, "loss": 0.0001, "reward": -0.16316668689250946, "reward_std": 0.3424929082393646, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16316668689250946, "step": 125 }, { "completion_length": 200.0, "epoch": 0.056074766355140186, "grad_norm": 0.811120331287384, "kl": 0.00203678198158741, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "reward": -0.4819999933242798, "reward_std": 0.3060758113861084, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4819999933242798, "step": 126 }, { "completion_length": 200.0, "epoch": 0.05651980418335559, "grad_norm": 0.004288077354431152, "kl": 0.00076559919398278, "learning_rate": 2.8222222222222223e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 127 }, { "completion_length": 200.0, "epoch": 0.05696484201157098, "grad_norm": 0.7662628293037415, "kl": 0.0013816291466355324, "learning_rate": 2.8444444444444446e-06, "loss": 0.0001, "reward": -0.4233333468437195, "reward_std": 0.2736119031906128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4233333468437195, "step": 128 }, { "completion_length": 200.0, "epoch": 0.05740987983978638, "grad_norm": 0.7000979781150818, "kl": 0.0022010253742337227, "learning_rate": 2.866666666666667e-06, "loss": 0.0001, "reward": -0.43416666984558105, "reward_std": 0.2743176221847534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43416666984558105, "step": 129 }, { "completion_length": 189.5, "epoch": 0.05785491766800178, "grad_norm": 0.6776527762413025, "kl": 0.0026975106447935104, "learning_rate": 2.888888888888889e-06, "loss": 0.0001, "reward": -0.035999998450279236, "reward_std": 0.27313145995140076, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03599999472498894, "step": 130 }, { "completion_length": 200.0, "epoch": 0.05829995549621718, "grad_norm": 0.7056086659431458, "kl": 0.001560688717290759, "learning_rate": 2.9111111111111114e-06, "loss": 0.0001, "reward": -0.31833332777023315, "reward_std": 0.3454664647579193, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31833332777023315, "step": 131 }, { "completion_length": 172.5, "epoch": 0.05874499332443257, "grad_norm": 1.0837831497192383, "kl": 0.0017425650730729103, "learning_rate": 2.9333333333333338e-06, "loss": 0.0001, "reward": -0.3088333308696747, "reward_std": 0.3001648783683777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3088333308696747, "step": 132 }, { "completion_length": 200.0, "epoch": 0.059190031152647975, "grad_norm": 0.7270302176475525, "kl": 0.0014150540810078382, "learning_rate": 2.955555555555556e-06, "loss": 0.0001, "reward": -0.53083336353302, "reward_std": 0.02674633078277111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.53083336353302, "step": 133 }, { "completion_length": 181.5, "epoch": 0.059635068980863376, "grad_norm": 0.6710302233695984, "kl": 0.0028289398178458214, "learning_rate": 2.9777777777777777e-06, "loss": 0.0001, "reward": -0.15183335542678833, "reward_std": 0.3105810284614563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15183335542678833, "step": 134 }, { "completion_length": 200.0, "epoch": 0.06008010680907877, "grad_norm": 0.8479624390602112, "kl": 0.0018316200003027916, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.02383333444595337, "reward_std": 0.24780671298503876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02383333444595337, "step": 135 }, { "completion_length": 200.0, "epoch": 0.06052514463729417, "grad_norm": 0.8236755728721619, "kl": 0.002227437449619174, "learning_rate": 3.0222222222222225e-06, "loss": 0.0001, "reward": -0.3340000510215759, "reward_std": 0.35908883810043335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3340000510215759, "step": 136 }, { "completion_length": 200.0, "epoch": 0.060970182465509566, "grad_norm": 0.011570720933377743, "kl": 0.0022140974178910255, "learning_rate": 3.044444444444445e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 137 }, { "completion_length": 200.0, "epoch": 0.06141522029372497, "grad_norm": 0.7572616338729858, "kl": 0.003316813614219427, "learning_rate": 3.066666666666667e-06, "loss": 0.0001, "reward": -0.18433332443237305, "reward_std": 0.340387225151062, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18433332443237305, "step": 138 }, { "completion_length": 200.0, "epoch": 0.06186025812194036, "grad_norm": 0.00493138050660491, "kl": 0.0011617927812039852, "learning_rate": 3.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 139 }, { "completion_length": 200.0, "epoch": 0.06230529595015576, "grad_norm": 0.6994268894195557, "kl": 0.003730988595634699, "learning_rate": 3.1111111111111116e-06, "loss": 0.0001, "reward": -0.10350000858306885, "reward_std": 0.35402247309684753, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10350000858306885, "step": 140 }, { "completion_length": 188.6666717529297, "epoch": 0.06275033377837116, "grad_norm": 0.7533054351806641, "kl": 0.007150403223931789, "learning_rate": 3.133333333333334e-06, "loss": 0.0003, "reward": -0.21416668593883514, "reward_std": 0.36423152685165405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21416667103767395, "step": 141 }, { "completion_length": 200.0, "epoch": 0.06319537160658656, "grad_norm": 0.853130042552948, "kl": 0.0037856251001358032, "learning_rate": 3.1555555555555555e-06, "loss": 0.0002, "reward": -0.45483335852622986, "reward_std": 0.23466865718364716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45483335852622986, "step": 142 }, { "completion_length": 184.5, "epoch": 0.06364040943480195, "grad_norm": 0.6948954463005066, "kl": 0.003319720271974802, "learning_rate": 3.177777777777778e-06, "loss": 0.0001, "reward": -0.33100003004074097, "reward_std": 0.12057197093963623, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33100003004074097, "step": 143 }, { "completion_length": 200.0, "epoch": 0.06408544726301736, "grad_norm": 0.6649389266967773, "kl": 0.004403320141136646, "learning_rate": 3.2000000000000003e-06, "loss": 0.0002, "reward": -0.33233335614204407, "reward_std": 0.3574983775615692, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3323333263397217, "step": 144 }, { "completion_length": 200.0, "epoch": 0.06453048509123276, "grad_norm": 0.008188747800886631, "kl": 0.0016727236798033118, "learning_rate": 3.2222222222222227e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 145 }, { "completion_length": 197.0, "epoch": 0.06497552291944815, "grad_norm": 0.7850582599639893, "kl": 0.004564880859106779, "learning_rate": 3.2444444444444446e-06, "loss": 0.0002, "reward": -0.18733333051204681, "reward_std": 0.36280059814453125, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18733333051204681, "step": 146 }, { "completion_length": 200.0, "epoch": 0.06542056074766354, "grad_norm": 0.8551555275917053, "kl": 0.005018829368054867, "learning_rate": 3.266666666666667e-06, "loss": 0.0002, "reward": -0.22599999606609344, "reward_std": 0.3867609202861786, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 147 }, { "completion_length": 200.0, "epoch": 0.06586559857587895, "grad_norm": 0.011043811216950417, "kl": 0.0033470559865236282, "learning_rate": 3.2888888888888894e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 148 }, { "completion_length": 187.6666717529297, "epoch": 0.06631063640409435, "grad_norm": 0.7791420221328735, "kl": 0.007582447957247496, "learning_rate": 3.3111111111111118e-06, "loss": 0.0003, "reward": -0.24450001120567322, "reward_std": 0.23151740431785583, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24450001120567322, "step": 149 }, { "completion_length": 200.0, "epoch": 0.06675567423230974, "grad_norm": 0.7961557507514954, "kl": 0.005281232297420502, "learning_rate": 3.3333333333333333e-06, "loss": 0.0002, "reward": 0.025166666135191917, "reward_std": 0.2445407211780548, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025166666135191917, "step": 150 }, { "completion_length": 191.1666717529297, "epoch": 0.06720071206052515, "grad_norm": 0.7541393041610718, "kl": 0.008166976273059845, "learning_rate": 3.3555555555555557e-06, "loss": 0.0003, "reward": -0.26350000500679016, "reward_std": 0.3062899112701416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26350000500679016, "step": 151 }, { "completion_length": 138.5, "epoch": 0.06764574988874054, "grad_norm": 1.3180961608886719, "kl": 0.0019082196522504091, "learning_rate": 3.377777777777778e-06, "loss": 0.0001, "reward": -0.19033333659172058, "reward_std": 0.2720842957496643, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19033333659172058, "step": 152 }, { "completion_length": 200.0, "epoch": 0.06809078771695594, "grad_norm": 0.014968675561249256, "kl": 0.0043890452943742275, "learning_rate": 3.4000000000000005e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 153 }, { "completion_length": 200.0, "epoch": 0.06853582554517133, "grad_norm": 1.0916509628295898, "kl": 0.006353219039738178, "learning_rate": 3.4222222222222224e-06, "loss": 0.0003, "reward": -0.014833331108093262, "reward_std": 0.4068416953086853, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014833331108093262, "step": 154 }, { "completion_length": 200.0, "epoch": 0.06898086337338674, "grad_norm": 0.7793182730674744, "kl": 0.00863957405090332, "learning_rate": 3.444444444444445e-06, "loss": 0.0003, "reward": -0.10183333605527878, "reward_std": 0.4037016034126282, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10183333605527878, "step": 155 }, { "completion_length": 200.0, "epoch": 0.06942590120160214, "grad_norm": 0.012789091095328331, "kl": 0.0035248759668320417, "learning_rate": 3.4666666666666672e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 156 }, { "completion_length": 199.83334350585938, "epoch": 0.06987093902981753, "grad_norm": 0.8552259802818298, "kl": 0.007797297090291977, "learning_rate": 3.4888888888888896e-06, "loss": 0.0003, "reward": -0.32883334159851074, "reward_std": 0.3524216115474701, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32883334159851074, "step": 157 }, { "completion_length": 200.0, "epoch": 0.07031597685803294, "grad_norm": 0.7439639568328857, "kl": 0.008432665839791298, "learning_rate": 3.511111111111111e-06, "loss": 0.0003, "reward": -0.3968333601951599, "reward_std": 0.25783050060272217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3968333601951599, "step": 158 }, { "completion_length": 200.0, "epoch": 0.07076101468624833, "grad_norm": 0.8293086290359497, "kl": 0.015361580066382885, "learning_rate": 3.5333333333333335e-06, "loss": 0.0006, "reward": -0.09950000047683716, "reward_std": 0.34847769141197205, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09950000047683716, "step": 159 }, { "completion_length": 200.0, "epoch": 0.07120605251446373, "grad_norm": 0.02794010564684868, "kl": 0.011896876618266106, "learning_rate": 3.555555555555556e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 160 }, { "completion_length": 168.5, "epoch": 0.07165109034267912, "grad_norm": 1.1416176557540894, "kl": 0.01758456416428089, "learning_rate": 3.577777777777778e-06, "loss": 0.0007, "reward": -0.21533334255218506, "reward_std": 0.36748045682907104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21533334255218506, "step": 161 }, { "completion_length": 200.0, "epoch": 0.07209612817089453, "grad_norm": 0.8033695220947266, "kl": 0.016339905560016632, "learning_rate": 3.6000000000000003e-06, "loss": 0.0007, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 162 }, { "completion_length": 200.0, "epoch": 0.07254116599910992, "grad_norm": 0.013050896115601063, "kl": 0.005038695875555277, "learning_rate": 3.6222222222222226e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 163 }, { "completion_length": 200.0, "epoch": 0.07298620382732532, "grad_norm": 0.8717074990272522, "kl": 0.01940556988120079, "learning_rate": 3.644444444444445e-06, "loss": 0.0008, "reward": -0.44333335757255554, "reward_std": 0.27983972430229187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44333332777023315, "step": 164 }, { "completion_length": 200.0, "epoch": 0.07343124165554073, "grad_norm": 0.7769482135772705, "kl": 0.008692685514688492, "learning_rate": 3.6666666666666666e-06, "loss": 0.0003, "reward": -0.1525000035762787, "reward_std": 0.4397725462913513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1525000035762787, "step": 165 }, { "completion_length": 200.0, "epoch": 0.07387627948375612, "grad_norm": 0.009532845579087734, "kl": 0.003257167525589466, "learning_rate": 3.688888888888889e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 166 }, { "completion_length": 200.0, "epoch": 0.07432131731197152, "grad_norm": 0.766445517539978, "kl": 0.01774817332625389, "learning_rate": 3.7111111111111113e-06, "loss": 0.0007, "reward": 0.01966666243970394, "reward_std": 0.25801295042037964, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01966666243970394, "step": 167 }, { "completion_length": 192.83334350585938, "epoch": 0.07476635514018691, "grad_norm": 0.7344871759414673, "kl": 0.019676849246025085, "learning_rate": 3.7333333333333337e-06, "loss": 0.0008, "reward": -0.3711666762828827, "reward_std": 0.27725324034690857, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3711666762828827, "step": 168 }, { "completion_length": 200.0, "epoch": 0.07521139296840232, "grad_norm": 0.708225429058075, "kl": 0.01603546366095543, "learning_rate": 3.7555555555555557e-06, "loss": 0.0006, "reward": -0.007833331823348999, "reward_std": 0.32537388801574707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007833331823348999, "step": 169 }, { "completion_length": 200.0, "epoch": 0.07565643079661771, "grad_norm": 0.012713441625237465, "kl": 0.005252152215689421, "learning_rate": 3.777777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 170 }, { "completion_length": 200.0, "epoch": 0.07610146862483311, "grad_norm": 0.013440362177789211, "kl": 0.006324666552245617, "learning_rate": 3.8000000000000005e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 171 }, { "completion_length": 200.0, "epoch": 0.07654650645304852, "grad_norm": 0.018537871539592743, "kl": 0.0071646831929683685, "learning_rate": 3.8222222222222224e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 172 }, { "completion_length": 200.0, "epoch": 0.07699154428126391, "grad_norm": 0.01842048391699791, "kl": 0.008270082995295525, "learning_rate": 3.844444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 173 }, { "completion_length": 200.0, "epoch": 0.0774365821094793, "grad_norm": 0.7180718779563904, "kl": 0.023133087903261185, "learning_rate": 3.866666666666667e-06, "loss": 0.0009, "reward": -0.32500001788139343, "reward_std": 0.35587525367736816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32500001788139343, "step": 174 }, { "completion_length": 200.0, "epoch": 0.0778816199376947, "grad_norm": 0.8401187062263489, "kl": 0.013387969695031643, "learning_rate": 3.88888888888889e-06, "loss": 0.0005, "reward": 0.029499998316168785, "reward_std": 0.2993685007095337, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.029499998316168785, "step": 175 }, { "completion_length": 200.0, "epoch": 0.07832665776591011, "grad_norm": 0.0299467034637928, "kl": 0.015433305874466896, "learning_rate": 3.911111111111112e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 200.0, "epoch": 0.0787716955941255, "grad_norm": 0.01781369000673294, "kl": 0.00665412237867713, "learning_rate": 3.9333333333333335e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 177 }, { "completion_length": 200.0, "epoch": 0.0792167334223409, "grad_norm": 0.029804501682519913, "kl": 0.015158621594309807, "learning_rate": 3.955555555555556e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 178 }, { "completion_length": 200.0, "epoch": 0.0796617712505563, "grad_norm": 0.020057376474142075, "kl": 0.005600334610790014, "learning_rate": 3.977777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 179 }, { "completion_length": 200.0, "epoch": 0.0801068090787717, "grad_norm": 0.6821267008781433, "kl": 0.01735800690948963, "learning_rate": 4.000000000000001e-06, "loss": 0.0007, "reward": -0.2288333624601364, "reward_std": 0.3883763253688812, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2288333624601364, "step": 180 }, { "completion_length": 200.0, "epoch": 0.08055184690698709, "grad_norm": 0.8316364884376526, "kl": 0.03854802995920181, "learning_rate": 4.022222222222222e-06, "loss": 0.0015, "reward": -0.11349999904632568, "reward_std": 0.37036940455436707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11349999904632568, "step": 181 }, { "completion_length": 200.0, "epoch": 0.08099688473520249, "grad_norm": 0.8583576679229736, "kl": 0.014207671396434307, "learning_rate": 4.044444444444445e-06, "loss": 0.0006, "reward": 0.002499997615814209, "reward_std": 0.30006250739097595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.002499997615814209, "step": 182 }, { "completion_length": 200.0, "epoch": 0.0814419225634179, "grad_norm": 0.0153994495049119, "kl": 0.007102414965629578, "learning_rate": 4.066666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 183 }, { "completion_length": 200.0, "epoch": 0.08188696039163329, "grad_norm": 0.8327325582504272, "kl": 0.022632954642176628, "learning_rate": 4.088888888888889e-06, "loss": 0.0009, "reward": 0.014666667208075523, "reward_std": 0.27026039361953735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014666667208075523, "step": 184 }, { "completion_length": 200.0, "epoch": 0.08233199821984868, "grad_norm": 0.010787020437419415, "kl": 0.004412154667079449, "learning_rate": 4.111111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 185 }, { "completion_length": 200.0, "epoch": 0.08277703604806408, "grad_norm": 0.029312826693058014, "kl": 0.01270313374698162, "learning_rate": 4.133333333333333e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 186 }, { "completion_length": 175.1666717529297, "epoch": 0.08322207387627949, "grad_norm": 0.8411778211593628, "kl": 0.028145212680101395, "learning_rate": 4.155555555555556e-06, "loss": 0.0011, "reward": -0.01066666841506958, "reward_std": 0.17476919293403625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01066666841506958, "step": 187 }, { "completion_length": 200.0, "epoch": 0.08366711170449488, "grad_norm": 0.023354342207312584, "kl": 0.00851635355502367, "learning_rate": 4.177777777777778e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 192.5, "epoch": 0.08411214953271028, "grad_norm": 0.8554210662841797, "kl": 0.01744459569454193, "learning_rate": 4.2000000000000004e-06, "loss": 0.0007, "reward": -0.04233333468437195, "reward_std": 0.2719578444957733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04233333468437195, "step": 189 }, { "completion_length": 177.33334350585938, "epoch": 0.08455718736092568, "grad_norm": 0.8024124503135681, "kl": 0.015208413824439049, "learning_rate": 4.222222222222223e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 190 }, { "completion_length": 200.0, "epoch": 0.08500222518914108, "grad_norm": 0.6993169188499451, "kl": 0.014287407509982586, "learning_rate": 4.244444444444445e-06, "loss": 0.0006, "reward": -0.1301666796207428, "reward_std": 0.39577287435531616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 191 }, { "completion_length": 200.0, "epoch": 0.08544726301735647, "grad_norm": 0.6678306460380554, "kl": 0.013930333778262138, "learning_rate": 4.266666666666668e-06, "loss": 0.0006, "reward": 0.043666668236255646, "reward_std": 0.26521819829940796, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.043666668236255646, "step": 192 }, { "completion_length": 200.0, "epoch": 0.08589230084557187, "grad_norm": 0.028331147506833076, "kl": 0.017971333116292953, "learning_rate": 4.288888888888889e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 193 }, { "completion_length": 200.0, "epoch": 0.08633733867378728, "grad_norm": 0.027453351765871048, "kl": 0.009431993588805199, "learning_rate": 4.3111111111111115e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 194 }, { "completion_length": 200.0, "epoch": 0.08678237650200267, "grad_norm": 0.773445725440979, "kl": 0.023339729756116867, "learning_rate": 4.333333333333334e-06, "loss": 0.0009, "reward": -0.10633333772420883, "reward_std": 0.35838064551353455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10633333772420883, "step": 195 }, { "completion_length": 200.0, "epoch": 0.08722741433021806, "grad_norm": 0.014650012366473675, "kl": 0.007890871725976467, "learning_rate": 4.3555555555555555e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 196 }, { "completion_length": 200.0, "epoch": 0.08767245215843347, "grad_norm": 0.025273794308304787, "kl": 0.01200440526008606, "learning_rate": 4.377777777777778e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 197 }, { "completion_length": 192.33334350585938, "epoch": 0.08811748998664887, "grad_norm": 0.7459567785263062, "kl": 0.01451034378260374, "learning_rate": 4.4e-06, "loss": 0.0006, "reward": -0.26500001549720764, "reward_std": 0.33120569586753845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26500001549720764, "step": 198 }, { "completion_length": 200.0, "epoch": 0.08856252781486426, "grad_norm": 0.010312313213944435, "kl": 0.0038611218333244324, "learning_rate": 4.422222222222223e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 199 }, { "completion_length": 200.0, "epoch": 0.08900756564307966, "grad_norm": 0.6166565418243408, "kl": 0.009738167747855186, "learning_rate": 4.444444444444444e-06, "loss": 0.0004, "reward": 0.0035000047646462917, "reward_std": 0.2976129949092865, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0035000047646462917, "step": 200 }, { "completion_length": 200.0, "epoch": 0.08945260347129506, "grad_norm": 0.008906065486371517, "kl": 0.0037507950328290462, "learning_rate": 4.4666666666666665e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 201 }, { "completion_length": 198.1666717529297, "epoch": 0.08989764129951046, "grad_norm": 0.9748014807701111, "kl": 0.009199577383697033, "learning_rate": 4.488888888888889e-06, "loss": 0.0004, "reward": 0.04983333498239517, "reward_std": 0.18411998450756073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04983333498239517, "step": 202 }, { "completion_length": 200.0, "epoch": 0.09034267912772585, "grad_norm": 0.04424808546900749, "kl": 0.01891401596367359, "learning_rate": 4.511111111111111e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 203 }, { "completion_length": 200.0, "epoch": 0.09078771695594126, "grad_norm": 0.008351677097380161, "kl": 0.003573081223294139, "learning_rate": 4.533333333333334e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 204 }, { "completion_length": 200.0, "epoch": 0.09123275478415666, "grad_norm": 0.7445893883705139, "kl": 0.014485888183116913, "learning_rate": 4.555555555555556e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 205 }, { "completion_length": 200.0, "epoch": 0.09167779261237205, "grad_norm": 0.6814647316932678, "kl": 0.017133373767137527, "learning_rate": 4.5777777777777785e-06, "loss": 0.0007, "reward": -0.08250000327825546, "reward_std": 0.37661266326904297, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 206 }, { "completion_length": 200.0, "epoch": 0.09212283044058744, "grad_norm": 0.013401404023170471, "kl": 0.0045086476020514965, "learning_rate": 4.600000000000001e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 207 }, { "completion_length": 200.0, "epoch": 0.09256786826880285, "grad_norm": 0.015258646570146084, "kl": 0.005985723342746496, "learning_rate": 4.622222222222222e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 208 }, { "completion_length": 200.0, "epoch": 0.09301290609701825, "grad_norm": 0.017956389114260674, "kl": 0.0076329647563397884, "learning_rate": 4.644444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 209 }, { "completion_length": 200.0, "epoch": 0.09345794392523364, "grad_norm": 0.6878367066383362, "kl": 0.00822862982749939, "learning_rate": 4.666666666666667e-06, "loss": 0.0003, "reward": 0.025333335623145103, "reward_std": 0.24413248896598816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025333335623145103, "step": 210 }, { "completion_length": 185.6666717529297, "epoch": 0.09390298175344905, "grad_norm": 0.7083204388618469, "kl": 0.008970928378403187, "learning_rate": 4.6888888888888895e-06, "loss": 0.0004, "reward": -0.10883334279060364, "reward_std": 0.31478020548820496, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10883334279060364, "step": 211 }, { "completion_length": 200.0, "epoch": 0.09434801958166444, "grad_norm": 0.010259282775223255, "kl": 0.005621683783829212, "learning_rate": 4.711111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 212 }, { "completion_length": 200.0, "epoch": 0.09479305740987984, "grad_norm": 0.63719642162323, "kl": 0.017996463924646378, "learning_rate": 4.7333333333333335e-06, "loss": 0.0007, "reward": -0.2148333489894867, "reward_std": 0.37843701243400574, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2148333489894867, "step": 213 }, { "completion_length": 200.0, "epoch": 0.09523809523809523, "grad_norm": 0.7535067796707153, "kl": 0.009188219904899597, "learning_rate": 4.755555555555556e-06, "loss": 0.0004, "reward": 0.025833334773778915, "reward_std": 0.24290774762630463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025833334773778915, "step": 214 }, { "completion_length": 200.0, "epoch": 0.09568313306631064, "grad_norm": 0.03754749149084091, "kl": 0.023878587409853935, "learning_rate": 4.777777777777778e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 215 }, { "completion_length": 195.1666717529297, "epoch": 0.09612817089452604, "grad_norm": 0.7495411038398743, "kl": 0.020265337079763412, "learning_rate": 4.800000000000001e-06, "loss": 0.0008, "reward": -0.1301666796207428, "reward_std": 0.2998015582561493, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 216 }, { "completion_length": 200.0, "epoch": 0.09657320872274143, "grad_norm": 0.8328439593315125, "kl": 0.012733696028590202, "learning_rate": 4.822222222222222e-06, "loss": 0.0005, "reward": -0.10983332991600037, "reward_std": 0.36434730887413025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10983332991600037, "step": 217 }, { "completion_length": 200.0, "epoch": 0.09701824655095684, "grad_norm": 0.014833502471446991, "kl": 0.005706641357392073, "learning_rate": 4.8444444444444446e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 218 }, { "completion_length": 200.0, "epoch": 0.09746328437917223, "grad_norm": 0.01487264595925808, "kl": 0.007111959159374237, "learning_rate": 4.866666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 219 }, { "completion_length": 191.5, "epoch": 0.09790832220738763, "grad_norm": 0.6546903252601624, "kl": 0.03114478290081024, "learning_rate": 4.888888888888889e-06, "loss": 0.0012, "reward": -0.02316666767001152, "reward_std": 0.3049527406692505, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02316666767001152, "step": 220 }, { "completion_length": 200.0, "epoch": 0.09835336003560302, "grad_norm": 0.012690722942352295, "kl": 0.004840874578803778, "learning_rate": 4.911111111111112e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 221 }, { "completion_length": 200.0, "epoch": 0.09879839786381843, "grad_norm": 0.8420057892799377, "kl": 0.01995580643415451, "learning_rate": 4.933333333333334e-06, "loss": 0.0008, "reward": -0.11483334004878998, "reward_std": 0.37201637029647827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11483334004878998, "step": 222 }, { "completion_length": 162.6666717529297, "epoch": 0.09924343569203382, "grad_norm": 0.7063876390457153, "kl": 0.018741153180599213, "learning_rate": 4.9555555555555565e-06, "loss": 0.0007, "reward": -0.15850001573562622, "reward_std": 0.2223670333623886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15850001573562622, "step": 223 }, { "completion_length": 142.1666717529297, "epoch": 0.09968847352024922, "grad_norm": 1.3013415336608887, "kl": 0.0186506025493145, "learning_rate": 4.977777777777778e-06, "loss": 0.0007, "reward": 0.08866667002439499, "reward_std": 0.11559354513883591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08866667002439499, "step": 224 }, { "completion_length": 200.0, "epoch": 0.10013351134846461, "grad_norm": 0.8148112297058105, "kl": 0.014559872448444366, "learning_rate": 5e-06, "loss": 0.0006, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 225 }, { "completion_length": 200.0, "epoch": 0.10057854917668002, "grad_norm": 0.7223935127258301, "kl": 0.01846488006412983, "learning_rate": 4.999996982499377e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 226 }, { "completion_length": 176.83334350585938, "epoch": 0.10102358700489542, "grad_norm": 0.855940580368042, "kl": 0.023898562416434288, "learning_rate": 4.9999879300047904e-06, "loss": 0.001, "reward": -0.06350000202655792, "reward_std": 0.24804334342479706, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06350000202655792, "step": 227 }, { "completion_length": 178.1666717529297, "epoch": 0.10146862483311081, "grad_norm": 1.3559764623641968, "kl": 0.012699100188910961, "learning_rate": 4.999972842538094e-06, "loss": 0.0005, "reward": 0.0833333358168602, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 228 }, { "completion_length": 200.0, "epoch": 0.10191366266132622, "grad_norm": 0.11357201635837555, "kl": 0.02485606074333191, "learning_rate": 4.999951720135707e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 229 }, { "completion_length": 193.5, "epoch": 0.10235870048954161, "grad_norm": 1.0450315475463867, "kl": 0.022413700819015503, "learning_rate": 4.999924562848623e-06, "loss": 0.0009, "reward": -0.27666670083999634, "reward_std": 0.32778817415237427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.27666670083999634, "step": 230 }, { "completion_length": 200.0, "epoch": 0.102803738317757, "grad_norm": 0.022118445485830307, "kl": 0.011586411856114864, "learning_rate": 4.999891370742395e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 231 }, { "completion_length": 200.0, "epoch": 0.1032487761459724, "grad_norm": 0.032451968640089035, "kl": 0.01942615956068039, "learning_rate": 4.999852143897152e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 232 }, { "completion_length": 187.33334350585938, "epoch": 0.10369381397418781, "grad_norm": 1.0973916053771973, "kl": 0.028928395360708237, "learning_rate": 4.999806882407586e-06, "loss": 0.0012, "reward": 0.09416666626930237, "reward_std": 0.07552593946456909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09416666626930237, "step": 233 }, { "completion_length": 200.0, "epoch": 0.1041388518024032, "grad_norm": 0.8492021560668945, "kl": 0.020361032336950302, "learning_rate": 4.9997555863829584e-06, "loss": 0.0008, "reward": -0.0728333443403244, "reward_std": 0.3074478507041931, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0728333443403244, "step": 234 }, { "completion_length": 200.0, "epoch": 0.1045838896306186, "grad_norm": 0.7689980268478394, "kl": 0.010467594489455223, "learning_rate": 4.999698255947099e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 235 }, { "completion_length": 200.0, "epoch": 0.105028927458834, "grad_norm": 0.022514864802360535, "kl": 0.016714762896299362, "learning_rate": 4.9996348912384025e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 236 }, { "completion_length": 200.0, "epoch": 0.1054739652870494, "grad_norm": 0.7497798800468445, "kl": 0.035260215401649475, "learning_rate": 4.999565492409831e-06, "loss": 0.0014, "reward": -0.07899999618530273, "reward_std": 0.3160455822944641, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07899999618530273, "step": 237 }, { "completion_length": 180.33334350585938, "epoch": 0.1059190031152648, "grad_norm": 1.1552839279174805, "kl": 0.020859047770500183, "learning_rate": 4.999490059628914e-06, "loss": 0.0008, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 238 }, { "completion_length": 200.0, "epoch": 0.10636404094348019, "grad_norm": 0.014450768008828163, "kl": 0.00946769304573536, "learning_rate": 4.999408593077747e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 239 }, { "completion_length": 200.0, "epoch": 0.1068090787716956, "grad_norm": 0.6490687131881714, "kl": 0.013119819574058056, "learning_rate": 4.999321092952989e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 240 }, { "completion_length": 198.1666717529297, "epoch": 0.10725411659991099, "grad_norm": 0.8787567615509033, "kl": 0.027725404128432274, "learning_rate": 4.999227559465865e-06, "loss": 0.0011, "reward": -0.060833342373371124, "reward_std": 0.2964027523994446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.060833342373371124, "step": 241 }, { "completion_length": 200.0, "epoch": 0.10769915442812639, "grad_norm": 0.6240781545639038, "kl": 0.011513415724039078, "learning_rate": 4.999127992842167e-06, "loss": 0.0005, "reward": 0.019833337515592575, "reward_std": 0.2576046586036682, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019833337515592575, "step": 242 }, { "completion_length": 200.0, "epoch": 0.1081441922563418, "grad_norm": 0.021726641803979874, "kl": 0.011281192302703857, "learning_rate": 4.999022393322246e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 243 }, { "completion_length": 200.0, "epoch": 0.10858923008455719, "grad_norm": 0.01203103642910719, "kl": 0.007502372842282057, "learning_rate": 4.998910761161022e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 244 }, { "completion_length": 200.0, "epoch": 0.10903426791277258, "grad_norm": 0.03352230042219162, "kl": 0.015283550135791302, "learning_rate": 4.998793096627973e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 245 }, { "completion_length": 180.0, "epoch": 0.10947930574098798, "grad_norm": 0.6931204199790955, "kl": 0.025455031543970108, "learning_rate": 4.998669400007142e-06, "loss": 0.001, "reward": -0.0663333386182785, "reward_std": 0.25824224948883057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0663333386182785, "step": 246 }, { "completion_length": 179.33334350585938, "epoch": 0.10992434356920339, "grad_norm": 0.7052227258682251, "kl": 0.01645379140973091, "learning_rate": 4.998539671597134e-06, "loss": 0.0007, "reward": -0.1290000081062317, "reward_std": 0.243107408285141, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1290000081062317, "step": 247 }, { "completion_length": 200.0, "epoch": 0.11036938139741878, "grad_norm": 0.6620118021965027, "kl": 0.01992572657763958, "learning_rate": 4.998403911711112e-06, "loss": 0.0008, "reward": 0.02500000223517418, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02500000223517418, "step": 248 }, { "completion_length": 200.0, "epoch": 0.11081441922563418, "grad_norm": 0.012199520133435726, "kl": 0.01109787356108427, "learning_rate": 4.9982621206768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 249 }, { "completion_length": 200.0, "epoch": 0.11125945705384958, "grad_norm": 0.6646403074264526, "kl": 0.017368197441101074, "learning_rate": 4.998114298836483e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 250 }, { "completion_length": 200.0, "epoch": 0.11170449488206498, "grad_norm": 0.64476478099823, "kl": 0.018091298639774323, "learning_rate": 4.997960446547002e-06, "loss": 0.0007, "reward": 0.007833331823348999, "reward_std": 0.28699856996536255, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007833331823348999, "step": 251 }, { "completion_length": 200.0, "epoch": 0.11214953271028037, "grad_norm": 0.01579858362674713, "kl": 0.009715499356389046, "learning_rate": 4.997800564179758e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 252 }, { "completion_length": 200.0, "epoch": 0.11259457053849577, "grad_norm": 0.7200601696968079, "kl": 0.02384258806705475, "learning_rate": 4.997634652120704e-06, "loss": 0.001, "reward": 0.016166668385267258, "reward_std": 0.2665861248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 253 }, { "completion_length": 193.33334350585938, "epoch": 0.11303960836671118, "grad_norm": 0.7668142318725586, "kl": 0.029656527563929558, "learning_rate": 4.997462710770356e-06, "loss": 0.0012, "reward": -0.13450001180171967, "reward_std": 0.29981911182403564, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13450001180171967, "step": 254 }, { "completion_length": 200.0, "epoch": 0.11348464619492657, "grad_norm": 0.02032412961125374, "kl": 0.014308085665106773, "learning_rate": 4.997284740543776e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 255 }, { "completion_length": 198.1666717529297, "epoch": 0.11392968402314196, "grad_norm": 0.025216558948159218, "kl": 0.022948317229747772, "learning_rate": 4.997100741870587e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 256 }, { "completion_length": 200.0, "epoch": 0.11437472185135737, "grad_norm": 0.0239882729947567, "kl": 0.01751275360584259, "learning_rate": 4.996910715194963e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 257 }, { "completion_length": 200.0, "epoch": 0.11481975967957277, "grad_norm": 0.028994852676987648, "kl": 0.01953146606683731, "learning_rate": 4.9967146609756254e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 200.0, "epoch": 0.11526479750778816, "grad_norm": 0.7920787334442139, "kl": 0.024792861193418503, "learning_rate": 4.996512579685851e-06, "loss": 0.001, "reward": 0.034333329647779465, "reward_std": 0.28770241141319275, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.034333329647779465, "step": 259 }, { "completion_length": 200.0, "epoch": 0.11570983533600356, "grad_norm": 0.027261994779109955, "kl": 0.01596745476126671, "learning_rate": 4.996304471813464e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 260 }, { "completion_length": 199.6666717529297, "epoch": 0.11615487316421896, "grad_norm": 0.7512991428375244, "kl": 0.020810922607779503, "learning_rate": 4.996090337860836e-06, "loss": 0.0008, "reward": 0.03583333641290665, "reward_std": 0.218412846326828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03583333641290665, "step": 261 }, { "completion_length": 200.0, "epoch": 0.11659991099243436, "grad_norm": 0.01981634460389614, "kl": 0.02145499363541603, "learning_rate": 4.995870178344888e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 262 }, { "completion_length": 200.0, "epoch": 0.11704494882064975, "grad_norm": 0.015385876409709454, "kl": 0.010222600772976875, "learning_rate": 4.995643993797084e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 263 }, { "completion_length": 200.0, "epoch": 0.11748998664886515, "grad_norm": 0.010453257709741592, "kl": 0.004751277156174183, "learning_rate": 4.995411784763434e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 264 }, { "completion_length": 200.0, "epoch": 0.11793502447708056, "grad_norm": 0.02353510819375515, "kl": 0.01597435772418976, "learning_rate": 4.995173551804491e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 265 }, { "completion_length": 200.0, "epoch": 0.11838006230529595, "grad_norm": 0.6973554491996765, "kl": 0.013702675700187683, "learning_rate": 4.9949292954953486e-06, "loss": 0.0005, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 266 }, { "completion_length": 200.0, "epoch": 0.11882510013351134, "grad_norm": 0.7307512760162354, "kl": 0.02334902063012123, "learning_rate": 4.994679016425642e-06, "loss": 0.0009, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 267 }, { "completion_length": 196.5, "epoch": 0.11927013796172675, "grad_norm": 0.7821613550186157, "kl": 0.023126548156142235, "learning_rate": 4.994422715199546e-06, "loss": 0.0009, "reward": -0.35100001096725464, "reward_std": 0.24427853524684906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35100001096725464, "step": 268 }, { "completion_length": 200.0, "epoch": 0.11971517578994215, "grad_norm": 0.014924556948244572, "kl": 0.010245325975120068, "learning_rate": 4.99416039243577e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 269 }, { "completion_length": 200.0, "epoch": 0.12016021361815754, "grad_norm": 0.011302834376692772, "kl": 0.0074605681002140045, "learning_rate": 4.993892048767563e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 270 }, { "completion_length": 196.83334350585938, "epoch": 0.12060525144637294, "grad_norm": 0.7507510185241699, "kl": 0.01992712914943695, "learning_rate": 4.993617684842707e-06, "loss": 0.0008, "reward": 0.0560000017285347, "reward_std": 0.1690147966146469, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0560000017285347, "step": 271 }, { "completion_length": 186.33334350585938, "epoch": 0.12105028927458834, "grad_norm": 0.642690896987915, "kl": 0.03827090188860893, "learning_rate": 4.9933373013235156e-06, "loss": 0.0015, "reward": -0.01666666567325592, "reward_std": 0.2657710909843445, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01666666567325592, "step": 272 }, { "completion_length": 200.0, "epoch": 0.12149532710280374, "grad_norm": 0.016800519078969955, "kl": 0.007222745567560196, "learning_rate": 4.993050898886833e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 273 }, { "completion_length": 200.0, "epoch": 0.12194036493101913, "grad_norm": 0.758976936340332, "kl": 0.018932368606328964, "learning_rate": 4.992758478224039e-06, "loss": 0.0008, "reward": -0.24283334612846375, "reward_std": 0.41019290685653687, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24283334612846375, "step": 274 }, { "completion_length": 200.0, "epoch": 0.12238540275923454, "grad_norm": 0.020172713324427605, "kl": 0.012656650505959988, "learning_rate": 4.992460040041034e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 275 }, { "completion_length": 200.0, "epoch": 0.12283044058744993, "grad_norm": 0.01721290498971939, "kl": 0.008657376281917095, "learning_rate": 4.992155585058248e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 276 }, { "completion_length": 186.83334350585938, "epoch": 0.12327547841566533, "grad_norm": 0.6902337074279785, "kl": 0.06522956490516663, "learning_rate": 4.991845114010638e-06, "loss": 0.0026, "reward": -0.09683333337306976, "reward_std": 0.308468759059906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09683333337306976, "step": 277 }, { "completion_length": 200.0, "epoch": 0.12372051624388072, "grad_norm": 0.013619703240692616, "kl": 0.009151134639978409, "learning_rate": 4.99152862764768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 278 }, { "completion_length": 200.0, "epoch": 0.12416555407209613, "grad_norm": 0.01365516148507595, "kl": 0.005762772168964148, "learning_rate": 4.99120612673337e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 279 }, { "completion_length": 200.0, "epoch": 0.12461059190031153, "grad_norm": 0.03844517469406128, "kl": 0.009146707132458687, "learning_rate": 4.990877612046228e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 280 }, { "completion_length": 200.0, "epoch": 0.12505562972852693, "grad_norm": 0.010963845066726208, "kl": 0.004802503623068333, "learning_rate": 4.9905430843792886e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 281 }, { "completion_length": 200.0, "epoch": 0.12550066755674233, "grad_norm": 0.018975911661982536, "kl": 0.00843195803463459, "learning_rate": 4.9902025445401e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 282 }, { "completion_length": 191.33334350585938, "epoch": 0.12594570538495772, "grad_norm": 0.9152283072471619, "kl": 0.01359584927558899, "learning_rate": 4.989855993350728e-06, "loss": 0.0005, "reward": 0.08433333039283752, "reward_std": 0.09961257874965668, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08433333039283752, "step": 283 }, { "completion_length": 199.33334350585938, "epoch": 0.12639074321317312, "grad_norm": 0.6327641606330872, "kl": 0.011363311670720577, "learning_rate": 4.989503431647744e-06, "loss": 0.0005, "reward": 0.05283333361148834, "reward_std": 0.17677150666713715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05283333361148834, "step": 284 }, { "completion_length": 200.0, "epoch": 0.1268357810413885, "grad_norm": 0.015086417086422443, "kl": 0.006470312364399433, "learning_rate": 4.9891448602822355e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 285 }, { "completion_length": 200.0, "epoch": 0.1272808188696039, "grad_norm": 0.6536492705345154, "kl": 0.015819404274225235, "learning_rate": 4.988780280119792e-06, "loss": 0.0006, "reward": -0.09083332866430283, "reward_std": 0.33507877588272095, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09083332866430283, "step": 286 }, { "completion_length": 200.0, "epoch": 0.1277258566978193, "grad_norm": 0.008064402267336845, "kl": 0.002744730096310377, "learning_rate": 4.988409692040511e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 287 }, { "completion_length": 180.83334350585938, "epoch": 0.12817089452603472, "grad_norm": 0.819420337677002, "kl": 0.06618692725896835, "learning_rate": 4.988033096938991e-06, "loss": 0.0026, "reward": 0.021166665479540825, "reward_std": 0.24596862494945526, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021166665479540825, "step": 288 }, { "completion_length": 200.0, "epoch": 0.12861593235425012, "grad_norm": 0.017804304137825966, "kl": 0.0107121542096138, "learning_rate": 4.9876504957243345e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 289 }, { "completion_length": 200.0, "epoch": 0.1290609701824655, "grad_norm": 0.009461048990488052, "kl": 0.005491574760526419, "learning_rate": 4.987261889320141e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 290 }, { "completion_length": 200.0, "epoch": 0.1295060080106809, "grad_norm": 0.020793907344341278, "kl": 0.006585664115846157, "learning_rate": 4.986867278664505e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 291 }, { "completion_length": 200.0, "epoch": 0.1299510458388963, "grad_norm": 0.01623818278312683, "kl": 0.014776560477912426, "learning_rate": 4.9864666647100176e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 292 }, { "completion_length": 200.0, "epoch": 0.1303960836671117, "grad_norm": 0.015048404224216938, "kl": 0.006943022832274437, "learning_rate": 4.986060048423761e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 293 }, { "completion_length": 200.0, "epoch": 0.1308411214953271, "grad_norm": 0.10332320630550385, "kl": 0.018736328929662704, "learning_rate": 4.985647430787308e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 294 }, { "completion_length": 200.0, "epoch": 0.1312861593235425, "grad_norm": 0.7041745185852051, "kl": 0.013492944650352001, "learning_rate": 4.985228812796717e-06, "loss": 0.0005, "reward": 0.0403333380818367, "reward_std": 0.20739012956619263, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0403333380818367, "step": 295 }, { "completion_length": 180.33334350585938, "epoch": 0.1317311971517579, "grad_norm": 1.2074172496795654, "kl": 0.017207711935043335, "learning_rate": 4.984804195462532e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 296 }, { "completion_length": 200.0, "epoch": 0.1321762349799733, "grad_norm": 0.007344384212046862, "kl": 0.0033924030140042305, "learning_rate": 4.984373579809778e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 200.0, "epoch": 0.1326212728081887, "grad_norm": 0.6581267714500427, "kl": 0.004817100241780281, "learning_rate": 4.983936966877964e-06, "loss": 0.0002, "reward": 0.02033333107829094, "reward_std": 0.2563799321651459, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02033333107829094, "step": 298 }, { "completion_length": 200.0, "epoch": 0.1330663106364041, "grad_norm": 0.6918825507164001, "kl": 0.008517519570887089, "learning_rate": 4.983494357721074e-06, "loss": 0.0003, "reward": -0.029333334416151047, "reward_std": 0.3780378997325897, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.029333334416151047, "step": 299 }, { "completion_length": 200.0, "epoch": 0.13351134846461948, "grad_norm": 0.6355785727500916, "kl": 0.007732154801487923, "learning_rate": 4.983045753407564e-06, "loss": 0.0003, "reward": -0.12200000137090683, "reward_std": 0.3381112515926361, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12200000137090683, "step": 300 }, { "completion_length": 200.0, "epoch": 0.13395638629283488, "grad_norm": 0.00788823515176773, "kl": 0.004913420882076025, "learning_rate": 4.982591155020367e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 301 }, { "completion_length": 200.0, "epoch": 0.1344014241210503, "grad_norm": 0.019300375133752823, "kl": 0.0115005848929286, "learning_rate": 4.9821305636568835e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 302 }, { "completion_length": 200.0, "epoch": 0.1348464619492657, "grad_norm": 0.5956396460533142, "kl": 0.014617225155234337, "learning_rate": 4.981663980428981e-06, "loss": 0.0006, "reward": 0.019499998539686203, "reward_std": 0.2584211826324463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 303 }, { "completion_length": 200.0, "epoch": 0.1352914997774811, "grad_norm": 0.013532106764614582, "kl": 0.007755351718515158, "learning_rate": 4.981191406462991e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 304 }, { "completion_length": 200.0, "epoch": 0.13573653760569648, "grad_norm": 0.6766030788421631, "kl": 0.019166380167007446, "learning_rate": 4.9807128428997085e-06, "loss": 0.0008, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 305 }, { "completion_length": 200.0, "epoch": 0.13618157543391188, "grad_norm": 0.0559423454105854, "kl": 0.023686787113547325, "learning_rate": 4.980228290894386e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 306 }, { "completion_length": 200.0, "epoch": 0.13662661326212727, "grad_norm": 0.03602616861462593, "kl": 0.021832283586263657, "learning_rate": 4.979737751616732e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 307 }, { "completion_length": 200.0, "epoch": 0.13707165109034267, "grad_norm": 1.0004466772079468, "kl": 0.01577741652727127, "learning_rate": 4.979241226250908e-06, "loss": 0.0006, "reward": 0.010333329439163208, "reward_std": 0.2808748185634613, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.010333329439163208, "step": 308 }, { "completion_length": 200.0, "epoch": 0.1375166889185581, "grad_norm": 0.01729634776711464, "kl": 0.007570373825728893, "learning_rate": 4.9787387159955265e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 309 }, { "completion_length": 200.0, "epoch": 0.13796172674677348, "grad_norm": 0.0166346225887537, "kl": 0.007312558591365814, "learning_rate": 4.978230222063649e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 310 }, { "completion_length": 200.0, "epoch": 0.13840676457498888, "grad_norm": 0.018261654302477837, "kl": 0.009910644963383675, "learning_rate": 4.9777157456827785e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 311 }, { "completion_length": 200.0, "epoch": 0.13885180240320427, "grad_norm": 0.02585846185684204, "kl": 0.011723216623067856, "learning_rate": 4.977195288094863e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 312 }, { "completion_length": 200.0, "epoch": 0.13929684023141967, "grad_norm": 0.02551482431590557, "kl": 0.011289785616099834, "learning_rate": 4.976668850556284e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 313 }, { "completion_length": 200.0, "epoch": 0.13974187805963506, "grad_norm": 0.022060496732592583, "kl": 0.01366241555660963, "learning_rate": 4.976136434337866e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 314 }, { "completion_length": 200.0, "epoch": 0.14018691588785046, "grad_norm": 0.5639723539352417, "kl": 0.006856884807348251, "learning_rate": 4.97559804072486e-06, "loss": 0.0003, "reward": 0.022499999031424522, "reward_std": 0.2510727047920227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022499999031424522, "step": 315 }, { "completion_length": 196.0, "epoch": 0.14063195371606588, "grad_norm": 0.8882031440734863, "kl": 0.012949245050549507, "learning_rate": 4.9750536710169485e-06, "loss": 0.0005, "reward": 0.061000000685453415, "reward_std": 0.1567673534154892, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.061000000685453415, "step": 316 }, { "completion_length": 200.0, "epoch": 0.14107699154428127, "grad_norm": 0.738691508769989, "kl": 0.018215883523225784, "learning_rate": 4.97450332652824e-06, "loss": 0.0007, "reward": -0.19816666841506958, "reward_std": 0.35409173369407654, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19816666841506958, "step": 317 }, { "completion_length": 200.0, "epoch": 0.14152202937249667, "grad_norm": 0.008259186521172523, "kl": 0.004001545254141092, "learning_rate": 4.973947008587268e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 318 }, { "completion_length": 183.0, "epoch": 0.14196706720071206, "grad_norm": 0.8659139275550842, "kl": 0.02181245945394039, "learning_rate": 4.973384718536982e-06, "loss": 0.0009, "reward": 0.03533333167433739, "reward_std": 0.18040475249290466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03533333167433739, "step": 319 }, { "completion_length": 200.0, "epoch": 0.14241210502892745, "grad_norm": 0.013459905050694942, "kl": 0.008506972342729568, "learning_rate": 4.972816457734752e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 320 }, { "completion_length": 200.0, "epoch": 0.14285714285714285, "grad_norm": 0.6455613970756531, "kl": 0.04512510821223259, "learning_rate": 4.972242227552358e-06, "loss": 0.0018, "reward": 0.008666664361953735, "reward_std": 0.2849573493003845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008666664361953735, "step": 321 }, { "completion_length": 182.33334350585938, "epoch": 0.14330218068535824, "grad_norm": 0.7501981854438782, "kl": 0.01761593297123909, "learning_rate": 4.971662029375995e-06, "loss": 0.0007, "reward": -0.17916667461395264, "reward_std": 0.331107497215271, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17916667461395264, "step": 322 }, { "completion_length": 200.0, "epoch": 0.14374721851357367, "grad_norm": 0.008645527996122837, "kl": 0.003186706220731139, "learning_rate": 4.97107586460626e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 323 }, { "completion_length": 175.83334350585938, "epoch": 0.14419225634178906, "grad_norm": 0.7112619876861572, "kl": 0.030575290322303772, "learning_rate": 4.970483734658154e-06, "loss": 0.0012, "reward": 0.08583333343267441, "reward_std": 0.12314936518669128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 324 }, { "completion_length": 200.0, "epoch": 0.14463729417000445, "grad_norm": 0.035192981362342834, "kl": 0.008340008556842804, "learning_rate": 4.969885640961081e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 325 }, { "completion_length": 190.0, "epoch": 0.14508233199821985, "grad_norm": 0.7466467618942261, "kl": 0.0285557322204113, "learning_rate": 4.969281584958838e-06, "loss": 0.0011, "reward": 0.015166670083999634, "reward_std": 0.2690356373786926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.015166670083999634, "step": 326 }, { "completion_length": 191.1666717529297, "epoch": 0.14552736982643524, "grad_norm": 0.802043616771698, "kl": 0.025279924273490906, "learning_rate": 4.968671568109617e-06, "loss": 0.001, "reward": 0.0898333415389061, "reward_std": 0.21741704642772675, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0898333415389061, "step": 327 }, { "completion_length": 167.5, "epoch": 0.14597240765465064, "grad_norm": 0.812122642993927, "kl": 0.01720350608229637, "learning_rate": 4.968055591885999e-06, "loss": 0.0007, "reward": -0.06016666814684868, "reward_std": 0.24665558338165283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06016666814684868, "step": 328 }, { "completion_length": 200.0, "epoch": 0.14641744548286603, "grad_norm": 0.7395609021186829, "kl": 0.05103464424610138, "learning_rate": 4.967433657774952e-06, "loss": 0.002, "reward": -0.07100000232458115, "reward_std": 0.3038104772567749, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07100000232458115, "step": 329 }, { "completion_length": 179.83334350585938, "epoch": 0.14686248331108145, "grad_norm": 0.7787045836448669, "kl": 0.04071980342268944, "learning_rate": 4.9668057672778225e-06, "loss": 0.0016, "reward": 0.047833334654569626, "reward_std": 0.21287593245506287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.047833334654569626, "step": 330 }, { "completion_length": 200.0, "epoch": 0.14730752113929685, "grad_norm": 0.6824626326560974, "kl": 0.008245850913226604, "learning_rate": 4.966171921910341e-06, "loss": 0.0003, "reward": -0.09216667711734772, "reward_std": 0.34250280261039734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09216667711734772, "step": 331 }, { "completion_length": 200.0, "epoch": 0.14775255896751224, "grad_norm": 0.01006829272955656, "kl": 0.004723408259451389, "learning_rate": 4.96553212320261e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 332 }, { "completion_length": 200.0, "epoch": 0.14819759679572764, "grad_norm": 0.7323021292686462, "kl": 0.010274862870573997, "learning_rate": 4.9648863726991035e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 333 }, { "completion_length": 200.0, "epoch": 0.14864263462394303, "grad_norm": 0.024399923160672188, "kl": 0.013128578662872314, "learning_rate": 4.964234671958663e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 334 }, { "completion_length": 200.0, "epoch": 0.14908767245215843, "grad_norm": 0.010077173821628094, "kl": 0.004746645223349333, "learning_rate": 4.963577022554496e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 335 }, { "completion_length": 200.0, "epoch": 0.14953271028037382, "grad_norm": 0.011270418763160706, "kl": 0.005329379811882973, "learning_rate": 4.962913426074166e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 336 }, { "completion_length": 200.0, "epoch": 0.14997774810858924, "grad_norm": 0.012212223373353481, "kl": 0.006592839956283569, "learning_rate": 4.9622438841195986e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 337 }, { "completion_length": 200.0, "epoch": 0.15042278593680464, "grad_norm": 0.007356896065175533, "kl": 0.003410342149436474, "learning_rate": 4.961568398307065e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 338 }, { "completion_length": 199.5, "epoch": 0.15086782376502003, "grad_norm": 0.6488639116287231, "kl": 0.01651693880558014, "learning_rate": 4.960886970267191e-06, "loss": 0.0007, "reward": 0.05366666615009308, "reward_std": 0.17473027110099792, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05366666615009308, "step": 339 }, { "completion_length": 200.0, "epoch": 0.15131286159323543, "grad_norm": 0.024112718179821968, "kl": 0.018834218382835388, "learning_rate": 4.960199601644943e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 340 }, { "completion_length": 200.0, "epoch": 0.15175789942145082, "grad_norm": 0.013589066453278065, "kl": 0.01101887971162796, "learning_rate": 4.959506294099629e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 341 }, { "completion_length": 200.0, "epoch": 0.15220293724966621, "grad_norm": 0.010311473160982132, "kl": 0.007442638278007507, "learning_rate": 4.958807049304893e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 342 }, { "completion_length": 171.1666717529297, "epoch": 0.1526479750778816, "grad_norm": 5.103739261627197, "kl": 0.3760009706020355, "learning_rate": 4.958101868948715e-06, "loss": 0.015, "reward": 0.1758333444595337, "reward_std": 0.12451573461294174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1758333444595337, "step": 343 }, { "completion_length": 200.0, "epoch": 0.15309301290609703, "grad_norm": 0.013698318973183632, "kl": 0.011166570708155632, "learning_rate": 4.957390754733398e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 344 }, { "completion_length": 200.0, "epoch": 0.15353805073431243, "grad_norm": 0.014360117726027966, "kl": 0.010611571371555328, "learning_rate": 4.956673708375574e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 345 }, { "completion_length": 189.5, "epoch": 0.15398308856252782, "grad_norm": 0.7445936799049377, "kl": 0.028817251324653625, "learning_rate": 4.955950731606192e-06, "loss": 0.0012, "reward": 0.07383333891630173, "reward_std": 0.12533222138881683, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07383333891630173, "step": 346 }, { "completion_length": 184.0, "epoch": 0.15442812639074321, "grad_norm": 0.7008059024810791, "kl": 0.05760132521390915, "learning_rate": 4.9552218261705185e-06, "loss": 0.0023, "reward": -0.22599999606609344, "reward_std": 0.35765790939331055, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 347 }, { "completion_length": 200.0, "epoch": 0.1548731642189586, "grad_norm": 0.6954376697540283, "kl": 0.007197665050625801, "learning_rate": 4.954486993828132e-06, "loss": 0.0003, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 348 }, { "completion_length": 200.0, "epoch": 0.155318202047174, "grad_norm": 0.009056415408849716, "kl": 0.007077607326209545, "learning_rate": 4.953746236352917e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 349 }, { "completion_length": 200.0, "epoch": 0.1557632398753894, "grad_norm": 0.01317316759377718, "kl": 0.01229158602654934, "learning_rate": 4.952999555533065e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 350 }, { "completion_length": 195.5, "epoch": 0.15620827770360482, "grad_norm": 0.6882670521736145, "kl": 0.024929411709308624, "learning_rate": 4.952246953171062e-06, "loss": 0.001, "reward": -0.04200000315904617, "reward_std": 0.33165282011032104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04200000315904617, "step": 351 }, { "completion_length": 195.6666717529297, "epoch": 0.15665331553182021, "grad_norm": 0.676509439945221, "kl": 0.04691646993160248, "learning_rate": 4.951488431083689e-06, "loss": 0.0019, "reward": -0.13633333146572113, "reward_std": 0.29477566480636597, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13633333146572113, "step": 352 }, { "completion_length": 200.0, "epoch": 0.1570983533600356, "grad_norm": 0.6416396498680115, "kl": 0.01036627497524023, "learning_rate": 4.950723991102022e-06, "loss": 0.0004, "reward": -0.014666667208075523, "reward_std": 0.34211206436157227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014666667208075523, "step": 353 }, { "completion_length": 200.0, "epoch": 0.157543391188251, "grad_norm": 0.6665915846824646, "kl": 0.03278065845370293, "learning_rate": 4.949953635071417e-06, "loss": 0.0013, "reward": -0.08783333748579025, "reward_std": 0.3303600549697876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08783333748579025, "step": 354 }, { "completion_length": 191.5, "epoch": 0.1579884290164664, "grad_norm": 0.9799753427505493, "kl": 0.01656440459191799, "learning_rate": 4.949177364851515e-06, "loss": 0.0007, "reward": 0.0429999977350235, "reward_std": 0.20085816085338593, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0429999977350235, "step": 355 }, { "completion_length": 200.0, "epoch": 0.1584334668446818, "grad_norm": 0.016740066930651665, "kl": 0.01629803143441677, "learning_rate": 4.9483951823162326e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 356 }, { "completion_length": 200.0, "epoch": 0.1588785046728972, "grad_norm": 0.014137927442789078, "kl": 0.007187969516962767, "learning_rate": 4.947607089353758e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 357 }, { "completion_length": 200.0, "epoch": 0.1593235425011126, "grad_norm": 0.6014176607131958, "kl": 0.018046831712126732, "learning_rate": 4.946813087866549e-06, "loss": 0.0007, "reward": 0.007333338260650635, "reward_std": 0.2882232666015625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007333338260650635, "step": 358 }, { "completion_length": 200.0, "epoch": 0.159768580329328, "grad_norm": 0.013204401358962059, "kl": 0.013151012361049652, "learning_rate": 4.946013179771325e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 359 }, { "completion_length": 199.5, "epoch": 0.1602136181575434, "grad_norm": 0.6359446048736572, "kl": 0.02717038244009018, "learning_rate": 4.9452073669990656e-06, "loss": 0.0011, "reward": 0.03733333572745323, "reward_std": 0.2147386074066162, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03733333572745323, "step": 360 }, { "completion_length": 200.0, "epoch": 0.1606586559857588, "grad_norm": 0.013449462130665779, "kl": 0.006884987931698561, "learning_rate": 4.944395651495002e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 361 }, { "completion_length": 187.6666717529297, "epoch": 0.16110369381397419, "grad_norm": 0.818776547908783, "kl": 0.0218803733587265, "learning_rate": 4.9435780352186154e-06, "loss": 0.0009, "reward": -0.00916666816920042, "reward_std": 0.21735171973705292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.009166665375232697, "step": 362 }, { "completion_length": 200.0, "epoch": 0.16154873164218958, "grad_norm": 0.013731640763580799, "kl": 0.0065148696303367615, "learning_rate": 4.942754520143634e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 363 }, { "completion_length": 200.0, "epoch": 0.16199376947040497, "grad_norm": 0.009971830062568188, "kl": 0.012283856980502605, "learning_rate": 4.9419251082580216e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 364 }, { "completion_length": 200.0, "epoch": 0.16243880729862037, "grad_norm": 0.6981242895126343, "kl": 0.022827019914984703, "learning_rate": 4.94108980156398e-06, "loss": 0.0009, "reward": -0.02850000187754631, "reward_std": 0.3187059760093689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02850000187754631, "step": 365 }, { "completion_length": 186.1666717529297, "epoch": 0.1628838451268358, "grad_norm": 0.7663524746894836, "kl": 0.02265150099992752, "learning_rate": 4.940248602077939e-06, "loss": 0.0009, "reward": 0.006000000052154064, "reward_std": 0.2016005963087082, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006000000052154064, "step": 366 }, { "completion_length": 200.0, "epoch": 0.16332888295505119, "grad_norm": 0.01284782588481903, "kl": 0.009670613333582878, "learning_rate": 4.939401511830556e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 367 }, { "completion_length": 200.0, "epoch": 0.16377392078326658, "grad_norm": 0.01885703206062317, "kl": 0.009269410744309425, "learning_rate": 4.938548532866706e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 368 }, { "completion_length": 200.0, "epoch": 0.16421895861148197, "grad_norm": 0.6328880190849304, "kl": 0.01867607608437538, "learning_rate": 4.937689667245481e-06, "loss": 0.0007, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 369 }, { "completion_length": 200.0, "epoch": 0.16466399643969737, "grad_norm": 0.010342692025005817, "kl": 0.0059143840335309505, "learning_rate": 4.936824917040184e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 370 }, { "completion_length": 200.0, "epoch": 0.16510903426791276, "grad_norm": 0.6799391508102417, "kl": 0.01873181015253067, "learning_rate": 4.935954284338321e-06, "loss": 0.0007, "reward": 0.005833338014781475, "reward_std": 0.2918975353240967, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005833338014781475, "step": 371 }, { "completion_length": 200.0, "epoch": 0.16555407209612816, "grad_norm": 0.017115725204348564, "kl": 0.010264448821544647, "learning_rate": 4.9350777712415995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 372 }, { "completion_length": 200.0, "epoch": 0.16599910992434358, "grad_norm": 0.010852769017219543, "kl": 0.007528107613325119, "learning_rate": 4.934195379865925e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 373 }, { "completion_length": 168.83334350585938, "epoch": 0.16644414775255897, "grad_norm": 1.29231595993042, "kl": 0.18531188368797302, "learning_rate": 4.933307112341388e-06, "loss": 0.0074, "reward": 0.1628333330154419, "reward_std": 0.13582256436347961, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1628333330154419, "step": 374 }, { "completion_length": 200.0, "epoch": 0.16688918558077437, "grad_norm": 0.009757625870406628, "kl": 0.01133672520518303, "learning_rate": 4.932412970812269e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 375 }, { "completion_length": 199.5, "epoch": 0.16733422340898976, "grad_norm": 0.01278277114033699, "kl": 0.009305099956691265, "learning_rate": 4.931512957437024e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 376 }, { "completion_length": 200.0, "epoch": 0.16777926123720516, "grad_norm": 0.013093134388327599, "kl": 0.007520940154790878, "learning_rate": 4.930607074388287e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 377 }, { "completion_length": 200.0, "epoch": 0.16822429906542055, "grad_norm": 0.011228191666305065, "kl": 0.008690441027283669, "learning_rate": 4.92969532385286e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 378 }, { "completion_length": 185.5, "epoch": 0.16866933689363595, "grad_norm": 0.7297086119651794, "kl": 0.026256537064909935, "learning_rate": 4.928777708031709e-06, "loss": 0.0011, "reward": 0.016166668385267258, "reward_std": 0.16865399479866028, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 379 }, { "completion_length": 200.0, "epoch": 0.16911437472185137, "grad_norm": 0.015303281135857105, "kl": 0.01288022380322218, "learning_rate": 4.927854229139959e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 380 }, { "completion_length": 200.0, "epoch": 0.16955941255006676, "grad_norm": 0.011454589664936066, "kl": 0.006882285233587027, "learning_rate": 4.9269248894068886e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 381 }, { "completion_length": 200.0, "epoch": 0.17000445037828216, "grad_norm": 0.014687180519104004, "kl": 0.014141609892249107, "learning_rate": 4.9259896910759246e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 382 }, { "completion_length": 200.0, "epoch": 0.17044948820649755, "grad_norm": 0.016531087458133698, "kl": 0.00648513063788414, "learning_rate": 4.925048636404635e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 383 }, { "completion_length": 200.0, "epoch": 0.17089452603471295, "grad_norm": 0.00957447849214077, "kl": 0.0061697340570390224, "learning_rate": 4.9241017276647295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 384 }, { "completion_length": 200.0, "epoch": 0.17133956386292834, "grad_norm": 0.01072862558066845, "kl": 0.005429576151072979, "learning_rate": 4.923148967142043e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 385 }, { "completion_length": 200.0, "epoch": 0.17178460169114373, "grad_norm": 0.024660624563694, "kl": 0.009238356724381447, "learning_rate": 4.9221903571365406e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 386 }, { "completion_length": 200.0, "epoch": 0.17222963951935916, "grad_norm": 0.7495405673980713, "kl": 0.0185337346047163, "learning_rate": 4.921225899962308e-06, "loss": 0.0007, "reward": -0.13200001418590546, "reward_std": 0.3997148871421814, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13200001418590546, "step": 387 }, { "completion_length": 200.0, "epoch": 0.17267467734757455, "grad_norm": 0.010481936857104301, "kl": 0.00567130371928215, "learning_rate": 4.920255597947545e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 388 }, { "completion_length": 200.0, "epoch": 0.17311971517578995, "grad_norm": 0.007845093496143818, "kl": 0.003956751897931099, "learning_rate": 4.919279453434561e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 389 }, { "completion_length": 181.0, "epoch": 0.17356475300400534, "grad_norm": 0.6615269184112549, "kl": 0.030248617753386497, "learning_rate": 4.918297468779771e-06, "loss": 0.0012, "reward": -0.13499999046325684, "reward_std": 0.35328683257102966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13499999046325684, "step": 390 }, { "completion_length": 200.0, "epoch": 0.17400979083222073, "grad_norm": 0.691724419593811, "kl": 0.02659853920340538, "learning_rate": 4.917309646353682e-06, "loss": 0.0011, "reward": -0.09699999541044235, "reward_std": 0.34400463104248047, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09699999541044235, "step": 391 }, { "completion_length": 184.5, "epoch": 0.17445482866043613, "grad_norm": 0.6306662559509277, "kl": 0.0601261667907238, "learning_rate": 4.916315988540903e-06, "loss": 0.0024, "reward": -0.09049999713897705, "reward_std": 0.25501197576522827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09049999713897705, "step": 392 }, { "completion_length": 200.0, "epoch": 0.17489986648865152, "grad_norm": 0.012944119051098824, "kl": 0.01151751633733511, "learning_rate": 4.9153164977401215e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 393 }, { "completion_length": 200.0, "epoch": 0.17534490431686695, "grad_norm": 0.7263959646224976, "kl": 0.011329833418130875, "learning_rate": 4.914311176364109e-06, "loss": 0.0005, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 394 }, { "completion_length": 200.0, "epoch": 0.17578994214508234, "grad_norm": 0.00732870027422905, "kl": 0.004891596734523773, "learning_rate": 4.913300026839714e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 395 }, { "completion_length": 200.0, "epoch": 0.17623497997329773, "grad_norm": 0.7324749827384949, "kl": 0.01928497850894928, "learning_rate": 4.912283051607849e-06, "loss": 0.0008, "reward": 0.02199999988079071, "reward_std": 0.25229746103286743, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02199999988079071, "step": 396 }, { "completion_length": 200.0, "epoch": 0.17668001780151313, "grad_norm": 0.8718001842498779, "kl": 0.13316845893859863, "learning_rate": 4.911260253123494e-06, "loss": 0.0053, "reward": -0.1536666750907898, "reward_std": 0.30690693855285645, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1536666750907898, "step": 397 }, { "completion_length": 200.0, "epoch": 0.17712505562972852, "grad_norm": 0.7277849912643433, "kl": 0.01658281497657299, "learning_rate": 4.9102316338556844e-06, "loss": 0.0007, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 398 }, { "completion_length": 200.0, "epoch": 0.17757009345794392, "grad_norm": 0.6445873975753784, "kl": 0.02959112823009491, "learning_rate": 4.909197196287509e-06, "loss": 0.0012, "reward": -0.1274999976158142, "reward_std": 0.44477805495262146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1274999976158142, "step": 399 }, { "completion_length": 172.33334350585938, "epoch": 0.1780151312861593, "grad_norm": 0.7523168921470642, "kl": 0.0322515144944191, "learning_rate": 4.908156942916101e-06, "loss": 0.0013, "reward": -0.03199999779462814, "reward_std": 0.2558077275753021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03199999779462814, "step": 400 }, { "completion_length": 167.0, "epoch": 0.17846016911437473, "grad_norm": 0.8686386942863464, "kl": 0.056089848279953, "learning_rate": 4.90711087625263e-06, "loss": 0.0022, "reward": 0.07116666436195374, "reward_std": 0.14513224363327026, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07116666436195374, "step": 401 }, { "completion_length": 200.0, "epoch": 0.17890520694259013, "grad_norm": 0.00890452042222023, "kl": 0.005404898431152105, "learning_rate": 4.906058998822303e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 402 }, { "completion_length": 200.0, "epoch": 0.17935024477080552, "grad_norm": 0.9637120962142944, "kl": 0.02167494222521782, "learning_rate": 4.905001313164353e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 403 }, { "completion_length": 200.0, "epoch": 0.17979528259902092, "grad_norm": 0.007997624576091766, "kl": 0.004169671796262264, "learning_rate": 4.9039378218320325e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 404 }, { "completion_length": 200.0, "epoch": 0.1802403204272363, "grad_norm": 0.016796903684735298, "kl": 0.011683585122227669, "learning_rate": 4.902868527392612e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 405 }, { "completion_length": 200.0, "epoch": 0.1806853582554517, "grad_norm": 0.015744149684906006, "kl": 0.012497194111347198, "learning_rate": 4.9017934324273655e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 406 }, { "completion_length": 199.1666717529297, "epoch": 0.1811303960836671, "grad_norm": 0.5119321346282959, "kl": 0.028500860556960106, "learning_rate": 4.900712539531577e-06, "loss": 0.0011, "reward": 0.03933333605527878, "reward_std": 0.2756495475769043, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03933333605527878, "step": 407 }, { "completion_length": 193.6666717529297, "epoch": 0.18157543391188252, "grad_norm": 0.009107636287808418, "kl": 0.009039022959768772, "learning_rate": 4.89962585131452e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 408 }, { "completion_length": 200.0, "epoch": 0.18202047174009792, "grad_norm": 0.013876304030418396, "kl": 0.009923950769007206, "learning_rate": 4.898533370399459e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 409 }, { "completion_length": 200.0, "epoch": 0.1824655095683133, "grad_norm": 0.03001904860138893, "kl": 0.01629229262471199, "learning_rate": 4.897435099423647e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 410 }, { "completion_length": 200.0, "epoch": 0.1829105473965287, "grad_norm": 0.020497044548392296, "kl": 0.0096663823351264, "learning_rate": 4.896331041038309e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 411 }, { "completion_length": 200.0, "epoch": 0.1833555852247441, "grad_norm": 0.9498317241668701, "kl": 0.01545047014951706, "learning_rate": 4.895221197908643e-06, "loss": 0.0006, "reward": 0.001833329675719142, "reward_std": 0.3016955256462097, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.001833329675719142, "step": 412 }, { "completion_length": 200.0, "epoch": 0.1838006230529595, "grad_norm": 0.006678743753582239, "kl": 0.003321468597277999, "learning_rate": 4.89410557271381e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 413 }, { "completion_length": 200.0, "epoch": 0.1842456608811749, "grad_norm": 0.009199023246765137, "kl": 0.005622576922178268, "learning_rate": 4.8929841681469295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 414 }, { "completion_length": 200.0, "epoch": 0.1846906987093903, "grad_norm": 0.7114378213882446, "kl": 0.02522313967347145, "learning_rate": 4.891856986915073e-06, "loss": 0.001, "reward": -0.10500000417232513, "reward_std": 0.31319066882133484, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10500000417232513, "step": 415 }, { "completion_length": 200.0, "epoch": 0.1851357365376057, "grad_norm": 0.011489290744066238, "kl": 0.008540185168385506, "learning_rate": 4.8907240317392565e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 416 }, { "completion_length": 200.0, "epoch": 0.1855807743658211, "grad_norm": 0.6549782156944275, "kl": 0.023001134395599365, "learning_rate": 4.889585305354436e-06, "loss": 0.0009, "reward": 0.05899999663233757, "reward_std": 0.30181917548179626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05899999663233757, "step": 417 }, { "completion_length": 200.0, "epoch": 0.1860258121940365, "grad_norm": 0.6743258833885193, "kl": 0.014783745631575584, "learning_rate": 4.888440810509496e-06, "loss": 0.0006, "reward": -0.007166664116084576, "reward_std": 0.267223060131073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007166664116084576, "step": 418 }, { "completion_length": 200.0, "epoch": 0.1864708500222519, "grad_norm": 0.011519741266965866, "kl": 0.006074823439121246, "learning_rate": 4.887290549967247e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 419 }, { "completion_length": 183.1666717529297, "epoch": 0.18691588785046728, "grad_norm": 0.8046799302101135, "kl": 0.018539931625127792, "learning_rate": 4.886134526504421e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.07905694097280502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 420 }, { "completion_length": 200.0, "epoch": 0.18736092567868268, "grad_norm": 0.02228482812643051, "kl": 0.022595927119255066, "learning_rate": 4.884972742911656e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 421 }, { "completion_length": 170.0, "epoch": 0.1878059635068981, "grad_norm": 1.0265932083129883, "kl": 0.03965630382299423, "learning_rate": 4.8838052019935005e-06, "loss": 0.0016, "reward": -0.19983333349227905, "reward_std": 0.36561259627342224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19983333349227905, "step": 422 }, { "completion_length": 200.0, "epoch": 0.1882510013351135, "grad_norm": 0.008268559351563454, "kl": 0.006413072347640991, "learning_rate": 4.882631906568398e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 423 }, { "completion_length": 200.0, "epoch": 0.1886960391633289, "grad_norm": 0.011764715425670147, "kl": 0.00969620794057846, "learning_rate": 4.881452859468685e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 424 }, { "completion_length": 200.0, "epoch": 0.18914107699154428, "grad_norm": 0.013174543157219887, "kl": 0.012654460966587067, "learning_rate": 4.880268063540581e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 425 }, { "completion_length": 200.0, "epoch": 0.18958611481975968, "grad_norm": 0.009335462003946304, "kl": 0.011132560670375824, "learning_rate": 4.8790775216441835e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 426 }, { "completion_length": 141.1666717529297, "epoch": 0.19003115264797507, "grad_norm": 0.8435110449790955, "kl": 0.023837899789214134, "learning_rate": 4.877881236653463e-06, "loss": 0.001, "reward": -0.03983333706855774, "reward_std": 0.24120646715164185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03983333706855774, "step": 427 }, { "completion_length": 199.6666717529297, "epoch": 0.19047619047619047, "grad_norm": 0.6499157547950745, "kl": 0.01160873007029295, "learning_rate": 4.8766792114562495e-06, "loss": 0.0005, "reward": -0.09033333510160446, "reward_std": 0.351275771856308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09033333510160446, "step": 428 }, { "completion_length": 197.83334350585938, "epoch": 0.1909212283044059, "grad_norm": 0.015310406684875488, "kl": 0.010684727691113949, "learning_rate": 4.875471448954234e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 429 }, { "completion_length": 200.0, "epoch": 0.19136626613262128, "grad_norm": 0.6541863679885864, "kl": 0.01754167675971985, "learning_rate": 4.874257952062957e-06, "loss": 0.0007, "reward": 0.00533333420753479, "reward_std": 0.2931222915649414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00533333420753479, "step": 430 }, { "completion_length": 200.0, "epoch": 0.19181130396083668, "grad_norm": 0.009647169150412083, "kl": 0.005298088304698467, "learning_rate": 4.873038723711798e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 431 }, { "completion_length": 200.0, "epoch": 0.19225634178905207, "grad_norm": 0.010216983035206795, "kl": 0.007101266644895077, "learning_rate": 4.871813766843977e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 432 }, { "completion_length": 200.0, "epoch": 0.19270137961726747, "grad_norm": 0.007988468743860722, "kl": 0.003836569143459201, "learning_rate": 4.870583084416539e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 433 }, { "completion_length": 200.0, "epoch": 0.19314641744548286, "grad_norm": 0.7281708717346191, "kl": 0.016352718695998192, "learning_rate": 4.869346679400353e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 434 }, { "completion_length": 200.0, "epoch": 0.19359145527369825, "grad_norm": 0.009215595200657845, "kl": 0.006540496833622456, "learning_rate": 4.868104554780101e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 435 }, { "completion_length": 178.0, "epoch": 0.19403649310191368, "grad_norm": 0.6501026749610901, "kl": 0.030550524592399597, "learning_rate": 4.866856713554271e-06, "loss": 0.0012, "reward": 0.04383333772420883, "reward_std": 0.15619271993637085, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04383333772420883, "step": 436 }, { "completion_length": 200.0, "epoch": 0.19448153093012907, "grad_norm": 0.010590564459562302, "kl": 0.006376064382493496, "learning_rate": 4.865603158735155e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 437 }, { "completion_length": 199.6666717529297, "epoch": 0.19492656875834447, "grad_norm": 0.5724078416824341, "kl": 0.015208952128887177, "learning_rate": 4.864343893348834e-06, "loss": 0.0006, "reward": 0.0690000057220459, "reward_std": 0.20461182296276093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0690000057220459, "step": 438 }, { "completion_length": 200.0, "epoch": 0.19537160658655986, "grad_norm": 0.5766968727111816, "kl": 0.011872484348714352, "learning_rate": 4.863078920435173e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 439 }, { "completion_length": 200.0, "epoch": 0.19581664441477525, "grad_norm": 0.0435682088136673, "kl": 0.017930179834365845, "learning_rate": 4.861808243047822e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 440 }, { "completion_length": 200.0, "epoch": 0.19626168224299065, "grad_norm": 0.012740753591060638, "kl": 0.005490332376211882, "learning_rate": 4.860531864254192e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 441 }, { "completion_length": 200.0, "epoch": 0.19670672007120604, "grad_norm": 0.007875868119299412, "kl": 0.003752867691218853, "learning_rate": 4.8592497871354646e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 442 }, { "completion_length": 200.0, "epoch": 0.19715175789942144, "grad_norm": 0.00968019850552082, "kl": 0.0069991848431527615, "learning_rate": 4.857962014786575e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 443 }, { "completion_length": 200.0, "epoch": 0.19759679572763686, "grad_norm": 0.6327012777328491, "kl": 0.013870742172002792, "learning_rate": 4.856668550316203e-06, "loss": 0.0006, "reward": 0.008500000461935997, "reward_std": 0.2853655517101288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008500000461935997, "step": 444 }, { "completion_length": 200.0, "epoch": 0.19804183355585225, "grad_norm": 0.6365529298782349, "kl": 0.01527687069028616, "learning_rate": 4.855369396846778e-06, "loss": 0.0006, "reward": 0.0004999985685572028, "reward_std": 0.248800128698349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0004999985685572028, "step": 445 }, { "completion_length": 200.0, "epoch": 0.19848687138406765, "grad_norm": 0.013388743624091148, "kl": 0.0054934462532401085, "learning_rate": 4.854064557514452e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 446 }, { "completion_length": 147.33334350585938, "epoch": 0.19893190921228304, "grad_norm": 0.80791836977005, "kl": 0.06670716404914856, "learning_rate": 4.8527540354691095e-06, "loss": 0.0027, "reward": 0.14483334124088287, "reward_std": 0.055607251822948456, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14483334124088287, "step": 447 }, { "completion_length": 200.0, "epoch": 0.19937694704049844, "grad_norm": 0.008610348217189312, "kl": 0.0053973570466041565, "learning_rate": 4.8514378338743525e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 448 }, { "completion_length": 200.0, "epoch": 0.19982198486871383, "grad_norm": 0.6462050676345825, "kl": 0.022491535171866417, "learning_rate": 4.850115955907491e-06, "loss": 0.0009, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 449 }, { "completion_length": 200.0, "epoch": 0.20026702269692923, "grad_norm": 0.013753926381468773, "kl": 0.003668756689876318, "learning_rate": 4.8487884047595395e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 450 }, { "completion_length": 200.0, "epoch": 0.20071206052514465, "grad_norm": 0.01476313453167677, "kl": 0.009049910120666027, "learning_rate": 4.847455183635207e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 451 }, { "completion_length": 194.5, "epoch": 0.20115709835336004, "grad_norm": 0.7963857650756836, "kl": 0.024605944752693176, "learning_rate": 4.846116295752891e-06, "loss": 0.001, "reward": 0.08583333343267441, "reward_std": 0.1649368703365326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 452 }, { "completion_length": 197.33334350585938, "epoch": 0.20160213618157544, "grad_norm": 0.008257864974439144, "kl": 0.00925783533602953, "learning_rate": 4.844771744344666e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 453 }, { "completion_length": 169.5, "epoch": 0.20204717400979083, "grad_norm": 0.7012379169464111, "kl": 0.032544177025556564, "learning_rate": 4.843421532656281e-06, "loss": 0.0013, "reward": 0.028833335265517235, "reward_std": 0.24847166240215302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.028833335265517235, "step": 454 }, { "completion_length": 133.5, "epoch": 0.20249221183800623, "grad_norm": 1.1525821685791016, "kl": 0.030363596975803375, "learning_rate": 4.8420656639471466e-06, "loss": 0.0012, "reward": -0.10766666382551193, "reward_std": 0.2986413836479187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10766667127609253, "step": 455 }, { "completion_length": 189.1666717529297, "epoch": 0.20293724966622162, "grad_norm": 0.01882368139922619, "kl": 0.018681103363633156, "learning_rate": 4.84070414149033e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 456 }, { "completion_length": 192.33334350585938, "epoch": 0.20338228749443701, "grad_norm": 0.907418429851532, "kl": 0.016113460063934326, "learning_rate": 4.83933696857255e-06, "loss": 0.0006, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 457 }, { "completion_length": 200.0, "epoch": 0.20382732532265244, "grad_norm": 0.02320541813969612, "kl": 0.012556570582091808, "learning_rate": 4.83796414849416e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 458 }, { "completion_length": 189.1666717529297, "epoch": 0.20427236315086783, "grad_norm": 0.8501640558242798, "kl": 0.023711485788226128, "learning_rate": 4.836585684569148e-06, "loss": 0.0009, "reward": 0.05000000447034836, "reward_std": 0.18371173739433289, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05000000447034836, "step": 459 }, { "completion_length": 200.0, "epoch": 0.20471740097908322, "grad_norm": 0.0207956675440073, "kl": 0.008212253451347351, "learning_rate": 4.83520158012513e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 460 }, { "completion_length": 199.33334350585938, "epoch": 0.20516243880729862, "grad_norm": 0.01620439812541008, "kl": 0.011511600576341152, "learning_rate": 4.833811838503331e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 461 }, { "completion_length": 200.0, "epoch": 0.205607476635514, "grad_norm": 0.8750971555709839, "kl": 0.007969997823238373, "learning_rate": 4.83241646305859e-06, "loss": 0.0003, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 462 }, { "completion_length": 200.0, "epoch": 0.2060525144637294, "grad_norm": 0.012503387406468391, "kl": 0.005554807838052511, "learning_rate": 4.8310154571593435e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 463 }, { "completion_length": 200.0, "epoch": 0.2064975522919448, "grad_norm": 0.01813841424882412, "kl": 0.00543005159124732, "learning_rate": 4.829608824187621e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 464 }, { "completion_length": 200.0, "epoch": 0.20694259012016022, "grad_norm": 0.007825582288205624, "kl": 0.007653496228158474, "learning_rate": 4.828196567539034e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 465 }, { "completion_length": 200.0, "epoch": 0.20738762794837562, "grad_norm": 0.03445260971784592, "kl": 0.011151362210512161, "learning_rate": 4.826778690622772e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 466 }, { "completion_length": 192.6666717529297, "epoch": 0.207832665776591, "grad_norm": 0.6254943609237671, "kl": 0.0196918286383152, "learning_rate": 4.82535519686159e-06, "loss": 0.0008, "reward": -0.05883334204554558, "reward_std": 0.3185080885887146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05883333832025528, "step": 467 }, { "completion_length": 183.33334350585938, "epoch": 0.2082777036048064, "grad_norm": 0.7328706383705139, "kl": 0.045631419867277145, "learning_rate": 4.823926089691803e-06, "loss": 0.0018, "reward": -0.11100000143051147, "reward_std": 0.37771734595298767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11100000143051147, "step": 468 }, { "completion_length": 200.0, "epoch": 0.2087227414330218, "grad_norm": 0.013255412690341473, "kl": 0.011069796979427338, "learning_rate": 4.822491372563276e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 469 }, { "completion_length": 200.0, "epoch": 0.2091677792612372, "grad_norm": 0.022944483906030655, "kl": 0.013812100514769554, "learning_rate": 4.821051048939416e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 470 }, { "completion_length": 200.0, "epoch": 0.2096128170894526, "grad_norm": 0.009076782502233982, "kl": 0.0035323970951139927, "learning_rate": 4.819605122297167e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 471 }, { "completion_length": 200.0, "epoch": 0.210057854917668, "grad_norm": 0.047525037080049515, "kl": 0.008770107291638851, "learning_rate": 4.818153596126995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 472 }, { "completion_length": 200.0, "epoch": 0.2105028927458834, "grad_norm": 0.7111804485321045, "kl": 0.02760786935687065, "learning_rate": 4.816696473932886e-06, "loss": 0.0011, "reward": -0.012666663154959679, "reward_std": 0.33721309900283813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.012666663154959679, "step": 473 }, { "completion_length": 70.66667175292969, "epoch": 0.2109479305740988, "grad_norm": 1.4313840866088867, "kl": 0.02774934470653534, "learning_rate": 4.815233759232333e-06, "loss": 0.0011, "reward": 0.01666666753590107, "reward_std": 0.054006174206733704, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01666666753590107, "step": 474 }, { "completion_length": 200.0, "epoch": 0.2113929684023142, "grad_norm": 0.008336659520864487, "kl": 0.003154546720907092, "learning_rate": 4.8137654555563305e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 475 }, { "completion_length": 200.0, "epoch": 0.2118380062305296, "grad_norm": 0.6730583310127258, "kl": 0.02612406387925148, "learning_rate": 4.812291566449363e-06, "loss": 0.001, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 476 }, { "completion_length": 200.0, "epoch": 0.21228304405874499, "grad_norm": 0.6641086935997009, "kl": 0.013326774351298809, "learning_rate": 4.810812095469401e-06, "loss": 0.0005, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 477 }, { "completion_length": 196.0, "epoch": 0.21272808188696038, "grad_norm": 0.02167276106774807, "kl": 0.016343414783477783, "learning_rate": 4.809327046187888e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 478 }, { "completion_length": 200.0, "epoch": 0.2131731197151758, "grad_norm": 0.6960393786430359, "kl": 0.00471863616257906, "learning_rate": 4.807836422189733e-06, "loss": 0.0002, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 479 }, { "completion_length": 200.0, "epoch": 0.2136181575433912, "grad_norm": 0.03306792676448822, "kl": 0.022989537566900253, "learning_rate": 4.806340227073304e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 480 }, { "completion_length": 186.1666717529297, "epoch": 0.2140631953716066, "grad_norm": 0.7957750558853149, "kl": 0.0064355782233178616, "learning_rate": 4.8048384644504165e-06, "loss": 0.0003, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 481 }, { "completion_length": 190.0, "epoch": 0.21450823319982198, "grad_norm": 0.7082236409187317, "kl": 0.05344567820429802, "learning_rate": 4.8033311379463255e-06, "loss": 0.0021, "reward": -0.0898333415389061, "reward_std": 0.3882114887237549, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0898333415389061, "step": 482 }, { "completion_length": 200.0, "epoch": 0.21495327102803738, "grad_norm": 0.019929101690649986, "kl": 0.008283906616270542, "learning_rate": 4.801818251199718e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 483 }, { "completion_length": 185.5, "epoch": 0.21539830885625277, "grad_norm": 0.6174662113189697, "kl": 0.02444113790988922, "learning_rate": 4.800299807862705e-06, "loss": 0.001, "reward": 0.00916666816920042, "reward_std": 0.22120074927806854, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00916666816920042, "step": 484 }, { "completion_length": 200.0, "epoch": 0.21584334668446817, "grad_norm": 0.6985958814620972, "kl": 0.047310732305049896, "learning_rate": 4.798775811600807e-06, "loss": 0.0019, "reward": -0.09299999475479126, "reward_std": 0.3469115197658539, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09299999475479126, "step": 485 }, { "completion_length": 200.0, "epoch": 0.2162883845126836, "grad_norm": 0.008483149111270905, "kl": 0.004622921347618103, "learning_rate": 4.7972462660929546e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 486 }, { "completion_length": 190.6666717529297, "epoch": 0.21673342234089898, "grad_norm": 0.8138934969902039, "kl": 0.022382408380508423, "learning_rate": 4.795711175031467e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 487 }, { "completion_length": 200.0, "epoch": 0.21717846016911438, "grad_norm": 0.010969250462949276, "kl": 0.005018382798880339, "learning_rate": 4.79417054212206e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 488 }, { "completion_length": 200.0, "epoch": 0.21762349799732977, "grad_norm": 0.007726567331701517, "kl": 0.003069926518946886, "learning_rate": 4.792624371083819e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 489 }, { "completion_length": 200.0, "epoch": 0.21806853582554517, "grad_norm": 0.00889444351196289, "kl": 0.004114137962460518, "learning_rate": 4.791072665649203e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 187.0, "epoch": 0.21851357365376056, "grad_norm": 0.9759646654129028, "kl": 0.02656198851764202, "learning_rate": 4.789515429564029e-06, "loss": 0.0011, "reward": 0.13449999690055847, "reward_std": 0.023270150646567345, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13449999690055847, "step": 491 }, { "completion_length": 164.6666717529297, "epoch": 0.21895861148197596, "grad_norm": 0.698348343372345, "kl": 0.03639654442667961, "learning_rate": 4.787952666587465e-06, "loss": 0.0015, "reward": 0.06016666814684868, "reward_std": 0.2878718078136444, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06016666814684868, "step": 492 }, { "completion_length": 181.83334350585938, "epoch": 0.21940364931019138, "grad_norm": 0.6999357342720032, "kl": 0.0428909957408905, "learning_rate": 4.786384380492024e-06, "loss": 0.0017, "reward": 0.16099999845027924, "reward_std": 0.13888844847679138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16099999845027924, "step": 493 }, { "completion_length": 184.83334350585938, "epoch": 0.21984868713840677, "grad_norm": 0.7373248338699341, "kl": 0.051096897572278976, "learning_rate": 4.784810575063546e-06, "loss": 0.002, "reward": 0.019499998539686203, "reward_std": 0.3235452175140381, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 494 }, { "completion_length": 198.5, "epoch": 0.22029372496662217, "grad_norm": 0.8084774017333984, "kl": 0.012246180325746536, "learning_rate": 4.783231254101201e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 495 }, { "completion_length": 200.0, "epoch": 0.22073876279483756, "grad_norm": 0.008205811493098736, "kl": 0.007350212894380093, "learning_rate": 4.781646421417469e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 496 }, { "completion_length": 200.0, "epoch": 0.22118380062305296, "grad_norm": 0.018955878913402557, "kl": 0.005687872879207134, "learning_rate": 4.780056080838138e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 497 }, { "completion_length": 200.0, "epoch": 0.22162883845126835, "grad_norm": 0.17352764308452606, "kl": 0.04136586934328079, "learning_rate": 4.77846023620229e-06, "loss": 0.0017, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 498 }, { "completion_length": 199.0, "epoch": 0.22207387627948375, "grad_norm": 0.014824754558503628, "kl": 0.008487005718052387, "learning_rate": 4.776858891362296e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 499 }, { "completion_length": 200.0, "epoch": 0.22251891410769917, "grad_norm": 0.7017592191696167, "kl": 0.012654486112296581, "learning_rate": 4.775252050183802e-06, "loss": 0.0005, "reward": 0.025499999523162842, "reward_std": 0.24372422695159912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025499999523162842, "step": 500 }, { "completion_length": 200.0, "epoch": 0.22296395193591456, "grad_norm": 0.8674989342689514, "kl": 0.024426866322755814, "learning_rate": 4.773639716545723e-06, "loss": 0.001, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 501 }, { "completion_length": 200.0, "epoch": 0.22340898976412996, "grad_norm": 0.008451178669929504, "kl": 0.0035606666933745146, "learning_rate": 4.772021894340235e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 502 }, { "completion_length": 200.0, "epoch": 0.22385402759234535, "grad_norm": 0.014713208191096783, "kl": 0.00692252442240715, "learning_rate": 4.77039858747276e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 503 }, { "completion_length": 200.0, "epoch": 0.22429906542056074, "grad_norm": 0.7125733494758606, "kl": 0.031231265515089035, "learning_rate": 4.768769799861962e-06, "loss": 0.0012, "reward": 0.019166668877005577, "reward_std": 0.2592376470565796, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019166668877005577, "step": 504 }, { "completion_length": 199.33334350585938, "epoch": 0.22474410324877614, "grad_norm": 0.6759501099586487, "kl": 0.01907402276992798, "learning_rate": 4.767135535439736e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 505 }, { "completion_length": 200.0, "epoch": 0.22518914107699153, "grad_norm": 0.008459432050585747, "kl": 0.009604415856301785, "learning_rate": 4.765495798151196e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 506 }, { "completion_length": 191.33334350585938, "epoch": 0.22563417890520696, "grad_norm": 0.9782493710517883, "kl": 0.030173055827617645, "learning_rate": 4.763850591954668e-06, "loss": 0.0012, "reward": 0.1875, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 507 }, { "completion_length": 196.33334350585938, "epoch": 0.22607921673342235, "grad_norm": 0.7100333571434021, "kl": 0.020358001813292503, "learning_rate": 4.762199920821683e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 508 }, { "completion_length": 199.5, "epoch": 0.22652425456163774, "grad_norm": 0.011824924498796463, "kl": 0.0119178993627429, "learning_rate": 4.760543788736961e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 509 }, { "completion_length": 200.0, "epoch": 0.22696929238985314, "grad_norm": 0.007140854839235544, "kl": 0.005115116946399212, "learning_rate": 4.758882199698405e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 510 }, { "completion_length": 190.1666717529297, "epoch": 0.22741433021806853, "grad_norm": 0.7087447643280029, "kl": 0.018751170486211777, "learning_rate": 4.757215157717091e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 511 }, { "completion_length": 194.0, "epoch": 0.22785936804628393, "grad_norm": 0.05364219471812248, "kl": 0.0219450481235981, "learning_rate": 4.7555426668172614e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 512 }, { "completion_length": 179.33334350585938, "epoch": 0.22830440587449932, "grad_norm": 0.7024439573287964, "kl": 0.027089372277259827, "learning_rate": 4.753864731036308e-06, "loss": 0.0011, "reward": 0.187666654586792, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 513 }, { "completion_length": 200.0, "epoch": 0.22874944370271474, "grad_norm": 0.010735830292105675, "kl": 0.008807472884654999, "learning_rate": 4.752181354424769e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 514 }, { "completion_length": 200.0, "epoch": 0.22919448153093014, "grad_norm": 0.008272531442344189, "kl": 0.014129738323390484, "learning_rate": 4.750492541046318e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 515 }, { "completion_length": 200.0, "epoch": 0.22963951935914553, "grad_norm": 0.006092743948101997, "kl": 0.0037111197598278522, "learning_rate": 4.74879829497775e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 516 }, { "completion_length": 200.0, "epoch": 0.23008455718736093, "grad_norm": 0.7442413568496704, "kl": 0.005196265410631895, "learning_rate": 4.747098620308975e-06, "loss": 0.0002, "reward": 0.1041666716337204, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 517 }, { "completion_length": 186.0, "epoch": 0.23052959501557632, "grad_norm": 0.9965373277664185, "kl": 0.03707145154476166, "learning_rate": 4.7453935211430105e-06, "loss": 0.0015, "reward": 0.2083333432674408, "reward_std": 0.15138253569602966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 518 }, { "completion_length": 200.0, "epoch": 0.23097463284379172, "grad_norm": 0.6923635005950928, "kl": 0.025277286767959595, "learning_rate": 4.743683001595965e-06, "loss": 0.001, "reward": 0.04116666316986084, "reward_std": 0.3334938883781433, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04116666316986084, "step": 519 }, { "completion_length": 200.0, "epoch": 0.2314196706720071, "grad_norm": 0.009316305629909039, "kl": 0.003812011331319809, "learning_rate": 4.741967065797036e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 520 }, { "completion_length": 163.83334350585938, "epoch": 0.23186470850022253, "grad_norm": 1.0490024089813232, "kl": 0.03154284879565239, "learning_rate": 4.740245717888491e-06, "loss": 0.0013, "reward": 0.13366666436195374, "reward_std": 0.11832442134618759, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13366666436195374, "step": 521 }, { "completion_length": 200.0, "epoch": 0.23230974632843793, "grad_norm": 0.5622179508209229, "kl": 0.00910902488976717, "learning_rate": 4.738518962025665e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 522 }, { "completion_length": 199.6666717529297, "epoch": 0.23275478415665332, "grad_norm": 0.6993998885154724, "kl": 0.028999265283346176, "learning_rate": 4.736786802376948e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 523 }, { "completion_length": 200.0, "epoch": 0.23319982198486872, "grad_norm": 0.011826745234429836, "kl": 0.005504906177520752, "learning_rate": 4.735049243123774e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 524 }, { "completion_length": 200.0, "epoch": 0.2336448598130841, "grad_norm": 0.8087031245231628, "kl": 0.06020812317728996, "learning_rate": 4.7333062884606114e-06, "loss": 0.0024, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 525 }, { "completion_length": 167.33334350585938, "epoch": 0.2340898976412995, "grad_norm": 0.7873777151107788, "kl": 0.03628785163164139, "learning_rate": 4.731557942594956e-06, "loss": 0.0015, "reward": 0.0885000079870224, "reward_std": 0.27989909052848816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0885000079870224, "step": 526 }, { "completion_length": 187.1666717529297, "epoch": 0.2345349354695149, "grad_norm": 0.7097110152244568, "kl": 0.034623414278030396, "learning_rate": 4.729804209747313e-06, "loss": 0.0014, "reward": -0.0033333352766931057, "reward_std": 0.3099939823150635, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0033333352766931057, "step": 527 }, { "completion_length": 200.0, "epoch": 0.2349799732977303, "grad_norm": 0.0095818554982543, "kl": 0.003446843009442091, "learning_rate": 4.728045094151194e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 528 }, { "completion_length": 158.6666717529297, "epoch": 0.23542501112594572, "grad_norm": 1.1593365669250488, "kl": 0.038195449858903885, "learning_rate": 4.726280600053109e-06, "loss": 0.0015, "reward": 0.1041666716337204, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 529 }, { "completion_length": 195.1666717529297, "epoch": 0.2358700489541611, "grad_norm": 0.6258925795555115, "kl": 0.021456394344568253, "learning_rate": 4.724510731712543e-06, "loss": 0.0009, "reward": 0.12516666948795319, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 530 }, { "completion_length": 185.33334350585938, "epoch": 0.2363150867823765, "grad_norm": 0.8283340930938721, "kl": 0.04360618814826012, "learning_rate": 4.722735493401961e-06, "loss": 0.0017, "reward": 0.2291666716337204, "reward_std": 0.14613065123558044, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 531 }, { "completion_length": 198.33334350585938, "epoch": 0.2367601246105919, "grad_norm": 0.6890245676040649, "kl": 0.012334956787526608, "learning_rate": 4.720954889406789e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 532 }, { "completion_length": 200.0, "epoch": 0.2372051624388073, "grad_norm": 0.006551599130034447, "kl": 0.00498668709769845, "learning_rate": 4.719168924025407e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 533 }, { "completion_length": 190.83334350585938, "epoch": 0.2376502002670227, "grad_norm": 0.7857916355133057, "kl": 0.037860430777072906, "learning_rate": 4.7173776015691345e-06, "loss": 0.0015, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 534 }, { "completion_length": 189.1666717529297, "epoch": 0.23809523809523808, "grad_norm": 0.7188397645950317, "kl": 0.02583630383014679, "learning_rate": 4.715580926362225e-06, "loss": 0.001, "reward": 0.0755000039935112, "reward_std": 0.3398551344871521, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007833331823348999, "step": 535 }, { "completion_length": 200.0, "epoch": 0.2385402759234535, "grad_norm": 0.7328579425811768, "kl": 0.014590962789952755, "learning_rate": 4.713778902741855e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 536 }, { "completion_length": 151.6666717529297, "epoch": 0.2389853137516689, "grad_norm": 0.8885166049003601, "kl": 0.0610353946685791, "learning_rate": 4.7119715350581096e-06, "loss": 0.0024, "reward": 0.250166654586792, "reward_std": 0.15827244520187378, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1668333262205124, "step": 537 }, { "completion_length": 200.0, "epoch": 0.2394303515798843, "grad_norm": 0.6243503093719482, "kl": 0.011630935594439507, "learning_rate": 4.710158827673974e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 538 }, { "completion_length": 188.1666717529297, "epoch": 0.2398753894080997, "grad_norm": 0.6943646669387817, "kl": 0.02668793499469757, "learning_rate": 4.708340784965326e-06, "loss": 0.0011, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 539 }, { "completion_length": 185.83334350585938, "epoch": 0.24032042723631508, "grad_norm": 0.7564919590950012, "kl": 0.036508020013570786, "learning_rate": 4.7065174113209225e-06, "loss": 0.0015, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 540 }, { "completion_length": 193.1666717529297, "epoch": 0.24076546506453048, "grad_norm": 1.0034822225570679, "kl": 0.038777224719524384, "learning_rate": 4.7046887111423865e-06, "loss": 0.0016, "reward": 0.1041666716337204, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 541 }, { "completion_length": 184.83334350585938, "epoch": 0.24121050289274587, "grad_norm": 0.8872579336166382, "kl": 0.009863987565040588, "learning_rate": 4.702854688844202e-06, "loss": 0.0004, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 542 }, { "completion_length": 112.66667175292969, "epoch": 0.2416555407209613, "grad_norm": 0.8163735866546631, "kl": 0.059337299317121506, "learning_rate": 4.701015348853699e-06, "loss": 0.0024, "reward": 0.2866666913032532, "reward_std": 0.26798635721206665, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03666666895151138, "step": 543 }, { "completion_length": 111.5, "epoch": 0.2421005785491767, "grad_norm": 1.2242677211761475, "kl": 0.08483864367008209, "learning_rate": 4.699170695611047e-06, "loss": 0.0034, "reward": 0.250166654586792, "reward_std": 0.13693124055862427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1668333262205124, "step": 544 }, { "completion_length": 196.6666717529297, "epoch": 0.24254561637739208, "grad_norm": 0.6293683648109436, "kl": 0.014732254669070244, "learning_rate": 4.697320733569238e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 545 }, { "completion_length": 200.0, "epoch": 0.24299065420560748, "grad_norm": 0.015561908483505249, "kl": 0.006728894077241421, "learning_rate": 4.695465467194082e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 546 }, { "completion_length": 199.0, "epoch": 0.24343569203382287, "grad_norm": 0.6494888663291931, "kl": 0.04665082320570946, "learning_rate": 4.693604900964193e-06, "loss": 0.0019, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 547 }, { "completion_length": 164.1666717529297, "epoch": 0.24388072986203826, "grad_norm": 0.9423882365226746, "kl": 0.06301107257604599, "learning_rate": 4.691739039370979e-06, "loss": 0.0025, "reward": 0.2291666716337204, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 548 }, { "completion_length": 191.5, "epoch": 0.24432576769025366, "grad_norm": 0.7398462891578674, "kl": 0.028107035905122757, "learning_rate": 4.68986788691863e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 549 }, { "completion_length": 200.0, "epoch": 0.24477080551846908, "grad_norm": 0.015271569602191448, "kl": 0.006307273171842098, "learning_rate": 4.68799144812411e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 550 }, { "completion_length": 161.1666717529297, "epoch": 0.24521584334668448, "grad_norm": 0.8253611922264099, "kl": 0.061717789620161057, "learning_rate": 4.686109727517142e-06, "loss": 0.0025, "reward": 0.2916666865348816, "reward_std": 0.23273734748363495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 551 }, { "completion_length": 149.6666717529297, "epoch": 0.24566088117489987, "grad_norm": 0.9274349808692932, "kl": 0.060279231518507004, "learning_rate": 4.6842227296402025e-06, "loss": 0.0024, "reward": 0.2293333262205124, "reward_std": 0.18415391445159912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 552 }, { "completion_length": 180.5, "epoch": 0.24610591900311526, "grad_norm": 0.7610526084899902, "kl": 0.040498070418834686, "learning_rate": 4.6823304590485025e-06, "loss": 0.0016, "reward": 0.1459999978542328, "reward_std": 0.16642116010189056, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 553 }, { "completion_length": 174.83334350585938, "epoch": 0.24655095683133066, "grad_norm": 0.820443868637085, "kl": 0.07258454710245132, "learning_rate": 4.680432920309986e-06, "loss": 0.0029, "reward": 0.2708333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 554 }, { "completion_length": 177.33334350585938, "epoch": 0.24699599465954605, "grad_norm": 0.7389416694641113, "kl": 0.04997410625219345, "learning_rate": 4.678530118005313e-06, "loss": 0.002, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 555 }, { "completion_length": 176.83334350585938, "epoch": 0.24744103248776145, "grad_norm": 0.7619123458862305, "kl": 0.0550689622759819, "learning_rate": 4.676622056727848e-06, "loss": 0.0022, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 556 }, { "completion_length": 180.6666717529297, "epoch": 0.24788607031597687, "grad_norm": 0.6843514442443848, "kl": 0.031852152198553085, "learning_rate": 4.674708741083651e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 557 }, { "completion_length": 95.5, "epoch": 0.24833110814419226, "grad_norm": 1.1454741954803467, "kl": 0.08709007501602173, "learning_rate": 4.6727901756914694e-06, "loss": 0.0035, "reward": 0.375166654586792, "reward_std": 0.15811441838741302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 558 }, { "completion_length": 184.5, "epoch": 0.24877614597240766, "grad_norm": 0.6881827712059021, "kl": 0.05518307909369469, "learning_rate": 4.670866365182719e-06, "loss": 0.0022, "reward": 0.1458333432674408, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 559 }, { "completion_length": 188.1666717529297, "epoch": 0.24922118380062305, "grad_norm": 0.8532994389533997, "kl": 0.03818666934967041, "learning_rate": 4.66893731420148e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 560 }, { "completion_length": 200.0, "epoch": 0.24966622162883845, "grad_norm": 0.8707519769668579, "kl": 0.02029397338628769, "learning_rate": 4.667003027404483e-06, "loss": 0.0008, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 561 }, { "completion_length": 167.5, "epoch": 0.25011125945705387, "grad_norm": 0.7573645710945129, "kl": 0.08380292356014252, "learning_rate": 4.665063509461098e-06, "loss": 0.0034, "reward": 0.15600000321865082, "reward_std": 0.3755556046962738, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07266666740179062, "step": 562 }, { "completion_length": 200.0, "epoch": 0.25055629728526924, "grad_norm": 0.7199256420135498, "kl": 0.027807530015707016, "learning_rate": 4.663118765053319e-06, "loss": 0.0011, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 563 }, { "completion_length": 129.33334350585938, "epoch": 0.25100133511348466, "grad_norm": 0.9969347715377808, "kl": 0.09646206349134445, "learning_rate": 4.661168798875763e-06, "loss": 0.0039, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 564 }, { "completion_length": 145.5, "epoch": 0.2514463729417, "grad_norm": 0.6874487996101379, "kl": 0.07618147879838943, "learning_rate": 4.6592136156356476e-06, "loss": 0.003, "reward": 0.25, "reward_std": 0.15811388194561005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 565 }, { "completion_length": 200.0, "epoch": 0.25189141076991545, "grad_norm": 0.8305972814559937, "kl": 0.022951535880565643, "learning_rate": 4.6572532200527875e-06, "loss": 0.0009, "reward": -0.05400000140070915, "reward_std": 0.37837284803390503, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05400000140070915, "step": 566 }, { "completion_length": 153.1666717529297, "epoch": 0.2523364485981308, "grad_norm": 0.9877628684043884, "kl": 0.0713796615600586, "learning_rate": 4.655287616859578e-06, "loss": 0.0029, "reward": 0.187666654586792, "reward_std": 0.06828372180461884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18766668438911438, "step": 567 }, { "completion_length": 195.33334350585938, "epoch": 0.25278148642634624, "grad_norm": 0.7665106058120728, "kl": 0.04054148495197296, "learning_rate": 4.6533168108009855e-06, "loss": 0.0016, "reward": 0.2291666716337204, "reward_std": 0.14613065123558044, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 568 }, { "completion_length": 170.33334350585938, "epoch": 0.25322652425456166, "grad_norm": 0.8183974623680115, "kl": 0.047635436058044434, "learning_rate": 4.651340806634538e-06, "loss": 0.0019, "reward": 0.3543333411216736, "reward_std": 0.12293358892202377, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 569 }, { "completion_length": 168.6666717529297, "epoch": 0.253671562082777, "grad_norm": 0.7534694075584412, "kl": 0.04617027938365936, "learning_rate": 4.64935960913031e-06, "loss": 0.0018, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 570 }, { "completion_length": 113.16667175292969, "epoch": 0.25411659991099245, "grad_norm": 1.2481725215911865, "kl": 0.07855173945426941, "learning_rate": 4.647373223070913e-06, "loss": 0.0031, "reward": 0.2916666865348816, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 571 }, { "completion_length": 103.66667175292969, "epoch": 0.2545616377392078, "grad_norm": 1.0237010717391968, "kl": 0.11217594146728516, "learning_rate": 4.645381653251485e-06, "loss": 0.0045, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 572 }, { "completion_length": 195.5, "epoch": 0.25500667556742324, "grad_norm": 0.886687695980072, "kl": 0.016482416540384293, "learning_rate": 4.643384904479675e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 573 }, { "completion_length": 114.33333587646484, "epoch": 0.2554517133956386, "grad_norm": 0.04589557647705078, "kl": 0.08041426539421082, "learning_rate": 4.641382981575637e-06, "loss": 0.0032, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 574 }, { "completion_length": 200.0, "epoch": 0.255896751223854, "grad_norm": 0.8335212469100952, "kl": 0.060845568776130676, "learning_rate": 4.639375889372013e-06, "loss": 0.0024, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 575 }, { "completion_length": 192.6666717529297, "epoch": 0.25634178905206945, "grad_norm": 0.8120817542076111, "kl": 0.03659413754940033, "learning_rate": 4.637363632713924e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 576 }, { "completion_length": 181.6666717529297, "epoch": 0.2567868268802848, "grad_norm": 0.723743736743927, "kl": 0.028544750064611435, "learning_rate": 4.63534621645896e-06, "loss": 0.0011, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 577 }, { "completion_length": 123.0, "epoch": 0.25723186470850024, "grad_norm": 0.8822214007377625, "kl": 0.15481743216514587, "learning_rate": 4.6333236454771644e-06, "loss": 0.0062, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 578 }, { "completion_length": 196.0, "epoch": 0.2576769025367156, "grad_norm": 0.8370240926742554, "kl": 0.045977018773555756, "learning_rate": 4.6312959246510245e-06, "loss": 0.0018, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 579 }, { "completion_length": 150.6666717529297, "epoch": 0.258121940364931, "grad_norm": 0.7318511009216309, "kl": 0.05776175111532211, "learning_rate": 4.629263058875458e-06, "loss": 0.0023, "reward": 0.2291666716337204, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 580 }, { "completion_length": 155.0, "epoch": 0.2585669781931464, "grad_norm": 0.9142718315124512, "kl": 0.069237619638443, "learning_rate": 4.627225053057806e-06, "loss": 0.0028, "reward": 0.3333333432674408, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 581 }, { "completion_length": 165.0, "epoch": 0.2590120160213618, "grad_norm": 0.7616869807243347, "kl": 0.0599093958735466, "learning_rate": 4.6251819121178145e-06, "loss": 0.0024, "reward": 0.3125, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 582 }, { "completion_length": 154.6666717529297, "epoch": 0.25945705384957723, "grad_norm": 0.806352972984314, "kl": 0.08792692422866821, "learning_rate": 4.623133640987628e-06, "loss": 0.0035, "reward": 0.2918333411216736, "reward_std": 0.1881493180990219, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 583 }, { "completion_length": 189.1666717529297, "epoch": 0.2599020916777926, "grad_norm": 0.7354152798652649, "kl": 0.03302110731601715, "learning_rate": 4.621080244611772e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 584 }, { "completion_length": 197.6666717529297, "epoch": 0.260347129506008, "grad_norm": 0.6742915511131287, "kl": 0.032543033361434937, "learning_rate": 4.619021727947147e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 585 }, { "completion_length": 151.83334350585938, "epoch": 0.2607921673342234, "grad_norm": 0.9699811935424805, "kl": 0.08444561809301376, "learning_rate": 4.616958095963014e-06, "loss": 0.0034, "reward": 0.1276666671037674, "reward_std": 0.48804986476898193, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1223333403468132, "step": 586 }, { "completion_length": 169.1666717529297, "epoch": 0.2612372051624388, "grad_norm": 1.1708648204803467, "kl": 0.04619307070970535, "learning_rate": 4.6148893536409815e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 587 }, { "completion_length": 109.33333587646484, "epoch": 0.2616822429906542, "grad_norm": 0.978725016117096, "kl": 0.08659862726926804, "learning_rate": 4.612815505974993e-06, "loss": 0.0035, "reward": 0.3333333432674408, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 588 }, { "completion_length": 185.5, "epoch": 0.2621272808188696, "grad_norm": 0.8235928416252136, "kl": 0.052005793899297714, "learning_rate": 4.610736557971321e-06, "loss": 0.0021, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 589 }, { "completion_length": 174.1666717529297, "epoch": 0.262572318647085, "grad_norm": 0.976612389087677, "kl": 0.045445047318935394, "learning_rate": 4.608652514648544e-06, "loss": 0.0018, "reward": 0.2916666865348816, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 590 }, { "completion_length": 200.0, "epoch": 0.2630173564753004, "grad_norm": 0.014436294324696064, "kl": 0.03448406979441643, "learning_rate": 4.606563381037544e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 591 }, { "completion_length": 200.0, "epoch": 0.2634623943035158, "grad_norm": 0.8270597457885742, "kl": 0.034238051623106, "learning_rate": 4.604469162181492e-06, "loss": 0.0014, "reward": 0.01300000213086605, "reward_std": 0.2962499260902405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01300000213086605, "step": 592 }, { "completion_length": 148.0, "epoch": 0.2639074321317312, "grad_norm": 0.8803796768188477, "kl": 0.06960055232048035, "learning_rate": 4.6023698631358326e-06, "loss": 0.0028, "reward": 0.437666654586792, "reward_std": 0.10446371138095856, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 593 }, { "completion_length": 200.0, "epoch": 0.2643524699599466, "grad_norm": 0.7200313210487366, "kl": 0.05139869078993797, "learning_rate": 4.6002654889682755e-06, "loss": 0.0021, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 594 }, { "completion_length": 193.6666717529297, "epoch": 0.26479750778816197, "grad_norm": 0.7966943979263306, "kl": 0.040248602628707886, "learning_rate": 4.598156044758779e-06, "loss": 0.0016, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 595 }, { "completion_length": 200.0, "epoch": 0.2652425456163774, "grad_norm": 0.755216121673584, "kl": 0.019312169402837753, "learning_rate": 4.5960415355995444e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 596 }, { "completion_length": 186.1666717529297, "epoch": 0.2656875834445928, "grad_norm": 0.853950023651123, "kl": 0.039613865315914154, "learning_rate": 4.593921966594997e-06, "loss": 0.0016, "reward": 0.026833336800336838, "reward_std": 0.4285675883293152, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.026833336800336838, "step": 597 }, { "completion_length": 191.1666717529297, "epoch": 0.2661326212728082, "grad_norm": 0.7605770826339722, "kl": 0.05703141912817955, "learning_rate": 4.591797342861778e-06, "loss": 0.0023, "reward": 0.1875, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 598 }, { "completion_length": 200.0, "epoch": 0.2665776591010236, "grad_norm": 0.8873202800750732, "kl": 0.03285577893257141, "learning_rate": 4.589667669528729e-06, "loss": 0.0013, "reward": 0.0833333358168602, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 599 }, { "completion_length": 152.5, "epoch": 0.26702269692923897, "grad_norm": 0.7901185154914856, "kl": 0.07483705878257751, "learning_rate": 4.587532951736884e-06, "loss": 0.003, "reward": 0.2916666865348816, "reward_std": 0.1881931722164154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 600 }, { "completion_length": 147.1666717529297, "epoch": 0.2674677347574544, "grad_norm": 0.9961167573928833, "kl": 0.0769721046090126, "learning_rate": 4.585393194639452e-06, "loss": 0.0031, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 601 }, { "completion_length": 200.0, "epoch": 0.26791277258566976, "grad_norm": 0.7630622386932373, "kl": 0.035730935633182526, "learning_rate": 4.583248403401808e-06, "loss": 0.0014, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 602 }, { "completion_length": 160.33334350585938, "epoch": 0.2683578104138852, "grad_norm": 0.896909236907959, "kl": 0.07461149990558624, "learning_rate": 4.581098583201478e-06, "loss": 0.003, "reward": 0.375, "reward_std": 0.1369306445121765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 603 }, { "completion_length": 116.83333587646484, "epoch": 0.2688028482421006, "grad_norm": 0.9277684688568115, "kl": 0.08133234083652496, "learning_rate": 4.578943739228131e-06, "loss": 0.0033, "reward": 0.24300000071525574, "reward_std": 0.22485819458961487, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15966667234897614, "step": 604 }, { "completion_length": 184.83334350585938, "epoch": 0.26924788607031597, "grad_norm": 0.7694399952888489, "kl": 0.058748748153448105, "learning_rate": 4.576783876683559e-06, "loss": 0.0023, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 605 }, { "completion_length": 136.33334350585938, "epoch": 0.2696929238985314, "grad_norm": 0.9464454054832458, "kl": 0.054710254073143005, "learning_rate": 4.574619000781674e-06, "loss": 0.0022, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 606 }, { "completion_length": 179.6666717529297, "epoch": 0.27013796172674676, "grad_norm": 0.8664156198501587, "kl": 0.07143112272024155, "learning_rate": 4.572449116748485e-06, "loss": 0.0029, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 607 }, { "completion_length": 200.0, "epoch": 0.2705829995549622, "grad_norm": 0.9034672975540161, "kl": 0.026494070887565613, "learning_rate": 4.570274229822095e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 608 }, { "completion_length": 200.0, "epoch": 0.27102803738317754, "grad_norm": 0.6816303133964539, "kl": 0.04936773702502251, "learning_rate": 4.5680943452526814e-06, "loss": 0.002, "reward": -0.1028333306312561, "reward_std": 0.36173439025878906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1028333306312561, "step": 609 }, { "completion_length": 89.5, "epoch": 0.27147307521139297, "grad_norm": 1.2434215545654297, "kl": 0.10808803886175156, "learning_rate": 4.565909468302486e-06, "loss": 0.0043, "reward": 0.382999986410141, "reward_std": 0.14375397562980652, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13300000131130219, "step": 610 }, { "completion_length": 200.0, "epoch": 0.2719181130396084, "grad_norm": 0.03729372099041939, "kl": 0.05879303440451622, "learning_rate": 4.563719604245804e-06, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 611 }, { "completion_length": 171.83334350585938, "epoch": 0.27236315086782376, "grad_norm": 0.7619086503982544, "kl": 0.14301355183124542, "learning_rate": 4.561524758368968e-06, "loss": 0.0057, "reward": 0.22450000047683716, "reward_std": 0.4011252820491791, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.025499999523162842, "step": 612 }, { "completion_length": 161.83334350585938, "epoch": 0.2728081886960392, "grad_norm": 0.8366971015930176, "kl": 0.07396448403596878, "learning_rate": 4.559324935970337e-06, "loss": 0.003, "reward": 0.3543333411216736, "reward_std": 0.22915641963481903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 613 }, { "completion_length": 178.1666717529297, "epoch": 0.27325322652425454, "grad_norm": 0.9588587880134583, "kl": 0.06857272237539291, "learning_rate": 4.5571201423602825e-06, "loss": 0.0027, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 614 }, { "completion_length": 185.83334350585938, "epoch": 0.27369826435246997, "grad_norm": 0.7504873871803284, "kl": 0.07397520542144775, "learning_rate": 4.554910382861178e-06, "loss": 0.003, "reward": 0.2916666865348816, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 615 }, { "completion_length": 199.33334350585938, "epoch": 0.27414330218068533, "grad_norm": 0.8504765629768372, "kl": 0.06764481961727142, "learning_rate": 4.552695662807385e-06, "loss": 0.0027, "reward": 0.1459999978542328, "reward_std": 0.12306909263134003, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 616 }, { "completion_length": 91.66667175292969, "epoch": 0.27458834000890076, "grad_norm": 0.24826188385486603, "kl": 0.1672677993774414, "learning_rate": 4.550475987545238e-06, "loss": 0.0067, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 617 }, { "completion_length": 187.0, "epoch": 0.2750333778371162, "grad_norm": 0.9045337438583374, "kl": 0.07695237547159195, "learning_rate": 4.548251362433033e-06, "loss": 0.0031, "reward": 0.1458333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 618 }, { "completion_length": 106.66667175292969, "epoch": 0.27547841566533154, "grad_norm": 0.9984831213951111, "kl": 0.17150583863258362, "learning_rate": 4.546021792841019e-06, "loss": 0.0069, "reward": 0.3543333411216736, "reward_std": 0.22915641963481903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 619 }, { "completion_length": 86.0, "epoch": 0.27592345349354697, "grad_norm": 0.9944882392883301, "kl": 0.17101812362670898, "learning_rate": 4.543787284151374e-06, "loss": 0.0068, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 620 }, { "completion_length": 141.5, "epoch": 0.27636849132176233, "grad_norm": 1.0072808265686035, "kl": 0.11485801637172699, "learning_rate": 4.541547841758207e-06, "loss": 0.0046, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 621 }, { "completion_length": 156.0, "epoch": 0.27681352914997776, "grad_norm": 1.1295337677001953, "kl": 0.121914803981781, "learning_rate": 4.539303471067531e-06, "loss": 0.0049, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 622 }, { "completion_length": 168.5, "epoch": 0.2772585669781931, "grad_norm": 0.791140615940094, "kl": 0.06966628134250641, "learning_rate": 4.537054177497259e-06, "loss": 0.0028, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 623 }, { "completion_length": 120.0, "epoch": 0.27770360480640854, "grad_norm": 0.9863029718399048, "kl": 0.10272692143917084, "learning_rate": 4.534799966477186e-06, "loss": 0.0041, "reward": 0.3959999978542328, "reward_std": 0.1658191978931427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06266666948795319, "step": 624 }, { "completion_length": 200.0, "epoch": 0.27814864263462397, "grad_norm": 0.8481348752975464, "kl": 0.06225643306970596, "learning_rate": 4.532540843448979e-06, "loss": 0.0025, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 625 }, { "completion_length": 144.0, "epoch": 0.27859368046283933, "grad_norm": 0.9757879376411438, "kl": 0.09962724894285202, "learning_rate": 4.530276813866162e-06, "loss": 0.004, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 626 }, { "completion_length": 194.6666717529297, "epoch": 0.27903871829105475, "grad_norm": 0.931898295879364, "kl": 0.07902925461530685, "learning_rate": 4.528007883194102e-06, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 627 }, { "completion_length": 190.83334350585938, "epoch": 0.2794837561192701, "grad_norm": 0.831519365310669, "kl": 0.06486264616250992, "learning_rate": 4.525734056910002e-06, "loss": 0.0026, "reward": 0.2918333411216736, "reward_std": 0.2042061984539032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 628 }, { "completion_length": 111.33333587646484, "epoch": 0.27992879394748554, "grad_norm": 0.9687957167625427, "kl": 0.1136898398399353, "learning_rate": 4.523455340502878e-06, "loss": 0.0045, "reward": 0.375166654586792, "reward_std": 0.15811441838741302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 629 }, { "completion_length": 120.66667175292969, "epoch": 0.2803738317757009, "grad_norm": 0.0987262949347496, "kl": 0.11421214044094086, "learning_rate": 4.521171739473552e-06, "loss": 0.0046, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 630 }, { "completion_length": 120.83333587646484, "epoch": 0.28081886960391633, "grad_norm": 0.8952674269676208, "kl": 0.09326162189245224, "learning_rate": 4.5188832593346386e-06, "loss": 0.0037, "reward": 0.4583333432674408, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 631 }, { "completion_length": 112.33333587646484, "epoch": 0.28126390743213175, "grad_norm": 0.06504479050636292, "kl": 0.12895925343036652, "learning_rate": 4.51658990561053e-06, "loss": 0.0052, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 632 }, { "completion_length": 118.16667175292969, "epoch": 0.2817089452603471, "grad_norm": 1.197392463684082, "kl": 0.39798209071159363, "learning_rate": 4.514291683837383e-06, "loss": 0.0159, "reward": 0.2523333430290222, "reward_std": 0.41331908106803894, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08100000023841858, "step": 633 }, { "completion_length": 141.33334350585938, "epoch": 0.28215398308856254, "grad_norm": 1.023812174797058, "kl": 0.0909874364733696, "learning_rate": 4.511988599563107e-06, "loss": 0.0036, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 634 }, { "completion_length": 107.5, "epoch": 0.2825990209167779, "grad_norm": 1.506011962890625, "kl": 0.10861188173294067, "learning_rate": 4.509680658347347e-06, "loss": 0.0043, "reward": 0.3959999978542328, "reward_std": 0.2002398669719696, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06266666948795319, "step": 635 }, { "completion_length": 150.1666717529297, "epoch": 0.28304405874499333, "grad_norm": 1.0660347938537598, "kl": 0.1432390809059143, "learning_rate": 4.507367865761476e-06, "loss": 0.0057, "reward": 0.3543333411216736, "reward_std": 0.20028147101402283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 636 }, { "completion_length": 190.33334350585938, "epoch": 0.2834890965732087, "grad_norm": 0.9527061581611633, "kl": 0.07011144608259201, "learning_rate": 4.505050227388575e-06, "loss": 0.0028, "reward": 0.1875, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 637 }, { "completion_length": 117.16667175292969, "epoch": 0.2839341344014241, "grad_norm": 1.230230689048767, "kl": 0.09981067478656769, "learning_rate": 4.502727748823425e-06, "loss": 0.004, "reward": 0.2084999978542328, "reward_std": 0.23266606032848358, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04183333367109299, "step": 638 }, { "completion_length": 129.0, "epoch": 0.28437917222963954, "grad_norm": 0.8897629976272583, "kl": 0.09878586232662201, "learning_rate": 4.50040043567249e-06, "loss": 0.004, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 639 }, { "completion_length": 92.83333587646484, "epoch": 0.2848242100578549, "grad_norm": 0.9260250926017761, "kl": 0.08705293387174606, "learning_rate": 4.498068293553906e-06, "loss": 0.0035, "reward": 0.437666654586792, "reward_std": 0.15268486738204956, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 640 }, { "completion_length": 195.5, "epoch": 0.28526924788607033, "grad_norm": 0.7902094125747681, "kl": 0.07190477102994919, "learning_rate": 4.495731328097464e-06, "loss": 0.0029, "reward": 0.1666666716337204, "reward_std": 0.1881931722164154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 641 }, { "completion_length": 200.0, "epoch": 0.2857142857142857, "grad_norm": 0.8260470032691956, "kl": 0.0487191379070282, "learning_rate": 4.4933895449446e-06, "loss": 0.0019, "reward": 0.1666666716337204, "reward_std": 0.12909945845603943, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 642 }, { "completion_length": 126.33333587646484, "epoch": 0.2861593235425011, "grad_norm": 0.7838308811187744, "kl": 0.12842848896980286, "learning_rate": 4.491042949748381e-06, "loss": 0.0051, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 643 }, { "completion_length": 154.6666717529297, "epoch": 0.2866043613707165, "grad_norm": 0.820894181728363, "kl": 0.060759272426366806, "learning_rate": 4.488691548173487e-06, "loss": 0.0024, "reward": 0.28716668486595154, "reward_std": 0.21236613392829895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12049999833106995, "step": 644 }, { "completion_length": 147.33334350585938, "epoch": 0.2870493991989319, "grad_norm": 0.7532824873924255, "kl": 0.0968950018286705, "learning_rate": 4.486335345896204e-06, "loss": 0.0039, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 645 }, { "completion_length": 179.5, "epoch": 0.28749443702714733, "grad_norm": 0.7337256073951721, "kl": 0.0560641810297966, "learning_rate": 4.483974348604407e-06, "loss": 0.0022, "reward": 0.20866666734218597, "reward_std": 0.12942281365394592, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20866666734218597, "step": 646 }, { "completion_length": 191.33334350585938, "epoch": 0.2879394748553627, "grad_norm": 0.665947675704956, "kl": 0.04197518154978752, "learning_rate": 4.48160856199754e-06, "loss": 0.0017, "reward": 0.187666654586792, "reward_std": 0.20557884871959686, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 647 }, { "completion_length": 168.33334350585938, "epoch": 0.2883845126835781, "grad_norm": 0.7467770576477051, "kl": 0.05759914964437485, "learning_rate": 4.479237991786617e-06, "loss": 0.0023, "reward": 0.10266666859388351, "reward_std": 0.39400848746299744, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019333332777023315, "step": 648 }, { "completion_length": 200.0, "epoch": 0.2888295505117935, "grad_norm": 0.7417876720428467, "kl": 0.05739980190992355, "learning_rate": 4.476862643694194e-06, "loss": 0.0023, "reward": 0.0416666679084301, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 649 }, { "completion_length": 196.83334350585938, "epoch": 0.2892745883400089, "grad_norm": 0.6486422419548035, "kl": 0.05907044932246208, "learning_rate": 4.474482523454363e-06, "loss": 0.0024, "reward": 0.1041666716337204, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 650 }, { "completion_length": 132.1666717529297, "epoch": 0.2897196261682243, "grad_norm": 0.8684423565864563, "kl": 0.05625095218420029, "learning_rate": 4.472097636812736e-06, "loss": 0.0022, "reward": 0.437666654586792, "reward_std": 0.15268486738204956, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 651 }, { "completion_length": 143.0, "epoch": 0.2901646639964397, "grad_norm": 0.8793902397155762, "kl": 0.07028966397047043, "learning_rate": 4.469707989526429e-06, "loss": 0.0028, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 652 }, { "completion_length": 177.5, "epoch": 0.2906097018246551, "grad_norm": 0.9311726093292236, "kl": 0.05447518453001976, "learning_rate": 4.467313587364053e-06, "loss": 0.0022, "reward": 0.2708333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 653 }, { "completion_length": 165.33334350585938, "epoch": 0.2910547396528705, "grad_norm": 0.6879425644874573, "kl": 0.06616301834583282, "learning_rate": 4.464914436105695e-06, "loss": 0.0026, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 654 }, { "completion_length": 200.0, "epoch": 0.2914997774810859, "grad_norm": 0.6475021243095398, "kl": 0.039877697825431824, "learning_rate": 4.462510541542909e-06, "loss": 0.0016, "reward": 0.04933333396911621, "reward_std": 0.38236457109451294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04933333396911621, "step": 655 }, { "completion_length": 143.83334350585938, "epoch": 0.2919448153093013, "grad_norm": 0.9102987051010132, "kl": 0.09255407750606537, "learning_rate": 4.460101909478696e-06, "loss": 0.0037, "reward": 0.14616666734218597, "reward_std": 0.1465870589017868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 656 }, { "completion_length": 126.16667175292969, "epoch": 0.2923898531375167, "grad_norm": 0.045655757188797, "kl": 0.07757769525051117, "learning_rate": 4.457688545727496e-06, "loss": 0.0031, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 657 }, { "completion_length": 191.6666717529297, "epoch": 0.29283489096573206, "grad_norm": 0.6900054216384888, "kl": 0.1448201835155487, "learning_rate": 4.45527045611517e-06, "loss": 0.0058, "reward": 0.010500004515051842, "reward_std": 0.3959311842918396, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07283332943916321, "step": 658 }, { "completion_length": 119.33333587646484, "epoch": 0.2932799287939475, "grad_norm": 0.9801993370056152, "kl": 0.10018780082464218, "learning_rate": 4.452847646478987e-06, "loss": 0.004, "reward": 0.35483333468437195, "reward_std": 0.0940093994140625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27149999141693115, "step": 659 }, { "completion_length": 193.5, "epoch": 0.2937249666221629, "grad_norm": 0.7721496224403381, "kl": 0.04845193028450012, "learning_rate": 4.4504201226676124e-06, "loss": 0.0019, "reward": 0.2708333432674408, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 660 }, { "completion_length": 164.0, "epoch": 0.2941700044503783, "grad_norm": 0.8990196585655212, "kl": 0.038245782256126404, "learning_rate": 4.4479878905410875e-06, "loss": 0.0015, "reward": 0.3959999978542328, "reward_std": 0.12286578863859177, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 661 }, { "completion_length": 197.0, "epoch": 0.2946150422785937, "grad_norm": 0.7893727421760559, "kl": 0.04667172580957413, "learning_rate": 4.445550955970823e-06, "loss": 0.0019, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 662 }, { "completion_length": 110.33333587646484, "epoch": 0.29506008010680906, "grad_norm": 0.8910045623779297, "kl": 0.08671041578054428, "learning_rate": 4.443109324839581e-06, "loss": 0.0035, "reward": 0.4375, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 663 }, { "completion_length": 193.6666717529297, "epoch": 0.2955051179350245, "grad_norm": 0.7910698056221008, "kl": 0.043277036398649216, "learning_rate": 4.440663003041459e-06, "loss": 0.0017, "reward": 0.2291666716337204, "reward_std": 0.22935599088668823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 664 }, { "completion_length": 186.33334350585938, "epoch": 0.29595015576323985, "grad_norm": 0.8543537855148315, "kl": 0.06220155954360962, "learning_rate": 4.43821199648188e-06, "loss": 0.0025, "reward": 0.1666666716337204, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 665 }, { "completion_length": 172.1666717529297, "epoch": 0.2963951935914553, "grad_norm": 0.8467397093772888, "kl": 0.05557259917259216, "learning_rate": 4.435756311077573e-06, "loss": 0.0022, "reward": 0.250166654586792, "reward_std": 0.2372765839099884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08349999785423279, "step": 666 }, { "completion_length": 111.66667175292969, "epoch": 0.2968402314196707, "grad_norm": 0.0691637396812439, "kl": 0.09109967201948166, "learning_rate": 4.4332959527565666e-06, "loss": 0.0036, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 667 }, { "completion_length": 171.5, "epoch": 0.29728526924788606, "grad_norm": 0.6657293438911438, "kl": 0.06025252863764763, "learning_rate": 4.430830927458166e-06, "loss": 0.0024, "reward": 0.3961666524410248, "reward_std": 0.14607451856136322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 668 }, { "completion_length": 119.66667175292969, "epoch": 0.2977303070761015, "grad_norm": 0.9089164137840271, "kl": 0.08549092710018158, "learning_rate": 4.428361241132943e-06, "loss": 0.0034, "reward": 0.375166654586792, "reward_std": 0.19339123368263245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04183333367109299, "step": 669 }, { "completion_length": 200.0, "epoch": 0.29817534490431685, "grad_norm": 0.6968740820884705, "kl": 0.04054586589336395, "learning_rate": 4.425886899742722e-06, "loss": 0.0016, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 670 }, { "completion_length": 121.33333587646484, "epoch": 0.2986203827325323, "grad_norm": 1.0022284984588623, "kl": 0.08514213562011719, "learning_rate": 4.423407909260564e-06, "loss": 0.0034, "reward": 0.437666654586792, "reward_std": 0.10446371138095856, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 671 }, { "completion_length": 184.1666717529297, "epoch": 0.29906542056074764, "grad_norm": 0.7887581586837769, "kl": 0.038804031908512115, "learning_rate": 4.420924275670753e-06, "loss": 0.0016, "reward": 0.2083333432674408, "reward_std": 0.17078250646591187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 672 }, { "completion_length": 198.0, "epoch": 0.29951045838896306, "grad_norm": 0.8230785131454468, "kl": 0.04383291304111481, "learning_rate": 4.4184360049687826e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.12313678115606308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 673 }, { "completion_length": 200.0, "epoch": 0.2999554962171785, "grad_norm": 0.7423299551010132, "kl": 0.03673369064927101, "learning_rate": 4.41594310316134e-06, "loss": 0.0015, "reward": 0.0625, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 674 }, { "completion_length": 137.0, "epoch": 0.30040053404539385, "grad_norm": 0.8925609588623047, "kl": 0.08815717697143555, "learning_rate": 4.4134455762662895e-06, "loss": 0.0035, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 675 }, { "completion_length": 97.5, "epoch": 0.3008455718736093, "grad_norm": 0.03086089715361595, "kl": 0.06561337411403656, "learning_rate": 4.410943430312663e-06, "loss": 0.0026, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 676 }, { "completion_length": 200.0, "epoch": 0.30129060970182464, "grad_norm": 0.03762960061430931, "kl": 0.025554362684488297, "learning_rate": 4.408436671340643e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 677 }, { "completion_length": 193.6666717529297, "epoch": 0.30173564753004006, "grad_norm": 0.7618845701217651, "kl": 0.03975854814052582, "learning_rate": 4.405925305401547e-06, "loss": 0.0016, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 678 }, { "completion_length": 143.0, "epoch": 0.30218068535825543, "grad_norm": 1.127150535583496, "kl": 0.08482104539871216, "learning_rate": 4.4034093385578125e-06, "loss": 0.0034, "reward": 0.3336666524410248, "reward_std": 0.21874704957008362, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08366666734218597, "step": 679 }, { "completion_length": 144.5, "epoch": 0.30262572318647085, "grad_norm": 0.03559992089867592, "kl": 0.05165047571063042, "learning_rate": 4.400888776882985e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 680 }, { "completion_length": 150.0, "epoch": 0.3030707610146863, "grad_norm": 0.8135696649551392, "kl": 0.07061418145895004, "learning_rate": 4.398363626461702e-06, "loss": 0.0028, "reward": 0.3543333411216736, "reward_std": 0.20028147101402283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 681 }, { "completion_length": 200.0, "epoch": 0.30351579884290164, "grad_norm": 0.7969770431518555, "kl": 0.03894883021712303, "learning_rate": 4.395833893389676e-06, "loss": 0.0016, "reward": 0.12433333694934845, "reward_std": 0.3078192174434662, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12433333694934845, "step": 682 }, { "completion_length": 200.0, "epoch": 0.30396083667111706, "grad_norm": 0.7363507747650146, "kl": 0.03019530698657036, "learning_rate": 4.393299583773688e-06, "loss": 0.0012, "reward": 0.125, "reward_std": 0.11180339753627777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 683 }, { "completion_length": 195.5, "epoch": 0.30440587449933243, "grad_norm": 0.7232860326766968, "kl": 0.03277706354856491, "learning_rate": 4.390760703731559e-06, "loss": 0.0013, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 684 }, { "completion_length": 198.5, "epoch": 0.30485091232754785, "grad_norm": 0.7945019602775574, "kl": 0.04717344790697098, "learning_rate": 4.388217259392148e-06, "loss": 0.0019, "reward": 0.14266666769981384, "reward_std": 0.3285280168056488, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14266666769981384, "step": 685 }, { "completion_length": 186.0, "epoch": 0.3052959501557632, "grad_norm": 0.7385047078132629, "kl": 0.04403085261583328, "learning_rate": 4.38566925689533e-06, "loss": 0.0018, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 686 }, { "completion_length": 163.6666717529297, "epoch": 0.30574098798397864, "grad_norm": 0.856914758682251, "kl": 0.05494067072868347, "learning_rate": 4.383116702391988e-06, "loss": 0.0022, "reward": 0.35466668009757996, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 687 }, { "completion_length": 196.6666717529297, "epoch": 0.30618602581219406, "grad_norm": 0.8300358057022095, "kl": 0.043124958872795105, "learning_rate": 4.3805596020439845e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 688 }, { "completion_length": 169.6666717529297, "epoch": 0.30663106364040943, "grad_norm": 0.749025285243988, "kl": 0.05098932236433029, "learning_rate": 4.3779979620241644e-06, "loss": 0.002, "reward": 0.20083332061767578, "reward_std": 0.40986746549606323, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03416666388511658, "step": 689 }, { "completion_length": 163.1666717529297, "epoch": 0.30707610146862485, "grad_norm": 0.9226408004760742, "kl": 0.055235203355550766, "learning_rate": 4.375431788516326e-06, "loss": 0.0022, "reward": 0.39633333683013916, "reward_std": 0.09396524727344513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22966668009757996, "step": 690 }, { "completion_length": 145.1666717529297, "epoch": 0.3075211392968402, "grad_norm": 0.8971224427223206, "kl": 0.05605591833591461, "learning_rate": 4.372861087715215e-06, "loss": 0.0022, "reward": 0.4586666524410248, "reward_std": 0.06403334438800812, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12533333897590637, "step": 691 }, { "completion_length": 149.5, "epoch": 0.30796617712505564, "grad_norm": 0.8178679943084717, "kl": 0.05255182832479477, "learning_rate": 4.3702858658265044e-06, "loss": 0.0021, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 692 }, { "completion_length": 200.0, "epoch": 0.308411214953271, "grad_norm": 0.8197976350784302, "kl": 0.03835117816925049, "learning_rate": 4.367706129066781e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 693 }, { "completion_length": 172.83334350585938, "epoch": 0.30885625278148643, "grad_norm": 0.8643909692764282, "kl": 0.04992937296628952, "learning_rate": 4.36512188366353e-06, "loss": 0.002, "reward": 0.27116668224334717, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 694 }, { "completion_length": 200.0, "epoch": 0.30930129060970185, "grad_norm": 0.7522907853126526, "kl": 0.04939102381467819, "learning_rate": 4.36253313585512e-06, "loss": 0.002, "reward": 0.02083333395421505, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 695 }, { "completion_length": 197.6666717529297, "epoch": 0.3097463284379172, "grad_norm": 0.7550695538520813, "kl": 0.02722327411174774, "learning_rate": 4.359939891890793e-06, "loss": 0.0011, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 696 }, { "completion_length": 200.0, "epoch": 0.31019136626613264, "grad_norm": 0.023237159475684166, "kl": 0.013823822140693665, "learning_rate": 4.357342158030638e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 697 }, { "completion_length": 200.0, "epoch": 0.310636404094348, "grad_norm": 0.6470344066619873, "kl": 0.028774775564670563, "learning_rate": 4.354739940545587e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 698 }, { "completion_length": 178.0, "epoch": 0.31108144192256343, "grad_norm": 0.8396903872489929, "kl": 0.036307577043771744, "learning_rate": 4.352133245717393e-06, "loss": 0.0015, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 699 }, { "completion_length": 194.0, "epoch": 0.3115264797507788, "grad_norm": 0.8275584578514099, "kl": 0.03899259492754936, "learning_rate": 4.349522079838622e-06, "loss": 0.0016, "reward": 0.125, "reward_std": 0.07905694097280502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 700 }, { "completion_length": 185.6666717529297, "epoch": 0.3119715175789942, "grad_norm": 0.043758708983659744, "kl": 0.049132104963064194, "learning_rate": 4.346906449212627e-06, "loss": 0.002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 701 }, { "completion_length": 196.83334350585938, "epoch": 0.31241655540720964, "grad_norm": 0.8897082209587097, "kl": 0.03444843739271164, "learning_rate": 4.344286360153541e-06, "loss": 0.0014, "reward": 0.1875, "reward_std": 0.18957190215587616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 702 }, { "completion_length": 177.0, "epoch": 0.312861593235425, "grad_norm": 0.8655028343200684, "kl": 0.05428258329629898, "learning_rate": 4.341661818986263e-06, "loss": 0.0022, "reward": 0.3966667056083679, "reward_std": 0.05062280222773552, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 703 }, { "completion_length": 200.0, "epoch": 0.31330663106364043, "grad_norm": 0.7451438903808594, "kl": 0.03979836031794548, "learning_rate": 4.339032832046434e-06, "loss": 0.0016, "reward": 0.187666654586792, "reward_std": 0.13138745725154877, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 704 }, { "completion_length": 197.1666717529297, "epoch": 0.3137516688918558, "grad_norm": 0.7860096096992493, "kl": 0.04280473291873932, "learning_rate": 4.336399405680432e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 705 }, { "completion_length": 200.0, "epoch": 0.3141967067200712, "grad_norm": 0.9517030715942383, "kl": 0.07379502058029175, "learning_rate": 4.333761546245348e-06, "loss": 0.003, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 706 }, { "completion_length": 180.1666717529297, "epoch": 0.3146417445482866, "grad_norm": 0.8798065185546875, "kl": 0.050519704818725586, "learning_rate": 4.331119260108977e-06, "loss": 0.002, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 707 }, { "completion_length": 190.1666717529297, "epoch": 0.315086782376502, "grad_norm": 0.7868778705596924, "kl": 0.07131218910217285, "learning_rate": 4.328472553649799e-06, "loss": 0.0029, "reward": 0.125, "reward_std": 0.20916502177715302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 708 }, { "completion_length": 200.0, "epoch": 0.31553182020471743, "grad_norm": 0.029699545353651047, "kl": 0.04753357172012329, "learning_rate": 4.325821433256963e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 709 }, { "completion_length": 188.83334350585938, "epoch": 0.3159768580329328, "grad_norm": 0.7864294648170471, "kl": 0.04412949085235596, "learning_rate": 4.323165905330277e-06, "loss": 0.0018, "reward": -0.10866667330265045, "reward_std": 0.38256901502609253, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10866667330265045, "step": 710 }, { "completion_length": 183.6666717529297, "epoch": 0.3164218958611482, "grad_norm": 0.8975881338119507, "kl": 0.04495091363787651, "learning_rate": 4.320505976280186e-06, "loss": 0.0018, "reward": 0.35466668009757996, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 711 }, { "completion_length": 184.0, "epoch": 0.3168669336893636, "grad_norm": 0.7394328117370605, "kl": 0.08109882473945618, "learning_rate": 4.3178416525277586e-06, "loss": 0.0032, "reward": 0.25, "reward_std": 0.22360679507255554, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 712 }, { "completion_length": 197.5, "epoch": 0.317311971517579, "grad_norm": 0.7964750528335571, "kl": 0.0377558134496212, "learning_rate": 4.315172940504677e-06, "loss": 0.0015, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 713 }, { "completion_length": 196.83334350585938, "epoch": 0.3177570093457944, "grad_norm": 1.8717976808547974, "kl": 0.176216721534729, "learning_rate": 4.312499846653211e-06, "loss": 0.007, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 714 }, { "completion_length": 141.0, "epoch": 0.3182020471740098, "grad_norm": 1.7549928426742554, "kl": 0.2996341586112976, "learning_rate": 4.309822377426211e-06, "loss": 0.012, "reward": 0.10500000417232513, "reward_std": 0.5367960333824158, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14500001072883606, "step": 715 }, { "completion_length": 195.1666717529297, "epoch": 0.3186470850022252, "grad_norm": 0.813319206237793, "kl": 0.05743285268545151, "learning_rate": 4.307140539287089e-06, "loss": 0.0023, "reward": 0.1458333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 716 }, { "completion_length": 186.5, "epoch": 0.3190921228304406, "grad_norm": 0.8374386429786682, "kl": 0.03491507098078728, "learning_rate": 4.304454338709803e-06, "loss": 0.0014, "reward": 0.22966668009757996, "reward_std": 0.1666717231273651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22966668009757996, "step": 717 }, { "completion_length": 192.0, "epoch": 0.319537160658656, "grad_norm": 0.7661421298980713, "kl": 0.04614044725894928, "learning_rate": 4.3017637821788436e-06, "loss": 0.0018, "reward": 0.2916666865348816, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 718 }, { "completion_length": 198.0, "epoch": 0.31998219848687137, "grad_norm": 0.8050372004508972, "kl": 0.018003419041633606, "learning_rate": 4.2990688761892155e-06, "loss": 0.0007, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 719 }, { "completion_length": 162.0, "epoch": 0.3204272363150868, "grad_norm": 0.758633017539978, "kl": 0.06661681085824966, "learning_rate": 4.296369627246422e-06, "loss": 0.0027, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 720 }, { "completion_length": 171.33334350585938, "epoch": 0.32087227414330216, "grad_norm": 0.8165731430053711, "kl": 0.06688942015171051, "learning_rate": 4.293666041866453e-06, "loss": 0.0027, "reward": 0.31283333897590637, "reward_std": 0.20551829040050507, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 721 }, { "completion_length": 194.33334350585938, "epoch": 0.3213173119715176, "grad_norm": 0.8424144387245178, "kl": 0.04467558115720749, "learning_rate": 4.290958126575764e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 722 }, { "completion_length": 171.83334350585938, "epoch": 0.32176234979973295, "grad_norm": 0.8838493824005127, "kl": 0.05616327375173569, "learning_rate": 4.2882458879112634e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 723 }, { "completion_length": 199.0, "epoch": 0.32220738762794837, "grad_norm": 0.7267130017280579, "kl": 0.04860411211848259, "learning_rate": 4.285529332420298e-06, "loss": 0.0019, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 724 }, { "completion_length": 200.0, "epoch": 0.3226524254561638, "grad_norm": 0.8249167203903198, "kl": 0.04377565532922745, "learning_rate": 4.282808466660632e-06, "loss": 0.0018, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 725 }, { "completion_length": 190.1666717529297, "epoch": 0.32309746328437916, "grad_norm": 0.7198425531387329, "kl": 0.04959031194448471, "learning_rate": 4.280083297200439e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 726 }, { "completion_length": 192.1666717529297, "epoch": 0.3235425011125946, "grad_norm": 0.7675707340240479, "kl": 0.04044210538268089, "learning_rate": 4.277353830618279e-06, "loss": 0.0016, "reward": 0.187666654586792, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 727 }, { "completion_length": 200.0, "epoch": 0.32398753894080995, "grad_norm": 0.7638272643089294, "kl": 0.03953151777386665, "learning_rate": 4.274620073503084e-06, "loss": 0.0016, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 728 }, { "completion_length": 152.6666717529297, "epoch": 0.32443257676902537, "grad_norm": 0.05756811052560806, "kl": 0.06373923271894455, "learning_rate": 4.2718820324541475e-06, "loss": 0.0025, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 729 }, { "completion_length": 162.0, "epoch": 0.32487761459724074, "grad_norm": 0.7738885283470154, "kl": 0.06379255652427673, "learning_rate": 4.2691397140811e-06, "loss": 0.0026, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 730 }, { "completion_length": 117.66667175292969, "epoch": 0.32532265242545616, "grad_norm": 0.9680776596069336, "kl": 0.08181880414485931, "learning_rate": 4.2663931250039005e-06, "loss": 0.0033, "reward": 0.3333333432674408, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 731 }, { "completion_length": 170.83334350585938, "epoch": 0.3257676902536716, "grad_norm": 0.8730087280273438, "kl": 0.07460620254278183, "learning_rate": 4.2636422718528155e-06, "loss": 0.003, "reward": 0.18150000274181366, "reward_std": 0.3583281934261322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18150000274181366, "step": 732 }, { "completion_length": 184.33334350585938, "epoch": 0.32621272808188695, "grad_norm": 0.8382135629653931, "kl": 0.05216635391116142, "learning_rate": 4.2608871612684074e-06, "loss": 0.0021, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 733 }, { "completion_length": 171.1666717529297, "epoch": 0.32665776591010237, "grad_norm": 0.9829415082931519, "kl": 0.050829045474529266, "learning_rate": 4.258127799901512e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 734 }, { "completion_length": 190.5, "epoch": 0.32710280373831774, "grad_norm": 0.7752060890197754, "kl": 0.04926741123199463, "learning_rate": 4.255364194413232e-06, "loss": 0.002, "reward": 0.2084999978542328, "reward_std": 0.1021445021033287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 735 }, { "completion_length": 200.0, "epoch": 0.32754784156653316, "grad_norm": 0.7153200507164001, "kl": 0.040820520371198654, "learning_rate": 4.25259635147491e-06, "loss": 0.0016, "reward": 0.2083333432674408, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 736 }, { "completion_length": 200.0, "epoch": 0.3279928793947485, "grad_norm": 0.023354843258857727, "kl": 0.03863299638032913, "learning_rate": 4.249824277768122e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 737 }, { "completion_length": 172.1666717529297, "epoch": 0.32843791722296395, "grad_norm": 0.796233594417572, "kl": 0.047746941447257996, "learning_rate": 4.2470479799846545e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 738 }, { "completion_length": 200.0, "epoch": 0.32888295505117937, "grad_norm": 0.024252623319625854, "kl": 0.04170938581228256, "learning_rate": 4.2442674648264914e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 739 }, { "completion_length": 200.0, "epoch": 0.32932799287939474, "grad_norm": 0.8098192811012268, "kl": 0.055160801857709885, "learning_rate": 4.241482739005798e-06, "loss": 0.0022, "reward": 0.11166667193174362, "reward_std": 0.3388460874557495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11166667193174362, "step": 740 }, { "completion_length": 177.33334350585938, "epoch": 0.32977303070761016, "grad_norm": 0.7267571091651917, "kl": 0.037675365805625916, "learning_rate": 4.238693809244904e-06, "loss": 0.0015, "reward": 0.1899999976158142, "reward_std": 0.26663535833358765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1066666692495346, "step": 741 }, { "completion_length": 200.0, "epoch": 0.3302180685358255, "grad_norm": 0.03363126143813133, "kl": 0.04696328565478325, "learning_rate": 4.235900682276287e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 742 }, { "completion_length": 194.33334350585938, "epoch": 0.33066310636404095, "grad_norm": 0.6722932457923889, "kl": 0.040012698620557785, "learning_rate": 4.2331033648425565e-06, "loss": 0.0016, "reward": 0.12316666543483734, "reward_std": 0.37579911947250366, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12316666543483734, "step": 743 }, { "completion_length": 200.0, "epoch": 0.3311081441922563, "grad_norm": 0.9077341556549072, "kl": 0.04133099317550659, "learning_rate": 4.230301863696439e-06, "loss": 0.0017, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 744 }, { "completion_length": 156.83334350585938, "epoch": 0.33155318202047174, "grad_norm": 1.1381067037582397, "kl": 0.0650453120470047, "learning_rate": 4.22749618560076e-06, "loss": 0.0026, "reward": 0.27133333683013916, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 745 }, { "completion_length": 200.0, "epoch": 0.33199821984868716, "grad_norm": 0.78874272108078, "kl": 0.02345721237361431, "learning_rate": 4.224686337328426e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 746 }, { "completion_length": 179.0, "epoch": 0.3324432576769025, "grad_norm": 0.836664617061615, "kl": 0.04164649173617363, "learning_rate": 4.221872325662414e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.15811388194561005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 747 }, { "completion_length": 135.83334350585938, "epoch": 0.33288829550511795, "grad_norm": 0.9000113010406494, "kl": 0.06278669834136963, "learning_rate": 4.219054157395749e-06, "loss": 0.0025, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 748 }, { "completion_length": 191.83334350585938, "epoch": 0.3333333333333333, "grad_norm": 0.8174548745155334, "kl": 0.0394861064851284, "learning_rate": 4.21623183933149e-06, "loss": 0.0016, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 749 }, { "completion_length": 166.0, "epoch": 0.33377837116154874, "grad_norm": 0.7031174302101135, "kl": 0.03650489076972008, "learning_rate": 4.213405378282714e-06, "loss": 0.0015, "reward": 0.25033333897590637, "reward_std": 0.13729628920555115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25033333897590637, "step": 750 }, { "completion_length": 192.83334350585938, "epoch": 0.3342234089897641, "grad_norm": 0.7927865386009216, "kl": 0.0499715581536293, "learning_rate": 4.210574781072501e-06, "loss": 0.002, "reward": 0.2084999978542328, "reward_std": 0.12935802340507507, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 751 }, { "completion_length": 162.33334350585938, "epoch": 0.3346684468179795, "grad_norm": 0.7981622219085693, "kl": 0.04853060469031334, "learning_rate": 4.207740054533913e-06, "loss": 0.0019, "reward": 0.22949998080730438, "reward_std": 0.14653019607067108, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 752 }, { "completion_length": 200.0, "epoch": 0.33511348464619495, "grad_norm": 0.022546028718352318, "kl": 0.01208839938044548, "learning_rate": 4.204901205509981e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 753 }, { "completion_length": 200.0, "epoch": 0.3355585224744103, "grad_norm": 0.7191113829612732, "kl": 0.031925659626722336, "learning_rate": 4.202058240853689e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 754 }, { "completion_length": 172.33334350585938, "epoch": 0.33600356030262574, "grad_norm": 0.8460274338722229, "kl": 0.047413043677806854, "learning_rate": 4.199211167427955e-06, "loss": 0.0019, "reward": 0.2121666520833969, "reward_std": 0.18116556107997894, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2121666669845581, "step": 755 }, { "completion_length": 114.66667175292969, "epoch": 0.3364485981308411, "grad_norm": 1.0755983591079712, "kl": 0.054288625717163086, "learning_rate": 4.196359992105614e-06, "loss": 0.0022, "reward": 0.1456666737794876, "reward_std": 0.26882386207580566, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1456666737794876, "step": 756 }, { "completion_length": 192.33334350585938, "epoch": 0.3368936359590565, "grad_norm": 0.7851623296737671, "kl": 0.03890561684966087, "learning_rate": 4.193504721769406e-06, "loss": 0.0016, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 757 }, { "completion_length": 173.83334350585938, "epoch": 0.3373386737872719, "grad_norm": 1.2191767692565918, "kl": 0.03187034651637077, "learning_rate": 4.190645363311955e-06, "loss": 0.0013, "reward": 0.20533333718776703, "reward_std": 0.06943102180957794, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20533333718776703, "step": 758 }, { "completion_length": 164.0, "epoch": 0.3377837116154873, "grad_norm": 0.6636083126068115, "kl": 0.13397148251533508, "learning_rate": 4.187781923635753e-06, "loss": 0.0054, "reward": 0.2254999876022339, "reward_std": 0.2528444528579712, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22550001740455627, "step": 759 }, { "completion_length": 127.66667175292969, "epoch": 0.33822874944370274, "grad_norm": 0.030917134135961533, "kl": 0.06076059490442276, "learning_rate": 4.184914409653147e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 760 }, { "completion_length": 194.0, "epoch": 0.3386737872719181, "grad_norm": 0.723721444606781, "kl": 0.043149642646312714, "learning_rate": 4.182042828286313e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 761 }, { "completion_length": 193.33334350585938, "epoch": 0.3391188251001335, "grad_norm": 0.7052424550056458, "kl": 0.039253827184438705, "learning_rate": 4.179167186467255e-06, "loss": 0.0016, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 762 }, { "completion_length": 116.83333587646484, "epoch": 0.3395638629283489, "grad_norm": 1.4630470275878906, "kl": 0.026879621669650078, "learning_rate": 4.17628749113777e-06, "loss": 0.0011, "reward": 0.24266664683818817, "reward_std": 0.15977442264556885, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24266664683818817, "step": 763 }, { "completion_length": 124.0, "epoch": 0.3400089007565643, "grad_norm": 0.04300786554813385, "kl": 0.05142722651362419, "learning_rate": 4.173403749249444e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 764 }, { "completion_length": 171.6666717529297, "epoch": 0.3404539385847797, "grad_norm": 1.0912110805511475, "kl": 0.04297208786010742, "learning_rate": 4.170515967763633e-06, "loss": 0.0017, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 765 }, { "completion_length": 163.33334350585938, "epoch": 0.3408989764129951, "grad_norm": 0.9113920331001282, "kl": 0.1428011953830719, "learning_rate": 4.167624153651444e-06, "loss": 0.0057, "reward": 0.18649999797344208, "reward_std": 0.34627026319503784, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18649999797344208, "step": 766 }, { "completion_length": 197.5, "epoch": 0.3413440142412105, "grad_norm": 0.7323278188705444, "kl": 0.046782054007053375, "learning_rate": 4.1647283138937144e-06, "loss": 0.0019, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 767 }, { "completion_length": 183.1666717529297, "epoch": 0.3417890520694259, "grad_norm": 0.7725353837013245, "kl": 0.027050405740737915, "learning_rate": 4.1618284554810056e-06, "loss": 0.0011, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949999570846558, "step": 768 }, { "completion_length": 137.0, "epoch": 0.3422340898976413, "grad_norm": 0.03009336069226265, "kl": 0.057053741067647934, "learning_rate": 4.158924585413576e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 769 }, { "completion_length": 197.5, "epoch": 0.3426791277258567, "grad_norm": 0.601902425289154, "kl": 0.027302034199237823, "learning_rate": 4.156016710701369e-06, "loss": 0.0011, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 770 }, { "completion_length": 200.0, "epoch": 0.3431241655540721, "grad_norm": 0.013710107654333115, "kl": 0.03290403261780739, "learning_rate": 4.153104838363997e-06, "loss": 0.0013, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 771 }, { "completion_length": 197.83334350585938, "epoch": 0.34356920338228747, "grad_norm": 0.6330693960189819, "kl": 0.03465873375535011, "learning_rate": 4.15018897543072e-06, "loss": 0.0014, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 772 }, { "completion_length": 167.6666717529297, "epoch": 0.3440142412105029, "grad_norm": 0.7272565364837646, "kl": 0.04871327802538872, "learning_rate": 4.1472691289404335e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 773 }, { "completion_length": 200.0, "epoch": 0.3444592790387183, "grad_norm": 0.025043027475476265, "kl": 0.03786566108465195, "learning_rate": 4.144345305941648e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 774 }, { "completion_length": 200.0, "epoch": 0.3449043168669337, "grad_norm": 0.03370220214128494, "kl": 0.0471016988158226, "learning_rate": 4.141417513492473e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 775 }, { "completion_length": 200.0, "epoch": 0.3453493546951491, "grad_norm": 0.7220287919044495, "kl": 0.035600695759058, "learning_rate": 4.138485758660602e-06, "loss": 0.0014, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 776 }, { "completion_length": 120.66667175292969, "epoch": 0.34579439252336447, "grad_norm": 1.2347756624221802, "kl": 0.08316612988710403, "learning_rate": 4.135550048523293e-06, "loss": 0.0033, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 777 }, { "completion_length": 170.6666717529297, "epoch": 0.3462394303515799, "grad_norm": 0.7800046801567078, "kl": 0.053269438445568085, "learning_rate": 4.132610390167349e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 778 }, { "completion_length": 157.1666717529297, "epoch": 0.34668446817979526, "grad_norm": 0.7412069439888, "kl": 0.06059323251247406, "learning_rate": 4.12966679068911e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 779 }, { "completion_length": 200.0, "epoch": 0.3471295060080107, "grad_norm": 0.7685080766677856, "kl": 0.056760650128126144, "learning_rate": 4.126719257194425e-06, "loss": 0.0023, "reward": 0.12250000238418579, "reward_std": 0.31230995059013367, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12250000238418579, "step": 780 }, { "completion_length": 200.0, "epoch": 0.3475745438362261, "grad_norm": 0.018277641385793686, "kl": 0.03469054400920868, "learning_rate": 4.123767796798641e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 781 }, { "completion_length": 133.5, "epoch": 0.34801958166444147, "grad_norm": 0.8028903007507324, "kl": 0.06514596194028854, "learning_rate": 4.120812416626586e-06, "loss": 0.0026, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 782 }, { "completion_length": 146.83334350585938, "epoch": 0.3484646194926569, "grad_norm": 1.0154013633728027, "kl": 0.0660717785358429, "learning_rate": 4.117853123812549e-06, "loss": 0.0026, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 783 }, { "completion_length": 161.6666717529297, "epoch": 0.34890965732087226, "grad_norm": 0.76609206199646, "kl": 0.05136913061141968, "learning_rate": 4.1148899255002636e-06, "loss": 0.0021, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 784 }, { "completion_length": 142.5, "epoch": 0.3493546951490877, "grad_norm": 0.03385661169886589, "kl": 0.05575406178832054, "learning_rate": 4.111922828842892e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 785 }, { "completion_length": 187.0, "epoch": 0.34979973297730305, "grad_norm": 0.7177822589874268, "kl": 0.030316343531012535, "learning_rate": 4.108951841003009e-06, "loss": 0.0012, "reward": 0.25050002336502075, "reward_std": 0.13747835159301758, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25050002336502075, "step": 786 }, { "completion_length": 178.5, "epoch": 0.35024477080551847, "grad_norm": 0.7809700965881348, "kl": 0.039071641862392426, "learning_rate": 4.105976969152578e-06, "loss": 0.0016, "reward": 0.31333333253860474, "reward_std": 0.1049412414431572, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 787 }, { "completion_length": 177.1666717529297, "epoch": 0.3506898086337339, "grad_norm": 0.7270780205726624, "kl": 0.042834021151065826, "learning_rate": 4.102998220472943e-06, "loss": 0.0017, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 788 }, { "completion_length": 200.0, "epoch": 0.35113484646194926, "grad_norm": 0.6192082762718201, "kl": 0.04256781190633774, "learning_rate": 4.100015602154802e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 789 }, { "completion_length": 200.0, "epoch": 0.3515798842901647, "grad_norm": 0.014344203285872936, "kl": 0.03416243940591812, "learning_rate": 4.0970291213982e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 790 }, { "completion_length": 200.0, "epoch": 0.35202492211838005, "grad_norm": 0.8113481998443604, "kl": 0.05264754593372345, "learning_rate": 4.094038785412504e-06, "loss": 0.0021, "reward": 0.11516666412353516, "reward_std": 0.3302728831768036, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11516666412353516, "step": 791 }, { "completion_length": 138.0, "epoch": 0.35246995994659547, "grad_norm": 1.720563292503357, "kl": 0.03622860834002495, "learning_rate": 4.091044601416383e-06, "loss": 0.0014, "reward": 0.27133333683013916, "reward_std": 0.1465587466955185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 792 }, { "completion_length": 177.5, "epoch": 0.35291499777481083, "grad_norm": 0.7472648024559021, "kl": 0.04746413230895996, "learning_rate": 4.0880465766378015e-06, "loss": 0.0019, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 793 }, { "completion_length": 129.83334350585938, "epoch": 0.35336003560302626, "grad_norm": 1.1209585666656494, "kl": 0.0549950897693634, "learning_rate": 4.085044718313991e-06, "loss": 0.0022, "reward": 0.31349998712539673, "reward_std": 0.10458250343799591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31349998712539673, "step": 794 }, { "completion_length": 193.1666717529297, "epoch": 0.3538050734312417, "grad_norm": 0.6405737400054932, "kl": 0.04125973582267761, "learning_rate": 4.08203903369144e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 795 }, { "completion_length": 185.5, "epoch": 0.35425011125945705, "grad_norm": 0.8062145709991455, "kl": 0.04092112183570862, "learning_rate": 4.079029530025873e-06, "loss": 0.0016, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 796 }, { "completion_length": 142.83334350585938, "epoch": 0.35469514908767247, "grad_norm": 0.0398949459195137, "kl": 0.052285999059677124, "learning_rate": 4.076016214582232e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 797 }, { "completion_length": 154.0, "epoch": 0.35514018691588783, "grad_norm": 1.113144874572754, "kl": 0.07075363397598267, "learning_rate": 4.072999094634663e-06, "loss": 0.0028, "reward": 0.29233333468437195, "reward_std": 0.10222654044628143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 798 }, { "completion_length": 200.0, "epoch": 0.35558522474410326, "grad_norm": 0.016199104487895966, "kl": 0.03834648057818413, "learning_rate": 4.069978177466495e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 799 }, { "completion_length": 200.0, "epoch": 0.3560302625723186, "grad_norm": 0.9117387533187866, "kl": 0.18886041641235352, "learning_rate": 4.066953470370223e-06, "loss": 0.0076, "reward": 0.016166668385267258, "reward_std": 0.2665861248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 800 }, { "completion_length": 178.33334350585938, "epoch": 0.35647530040053405, "grad_norm": 0.6704380512237549, "kl": 0.057386137545108795, "learning_rate": 4.063924980647492e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 801 }, { "completion_length": 168.33334350585938, "epoch": 0.35692033822874947, "grad_norm": 0.767604649066925, "kl": 0.0486370325088501, "learning_rate": 4.060892715609078e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 802 }, { "completion_length": 141.0, "epoch": 0.35736537605696483, "grad_norm": 1.3145936727523804, "kl": 0.04570293426513672, "learning_rate": 4.0578566825748685e-06, "loss": 0.0018, "reward": 0.29233333468437195, "reward_std": 0.15182314813137054, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 803 }, { "completion_length": 126.66667175292969, "epoch": 0.35781041388518026, "grad_norm": 0.6498439311981201, "kl": 0.06144469231367111, "learning_rate": 4.054816888873852e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 804 }, { "completion_length": 185.6666717529297, "epoch": 0.3582554517133956, "grad_norm": 0.9341973662376404, "kl": 0.05542676895856857, "learning_rate": 4.051773341844088e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 805 }, { "completion_length": 153.33334350585938, "epoch": 0.35870048954161105, "grad_norm": 0.6858371496200562, "kl": 0.06017071008682251, "learning_rate": 4.048726048832704e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 806 }, { "completion_length": 182.6666717529297, "epoch": 0.3591455273698264, "grad_norm": 0.5970679521560669, "kl": 0.03451145440340042, "learning_rate": 4.045675017195866e-06, "loss": 0.0014, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 807 }, { "completion_length": 162.5, "epoch": 0.35959056519804183, "grad_norm": 0.6962899565696716, "kl": 0.05475949868559837, "learning_rate": 4.042620254298765e-06, "loss": 0.0022, "reward": 0.29233333468437195, "reward_std": 0.15182313323020935, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 808 }, { "completion_length": 183.83334350585938, "epoch": 0.36003560302625726, "grad_norm": 0.7941375970840454, "kl": 0.045727722346782684, "learning_rate": 4.039561767515599e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 809 }, { "completion_length": 174.1666717529297, "epoch": 0.3604806408544726, "grad_norm": 0.6645864248275757, "kl": 0.052175264805555344, "learning_rate": 4.036499564229559e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 810 }, { "completion_length": 192.0, "epoch": 0.36092567868268804, "grad_norm": 0.7362174391746521, "kl": 0.05001889169216156, "learning_rate": 4.033433651832806e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 811 }, { "completion_length": 183.33334350585938, "epoch": 0.3613707165109034, "grad_norm": 0.700889527797699, "kl": 0.04670108109712601, "learning_rate": 4.0303640377264505e-06, "loss": 0.0019, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 812 }, { "completion_length": 121.66667175292969, "epoch": 0.36181575433911883, "grad_norm": 0.11063014715909958, "kl": 0.08557015657424927, "learning_rate": 4.027290729320545e-06, "loss": 0.0037, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 813 }, { "completion_length": 75.83333587646484, "epoch": 0.3622607921673342, "grad_norm": 1.2718409299850464, "kl": 0.08148349821567535, "learning_rate": 4.024213734034057e-06, "loss": 0.0033, "reward": 0.39666664600372314, "reward_std": 0.05062279850244522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 814 }, { "completion_length": 186.5, "epoch": 0.3627058299955496, "grad_norm": 0.7700260281562805, "kl": 0.04646776616573334, "learning_rate": 4.021133059294855e-06, "loss": 0.0019, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 815 }, { "completion_length": 147.5, "epoch": 0.36315086782376504, "grad_norm": 1.176891803741455, "kl": 0.05650541931390762, "learning_rate": 4.018048712539689e-06, "loss": 0.0023, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 816 }, { "completion_length": 180.0, "epoch": 0.3635959056519804, "grad_norm": 0.6683288216590881, "kl": 0.06407992541790009, "learning_rate": 4.014960701214173e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 817 }, { "completion_length": 105.0, "epoch": 0.36404094348019583, "grad_norm": 0.9382845163345337, "kl": 0.06550440192222595, "learning_rate": 4.011869032772769e-06, "loss": 0.0026, "reward": 0.31333333253860474, "reward_std": 0.15350136160850525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 818 }, { "completion_length": 165.6666717529297, "epoch": 0.3644859813084112, "grad_norm": 0.7827046513557434, "kl": 0.05532263219356537, "learning_rate": 4.008773714678766e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 819 }, { "completion_length": 145.6666717529297, "epoch": 0.3649310191366266, "grad_norm": 0.026940084993839264, "kl": 0.0543852262198925, "learning_rate": 4.005674754404263e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 820 }, { "completion_length": 138.6666717529297, "epoch": 0.365376056964842, "grad_norm": 0.035869237035512924, "kl": 0.06597190350294113, "learning_rate": 4.002572159430151e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 821 }, { "completion_length": 163.83334350585938, "epoch": 0.3658210947930574, "grad_norm": 0.910876989364624, "kl": 0.06616390496492386, "learning_rate": 3.999465937246096e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 822 }, { "completion_length": 133.1666717529297, "epoch": 0.36626613262127283, "grad_norm": 0.8520764708518982, "kl": 0.08879172801971436, "learning_rate": 3.996356095350522e-06, "loss": 0.0036, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 823 }, { "completion_length": 170.0, "epoch": 0.3667111704494882, "grad_norm": 0.8743280172348022, "kl": 0.061830103397369385, "learning_rate": 3.993242641250586e-06, "loss": 0.0025, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 824 }, { "completion_length": 171.1666717529297, "epoch": 0.3671562082777036, "grad_norm": 0.7217028737068176, "kl": 0.056446269154548645, "learning_rate": 3.990125582462171e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 825 }, { "completion_length": 179.1666717529297, "epoch": 0.367601246105919, "grad_norm": 0.6563734412193298, "kl": 0.060024701058864594, "learning_rate": 3.987004926509854e-06, "loss": 0.0024, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 826 }, { "completion_length": 188.0, "epoch": 0.3680462839341344, "grad_norm": 0.6999355554580688, "kl": 0.045960139483213425, "learning_rate": 3.983880680926904e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 827 }, { "completion_length": 98.16667175292969, "epoch": 0.3684913217623498, "grad_norm": 0.04137030988931656, "kl": 0.08726416528224945, "learning_rate": 3.98075285325525e-06, "loss": 0.0038, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 828 }, { "completion_length": 160.1666717529297, "epoch": 0.3689363595905652, "grad_norm": 0.7065590620040894, "kl": 0.061148129403591156, "learning_rate": 3.977621451045469e-06, "loss": 0.0024, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 829 }, { "completion_length": 192.1666717529297, "epoch": 0.3693813974187806, "grad_norm": 0.7527098059654236, "kl": 0.055032290518283844, "learning_rate": 3.974486481856769e-06, "loss": 0.0022, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 830 }, { "completion_length": 157.6666717529297, "epoch": 0.369826435246996, "grad_norm": 0.8093323111534119, "kl": 0.06528506428003311, "learning_rate": 3.971347953256965e-06, "loss": 0.0026, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 831 }, { "completion_length": 136.5, "epoch": 0.3702714730752114, "grad_norm": 0.04360884055495262, "kl": 0.0801323652267456, "learning_rate": 3.968205872822468e-06, "loss": 0.0035, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 832 }, { "completion_length": 173.83334350585938, "epoch": 0.3707165109034268, "grad_norm": 0.7571378350257874, "kl": 0.05690945312380791, "learning_rate": 3.965060248138263e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 833 }, { "completion_length": 87.66667175292969, "epoch": 0.3711615487316422, "grad_norm": 0.06486710906028748, "kl": 0.10215651988983154, "learning_rate": 3.961911086797886e-06, "loss": 0.0044, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 834 }, { "completion_length": 169.83334350585938, "epoch": 0.37160658655985757, "grad_norm": 0.7843642830848694, "kl": 0.05757517367601395, "learning_rate": 3.958758396403418e-06, "loss": 0.0023, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 835 }, { "completion_length": 149.6666717529297, "epoch": 0.372051624388073, "grad_norm": 0.6219406127929688, "kl": 0.11813263595104218, "learning_rate": 3.955602184565452e-06, "loss": 0.0047, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 836 }, { "completion_length": 154.6666717529297, "epoch": 0.3724966622162884, "grad_norm": 0.699906051158905, "kl": 0.06392105668783188, "learning_rate": 3.952442458903087e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 837 }, { "completion_length": 200.0, "epoch": 0.3729417000445038, "grad_norm": 0.036947038024663925, "kl": 0.05290337651968002, "learning_rate": 3.9492792270439015e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 838 }, { "completion_length": 99.33333587646484, "epoch": 0.3733867378727192, "grad_norm": 1.119400978088379, "kl": 0.06555521488189697, "learning_rate": 3.946112496623939e-06, "loss": 0.0026, "reward": 0.3343333601951599, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33433330059051514, "step": 839 }, { "completion_length": 153.6666717529297, "epoch": 0.37383177570093457, "grad_norm": 0.030132388696074486, "kl": 0.057201653718948364, "learning_rate": 3.942942275287688e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 840 }, { "completion_length": 123.16667175292969, "epoch": 0.37427681352915, "grad_norm": 0.027864990755915642, "kl": 0.06204737722873688, "learning_rate": 3.939768570688064e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 841 }, { "completion_length": 161.0, "epoch": 0.37472185135736535, "grad_norm": 0.6778984069824219, "kl": 0.06228325888514519, "learning_rate": 3.936591390486393e-06, "loss": 0.0025, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 842 }, { "completion_length": 165.83334350585938, "epoch": 0.3751668891855808, "grad_norm": 0.03816313296556473, "kl": 0.06406964361667633, "learning_rate": 3.933410742352388e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 843 }, { "completion_length": 161.6666717529297, "epoch": 0.3756119270137962, "grad_norm": 0.8484237790107727, "kl": 0.09677344560623169, "learning_rate": 3.930226633964137e-06, "loss": 0.0039, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 844 }, { "completion_length": 135.5, "epoch": 0.37605696484201157, "grad_norm": 0.03836163505911827, "kl": 0.06688190996646881, "learning_rate": 3.927039073008077e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 845 }, { "completion_length": 200.0, "epoch": 0.376502002670227, "grad_norm": 0.03299454599618912, "kl": 0.052207015454769135, "learning_rate": 3.9238480671789836e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 846 }, { "completion_length": 189.33334350585938, "epoch": 0.37694704049844235, "grad_norm": 0.8058741092681885, "kl": 0.052587032318115234, "learning_rate": 3.920653624179945e-06, "loss": 0.0021, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 847 }, { "completion_length": 156.0, "epoch": 0.3773920783266578, "grad_norm": 0.9629629254341125, "kl": 0.18556594848632812, "learning_rate": 3.917455751722349e-06, "loss": 0.0074, "reward": 0.28033334016799927, "reward_std": 0.23433451354503632, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28033334016799927, "step": 848 }, { "completion_length": 132.33334350585938, "epoch": 0.37783711615487314, "grad_norm": 0.05315527319908142, "kl": 0.07583107054233551, "learning_rate": 3.914254457525862e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 849 }, { "completion_length": 187.1666717529297, "epoch": 0.37828215398308856, "grad_norm": 0.7499144673347473, "kl": 0.051162999123334885, "learning_rate": 3.9110497493184084e-06, "loss": 0.002, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 850 }, { "completion_length": 153.33334350585938, "epoch": 0.378727191811304, "grad_norm": 0.6836094856262207, "kl": 0.06891427934169769, "learning_rate": 3.9078416348361555e-06, "loss": 0.0028, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 851 }, { "completion_length": 180.1666717529297, "epoch": 0.37917222963951935, "grad_norm": 0.8249133229255676, "kl": 0.04966040700674057, "learning_rate": 3.904630121823495e-06, "loss": 0.002, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 852 }, { "completion_length": 91.0, "epoch": 0.3796172674677348, "grad_norm": 1.4175060987472534, "kl": 0.15148359537124634, "learning_rate": 3.901415218033019e-06, "loss": 0.0061, "reward": 0.36516666412353516, "reward_std": 0.026536142453551292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36516663432121277, "step": 853 }, { "completion_length": 146.5, "epoch": 0.38006230529595014, "grad_norm": 0.850635290145874, "kl": 0.06043955311179161, "learning_rate": 3.8981969312255075e-06, "loss": 0.0024, "reward": 0.31333333253860474, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 854 }, { "completion_length": 134.6666717529297, "epoch": 0.38050734312416556, "grad_norm": 0.028085196390748024, "kl": 0.061860792338848114, "learning_rate": 3.894975269169906e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 855 }, { "completion_length": 188.0, "epoch": 0.38095238095238093, "grad_norm": 1.085335612297058, "kl": 0.16889886558055878, "learning_rate": 3.891750239643309e-06, "loss": 0.0068, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 856 }, { "completion_length": 137.5, "epoch": 0.38139741878059635, "grad_norm": 0.8269402384757996, "kl": 0.06682190299034119, "learning_rate": 3.888521850430939e-06, "loss": 0.0027, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 857 }, { "completion_length": 200.0, "epoch": 0.3818424566088118, "grad_norm": 0.026867447420954704, "kl": 0.04740360379219055, "learning_rate": 3.885290109326131e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 858 }, { "completion_length": 200.0, "epoch": 0.38228749443702714, "grad_norm": 0.023793501779437065, "kl": 0.046479731798172, "learning_rate": 3.882055024130307e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 859 }, { "completion_length": 189.83334350585938, "epoch": 0.38273253226524256, "grad_norm": 0.6452245712280273, "kl": 0.050767965614795685, "learning_rate": 3.878816602652965e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 860 }, { "completion_length": 126.5, "epoch": 0.38317757009345793, "grad_norm": 0.02156522497534752, "kl": 0.062334608286619186, "learning_rate": 3.875574852711656e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 861 }, { "completion_length": 184.33334350585938, "epoch": 0.38362260792167335, "grad_norm": 1.0488053560256958, "kl": 0.050179775804281235, "learning_rate": 3.872329782131967e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 862 }, { "completion_length": 177.0, "epoch": 0.3840676457498887, "grad_norm": 0.7218441367149353, "kl": 0.05524627864360809, "learning_rate": 3.869081398747499e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 863 }, { "completion_length": 169.5, "epoch": 0.38451268357810414, "grad_norm": 0.6805753707885742, "kl": 0.057348743081092834, "learning_rate": 3.865829710399852e-06, "loss": 0.0023, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 864 }, { "completion_length": 190.1666717529297, "epoch": 0.38495772140631956, "grad_norm": 0.7564582824707031, "kl": 0.06877424567937851, "learning_rate": 3.862574724938602e-06, "loss": 0.0028, "reward": 0.0833333358168602, "reward_std": 0.3685337007045746, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333283662796, "step": 865 }, { "completion_length": 130.6666717529297, "epoch": 0.38540275923453493, "grad_norm": 0.028254088014364243, "kl": 0.05616528540849686, "learning_rate": 3.859316450221286e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 866 }, { "completion_length": 155.33334350585938, "epoch": 0.38584779706275035, "grad_norm": 0.6590713858604431, "kl": 0.05754072964191437, "learning_rate": 3.856054894113382e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 867 }, { "completion_length": 155.5, "epoch": 0.3862928348909657, "grad_norm": 0.11690230667591095, "kl": 0.07130900025367737, "learning_rate": 3.852790064488286e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 868 }, { "completion_length": 193.6666717529297, "epoch": 0.38673787271918114, "grad_norm": 0.7855741381645203, "kl": 0.0497148297727108, "learning_rate": 3.8495219692273e-06, "loss": 0.002, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 869 }, { "completion_length": 149.1666717529297, "epoch": 0.3871829105473965, "grad_norm": 0.749443769454956, "kl": 0.044368281960487366, "learning_rate": 3.846250616219607e-06, "loss": 0.0018, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 870 }, { "completion_length": 147.5, "epoch": 0.38762794837561193, "grad_norm": 0.779934823513031, "kl": 0.09931713342666626, "learning_rate": 3.842976013362255e-06, "loss": 0.004, "reward": 0.2381666600704193, "reward_std": 0.3376213312149048, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2381666600704193, "step": 871 }, { "completion_length": 162.83334350585938, "epoch": 0.38807298620382735, "grad_norm": 0.0258543211966753, "kl": 0.05789042264223099, "learning_rate": 3.839698168560137e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 872 }, { "completion_length": 178.6666717529297, "epoch": 0.3885180240320427, "grad_norm": 0.7851041555404663, "kl": 0.0560050830245018, "learning_rate": 3.8364170897259715e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 873 }, { "completion_length": 160.6666717529297, "epoch": 0.38896306186025814, "grad_norm": 0.7662412524223328, "kl": 0.07136371731758118, "learning_rate": 3.833132784780284e-06, "loss": 0.0029, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 874 }, { "completion_length": 160.33334350585938, "epoch": 0.3894080996884735, "grad_norm": 0.6216509938240051, "kl": 0.05862082540988922, "learning_rate": 3.82984526165139e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 875 }, { "completion_length": 162.33334350585938, "epoch": 0.38985313751668893, "grad_norm": 0.6714538335800171, "kl": 0.06793516874313354, "learning_rate": 3.8265545282753706e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 876 }, { "completion_length": 176.0, "epoch": 0.3902981753449043, "grad_norm": 0.6328492760658264, "kl": 0.07553236186504364, "learning_rate": 3.823260592596058e-06, "loss": 0.003, "reward": 0.20350000262260437, "reward_std": 0.36431294679641724, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20350000262260437, "step": 877 }, { "completion_length": 200.0, "epoch": 0.3907432131731197, "grad_norm": 0.6953088045120239, "kl": 0.03606265038251877, "learning_rate": 3.819963462565015e-06, "loss": 0.0014, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 878 }, { "completion_length": 168.6666717529297, "epoch": 0.39118825100133514, "grad_norm": 0.6372416615486145, "kl": 0.06722667813301086, "learning_rate": 3.816663146141514e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 879 }, { "completion_length": 159.5, "epoch": 0.3916332888295505, "grad_norm": 0.8337939381599426, "kl": 0.06765174865722656, "learning_rate": 3.813359651292522e-06, "loss": 0.0027, "reward": 0.31333333253860474, "reward_std": 0.06864885985851288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 880 }, { "completion_length": 164.5, "epoch": 0.39207832665776593, "grad_norm": 0.6367321610450745, "kl": 0.049886442720890045, "learning_rate": 3.810052985992677e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 881 }, { "completion_length": 151.1666717529297, "epoch": 0.3925233644859813, "grad_norm": 0.03452099859714508, "kl": 0.0649208277463913, "learning_rate": 3.8067431582242697e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 882 }, { "completion_length": 150.0, "epoch": 0.3929684023141967, "grad_norm": 0.8333684802055359, "kl": 0.061620742082595825, "learning_rate": 3.8034301759772263e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 883 }, { "completion_length": 132.33334350585938, "epoch": 0.3934134401424121, "grad_norm": 0.03911470249295235, "kl": 0.07865100353956223, "learning_rate": 3.8001140472490887e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 884 }, { "completion_length": 130.1666717529297, "epoch": 0.3938584779706275, "grad_norm": 1.2934985160827637, "kl": 0.05334341526031494, "learning_rate": 3.796794780044992e-06, "loss": 0.0021, "reward": 0.17083333432674408, "reward_std": 0.2419474571943283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17083333432674408, "step": 885 }, { "completion_length": 139.1666717529297, "epoch": 0.3943035157988429, "grad_norm": 0.8030723929405212, "kl": 0.06765462458133698, "learning_rate": 3.7934723823776494e-06, "loss": 0.0027, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 886 }, { "completion_length": 105.5, "epoch": 0.3947485536270583, "grad_norm": 0.02923821657896042, "kl": 0.06622253358364105, "learning_rate": 3.7901468622673303e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 887 }, { "completion_length": 140.6666717529297, "epoch": 0.3951935914552737, "grad_norm": 0.7183970808982849, "kl": 0.0615064799785614, "learning_rate": 3.786818227741842e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 888 }, { "completion_length": 165.33334350585938, "epoch": 0.3956386292834891, "grad_norm": 0.733579158782959, "kl": 0.04961232468485832, "learning_rate": 3.78348648683651e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 889 }, { "completion_length": 171.33334350585938, "epoch": 0.3960836671117045, "grad_norm": 0.7921627163887024, "kl": 0.05507838726043701, "learning_rate": 3.780151647594159e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 890 }, { "completion_length": 171.0, "epoch": 0.3965287049399199, "grad_norm": 0.7230894565582275, "kl": 0.057939350605010986, "learning_rate": 3.7768137180650915e-06, "loss": 0.0023, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 891 }, { "completion_length": 155.0, "epoch": 0.3969737427681353, "grad_norm": 0.7460453510284424, "kl": 0.08669416606426239, "learning_rate": 3.773472706307072e-06, "loss": 0.0035, "reward": 0.29216665029525757, "reward_std": 0.2053488940000534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216665029525757, "step": 892 }, { "completion_length": 127.0, "epoch": 0.39741878059635066, "grad_norm": 0.9885587692260742, "kl": 0.23846940696239471, "learning_rate": 3.7701286203853036e-06, "loss": 0.0095, "reward": 0.3103333115577698, "reward_std": 0.16084983944892883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3103333115577698, "step": 893 }, { "completion_length": 133.1666717529297, "epoch": 0.3978638184245661, "grad_norm": 1.0468348264694214, "kl": 0.241329163312912, "learning_rate": 3.7667814683724126e-06, "loss": 0.0097, "reward": 0.34300002455711365, "reward_std": 0.08083315193653107, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34299999475479126, "step": 894 }, { "completion_length": 140.33334350585938, "epoch": 0.3983088562527815, "grad_norm": 0.965579628944397, "kl": 0.06833823025226593, "learning_rate": 3.7634312583484244e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 895 }, { "completion_length": 148.5, "epoch": 0.3987538940809969, "grad_norm": 0.7001117467880249, "kl": 0.05449951812624931, "learning_rate": 3.7600779984007485e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 896 }, { "completion_length": 171.5, "epoch": 0.3991989319092123, "grad_norm": 0.678555965423584, "kl": 0.04612820968031883, "learning_rate": 3.756721696624156e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 897 }, { "completion_length": 177.6666717529297, "epoch": 0.39964396973742766, "grad_norm": 0.915947675704956, "kl": 0.05997948348522186, "learning_rate": 3.7533623611207607e-06, "loss": 0.0024, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 898 }, { "completion_length": 200.0, "epoch": 0.4000890075656431, "grad_norm": 0.05163982883095741, "kl": 0.049683578312397, "learning_rate": 3.7500000000000005e-06, "loss": 0.002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 899 }, { "completion_length": 168.6666717529297, "epoch": 0.40053404539385845, "grad_norm": 0.7753753066062927, "kl": 0.05140147730708122, "learning_rate": 3.7466346213786165e-06, "loss": 0.0021, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 900 }, { "completion_length": 176.1666717529297, "epoch": 0.4009790832220739, "grad_norm": 0.027781542390584946, "kl": 0.054599761962890625, "learning_rate": 3.743266233380635e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 901 }, { "completion_length": 83.66667175292969, "epoch": 0.4014241210502893, "grad_norm": 0.02804761379957199, "kl": 0.07853664457798004, "learning_rate": 3.7398948441373454e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 902 }, { "completion_length": 141.6666717529297, "epoch": 0.40186915887850466, "grad_norm": 0.028074469417333603, "kl": 0.056927796453237534, "learning_rate": 3.7365204617872834e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 903 }, { "completion_length": 177.83334350585938, "epoch": 0.4023141967067201, "grad_norm": 0.8167720437049866, "kl": 0.06986045837402344, "learning_rate": 3.7331430944762105e-06, "loss": 0.0028, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 904 }, { "completion_length": 187.1666717529297, "epoch": 0.40275923453493545, "grad_norm": 0.7425169944763184, "kl": 0.04808403179049492, "learning_rate": 3.729762750357092e-06, "loss": 0.0019, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 905 }, { "completion_length": 139.5, "epoch": 0.4032042723631509, "grad_norm": 0.03980370983481407, "kl": 0.062199126929044724, "learning_rate": 3.7263794375900803e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 906 }, { "completion_length": 145.5, "epoch": 0.40364931019136624, "grad_norm": 0.022532425820827484, "kl": 0.056330885738134384, "learning_rate": 3.7229931643424943e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 907 }, { "completion_length": 198.83334350585938, "epoch": 0.40409434801958166, "grad_norm": 0.625041127204895, "kl": 0.05984826013445854, "learning_rate": 3.7196039387887995e-06, "loss": 0.0024, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 908 }, { "completion_length": 192.5, "epoch": 0.4045393858477971, "grad_norm": 0.7206208109855652, "kl": 0.050445057451725006, "learning_rate": 3.7162117691105894e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 909 }, { "completion_length": 200.0, "epoch": 0.40498442367601245, "grad_norm": 0.028280407190322876, "kl": 0.040611665695905685, "learning_rate": 3.71281666349656e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 910 }, { "completion_length": 184.83334350585938, "epoch": 0.4054294615042279, "grad_norm": 0.7396180629730225, "kl": 0.04777958244085312, "learning_rate": 3.7094186301425006e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 911 }, { "completion_length": 191.33334350585938, "epoch": 0.40587449933244324, "grad_norm": 1.011993408203125, "kl": 0.04695138335227966, "learning_rate": 3.706017677251266e-06, "loss": 0.0019, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 912 }, { "completion_length": 114.0, "epoch": 0.40631953716065866, "grad_norm": 0.029970509931445122, "kl": 0.06655572354793549, "learning_rate": 3.7026138130327547e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 913 }, { "completion_length": 180.33334350585938, "epoch": 0.40676457498887403, "grad_norm": 0.660056471824646, "kl": 0.041105978190898895, "learning_rate": 3.6992070457038998e-06, "loss": 0.0016, "reward": 0.29233333468437195, "reward_std": 0.10247080028057098, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 914 }, { "completion_length": 183.1666717529297, "epoch": 0.40720961281708945, "grad_norm": 0.7320570945739746, "kl": 0.045362964272499084, "learning_rate": 3.6957973834886387e-06, "loss": 0.0018, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 915 }, { "completion_length": 159.0, "epoch": 0.4076546506453049, "grad_norm": 0.7662878036499023, "kl": 0.05844952166080475, "learning_rate": 3.692384834617897e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 916 }, { "completion_length": 184.1666717529297, "epoch": 0.40809968847352024, "grad_norm": 0.7281984090805054, "kl": 0.05307254195213318, "learning_rate": 3.688969407329569e-06, "loss": 0.0021, "reward": 0.27133333683013916, "reward_std": 0.12340772151947021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 917 }, { "completion_length": 177.6666717529297, "epoch": 0.40854472630173566, "grad_norm": 0.7347136735916138, "kl": 0.05313348397612572, "learning_rate": 3.6855511098684996e-06, "loss": 0.0021, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 918 }, { "completion_length": 200.0, "epoch": 0.40898976412995103, "grad_norm": 0.7880335450172424, "kl": 0.03796866908669472, "learning_rate": 3.682129950486459e-06, "loss": 0.0015, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 919 }, { "completion_length": 171.83334350585938, "epoch": 0.40943480195816645, "grad_norm": 0.03392661735415459, "kl": 0.05389983206987381, "learning_rate": 3.678705937442128e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 920 }, { "completion_length": 159.33334350585938, "epoch": 0.4098798397863818, "grad_norm": 0.8597182035446167, "kl": 0.0735437273979187, "learning_rate": 3.675279079001077e-06, "loss": 0.0029, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 921 }, { "completion_length": 196.5, "epoch": 0.41032487761459724, "grad_norm": 0.7532240152359009, "kl": 0.053770922124385834, "learning_rate": 3.6718493834357415e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 922 }, { "completion_length": 153.6666717529297, "epoch": 0.41076991544281266, "grad_norm": 0.7527331709861755, "kl": 0.05144277960062027, "learning_rate": 3.6684168590254103e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 923 }, { "completion_length": 182.5, "epoch": 0.411214953271028, "grad_norm": 0.5914585590362549, "kl": 0.05143951624631882, "learning_rate": 3.6649815140561995e-06, "loss": 0.0021, "reward": 0.21649998426437378, "reward_std": 0.3328048884868622, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21649998426437378, "step": 924 }, { "completion_length": 185.0, "epoch": 0.41165999109924345, "grad_norm": 0.6755291819572449, "kl": 0.04394887015223503, "learning_rate": 3.6615433568210313e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 925 }, { "completion_length": 194.83334350585938, "epoch": 0.4121050289274588, "grad_norm": 0.8636181354522705, "kl": 0.047952380031347275, "learning_rate": 3.658102395619621e-06, "loss": 0.0019, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 926 }, { "completion_length": 200.0, "epoch": 0.41255006675567424, "grad_norm": 0.7906734347343445, "kl": 0.02828504703938961, "learning_rate": 3.65465863875845e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 927 }, { "completion_length": 149.1666717529297, "epoch": 0.4129951045838896, "grad_norm": 0.8365933895111084, "kl": 0.08980211615562439, "learning_rate": 3.651212094550748e-06, "loss": 0.0036, "reward": 0.26616665720939636, "reward_std": 0.2690356373786926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26616665720939636, "step": 928 }, { "completion_length": 199.6666717529297, "epoch": 0.413440142412105, "grad_norm": 0.6435677409172058, "kl": 0.03840624541044235, "learning_rate": 3.6477627713164767e-06, "loss": 0.0015, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 929 }, { "completion_length": 140.1666717529297, "epoch": 0.41388518024032045, "grad_norm": 0.02791665680706501, "kl": 0.0602748803794384, "learning_rate": 3.6443106773823025e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 930 }, { "completion_length": 190.0, "epoch": 0.4143302180685358, "grad_norm": 0.8331409096717834, "kl": 0.05726364254951477, "learning_rate": 3.6408558210815814e-06, "loss": 0.0023, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 931 }, { "completion_length": 193.1666717529297, "epoch": 0.41477525589675124, "grad_norm": 0.7008532285690308, "kl": 0.05072779580950737, "learning_rate": 3.6373982107543398e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 932 }, { "completion_length": 136.33334350585938, "epoch": 0.4152202937249666, "grad_norm": 0.02707936242222786, "kl": 0.05186247080564499, "learning_rate": 3.63393785474725e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 933 }, { "completion_length": 145.5, "epoch": 0.415665331553182, "grad_norm": 0.7188604474067688, "kl": 0.06224457547068596, "learning_rate": 3.6304747614136126e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 934 }, { "completion_length": 117.66667175292969, "epoch": 0.4161103693813974, "grad_norm": 1.1184520721435547, "kl": 0.0940161943435669, "learning_rate": 3.6270089391133378e-06, "loss": 0.0038, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 935 }, { "completion_length": 158.5, "epoch": 0.4165554072096128, "grad_norm": 0.683738648891449, "kl": 0.05458883196115494, "learning_rate": 3.6235403962129218e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 936 }, { "completion_length": 176.5, "epoch": 0.41700044503782824, "grad_norm": 0.7745259404182434, "kl": 0.053183965384960175, "learning_rate": 3.6200691410854284e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 937 }, { "completion_length": 134.5, "epoch": 0.4174454828660436, "grad_norm": 0.028597315773367882, "kl": 0.06111491844058037, "learning_rate": 3.61659518211047e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 938 }, { "completion_length": 157.0, "epoch": 0.417890520694259, "grad_norm": 0.05513963848352432, "kl": 0.07339125871658325, "learning_rate": 3.6131185276741846e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 939 }, { "completion_length": 173.0, "epoch": 0.4183355585224744, "grad_norm": 0.7153098583221436, "kl": 0.053828366100788116, "learning_rate": 3.6096391861692183e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 940 }, { "completion_length": 193.0, "epoch": 0.4187805963506898, "grad_norm": 0.738511323928833, "kl": 0.05540754646062851, "learning_rate": 3.6061571659947032e-06, "loss": 0.0022, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 941 }, { "completion_length": 144.33334350585938, "epoch": 0.4192256341789052, "grad_norm": 0.01766749657690525, "kl": 0.05199790000915527, "learning_rate": 3.602672475556237e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 942 }, { "completion_length": 131.1666717529297, "epoch": 0.4196706720071206, "grad_norm": 0.02660481631755829, "kl": 0.05784265324473381, "learning_rate": 3.5991851232658647e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 943 }, { "completion_length": 191.6666717529297, "epoch": 0.420115709835336, "grad_norm": 0.6994498372077942, "kl": 0.039949461817741394, "learning_rate": 3.595695117542057e-06, "loss": 0.0016, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 944 }, { "completion_length": 166.83334350585938, "epoch": 0.4205607476635514, "grad_norm": 0.8509209752082825, "kl": 0.10748060047626495, "learning_rate": 3.5922024668096885e-06, "loss": 0.0043, "reward": 0.29216668009757996, "reward_std": 0.06493817269802094, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 945 }, { "completion_length": 143.1666717529297, "epoch": 0.4210057854917668, "grad_norm": 0.014815707691013813, "kl": 0.04326681047677994, "learning_rate": 3.5887071795000204e-06, "loss": 0.002, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 946 }, { "completion_length": 182.83334350585938, "epoch": 0.4214508233199822, "grad_norm": 0.8161847591400146, "kl": 0.0375329852104187, "learning_rate": 3.585209264050678e-06, "loss": 0.0015, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 947 }, { "completion_length": 171.1666717529297, "epoch": 0.4218958611481976, "grad_norm": 0.7375685572624207, "kl": 0.057841382920742035, "learning_rate": 3.5817087289056305e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 948 }, { "completion_length": 155.0, "epoch": 0.42234089897641297, "grad_norm": 0.02155291475355625, "kl": 0.051909781992435455, "learning_rate": 3.5782055825151722e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 949 }, { "completion_length": 171.33334350585938, "epoch": 0.4227859368046284, "grad_norm": 0.635463297367096, "kl": 0.049717407673597336, "learning_rate": 3.5746998333358994e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 950 }, { "completion_length": 135.83334350585938, "epoch": 0.4232309746328438, "grad_norm": 0.04738845303654671, "kl": 0.07275395095348358, "learning_rate": 3.571191489830693e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 951 }, { "completion_length": 148.5, "epoch": 0.4236760124610592, "grad_norm": 0.734367311000824, "kl": 0.0618259459733963, "learning_rate": 3.567680560468696e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 952 }, { "completion_length": 185.1666717529297, "epoch": 0.4241210502892746, "grad_norm": 0.6942402720451355, "kl": 0.04942780360579491, "learning_rate": 3.564167053725293e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 953 }, { "completion_length": 149.83334350585938, "epoch": 0.42456608811748997, "grad_norm": 0.6157485246658325, "kl": 0.048379119485616684, "learning_rate": 3.560650978082092e-06, "loss": 0.0019, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 954 }, { "completion_length": 181.33334350585938, "epoch": 0.4250111259457054, "grad_norm": 0.6741864085197449, "kl": 0.05080154538154602, "learning_rate": 3.5571323420269e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 955 }, { "completion_length": 164.33334350585938, "epoch": 0.42545616377392076, "grad_norm": 0.6833618879318237, "kl": 0.04868559539318085, "learning_rate": 3.5536111540537076e-06, "loss": 0.0019, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 956 }, { "completion_length": 153.83334350585938, "epoch": 0.4259012016021362, "grad_norm": 0.8367120027542114, "kl": 0.08897934854030609, "learning_rate": 3.5500874226626635e-06, "loss": 0.0036, "reward": 0.2783333361148834, "reward_std": 0.23923347890377045, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27833330631256104, "step": 957 }, { "completion_length": 180.1666717529297, "epoch": 0.4263462394303516, "grad_norm": 0.8038119673728943, "kl": 0.06598465144634247, "learning_rate": 3.546561156360057e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 958 }, { "completion_length": 167.6666717529297, "epoch": 0.42679127725856697, "grad_norm": 0.8257107734680176, "kl": 0.06047782301902771, "learning_rate": 3.543032363658297e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 959 }, { "completion_length": 151.1666717529297, "epoch": 0.4272363150867824, "grad_norm": 0.029956277459859848, "kl": 0.05927738547325134, "learning_rate": 3.5395010530758913e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 960 }, { "completion_length": 185.0, "epoch": 0.42768135291499776, "grad_norm": 0.7257969379425049, "kl": 0.043350182473659515, "learning_rate": 3.535967233137424e-06, "loss": 0.0017, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 961 }, { "completion_length": 164.33334350585938, "epoch": 0.4281263907432132, "grad_norm": 0.019633714109659195, "kl": 0.04942721128463745, "learning_rate": 3.5324309123735396e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 962 }, { "completion_length": 200.0, "epoch": 0.42857142857142855, "grad_norm": 0.6552579402923584, "kl": 0.055333852767944336, "learning_rate": 3.5288920993209175e-06, "loss": 0.0022, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 963 }, { "completion_length": 172.0, "epoch": 0.42901646639964397, "grad_norm": 0.8508467078208923, "kl": 0.047447673976421356, "learning_rate": 3.5253508025222545e-06, "loss": 0.0019, "reward": 0.2615000009536743, "reward_std": 0.13879159092903137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2615000009536743, "step": 964 }, { "completion_length": 199.5, "epoch": 0.4294615042278594, "grad_norm": 0.6771219968795776, "kl": 0.0449552908539772, "learning_rate": 3.5218070305262427e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 965 }, { "completion_length": 149.5, "epoch": 0.42990654205607476, "grad_norm": 0.046515047550201416, "kl": 0.06530429422855377, "learning_rate": 3.5182607918875495e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 966 }, { "completion_length": 180.1666717529297, "epoch": 0.4303515798842902, "grad_norm": 0.645074188709259, "kl": 0.051890529692173004, "learning_rate": 3.514712095166797e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 967 }, { "completion_length": 177.5, "epoch": 0.43079661771250555, "grad_norm": 0.7000871896743774, "kl": 0.05255164951086044, "learning_rate": 3.511160948930539e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 968 }, { "completion_length": 149.1666717529297, "epoch": 0.43124165554072097, "grad_norm": 0.02208912931382656, "kl": 0.05640026181936264, "learning_rate": 3.507607361751248e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 969 }, { "completion_length": 168.6666717529297, "epoch": 0.43168669336893634, "grad_norm": 0.032338086515665054, "kl": 0.059976160526275635, "learning_rate": 3.504051342207282e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 970 }, { "completion_length": 151.0, "epoch": 0.43213173119715176, "grad_norm": 0.8209261894226074, "kl": 0.064157634973526, "learning_rate": 3.5004928988828748e-06, "loss": 0.0026, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 971 }, { "completion_length": 197.5, "epoch": 0.4325767690253672, "grad_norm": 0.6907802820205688, "kl": 0.05787642300128937, "learning_rate": 3.4969320403681105e-06, "loss": 0.0023, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 972 }, { "completion_length": 199.83334350585938, "epoch": 0.43302180685358255, "grad_norm": 0.802642285823822, "kl": 0.044948406517505646, "learning_rate": 3.493368775258904e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 973 }, { "completion_length": 168.83334350585938, "epoch": 0.43346684468179797, "grad_norm": 0.6813015341758728, "kl": 0.04564934968948364, "learning_rate": 3.489803112156978e-06, "loss": 0.0018, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 974 }, { "completion_length": 161.33334350585938, "epoch": 0.43391188251001334, "grad_norm": 0.028628792613744736, "kl": 0.05609843134880066, "learning_rate": 3.486235059669846e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 975 }, { "completion_length": 172.83334350585938, "epoch": 0.43435692033822876, "grad_norm": 0.7536810636520386, "kl": 0.07381969690322876, "learning_rate": 3.482664626410787e-06, "loss": 0.003, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 976 }, { "completion_length": 185.5, "epoch": 0.4348019581664441, "grad_norm": 0.7587660551071167, "kl": 0.04436537250876427, "learning_rate": 3.47909182099883e-06, "loss": 0.0018, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 977 }, { "completion_length": 174.6666717529297, "epoch": 0.43524699599465955, "grad_norm": 0.7341634035110474, "kl": 0.04806718975305557, "learning_rate": 3.4755166520587297e-06, "loss": 0.0019, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 978 }, { "completion_length": 200.0, "epoch": 0.43569203382287497, "grad_norm": 0.01764507032930851, "kl": 0.046592336148023605, "learning_rate": 3.4719391282209437e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 979 }, { "completion_length": 132.1666717529297, "epoch": 0.43613707165109034, "grad_norm": 0.921298086643219, "kl": 0.15562453866004944, "learning_rate": 3.4683592581216173e-06, "loss": 0.0062, "reward": 0.3048333525657654, "reward_std": 0.1743220090866089, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.304833322763443, "step": 980 }, { "completion_length": 119.16667175292969, "epoch": 0.43658210947930576, "grad_norm": 0.024678191170096397, "kl": 0.06529170274734497, "learning_rate": 3.464777050402559e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 981 }, { "completion_length": 144.1666717529297, "epoch": 0.4370271473075211, "grad_norm": 0.016610290855169296, "kl": 0.04939974471926689, "learning_rate": 3.461192513711219e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 982 }, { "completion_length": 149.83334350585938, "epoch": 0.43747218513573655, "grad_norm": 0.029446417465806007, "kl": 0.05275744944810867, "learning_rate": 3.4576056567006728e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 983 }, { "completion_length": 178.6666717529297, "epoch": 0.4379172229639519, "grad_norm": 0.7193699479103088, "kl": 0.057997625321149826, "learning_rate": 3.454016488029592e-06, "loss": 0.0023, "reward": 0.29216668009757996, "reward_std": 0.10255226492881775, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 984 }, { "completion_length": 132.0, "epoch": 0.43836226079216734, "grad_norm": 0.016964778304100037, "kl": 0.052684418857097626, "learning_rate": 3.4504250163622334e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 985 }, { "completion_length": 191.6666717529297, "epoch": 0.43880729862038276, "grad_norm": 0.7094153761863708, "kl": 0.03705962002277374, "learning_rate": 3.446831250368412e-06, "loss": 0.0015, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 986 }, { "completion_length": 182.5, "epoch": 0.4392523364485981, "grad_norm": 0.7157971858978271, "kl": 0.05741278454661369, "learning_rate": 3.443235198723479e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 987 }, { "completion_length": 195.0, "epoch": 0.43969737427681355, "grad_norm": 0.7865637540817261, "kl": 0.03509274870157242, "learning_rate": 3.4396368701083073e-06, "loss": 0.0014, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 988 }, { "completion_length": 162.1666717529297, "epoch": 0.4401424121050289, "grad_norm": 1.0707346200942993, "kl": 0.06345522403717041, "learning_rate": 3.436036273209261e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 989 }, { "completion_length": 157.6666717529297, "epoch": 0.44058744993324434, "grad_norm": 0.7505767345428467, "kl": 0.06585465371608734, "learning_rate": 3.432433416718184e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 990 }, { "completion_length": 197.6666717529297, "epoch": 0.4410324877614597, "grad_norm": 0.7079508900642395, "kl": 0.05217638239264488, "learning_rate": 3.428828309332375e-06, "loss": 0.0021, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 991 }, { "completion_length": 125.33333587646484, "epoch": 0.4414775255896751, "grad_norm": 0.022779956459999084, "kl": 0.06337767839431763, "learning_rate": 3.4252209597545634e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 992 }, { "completion_length": 200.0, "epoch": 0.44192256341789055, "grad_norm": 0.02871246449649334, "kl": 0.045304443687200546, "learning_rate": 3.4216113766928926e-06, "loss": 0.0018, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 993 }, { "completion_length": 196.33334350585938, "epoch": 0.4423676012461059, "grad_norm": 0.7327296137809753, "kl": 0.04095136374235153, "learning_rate": 3.4179995688608996e-06, "loss": 0.0016, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 994 }, { "completion_length": 179.0, "epoch": 0.44281263907432133, "grad_norm": 0.6812580823898315, "kl": 0.054273735731840134, "learning_rate": 3.414385544977489e-06, "loss": 0.0022, "reward": 0.24016666412353516, "reward_std": 0.23370617628097534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24016666412353516, "step": 995 }, { "completion_length": 147.0, "epoch": 0.4432576769025367, "grad_norm": 0.029301166534423828, "kl": 0.05664734169840813, "learning_rate": 3.4107693137669167e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 996 }, { "completion_length": 187.83334350585938, "epoch": 0.4437027147307521, "grad_norm": 0.7347012758255005, "kl": 0.0524878203868866, "learning_rate": 3.4071508839587676e-06, "loss": 0.0021, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 997 }, { "completion_length": 175.83334350585938, "epoch": 0.4441477525589675, "grad_norm": 0.7827162742614746, "kl": 0.05347862094640732, "learning_rate": 3.403530264287931e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 998 }, { "completion_length": 179.0, "epoch": 0.4445927903871829, "grad_norm": 0.0856582522392273, "kl": 0.05322853848338127, "learning_rate": 3.3999074634945854e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 999 }, { "completion_length": 193.0, "epoch": 0.44503782821539833, "grad_norm": 0.7221679091453552, "kl": 0.03924994543194771, "learning_rate": 3.396282490324175e-06, "loss": 0.0016, "reward": 0.2711666524410248, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1000 }, { "completion_length": 118.0, "epoch": 0.4454828660436137, "grad_norm": 0.025427184998989105, "kl": 0.05986163020133972, "learning_rate": 3.392655353527385e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1001 }, { "completion_length": 160.83334350585938, "epoch": 0.4459279038718291, "grad_norm": 0.8170287013053894, "kl": 0.05823221802711487, "learning_rate": 3.389026061860126e-06, "loss": 0.0023, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1002 }, { "completion_length": 200.0, "epoch": 0.4463729417000445, "grad_norm": 0.030482221394777298, "kl": 0.04448040574789047, "learning_rate": 3.3853946240835113e-06, "loss": 0.0018, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1003 }, { "completion_length": 160.5, "epoch": 0.4468179795282599, "grad_norm": 0.6799116134643555, "kl": 0.0437270812690258, "learning_rate": 3.3817610489638313e-06, "loss": 0.0017, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1004 }, { "completion_length": 183.83334350585938, "epoch": 0.4472630173564753, "grad_norm": 0.7161902785301208, "kl": 0.04126816242933273, "learning_rate": 3.3781253452725395e-06, "loss": 0.0017, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1005 }, { "completion_length": 198.83334350585938, "epoch": 0.4477080551846907, "grad_norm": 0.7520974278450012, "kl": 0.04543256014585495, "learning_rate": 3.3744875217862266e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1006 }, { "completion_length": 154.33334350585938, "epoch": 0.4481530930129061, "grad_norm": 0.8436493277549744, "kl": 0.061019204556941986, "learning_rate": 3.3708475872866e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1007 }, { "completion_length": 200.0, "epoch": 0.4485981308411215, "grad_norm": 0.6720322966575623, "kl": 0.033872686326503754, "learning_rate": 3.3672055505604624e-06, "loss": 0.0014, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 1008 }, { "completion_length": 197.83334350585938, "epoch": 0.4490431686693369, "grad_norm": 0.7746611833572388, "kl": 0.054266974329948425, "learning_rate": 3.3635614203996938e-06, "loss": 0.0022, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 1009 }, { "completion_length": 193.0, "epoch": 0.4494882064975523, "grad_norm": 0.725085973739624, "kl": 0.09663119912147522, "learning_rate": 3.3599152056012246e-06, "loss": 0.0039, "reward": 0.032499998807907104, "reward_std": 0.30418860912323, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.032499998807907104, "step": 1010 }, { "completion_length": 162.83334350585938, "epoch": 0.4499332443257677, "grad_norm": 0.6782302856445312, "kl": 0.051349788904190063, "learning_rate": 3.356266914967021e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1011 }, { "completion_length": 159.1666717529297, "epoch": 0.45037828215398307, "grad_norm": 0.7623798847198486, "kl": 0.09882908314466476, "learning_rate": 3.352616557304057e-06, "loss": 0.004, "reward": 0.21299999952316284, "reward_std": 0.25778907537460327, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21299999952316284, "step": 1012 }, { "completion_length": 166.5, "epoch": 0.4508233199821985, "grad_norm": 0.7910959720611572, "kl": 0.05198938772082329, "learning_rate": 3.3489641414242986e-06, "loss": 0.0021, "reward": 0.35466668009757996, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 1013 }, { "completion_length": 183.5, "epoch": 0.4512683578104139, "grad_norm": 0.8101857304573059, "kl": 0.05365820974111557, "learning_rate": 3.3453096761446795e-06, "loss": 0.0021, "reward": 0.20000001788139343, "reward_std": 0.25355708599090576, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20000001788139343, "step": 1014 }, { "completion_length": 196.0, "epoch": 0.4517133956386293, "grad_norm": 0.8162935376167297, "kl": 0.04988335818052292, "learning_rate": 3.34165317028708e-06, "loss": 0.002, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1015 }, { "completion_length": 192.0, "epoch": 0.4521584334668447, "grad_norm": 0.7259402871131897, "kl": 0.05293646082282066, "learning_rate": 3.3379946326783074e-06, "loss": 0.0021, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1016 }, { "completion_length": 166.1666717529297, "epoch": 0.45260347129506007, "grad_norm": 0.6532735228538513, "kl": 0.04412105679512024, "learning_rate": 3.3343340721500743e-06, "loss": 0.0018, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1017 }, { "completion_length": 148.33334350585938, "epoch": 0.4530485091232755, "grad_norm": 0.7425003051757812, "kl": 0.10510390996932983, "learning_rate": 3.3306714975389742e-06, "loss": 0.0042, "reward": 0.2293333262205124, "reward_std": 0.3592585027217865, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 1018 }, { "completion_length": 170.33334350585938, "epoch": 0.45349354695149086, "grad_norm": 0.804882287979126, "kl": 0.04764273762702942, "learning_rate": 3.3270069176864644e-06, "loss": 0.0019, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1019 }, { "completion_length": 128.5, "epoch": 0.4539385847797063, "grad_norm": 0.9536888599395752, "kl": 0.06011280044913292, "learning_rate": 3.3233403414388432e-06, "loss": 0.0024, "reward": 0.3341667056083679, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1020 }, { "completion_length": 129.1666717529297, "epoch": 0.4543836226079217, "grad_norm": 0.025884155184030533, "kl": 0.05302686616778374, "learning_rate": 3.319671777647227e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1021 }, { "completion_length": 182.33334350585938, "epoch": 0.45482866043613707, "grad_norm": 0.7451061606407166, "kl": 0.05932309851050377, "learning_rate": 3.3160012351675304e-06, "loss": 0.0024, "reward": 0.057500001043081284, "reward_std": 0.39164361357688904, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.057500001043081284, "step": 1022 }, { "completion_length": 179.33334350585938, "epoch": 0.4552736982643525, "grad_norm": 0.7858273386955261, "kl": 0.05125393718481064, "learning_rate": 3.312328722860445e-06, "loss": 0.0021, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 1023 }, { "completion_length": 133.6666717529297, "epoch": 0.45571873609256786, "grad_norm": 0.837760329246521, "kl": 0.15334957838058472, "learning_rate": 3.3086542495914176e-06, "loss": 0.0061, "reward": 0.2828333377838135, "reward_std": 0.22821079194545746, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2828333377838135, "step": 1024 }, { "completion_length": 169.0, "epoch": 0.4561637739207833, "grad_norm": 0.9212179183959961, "kl": 0.11279284954071045, "learning_rate": 3.304977824230628e-06, "loss": 0.0045, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1025 }, { "completion_length": 152.5, "epoch": 0.45660881174899864, "grad_norm": 0.6558517217636108, "kl": 0.05286305397748947, "learning_rate": 3.301299455652969e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1026 }, { "completion_length": 197.5, "epoch": 0.45705384957721407, "grad_norm": 0.7066366076469421, "kl": 0.04090268909931183, "learning_rate": 3.297619152738025e-06, "loss": 0.0016, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1027 }, { "completion_length": 185.1666717529297, "epoch": 0.4574988874054295, "grad_norm": 1.0826621055603027, "kl": 0.04789039492607117, "learning_rate": 3.293936924370048e-06, "loss": 0.0019, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1028 }, { "completion_length": 165.1666717529297, "epoch": 0.45794392523364486, "grad_norm": 0.7362165451049805, "kl": 0.05640339106321335, "learning_rate": 3.290252779437939e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1029 }, { "completion_length": 190.6666717529297, "epoch": 0.4583889630618603, "grad_norm": 0.993109941482544, "kl": 0.06488848477602005, "learning_rate": 3.286566726835227e-06, "loss": 0.0026, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1030 }, { "completion_length": 163.5, "epoch": 0.45883400089007564, "grad_norm": 0.01776033826172352, "kl": 0.051738932728767395, "learning_rate": 3.282878775460044e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1031 }, { "completion_length": 192.33334350585938, "epoch": 0.45927903871829107, "grad_norm": 0.8004707098007202, "kl": 0.043662603944540024, "learning_rate": 3.2791889342151055e-06, "loss": 0.0017, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1032 }, { "completion_length": 158.6666717529297, "epoch": 0.45972407654650643, "grad_norm": 0.837901771068573, "kl": 0.09061974287033081, "learning_rate": 3.2754972120076918e-06, "loss": 0.0036, "reward": 0.2618333399295807, "reward_std": 0.2796500623226166, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2618333101272583, "step": 1033 }, { "completion_length": 200.0, "epoch": 0.46016911437472185, "grad_norm": 0.01697075366973877, "kl": 0.042512111365795135, "learning_rate": 3.2718036177496217e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1034 }, { "completion_length": 154.83334350585938, "epoch": 0.4606141522029373, "grad_norm": 0.8117488622665405, "kl": 0.07002654671669006, "learning_rate": 3.268108160357233e-06, "loss": 0.0028, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1035 }, { "completion_length": 140.1666717529297, "epoch": 0.46105919003115264, "grad_norm": 0.0247640497982502, "kl": 0.05443256348371506, "learning_rate": 3.2644108487513614e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1036 }, { "completion_length": 197.83334350585938, "epoch": 0.46150422785936807, "grad_norm": 0.6665837168693542, "kl": 0.07370376586914062, "learning_rate": 3.2607116918573207e-06, "loss": 0.0029, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1037 }, { "completion_length": 189.1666717529297, "epoch": 0.46194926568758343, "grad_norm": 0.7120652794837952, "kl": 0.043880194425582886, "learning_rate": 3.2570106986048755e-06, "loss": 0.0018, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1038 }, { "completion_length": 172.33334350585938, "epoch": 0.46239430351579885, "grad_norm": 0.7578598260879517, "kl": 0.0529780313372612, "learning_rate": 3.253307877928227e-06, "loss": 0.0021, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1039 }, { "completion_length": 193.6666717529297, "epoch": 0.4628393413440142, "grad_norm": 0.7513720393180847, "kl": 0.07827489823102951, "learning_rate": 3.249603238765985e-06, "loss": 0.0031, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 1040 }, { "completion_length": 151.5, "epoch": 0.46328437917222964, "grad_norm": 0.02841741219162941, "kl": 0.05697069317102432, "learning_rate": 3.2458967900611504e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1041 }, { "completion_length": 177.1666717529297, "epoch": 0.46372941700044507, "grad_norm": 0.7044739127159119, "kl": 0.06733378767967224, "learning_rate": 3.242188540761092e-06, "loss": 0.0027, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1042 }, { "completion_length": 152.1666717529297, "epoch": 0.46417445482866043, "grad_norm": 0.9891431927680969, "kl": 0.06679210066795349, "learning_rate": 3.2384784998175274e-06, "loss": 0.0027, "reward": 0.30433332920074463, "reward_std": 0.1755467653274536, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.30433332920074463, "step": 1043 }, { "completion_length": 180.0, "epoch": 0.46461949265687585, "grad_norm": 0.7353585362434387, "kl": 0.05097658559679985, "learning_rate": 3.234766676186495e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1044 }, { "completion_length": 145.0, "epoch": 0.4650645304850912, "grad_norm": 0.023157954216003418, "kl": 0.05559340864419937, "learning_rate": 3.23105307882834e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1045 }, { "completion_length": 200.0, "epoch": 0.46550956831330664, "grad_norm": 0.6369867324829102, "kl": 0.06708483397960663, "learning_rate": 3.2273377167076876e-06, "loss": 0.0027, "reward": 0.1081666648387909, "reward_std": 0.3474193215370178, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1081666648387909, "step": 1046 }, { "completion_length": 186.0, "epoch": 0.465954606141522, "grad_norm": 0.8478872179985046, "kl": 0.04902426898479462, "learning_rate": 3.2236205987934237e-06, "loss": 0.002, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949999570846558, "step": 1047 }, { "completion_length": 169.83334350585938, "epoch": 0.46639964396973743, "grad_norm": 0.6993066072463989, "kl": 0.05535086244344711, "learning_rate": 3.219901734058675e-06, "loss": 0.0022, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1048 }, { "completion_length": 194.33334350585938, "epoch": 0.4668446817979528, "grad_norm": 0.8170168995857239, "kl": 0.03591171279549599, "learning_rate": 3.2161811314807794e-06, "loss": 0.0014, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 1049 }, { "completion_length": 127.16667175292969, "epoch": 0.4672897196261682, "grad_norm": 0.022290347144007683, "kl": 0.05225914716720581, "learning_rate": 3.212458800041276e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1050 }, { "completion_length": 195.83334350585938, "epoch": 0.46773475745438364, "grad_norm": 0.820202112197876, "kl": 0.04448423907160759, "learning_rate": 3.2087347487258735e-06, "loss": 0.0018, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 1051 }, { "completion_length": 141.33334350585938, "epoch": 0.468179795282599, "grad_norm": 0.767572283744812, "kl": 0.0574708953499794, "learning_rate": 3.2050089865244345e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1052 }, { "completion_length": 141.6666717529297, "epoch": 0.46862483311081443, "grad_norm": 0.772443413734436, "kl": 0.05985133349895477, "learning_rate": 3.2012815224309495e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1053 }, { "completion_length": 183.0, "epoch": 0.4690698709390298, "grad_norm": 0.7392573356628418, "kl": 0.043959807604551315, "learning_rate": 3.1975523654435204e-06, "loss": 0.0018, "reward": 0.27133333683013916, "reward_std": 0.12340773642063141, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 1054 }, { "completion_length": 130.83334350585938, "epoch": 0.4695149087672452, "grad_norm": 0.0171357449144125, "kl": 0.053742073476314545, "learning_rate": 3.1938215245643327e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1055 }, { "completion_length": 197.83334350585938, "epoch": 0.4699599465954606, "grad_norm": 0.751876175403595, "kl": 0.05212129279971123, "learning_rate": 3.190089008799638e-06, "loss": 0.0021, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1056 }, { "completion_length": 186.6666717529297, "epoch": 0.470404984423676, "grad_norm": 0.8305873870849609, "kl": 0.04781080409884453, "learning_rate": 3.1863548271597333e-06, "loss": 0.0019, "reward": 0.29216668009757996, "reward_std": 0.10255226492881775, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1057 }, { "completion_length": 174.5, "epoch": 0.47085002225189143, "grad_norm": 0.661457896232605, "kl": 0.056177690625190735, "learning_rate": 3.182618988658932e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1058 }, { "completion_length": 185.6666717529297, "epoch": 0.4712950600801068, "grad_norm": 0.7157963514328003, "kl": 0.05063026398420334, "learning_rate": 3.178881502315552e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1059 }, { "completion_length": 96.83333587646484, "epoch": 0.4717400979083222, "grad_norm": 0.028320979326963425, "kl": 0.07067437469959259, "learning_rate": 3.1751423771518876e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1060 }, { "completion_length": 108.0, "epoch": 0.4721851357365376, "grad_norm": 0.023467140272259712, "kl": 0.06319691985845566, "learning_rate": 3.171401622194187e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1061 }, { "completion_length": 159.5, "epoch": 0.472630173564753, "grad_norm": 0.6847676634788513, "kl": 0.06087712198495865, "learning_rate": 3.1676592464726354e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1062 }, { "completion_length": 115.16667175292969, "epoch": 0.4730752113929684, "grad_norm": 0.7515028715133667, "kl": 0.08603694289922714, "learning_rate": 3.1639152590213294e-06, "loss": 0.0034, "reward": 0.2993333339691162, "reward_std": 0.18779420852661133, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2993333041667938, "step": 1063 }, { "completion_length": 141.83334350585938, "epoch": 0.4735202492211838, "grad_norm": 0.02573162131011486, "kl": 0.05833118408918381, "learning_rate": 3.1601696688782575e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1064 }, { "completion_length": 153.1666717529297, "epoch": 0.4739652870493992, "grad_norm": 0.027337657287716866, "kl": 0.06034916639328003, "learning_rate": 3.1564224850852756e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1065 }, { "completion_length": 195.1666717529297, "epoch": 0.4744103248776146, "grad_norm": 0.6953572630882263, "kl": 0.04970608279109001, "learning_rate": 3.152673716688087e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1066 }, { "completion_length": 165.5, "epoch": 0.47485536270583, "grad_norm": 0.6301455497741699, "kl": 0.06556437909603119, "learning_rate": 3.148923372736221e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1067 }, { "completion_length": 151.1666717529297, "epoch": 0.4753004005340454, "grad_norm": 0.6602085828781128, "kl": 0.0660819336771965, "learning_rate": 3.14517146228301e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1068 }, { "completion_length": 157.1666717529297, "epoch": 0.4757454383622608, "grad_norm": 0.06263867020606995, "kl": 0.06300581991672516, "learning_rate": 3.141417994385566e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1069 }, { "completion_length": 167.33334350585938, "epoch": 0.47619047619047616, "grad_norm": 0.027167366817593575, "kl": 0.060957591980695724, "learning_rate": 3.1376629781047642e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1070 }, { "completion_length": 118.16667175292969, "epoch": 0.4766355140186916, "grad_norm": 0.03084111399948597, "kl": 0.07735933363437653, "learning_rate": 3.1339064225052153e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1071 }, { "completion_length": 145.6666717529297, "epoch": 0.477080551846907, "grad_norm": 0.04745664820075035, "kl": 0.08085335791110992, "learning_rate": 3.1301483366552455e-06, "loss": 0.0035, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1072 }, { "completion_length": 180.1666717529297, "epoch": 0.4775255896751224, "grad_norm": 0.7254480123519897, "kl": 0.055918075144290924, "learning_rate": 3.1263887296268767e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1073 }, { "completion_length": 192.33334350585938, "epoch": 0.4779706275033378, "grad_norm": 0.6751211285591125, "kl": 0.052202414721250534, "learning_rate": 3.122627610495803e-06, "loss": 0.0021, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1074 }, { "completion_length": 136.83334350585938, "epoch": 0.47841566533155316, "grad_norm": 0.8842310309410095, "kl": 0.05041445419192314, "learning_rate": 3.1188649883413665e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1075 }, { "completion_length": 148.6666717529297, "epoch": 0.4788607031597686, "grad_norm": 0.02766844630241394, "kl": 0.07211090624332428, "learning_rate": 3.1151008722465398e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1076 }, { "completion_length": 200.0, "epoch": 0.47930574098798395, "grad_norm": 0.019513975828886032, "kl": 0.04784202575683594, "learning_rate": 3.1113352712979e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1077 }, { "completion_length": 113.83333587646484, "epoch": 0.4797507788161994, "grad_norm": 0.9397275447845459, "kl": 0.06918388605117798, "learning_rate": 3.10756819458561e-06, "loss": 0.0028, "reward": 0.2759999930858612, "reward_std": 0.24494898319244385, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2759999930858612, "step": 1078 }, { "completion_length": 153.1666717529297, "epoch": 0.4801958166444148, "grad_norm": 0.023615337908267975, "kl": 0.05814233422279358, "learning_rate": 3.1037996512033963e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1079 }, { "completion_length": 200.0, "epoch": 0.48064085447263016, "grad_norm": 0.7462202310562134, "kl": 0.04940864071249962, "learning_rate": 3.1000296502485226e-06, "loss": 0.002, "reward": 0.18850001692771912, "reward_std": 0.15064363181591034, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18850001692771912, "step": 1080 }, { "completion_length": 153.83334350585938, "epoch": 0.4810858923008456, "grad_norm": 0.7381025552749634, "kl": 0.05779881775379181, "learning_rate": 3.096258200821774e-06, "loss": 0.0023, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1081 }, { "completion_length": 176.1666717529297, "epoch": 0.48153093012906095, "grad_norm": 0.7282528877258301, "kl": 0.05618796870112419, "learning_rate": 3.0924853120274313e-06, "loss": 0.0022, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1082 }, { "completion_length": 159.5, "epoch": 0.4819759679572764, "grad_norm": 0.6911483407020569, "kl": 0.06526792049407959, "learning_rate": 3.0887109929732496e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1083 }, { "completion_length": 165.33334350585938, "epoch": 0.48242100578549174, "grad_norm": 0.8593658208847046, "kl": 0.05354367941617966, "learning_rate": 3.084935252770437e-06, "loss": 0.0021, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1084 }, { "completion_length": 133.83334350585938, "epoch": 0.48286604361370716, "grad_norm": 0.7801674604415894, "kl": 0.0553756058216095, "learning_rate": 3.081158100533633e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1085 }, { "completion_length": 137.6666717529297, "epoch": 0.4833110814419226, "grad_norm": 0.020206518471240997, "kl": 0.061163198202848434, "learning_rate": 3.0773795453808842e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1086 }, { "completion_length": 177.5, "epoch": 0.48375611927013795, "grad_norm": 0.655252993106842, "kl": 0.05324871465563774, "learning_rate": 3.073599596433624e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1087 }, { "completion_length": 145.33334350585938, "epoch": 0.4842011570983534, "grad_norm": 0.02423451840877533, "kl": 0.0650758221745491, "learning_rate": 3.069818262816653e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1088 }, { "completion_length": 192.6666717529297, "epoch": 0.48464619492656874, "grad_norm": 0.6816695928573608, "kl": 0.05046912655234337, "learning_rate": 3.0660355536581104e-06, "loss": 0.002, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1089 }, { "completion_length": 162.1666717529297, "epoch": 0.48509123275478416, "grad_norm": 0.033472057431936264, "kl": 0.05764755234122276, "learning_rate": 3.0622514780894592e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1090 }, { "completion_length": 121.33333587646484, "epoch": 0.48553627058299953, "grad_norm": 0.028588904067873955, "kl": 0.0573783814907074, "learning_rate": 3.0584660452454596e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1091 }, { "completion_length": 116.33333587646484, "epoch": 0.48598130841121495, "grad_norm": 0.9297807812690735, "kl": 0.061338119208812714, "learning_rate": 3.054679264264148e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1092 }, { "completion_length": 157.5, "epoch": 0.4864263462394304, "grad_norm": 0.01856524497270584, "kl": 0.05327138304710388, "learning_rate": 3.0508911442868155e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1093 }, { "completion_length": 181.1666717529297, "epoch": 0.48687138406764574, "grad_norm": 0.8838992714881897, "kl": 0.06359528750181198, "learning_rate": 3.047101694457987e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1094 }, { "completion_length": 192.33334350585938, "epoch": 0.48731642189586116, "grad_norm": 0.7542251348495483, "kl": 0.0404694564640522, "learning_rate": 3.0433109239253937e-06, "loss": 0.0016, "reward": 0.20866666734218597, "reward_std": 0.12961582839488983, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20866666734218597, "step": 1095 }, { "completion_length": 168.33334350585938, "epoch": 0.48776145972407653, "grad_norm": 0.8963215947151184, "kl": 0.05817902833223343, "learning_rate": 3.0395188418399597e-06, "loss": 0.0023, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1096 }, { "completion_length": 139.5, "epoch": 0.48820649755229195, "grad_norm": 0.02465493232011795, "kl": 0.06517083942890167, "learning_rate": 3.035725457355773e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1097 }, { "completion_length": 164.1666717529297, "epoch": 0.4886515353805073, "grad_norm": 0.7453930974006653, "kl": 0.09313317388296127, "learning_rate": 3.0319307796300634e-06, "loss": 0.0037, "reward": 0.26016664505004883, "reward_std": 0.2837325930595398, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26016664505004883, "step": 1098 }, { "completion_length": 194.1666717529297, "epoch": 0.48909657320872274, "grad_norm": 0.822679340839386, "kl": 0.04898854345083237, "learning_rate": 3.028134817823187e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1099 }, { "completion_length": 116.5, "epoch": 0.48954161103693816, "grad_norm": 0.04699001461267471, "kl": 0.08001024276018143, "learning_rate": 3.024337581098597e-06, "loss": 0.0035, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1100 }, { "completion_length": 108.5, "epoch": 0.48998664886515353, "grad_norm": 0.9832037687301636, "kl": 0.05540832504630089, "learning_rate": 3.0205390786228244e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1101 }, { "completion_length": 157.1666717529297, "epoch": 0.49043168669336895, "grad_norm": 0.7491467595100403, "kl": 0.04887532815337181, "learning_rate": 3.016739319565457e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1102 }, { "completion_length": 194.33334350585938, "epoch": 0.4908767245215843, "grad_norm": 0.6997105479240417, "kl": 0.04166887700557709, "learning_rate": 3.0129383130991142e-06, "loss": 0.0017, "reward": 0.25033333897590637, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25033333897590637, "step": 1103 }, { "completion_length": 165.33334350585938, "epoch": 0.49132176234979974, "grad_norm": 0.8059443235397339, "kl": 0.12227432429790497, "learning_rate": 3.009136068399427e-06, "loss": 0.0049, "reward": 0.25866666436195374, "reward_std": 0.2874068021774292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25866663455963135, "step": 1104 }, { "completion_length": 194.33334350585938, "epoch": 0.4917668001780151, "grad_norm": 0.9285522103309631, "kl": 0.05058509111404419, "learning_rate": 3.005332594645018e-06, "loss": 0.002, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1105 }, { "completion_length": 189.1666717529297, "epoch": 0.49221183800623053, "grad_norm": 0.7666235566139221, "kl": 0.07048574090003967, "learning_rate": 3.0015279010174725e-06, "loss": 0.0028, "reward": 0.28333333134651184, "reward_std": 0.17277345061302185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28333330154418945, "step": 1106 }, { "completion_length": 145.1666717529297, "epoch": 0.49265687583444595, "grad_norm": 0.020632104948163033, "kl": 0.04825424402952194, "learning_rate": 2.9977219967013237e-06, "loss": 0.0022, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1107 }, { "completion_length": 133.5, "epoch": 0.4931019136626613, "grad_norm": 0.04588095471262932, "kl": 0.07726660370826721, "learning_rate": 2.993914890884027e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1108 }, { "completion_length": 160.33334350585938, "epoch": 0.49354695149087674, "grad_norm": 0.7883959412574768, "kl": 0.0675104409456253, "learning_rate": 2.990106592755937e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1109 }, { "completion_length": 168.33334350585938, "epoch": 0.4939919893190921, "grad_norm": 0.7340433597564697, "kl": 0.04423590004444122, "learning_rate": 2.9862971115102877e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1110 }, { "completion_length": 146.1666717529297, "epoch": 0.49443702714730753, "grad_norm": 0.04487235099077225, "kl": 0.07285156100988388, "learning_rate": 2.9824864563431684e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1111 }, { "completion_length": 173.0, "epoch": 0.4948820649755229, "grad_norm": 0.7536811232566833, "kl": 0.05195162817835808, "learning_rate": 2.978674636453503e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1112 }, { "completion_length": 178.33334350585938, "epoch": 0.4953271028037383, "grad_norm": 0.7448456287384033, "kl": 0.054630979895591736, "learning_rate": 2.9748616610430266e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1113 }, { "completion_length": 174.0, "epoch": 0.49577214063195374, "grad_norm": 0.7480930089950562, "kl": 0.07871096581220627, "learning_rate": 2.971047539316263e-06, "loss": 0.0031, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1114 }, { "completion_length": 168.6666717529297, "epoch": 0.4962171784601691, "grad_norm": 0.030191617086529732, "kl": 0.061308782547712326, "learning_rate": 2.9672322804805048e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1115 }, { "completion_length": 200.0, "epoch": 0.49666221628838453, "grad_norm": 0.031045343726873398, "kl": 0.05266506224870682, "learning_rate": 2.9634158937457886e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1116 }, { "completion_length": 141.5, "epoch": 0.4971072541165999, "grad_norm": 0.029410747811198235, "kl": 0.05735943466424942, "learning_rate": 2.9595983883248736e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1117 }, { "completion_length": 139.1666717529297, "epoch": 0.4975522919448153, "grad_norm": 0.7413390874862671, "kl": 0.06353849172592163, "learning_rate": 2.9557797734332196e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1118 }, { "completion_length": 115.5, "epoch": 0.4979973297730307, "grad_norm": 0.023313088342547417, "kl": 0.0640382319688797, "learning_rate": 2.9519600582889657e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1119 }, { "completion_length": 113.83333587646484, "epoch": 0.4984423676012461, "grad_norm": 0.023131540045142174, "kl": 0.05825777351856232, "learning_rate": 2.9481392521129047e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1120 }, { "completion_length": 183.0, "epoch": 0.49888740542946153, "grad_norm": 0.733018159866333, "kl": 0.0460851714015007, "learning_rate": 2.9443173641284663e-06, "loss": 0.0018, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1121 }, { "completion_length": 186.0, "epoch": 0.4993324432576769, "grad_norm": 0.7658995985984802, "kl": 0.039262425154447556, "learning_rate": 2.9404944035616893e-06, "loss": 0.0016, "reward": 0.27133333683013916, "reward_std": 0.12340772151947021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 1122 }, { "completion_length": 95.33333587646484, "epoch": 0.4997774810858923, "grad_norm": 0.9063270092010498, "kl": 0.05967504531145096, "learning_rate": 2.9366703796412022e-06, "loss": 0.0024, "reward": 0.39666664600372314, "reward_std": 0.05062279850244522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1123 }, { "completion_length": 190.6666717529297, "epoch": 0.5002225189141077, "grad_norm": 0.8011205792427063, "kl": 0.07317472994327545, "learning_rate": 2.9328453015982005e-06, "loss": 0.0029, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1124 }, { "completion_length": 193.1666717529297, "epoch": 0.5006675567423231, "grad_norm": 0.7241079211235046, "kl": 0.2021249532699585, "learning_rate": 2.9290191786664253e-06, "loss": 0.0081, "reward": 0.10083332657814026, "reward_std": 0.3284243941307068, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10083332657814026, "step": 1125 }, { "completion_length": 190.33334350585938, "epoch": 0.5011125945705385, "grad_norm": 0.8474998474121094, "kl": 0.04898199066519737, "learning_rate": 2.9251920200821383e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1126 }, { "completion_length": 183.6666717529297, "epoch": 0.5015576323987538, "grad_norm": 0.686153769493103, "kl": 0.04468465596437454, "learning_rate": 2.9213638350841026e-06, "loss": 0.0018, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1127 }, { "completion_length": 199.0, "epoch": 0.5020026702269693, "grad_norm": 0.7068986296653748, "kl": 0.04341750591993332, "learning_rate": 2.9175346329135596e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1128 }, { "completion_length": 155.0, "epoch": 0.5024477080551847, "grad_norm": 0.03027581423521042, "kl": 0.05036742985248566, "learning_rate": 2.9137044228142035e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1129 }, { "completion_length": 144.1666717529297, "epoch": 0.5028927458834, "grad_norm": 0.019624266773462296, "kl": 0.05049632117152214, "learning_rate": 2.909873214032165e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1130 }, { "completion_length": 181.6666717529297, "epoch": 0.5033377837116155, "grad_norm": 0.7808805108070374, "kl": 0.11239727586507797, "learning_rate": 2.906041015815983e-06, "loss": 0.0045, "reward": 0.10783332586288452, "reward_std": 0.32281166315078735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10783332586288452, "step": 1131 }, { "completion_length": 164.6666717529297, "epoch": 0.5037828215398309, "grad_norm": 0.7809410691261292, "kl": 0.05724121630191803, "learning_rate": 2.9022078374165863e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1132 }, { "completion_length": 171.6666717529297, "epoch": 0.5042278593680463, "grad_norm": 0.7676877379417419, "kl": 0.07378482818603516, "learning_rate": 2.8983736880872697e-06, "loss": 0.003, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1133 }, { "completion_length": 189.33334350585938, "epoch": 0.5046728971962616, "grad_norm": 0.7471271753311157, "kl": 0.04732378572225571, "learning_rate": 2.894538577083671e-06, "loss": 0.0019, "reward": 0.29216665029525757, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216665029525757, "step": 1134 }, { "completion_length": 138.1666717529297, "epoch": 0.5051179350244771, "grad_norm": 0.7307435274124146, "kl": 0.057880111038684845, "learning_rate": 2.8907025136637505e-06, "loss": 0.0023, "reward": 0.3084999918937683, "reward_std": 0.16534054279327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3084999918937683, "step": 1135 }, { "completion_length": 171.1666717529297, "epoch": 0.5055629728526925, "grad_norm": 0.6487751603126526, "kl": 0.04655180498957634, "learning_rate": 2.8868655070877676e-06, "loss": 0.0019, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1136 }, { "completion_length": 183.0, "epoch": 0.5060080106809078, "grad_norm": 0.9664366841316223, "kl": 0.05927140638232231, "learning_rate": 2.8830275666182566e-06, "loss": 0.0024, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1137 }, { "completion_length": 200.0, "epoch": 0.5064530485091233, "grad_norm": 0.03512910008430481, "kl": 0.05313728377223015, "learning_rate": 2.879188701520009e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1138 }, { "completion_length": 154.1666717529297, "epoch": 0.5068980863373387, "grad_norm": 0.7382206320762634, "kl": 0.051763903349637985, "learning_rate": 2.875348921060047e-06, "loss": 0.0021, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1139 }, { "completion_length": 135.1666717529297, "epoch": 0.507343124165554, "grad_norm": 0.7762537598609924, "kl": 0.07914315164089203, "learning_rate": 2.8715082345076022e-06, "loss": 0.0032, "reward": 0.2789999842643738, "reward_std": 0.23760050535202026, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2789999842643738, "step": 1140 }, { "completion_length": 119.5, "epoch": 0.5077881619937694, "grad_norm": 0.028253188356757164, "kl": 0.0685911476612091, "learning_rate": 2.8676666511340946e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1141 }, { "completion_length": 180.0, "epoch": 0.5082331998219849, "grad_norm": 0.915734052658081, "kl": 0.05604727193713188, "learning_rate": 2.8638241802131068e-06, "loss": 0.0022, "reward": 0.16566666960716248, "reward_std": 0.2729935050010681, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16566666960716248, "step": 1142 }, { "completion_length": 181.83334350585938, "epoch": 0.5086782376502003, "grad_norm": 0.8028380870819092, "kl": 0.05204429477453232, "learning_rate": 2.8599808310203662e-06, "loss": 0.0021, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1143 }, { "completion_length": 106.66667175292969, "epoch": 0.5091232754784156, "grad_norm": 0.026731373742222786, "kl": 0.0580553337931633, "learning_rate": 2.8561366128337213e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1144 }, { "completion_length": 166.83334350585938, "epoch": 0.5095683133066311, "grad_norm": 0.6618978381156921, "kl": 0.04359781742095947, "learning_rate": 2.852291534933114e-06, "loss": 0.0017, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1145 }, { "completion_length": 171.83334350585938, "epoch": 0.5100133511348465, "grad_norm": 0.7761538028717041, "kl": 0.05730967968702316, "learning_rate": 2.848445606600567e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1146 }, { "completion_length": 171.33334350585938, "epoch": 0.5104583889630618, "grad_norm": 8.721156120300293, "kl": 2.40242338180542, "learning_rate": 2.844598837120151e-06, "loss": 0.0961, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1147 }, { "completion_length": 200.0, "epoch": 0.5109034267912772, "grad_norm": 0.7378061413764954, "kl": 0.046708278357982635, "learning_rate": 2.8407512357779703e-06, "loss": 0.0019, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 1148 }, { "completion_length": 162.5, "epoch": 0.5113484646194927, "grad_norm": 0.7555989623069763, "kl": 0.049754880368709564, "learning_rate": 2.836902811862136e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1149 }, { "completion_length": 133.33334350585938, "epoch": 0.511793502447708, "grad_norm": 0.022492585703730583, "kl": 0.06045402213931084, "learning_rate": 2.833053574662747e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1150 }, { "completion_length": 193.0, "epoch": 0.5122385402759234, "grad_norm": 0.8129910230636597, "kl": 0.03598358854651451, "learning_rate": 2.8292035334718617e-06, "loss": 0.0014, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 1151 }, { "completion_length": 128.1666717529297, "epoch": 0.5126835781041389, "grad_norm": 0.8406050801277161, "kl": 0.07259870320558548, "learning_rate": 2.8253526975834824e-06, "loss": 0.0029, "reward": 0.33416664600372314, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1152 }, { "completion_length": 182.1666717529297, "epoch": 0.5131286159323543, "grad_norm": 0.7992534041404724, "kl": 0.04329013079404831, "learning_rate": 2.821501076293529e-06, "loss": 0.0017, "reward": 0.27133333683013916, "reward_std": 0.12340772151947021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 1153 }, { "completion_length": 157.5, "epoch": 0.5135736537605696, "grad_norm": 0.8890674710273743, "kl": 0.07445360720157623, "learning_rate": 2.8176486788998168e-06, "loss": 0.003, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1154 }, { "completion_length": 185.0, "epoch": 0.514018691588785, "grad_norm": 0.7826995849609375, "kl": 0.05238564312458038, "learning_rate": 2.813795514702036e-06, "loss": 0.0021, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1155 }, { "completion_length": 194.1666717529297, "epoch": 0.5144637294170005, "grad_norm": 0.6543252468109131, "kl": 0.04138343781232834, "learning_rate": 2.8099415930017254e-06, "loss": 0.0017, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1156 }, { "completion_length": 150.0, "epoch": 0.5149087672452158, "grad_norm": 0.022047659382224083, "kl": 0.054192956537008286, "learning_rate": 2.806086923102255e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1157 }, { "completion_length": 195.6666717529297, "epoch": 0.5153538050734312, "grad_norm": 0.7176312804222107, "kl": 0.036622267216444016, "learning_rate": 2.802231514308799e-06, "loss": 0.0015, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1158 }, { "completion_length": 133.0, "epoch": 0.5157988429016467, "grad_norm": 0.024259619414806366, "kl": 0.0638909861445427, "learning_rate": 2.798375375928318e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1159 }, { "completion_length": 162.5, "epoch": 0.516243880729862, "grad_norm": 0.6893110275268555, "kl": 0.0569378100335598, "learning_rate": 2.7945185172695295e-06, "loss": 0.0023, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1160 }, { "completion_length": 158.6666717529297, "epoch": 0.5166889185580774, "grad_norm": 0.028980251401662827, "kl": 0.06851427257061005, "learning_rate": 2.7906609476428935e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1161 }, { "completion_length": 131.33334350585938, "epoch": 0.5171339563862928, "grad_norm": 1.0463398694992065, "kl": 0.11661326885223389, "learning_rate": 2.7868026763605854e-06, "loss": 0.0047, "reward": 0.32233333587646484, "reward_std": 0.1314559429883957, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.32233333587646484, "step": 1162 }, { "completion_length": 166.1666717529297, "epoch": 0.5175789942145083, "grad_norm": 0.6688762307167053, "kl": 0.05393125116825104, "learning_rate": 2.782943712736473e-06, "loss": 0.0022, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1163 }, { "completion_length": 125.66667175292969, "epoch": 0.5180240320427236, "grad_norm": 0.7415357232093811, "kl": 0.06435568630695343, "learning_rate": 2.7790840660860973e-06, "loss": 0.0026, "reward": 0.29216665029525757, "reward_std": 0.2053488940000534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216665029525757, "step": 1164 }, { "completion_length": 153.6666717529297, "epoch": 0.518469069870939, "grad_norm": 0.7977253198623657, "kl": 0.05409222096204758, "learning_rate": 2.775223745726646e-06, "loss": 0.0022, "reward": 0.33416664600372314, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1165 }, { "completion_length": 137.33334350585938, "epoch": 0.5189141076991545, "grad_norm": 0.8917381167411804, "kl": 0.05699780583381653, "learning_rate": 2.7713627609769363e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1166 }, { "completion_length": 127.83333587646484, "epoch": 0.5193591455273698, "grad_norm": 0.026533007621765137, "kl": 0.06946979463100433, "learning_rate": 2.767501121157386e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1167 }, { "completion_length": 151.6666717529297, "epoch": 0.5198041833555852, "grad_norm": 0.02679111249744892, "kl": 0.058871589601039886, "learning_rate": 2.763638835589995e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1168 }, { "completion_length": 196.83334350585938, "epoch": 0.5202492211838006, "grad_norm": 0.6838176846504211, "kl": 0.0421440526843071, "learning_rate": 2.7597759135983237e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1169 }, { "completion_length": 152.6666717529297, "epoch": 0.520694259012016, "grad_norm": 0.6762892007827759, "kl": 0.0611237995326519, "learning_rate": 2.755912364507468e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1170 }, { "completion_length": 164.5, "epoch": 0.5211392968402314, "grad_norm": 0.025890666991472244, "kl": 0.056531667709350586, "learning_rate": 2.752048197644036e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1171 }, { "completion_length": 154.83334350585938, "epoch": 0.5215843346684468, "grad_norm": 0.6817211508750916, "kl": 0.0709918960928917, "learning_rate": 2.7481834223361294e-06, "loss": 0.0028, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1172 }, { "completion_length": 153.33334350585938, "epoch": 0.5220293724966623, "grad_norm": 0.8292616605758667, "kl": 0.0703635960817337, "learning_rate": 2.744318047913318e-06, "loss": 0.0028, "reward": 0.21949999034404755, "reward_std": 0.22260615229606628, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21949999034404755, "step": 1173 }, { "completion_length": 143.0, "epoch": 0.5224744103248776, "grad_norm": 0.01640567183494568, "kl": 0.054961517453193665, "learning_rate": 2.7404520837066163e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1174 }, { "completion_length": 191.1666717529297, "epoch": 0.522919448153093, "grad_norm": 0.7560374140739441, "kl": 0.04697653651237488, "learning_rate": 2.7365855390484646e-06, "loss": 0.0019, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1175 }, { "completion_length": 170.83334350585938, "epoch": 0.5233644859813084, "grad_norm": 0.651421070098877, "kl": 0.1001739650964737, "learning_rate": 2.7327184232727037e-06, "loss": 0.004, "reward": 0.2094999998807907, "reward_std": 0.34976324439048767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2094999998807907, "step": 1176 }, { "completion_length": 177.5, "epoch": 0.5238095238095238, "grad_norm": 0.6764176487922668, "kl": 0.059600621461868286, "learning_rate": 2.728850745714553e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1177 }, { "completion_length": 147.33334350585938, "epoch": 0.5242545616377392, "grad_norm": 0.7848303318023682, "kl": 0.05990871787071228, "learning_rate": 2.724982515710588e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1178 }, { "completion_length": 164.6666717529297, "epoch": 0.5246995994659546, "grad_norm": 0.6674816012382507, "kl": 0.054695215076208115, "learning_rate": 2.7211137425987178e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1179 }, { "completion_length": 179.83334350585938, "epoch": 0.52514463729417, "grad_norm": 0.6498001217842102, "kl": 0.05996771901845932, "learning_rate": 2.7172444357181628e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1180 }, { "completion_length": 186.5, "epoch": 0.5255896751223854, "grad_norm": 0.7126318216323853, "kl": 0.06885448098182678, "learning_rate": 2.7133746044094315e-06, "loss": 0.0028, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1181 }, { "completion_length": 165.6666717529297, "epoch": 0.5260347129506008, "grad_norm": 0.8802997469902039, "kl": 0.066841721534729, "learning_rate": 2.7095042580142984e-06, "loss": 0.0027, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1182 }, { "completion_length": 179.1666717529297, "epoch": 0.5264797507788161, "grad_norm": 0.7791231274604797, "kl": 0.04690280556678772, "learning_rate": 2.705633405875782e-06, "loss": 0.0019, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1183 }, { "completion_length": 183.1666717529297, "epoch": 0.5269247886070316, "grad_norm": 0.9081090092658997, "kl": 0.07322704046964645, "learning_rate": 2.701762057338122e-06, "loss": 0.0029, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1184 }, { "completion_length": 116.33333587646484, "epoch": 0.527369826435247, "grad_norm": 0.03892948850989342, "kl": 0.07545986026525497, "learning_rate": 2.697890221746754e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1185 }, { "completion_length": 150.33334350585938, "epoch": 0.5278148642634624, "grad_norm": 0.03163018450140953, "kl": 0.08749519288539886, "learning_rate": 2.6940179084482927e-06, "loss": 0.0038, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1186 }, { "completion_length": 143.33334350585938, "epoch": 0.5282599020916778, "grad_norm": 0.02014567144215107, "kl": 0.05517182871699333, "learning_rate": 2.690145126790503e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1187 }, { "completion_length": 116.16667175292969, "epoch": 0.5287049399198932, "grad_norm": 0.026103656738996506, "kl": 0.06503650546073914, "learning_rate": 2.686271886122283e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1188 }, { "completion_length": 85.5, "epoch": 0.5291499777481086, "grad_norm": 1.503110647201538, "kl": 0.07808747887611389, "learning_rate": 2.6823981957936363e-06, "loss": 0.0031, "reward": 0.39666664600372314, "reward_std": 0.05062279850244522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1189 }, { "completion_length": 183.1666717529297, "epoch": 0.5295950155763239, "grad_norm": 0.7100231647491455, "kl": 0.059634458273649216, "learning_rate": 2.678524065155655e-06, "loss": 0.0024, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1190 }, { "completion_length": 120.0, "epoch": 0.5300400534045394, "grad_norm": 0.03600943461060524, "kl": 0.0854596421122551, "learning_rate": 2.67464950356049e-06, "loss": 0.0037, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1191 }, { "completion_length": 192.1666717529297, "epoch": 0.5304850912327548, "grad_norm": 0.761343777179718, "kl": 0.15199977159500122, "learning_rate": 2.670774520361337e-06, "loss": 0.0061, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1192 }, { "completion_length": 150.0, "epoch": 0.5309301290609701, "grad_norm": 0.7439923286437988, "kl": 0.08151707053184509, "learning_rate": 2.666899124912407e-06, "loss": 0.0033, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1193 }, { "completion_length": 184.6666717529297, "epoch": 0.5313751668891856, "grad_norm": 0.7151275277137756, "kl": 0.08384295552968979, "learning_rate": 2.6630233265689053e-06, "loss": 0.0034, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1194 }, { "completion_length": 157.0, "epoch": 0.531820204717401, "grad_norm": 0.8445621132850647, "kl": 0.07981076091527939, "learning_rate": 2.659147134687013e-06, "loss": 0.0032, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1195 }, { "completion_length": 171.1666717529297, "epoch": 0.5322652425456164, "grad_norm": 0.9164091348648071, "kl": 0.05100735276937485, "learning_rate": 2.6552705586238575e-06, "loss": 0.002, "reward": 0.33416664600372314, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1196 }, { "completion_length": 200.0, "epoch": 0.5327102803738317, "grad_norm": 0.7608870267868042, "kl": 0.06349802017211914, "learning_rate": 2.651393607737496e-06, "loss": 0.0025, "reward": 0.12850001454353333, "reward_std": 0.2976129949092865, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12850001454353333, "step": 1197 }, { "completion_length": 182.6666717529297, "epoch": 0.5331553182020472, "grad_norm": 0.8513096570968628, "kl": 0.06429284065961838, "learning_rate": 2.6475162913868903e-06, "loss": 0.0026, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1198 }, { "completion_length": 169.6666717529297, "epoch": 0.5336003560302626, "grad_norm": 0.7032941579818726, "kl": 0.05037330836057663, "learning_rate": 2.643638618931883e-06, "loss": 0.002, "reward": 0.2561666667461395, "reward_std": 0.29353052377700806, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2561666667461395, "step": 1199 }, { "completion_length": 184.33334350585938, "epoch": 0.5340453938584779, "grad_norm": 1.026814341545105, "kl": 0.046553052961826324, "learning_rate": 2.639760599733178e-06, "loss": 0.0019, "reward": 0.15850000083446503, "reward_std": 0.10841356217861176, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15850000083446503, "step": 1200 }, { "completion_length": 150.83334350585938, "epoch": 0.5344904316866934, "grad_norm": 0.8465564846992493, "kl": 0.08807514607906342, "learning_rate": 2.635882243152316e-06, "loss": 0.0035, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1201 }, { "completion_length": 200.0, "epoch": 0.5349354695149088, "grad_norm": 0.026606876403093338, "kl": 0.047713205218315125, "learning_rate": 2.6320035585516513e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1202 }, { "completion_length": 164.0, "epoch": 0.5353805073431241, "grad_norm": 0.8893852233886719, "kl": 0.04501502960920334, "learning_rate": 2.6281245552943297e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1203 }, { "completion_length": 179.1666717529297, "epoch": 0.5358255451713395, "grad_norm": 0.9132094979286194, "kl": 0.0830874890089035, "learning_rate": 2.624245242744268e-06, "loss": 0.0033, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1204 }, { "completion_length": 162.1666717529297, "epoch": 0.536270582999555, "grad_norm": 0.7258564829826355, "kl": 0.050873756408691406, "learning_rate": 2.6203656302661284e-06, "loss": 0.002, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1205 }, { "completion_length": 123.5, "epoch": 0.5367156208277704, "grad_norm": 0.9385852217674255, "kl": 0.06338381767272949, "learning_rate": 2.6164857272252975e-06, "loss": 0.0025, "reward": 0.39666664600372314, "reward_std": 0.05062279850244522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1206 }, { "completion_length": 176.1666717529297, "epoch": 0.5371606586559857, "grad_norm": 0.7586855888366699, "kl": 0.05588201805949211, "learning_rate": 2.6126055429878634e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1207 }, { "completion_length": 139.1666717529297, "epoch": 0.5376056964842012, "grad_norm": 0.026297971606254578, "kl": 0.06350646913051605, "learning_rate": 2.608725086920591e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1208 }, { "completion_length": 127.66667175292969, "epoch": 0.5380507343124166, "grad_norm": 1.102054238319397, "kl": 0.16556766629219055, "learning_rate": 2.6048443683909053e-06, "loss": 0.0066, "reward": 0.3264999985694885, "reward_std": 0.1212497353553772, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3264999985694885, "step": 1209 }, { "completion_length": 152.6666717529297, "epoch": 0.5384957721406319, "grad_norm": 0.7634212374687195, "kl": 0.09940703958272934, "learning_rate": 2.6009633967668625e-06, "loss": 0.004, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1210 }, { "completion_length": 121.66667175292969, "epoch": 0.5389408099688473, "grad_norm": 0.07904371619224548, "kl": 0.09018834680318832, "learning_rate": 2.5970821814171287e-06, "loss": 0.0039, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1211 }, { "completion_length": 150.5, "epoch": 0.5393858477970628, "grad_norm": 0.06901979446411133, "kl": 0.09860502183437347, "learning_rate": 2.5932007317109607e-06, "loss": 0.0042, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1212 }, { "completion_length": 162.6666717529297, "epoch": 0.5398308856252781, "grad_norm": 0.030686013400554657, "kl": 0.07230792939662933, "learning_rate": 2.58931905701818e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1213 }, { "completion_length": 194.5, "epoch": 0.5402759234534935, "grad_norm": 0.6610811352729797, "kl": 0.04911746829748154, "learning_rate": 2.585437166709151e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1214 }, { "completion_length": 137.5, "epoch": 0.540720961281709, "grad_norm": 0.04222337529063225, "kl": 0.08580730855464935, "learning_rate": 2.581555070154759e-06, "loss": 0.0037, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1215 }, { "completion_length": 161.1666717529297, "epoch": 0.5411659991099244, "grad_norm": 0.8334679007530212, "kl": 0.05739726126194, "learning_rate": 2.5776727767263878e-06, "loss": 0.0023, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 1216 }, { "completion_length": 134.33334350585938, "epoch": 0.5416110369381397, "grad_norm": 0.7179288864135742, "kl": 0.1268427073955536, "learning_rate": 2.5737902957958928e-06, "loss": 0.0051, "reward": 0.29783332347869873, "reward_std": 0.19146844744682312, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29783332347869873, "step": 1217 }, { "completion_length": 148.33334350585938, "epoch": 0.5420560747663551, "grad_norm": 0.031933125108480453, "kl": 0.06602717936038971, "learning_rate": 2.5699076367355883e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1218 }, { "completion_length": 154.1666717529297, "epoch": 0.5425011125945706, "grad_norm": 0.8263406157493591, "kl": 0.0948314294219017, "learning_rate": 2.5660248089182136e-06, "loss": 0.0038, "reward": 0.3316666781902313, "reward_std": 0.0688670203089714, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3316666781902313, "step": 1219 }, { "completion_length": 161.1666717529297, "epoch": 0.5429461504227859, "grad_norm": 0.6751954555511475, "kl": 0.07369253039360046, "learning_rate": 2.5621418217169177e-06, "loss": 0.0029, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1220 }, { "completion_length": 138.0, "epoch": 0.5433911882510013, "grad_norm": 0.022621888667345047, "kl": 0.06267684698104858, "learning_rate": 2.5582586845052333e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1221 }, { "completion_length": 168.0, "epoch": 0.5438362260792168, "grad_norm": 0.8740278482437134, "kl": 0.05454988032579422, "learning_rate": 2.554375406657054e-06, "loss": 0.0022, "reward": 0.29250001907348633, "reward_std": 0.12935802340507507, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29250001907348633, "step": 1222 }, { "completion_length": 197.33334350585938, "epoch": 0.5442812639074321, "grad_norm": 0.6384437084197998, "kl": 0.06101636588573456, "learning_rate": 2.550491997546617e-06, "loss": 0.0024, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1223 }, { "completion_length": 126.0, "epoch": 0.5447263017356475, "grad_norm": 0.020438499748706818, "kl": 0.06194034963846207, "learning_rate": 2.5466084665484732e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1224 }, { "completion_length": 179.0, "epoch": 0.5451713395638629, "grad_norm": 0.6573781967163086, "kl": 0.0651036947965622, "learning_rate": 2.542724823037467e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1225 }, { "completion_length": 144.83334350585938, "epoch": 0.5456163773920784, "grad_norm": 0.04750434681773186, "kl": 0.08231821656227112, "learning_rate": 2.538841076388717e-06, "loss": 0.0036, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1226 }, { "completion_length": 133.0, "epoch": 0.5460614152202937, "grad_norm": 0.803379237651825, "kl": 0.056590914726257324, "learning_rate": 2.5349572359775894e-06, "loss": 0.0023, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1227 }, { "completion_length": 195.5, "epoch": 0.5465064530485091, "grad_norm": 1.080286979675293, "kl": 0.0482148602604866, "learning_rate": 2.5310733111796765e-06, "loss": 0.0019, "reward": 0.19833332300186157, "reward_std": 0.21086929738521576, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19833330810070038, "step": 1228 }, { "completion_length": 162.0, "epoch": 0.5469514908767246, "grad_norm": 0.02214636653661728, "kl": 0.05421888083219528, "learning_rate": 2.527189311370775e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1229 }, { "completion_length": 182.33334350585938, "epoch": 0.5473965287049399, "grad_norm": 0.7057332992553711, "kl": 0.07187958061695099, "learning_rate": 2.523305245926862e-06, "loss": 0.0029, "reward": 0.27133333683013916, "reward_std": 0.12340772151947021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 1230 }, { "completion_length": 150.0, "epoch": 0.5478415665331553, "grad_norm": 0.019993474707007408, "kl": 0.05282042548060417, "learning_rate": 2.519421124224074e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1231 }, { "completion_length": 131.33334350585938, "epoch": 0.5482866043613707, "grad_norm": 0.9813181757926941, "kl": 0.06754153221845627, "learning_rate": 2.5155369556386825e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1232 }, { "completion_length": 174.0, "epoch": 0.5487316421895861, "grad_norm": 0.7997798919677734, "kl": 0.08393299579620361, "learning_rate": 2.5116527495470724e-06, "loss": 0.0034, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1233 }, { "completion_length": 188.5, "epoch": 0.5491766800178015, "grad_norm": 0.7237727642059326, "kl": 0.049749068915843964, "learning_rate": 2.5077685153257182e-06, "loss": 0.002, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1234 }, { "completion_length": 161.6666717529297, "epoch": 0.5496217178460169, "grad_norm": 0.9056436419487, "kl": 0.04640985652804375, "learning_rate": 2.503884262351165e-06, "loss": 0.0019, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1235 }, { "completion_length": 143.33334350585938, "epoch": 0.5500667556742324, "grad_norm": 0.8036392331123352, "kl": 0.04953508824110031, "learning_rate": 2.5e-06, "loss": 0.002, "reward": 0.37566667795181274, "reward_std": 0.07905863225460052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 1236 }, { "completion_length": 151.83334350585938, "epoch": 0.5505117935024477, "grad_norm": 0.7205859422683716, "kl": 0.07808684557676315, "learning_rate": 2.4961157376488352e-06, "loss": 0.0031, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1237 }, { "completion_length": 154.6666717529297, "epoch": 0.5509568313306631, "grad_norm": 0.017512843012809753, "kl": 0.04894305393099785, "learning_rate": 2.492231484674282e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1238 }, { "completion_length": 175.0, "epoch": 0.5514018691588785, "grad_norm": 0.6931830644607544, "kl": 0.05162414163351059, "learning_rate": 2.488347250452929e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1239 }, { "completion_length": 181.6666717529297, "epoch": 0.5518469069870939, "grad_norm": 0.6881082653999329, "kl": 0.07192274183034897, "learning_rate": 2.4844630443613183e-06, "loss": 0.0029, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1240 }, { "completion_length": 113.5, "epoch": 0.5522919448153093, "grad_norm": 0.023202555254101753, "kl": 0.06096421927213669, "learning_rate": 2.480578875775927e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1241 }, { "completion_length": 179.0, "epoch": 0.5527369826435247, "grad_norm": 0.7318057417869568, "kl": 0.09055332839488983, "learning_rate": 2.4766947540731385e-06, "loss": 0.0036, "reward": 0.023666661232709885, "reward_std": 0.36290258169174194, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666661232709885, "step": 1242 }, { "completion_length": 200.0, "epoch": 0.5531820204717401, "grad_norm": 0.6898524165153503, "kl": 0.05797708034515381, "learning_rate": 2.4728106886292257e-06, "loss": 0.0023, "reward": 0.14133334159851074, "reward_std": 0.2661778926849365, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14133334159851074, "step": 1243 }, { "completion_length": 200.0, "epoch": 0.5536270582999555, "grad_norm": 0.7336671352386475, "kl": 0.05232497677206993, "learning_rate": 2.4689266888203243e-06, "loss": 0.0021, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 1244 }, { "completion_length": 200.0, "epoch": 0.5540720961281709, "grad_norm": 0.024343255907297134, "kl": 0.052714623510837555, "learning_rate": 2.4650427640224115e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1245 }, { "completion_length": 171.0, "epoch": 0.5545171339563862, "grad_norm": 0.6648960709571838, "kl": 0.05184723436832428, "learning_rate": 2.4611589236112834e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1246 }, { "completion_length": 122.33333587646484, "epoch": 0.5549621717846017, "grad_norm": 0.7880151271820068, "kl": 0.06251058727502823, "learning_rate": 2.4572751769625334e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1247 }, { "completion_length": 132.0, "epoch": 0.5554072096128171, "grad_norm": 0.025808745995163918, "kl": 0.06488477438688278, "learning_rate": 2.4533915334515276e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1248 }, { "completion_length": 186.6666717529297, "epoch": 0.5558522474410325, "grad_norm": 0.8226630091667175, "kl": 0.05653652548789978, "learning_rate": 2.4495080024533833e-06, "loss": 0.0023, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1249 }, { "completion_length": 139.33334350585938, "epoch": 0.5562972852692479, "grad_norm": 0.6765699982643127, "kl": 0.0589500293135643, "learning_rate": 2.4456245933429464e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1250 }, { "completion_length": 182.5, "epoch": 0.5567423230974633, "grad_norm": 0.7177083492279053, "kl": 0.054428085684776306, "learning_rate": 2.441741315494768e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1251 }, { "completion_length": 160.83334350585938, "epoch": 0.5571873609256787, "grad_norm": 0.907528817653656, "kl": 0.054479341953992844, "learning_rate": 2.437858178283083e-06, "loss": 0.0022, "reward": 0.3551666736602783, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1252 }, { "completion_length": 165.83334350585938, "epoch": 0.557632398753894, "grad_norm": 0.7852663397789001, "kl": 0.06126983463764191, "learning_rate": 2.4339751910817868e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1253 }, { "completion_length": 154.5, "epoch": 0.5580774365821095, "grad_norm": 0.6789471507072449, "kl": 0.06228331848978996, "learning_rate": 2.430092363264412e-06, "loss": 0.0025, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1254 }, { "completion_length": 195.33334350585938, "epoch": 0.5585224744103249, "grad_norm": 0.697045087814331, "kl": 0.05558575317263603, "learning_rate": 2.4262097042041076e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1255 }, { "completion_length": 140.33334350585938, "epoch": 0.5589675122385402, "grad_norm": 0.03193692862987518, "kl": 0.06003742665052414, "learning_rate": 2.422327223273614e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1256 }, { "completion_length": 200.0, "epoch": 0.5594125500667557, "grad_norm": 0.6868168115615845, "kl": 0.052136264741420746, "learning_rate": 2.4184449298452417e-06, "loss": 0.0021, "reward": 0.18816666305065155, "reward_std": 0.17143210768699646, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18816666305065155, "step": 1257 }, { "completion_length": 160.5, "epoch": 0.5598575878949711, "grad_norm": 0.7360550165176392, "kl": 0.06848403811454773, "learning_rate": 2.41456283329085e-06, "loss": 0.0027, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1258 }, { "completion_length": 170.5, "epoch": 0.5603026257231865, "grad_norm": 0.7767543196678162, "kl": 0.06690746545791626, "learning_rate": 2.410680942981821e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1259 }, { "completion_length": 200.0, "epoch": 0.5607476635514018, "grad_norm": 0.7363775372505188, "kl": 0.05513622611761093, "learning_rate": 2.40679926828904e-06, "loss": 0.0022, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 1260 }, { "completion_length": 195.5, "epoch": 0.5611927013796173, "grad_norm": 0.7677130699157715, "kl": 0.0692647248506546, "learning_rate": 2.4029178185828725e-06, "loss": 0.0028, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1261 }, { "completion_length": 160.0, "epoch": 0.5616377392078327, "grad_norm": 0.7026461958885193, "kl": 0.0852893814444542, "learning_rate": 2.3990366032331388e-06, "loss": 0.0034, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1262 }, { "completion_length": 197.1666717529297, "epoch": 0.562082777036048, "grad_norm": 0.9166952967643738, "kl": 0.0652032420039177, "learning_rate": 2.3951556316090955e-06, "loss": 0.0026, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 1263 }, { "completion_length": 138.83334350585938, "epoch": 0.5625278148642635, "grad_norm": 0.02755793184041977, "kl": 0.058620356023311615, "learning_rate": 2.39127491307941e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1264 }, { "completion_length": 196.6666717529297, "epoch": 0.5629728526924789, "grad_norm": 0.7676947712898254, "kl": 0.055359989404678345, "learning_rate": 2.3873944570121383e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1265 }, { "completion_length": 136.1666717529297, "epoch": 0.5634178905206942, "grad_norm": 0.8367419242858887, "kl": 0.1252003312110901, "learning_rate": 2.3835142727747033e-06, "loss": 0.005, "reward": 0.3186666667461395, "reward_std": 0.14043740928173065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31866663694381714, "step": 1266 }, { "completion_length": 147.33334350585938, "epoch": 0.5638629283489096, "grad_norm": 0.6919192671775818, "kl": 0.06637275218963623, "learning_rate": 2.3796343697338724e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1267 }, { "completion_length": 115.0, "epoch": 0.5643079661771251, "grad_norm": 0.8813140988349915, "kl": 0.06479702889919281, "learning_rate": 2.375754757255733e-06, "loss": 0.0026, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1268 }, { "completion_length": 176.1666717529297, "epoch": 0.5647530040053405, "grad_norm": 0.7938532829284668, "kl": 0.08625823259353638, "learning_rate": 2.371875444705671e-06, "loss": 0.0035, "reward": 0.20500001311302185, "reward_std": 0.2416965216398239, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20500001311302185, "step": 1269 }, { "completion_length": 145.5, "epoch": 0.5651980418335558, "grad_norm": 0.030022282153367996, "kl": 0.06400606036186218, "learning_rate": 2.3679964414483504e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1270 }, { "completion_length": 159.33334350585938, "epoch": 0.5656430796617713, "grad_norm": 0.6740466952323914, "kl": 0.05308264493942261, "learning_rate": 2.364117756847685e-06, "loss": 0.0021, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1271 }, { "completion_length": 182.6666717529297, "epoch": 0.5660881174899867, "grad_norm": 0.7299838066101074, "kl": 0.06719101965427399, "learning_rate": 2.3602394002668235e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1272 }, { "completion_length": 164.1666717529297, "epoch": 0.566533155318202, "grad_norm": 0.029490042477846146, "kl": 0.06731471419334412, "learning_rate": 2.3563613810681184e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1273 }, { "completion_length": 139.6666717529297, "epoch": 0.5669781931464174, "grad_norm": 0.03775608167052269, "kl": 0.07644303143024445, "learning_rate": 2.352483708613111e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1274 }, { "completion_length": 149.6666717529297, "epoch": 0.5674232309746329, "grad_norm": 0.03325009346008301, "kl": 0.06889767944812775, "learning_rate": 2.3486063922625046e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1275 }, { "completion_length": 166.6666717529297, "epoch": 0.5678682688028482, "grad_norm": 0.6968271136283875, "kl": 0.05943942815065384, "learning_rate": 2.344729441376143e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1276 }, { "completion_length": 153.33334350585938, "epoch": 0.5683133066310636, "grad_norm": 0.6684544682502747, "kl": 0.06784616410732269, "learning_rate": 2.340852865312988e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1277 }, { "completion_length": 155.83334350585938, "epoch": 0.5687583444592791, "grad_norm": 0.08216793835163116, "kl": 0.0659741461277008, "learning_rate": 2.3369766734310947e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1278 }, { "completion_length": 146.33334350585938, "epoch": 0.5692033822874945, "grad_norm": 0.7616736888885498, "kl": 0.07293137907981873, "learning_rate": 2.3331008750875934e-06, "loss": 0.0029, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1279 }, { "completion_length": 175.33334350585938, "epoch": 0.5696484201157098, "grad_norm": 0.716716468334198, "kl": 0.05208544060587883, "learning_rate": 2.329225479638663e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1280 }, { "completion_length": 181.6666717529297, "epoch": 0.5700934579439252, "grad_norm": 0.804816484451294, "kl": 0.06358016282320023, "learning_rate": 2.32535049643951e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1281 }, { "completion_length": 120.16667175292969, "epoch": 0.5705384957721407, "grad_norm": 0.018244080245494843, "kl": 0.05629762262105942, "learning_rate": 2.3214759348443456e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1282 }, { "completion_length": 183.6666717529297, "epoch": 0.570983533600356, "grad_norm": 0.7986438870429993, "kl": 0.07024630904197693, "learning_rate": 2.3176018042063637e-06, "loss": 0.0028, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1283 }, { "completion_length": 188.83334350585938, "epoch": 0.5714285714285714, "grad_norm": 0.7718678116798401, "kl": 0.04382052272558212, "learning_rate": 2.3137281138777173e-06, "loss": 0.0018, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1284 }, { "completion_length": 147.5, "epoch": 0.5718736092567869, "grad_norm": 0.05693955719470978, "kl": 0.09006796777248383, "learning_rate": 2.3098548732094964e-06, "loss": 0.0039, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1285 }, { "completion_length": 180.83334350585938, "epoch": 0.5723186470850022, "grad_norm": 0.7249354720115662, "kl": 0.09484501928091049, "learning_rate": 2.3059820915517077e-06, "loss": 0.0038, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1286 }, { "completion_length": 150.33334350585938, "epoch": 0.5727636849132176, "grad_norm": 0.039812684059143066, "kl": 0.07181607186794281, "learning_rate": 2.302109778253246e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1287 }, { "completion_length": 161.6666717529297, "epoch": 0.573208722741433, "grad_norm": 0.7592692375183105, "kl": 0.06504487991333008, "learning_rate": 2.298237942661879e-06, "loss": 0.0026, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1288 }, { "completion_length": 142.5, "epoch": 0.5736537605696485, "grad_norm": 0.021916421130299568, "kl": 0.05927393585443497, "learning_rate": 2.294366594124218e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1289 }, { "completion_length": 109.66667175292969, "epoch": 0.5740987983978638, "grad_norm": 0.030946621671319008, "kl": 0.06689797341823578, "learning_rate": 2.2904957419857016e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1290 }, { "completion_length": 200.0, "epoch": 0.5745438362260792, "grad_norm": 0.015107125975191593, "kl": 0.03902910649776459, "learning_rate": 2.2866253955905693e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1291 }, { "completion_length": 161.83334350585938, "epoch": 0.5749888740542947, "grad_norm": 0.9576203227043152, "kl": 0.06322360038757324, "learning_rate": 2.2827555642818377e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1292 }, { "completion_length": 176.6666717529297, "epoch": 0.57543391188251, "grad_norm": 0.7536025047302246, "kl": 0.057025518268346786, "learning_rate": 2.278886257401282e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1293 }, { "completion_length": 121.16667175292969, "epoch": 0.5758789497107254, "grad_norm": 0.017764268442988396, "kl": 0.058683812618255615, "learning_rate": 2.2750174842894127e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1294 }, { "completion_length": 113.66667175292969, "epoch": 0.5763239875389408, "grad_norm": 1.1381512880325317, "kl": 0.09195034205913544, "learning_rate": 2.2711492542854475e-06, "loss": 0.0037, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1295 }, { "completion_length": 145.0, "epoch": 0.5767690253671562, "grad_norm": 0.6938797831535339, "kl": 0.05431237071752548, "learning_rate": 2.2672815767272968e-06, "loss": 0.0022, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1296 }, { "completion_length": 158.5, "epoch": 0.5772140631953716, "grad_norm": 0.7317979335784912, "kl": 0.08583417534828186, "learning_rate": 2.2634144609515362e-06, "loss": 0.0034, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1297 }, { "completion_length": 200.0, "epoch": 0.577659101023587, "grad_norm": 0.030720453709363937, "kl": 0.05192447826266289, "learning_rate": 2.2595479162933846e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1298 }, { "completion_length": 176.6666717529297, "epoch": 0.5781041388518025, "grad_norm": 0.6601949334144592, "kl": 0.068632572889328, "learning_rate": 2.255681952086683e-06, "loss": 0.0027, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1299 }, { "completion_length": 173.6666717529297, "epoch": 0.5785491766800178, "grad_norm": 0.688745379447937, "kl": 0.04240253195166588, "learning_rate": 2.2518165776638715e-06, "loss": 0.0017, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1300 }, { "completion_length": 145.0, "epoch": 0.5789942145082332, "grad_norm": 0.021718116477131844, "kl": 0.0497608482837677, "learning_rate": 2.2479518023559645e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1301 }, { "completion_length": 159.1666717529297, "epoch": 0.5794392523364486, "grad_norm": 0.7606553435325623, "kl": 0.05341101810336113, "learning_rate": 2.2440876354925327e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1302 }, { "completion_length": 137.1666717529297, "epoch": 0.579884290164664, "grad_norm": 0.031692203134298325, "kl": 0.07368038594722748, "learning_rate": 2.240224086401677e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1303 }, { "completion_length": 152.33334350585938, "epoch": 0.5803293279928794, "grad_norm": 0.8328744769096375, "kl": 0.085344098508358, "learning_rate": 2.2363611644100055e-06, "loss": 0.0034, "reward": 0.2528333365917206, "reward_std": 0.18862704932689667, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2528333365917206, "step": 1304 }, { "completion_length": 178.33334350585938, "epoch": 0.5807743658210948, "grad_norm": 0.7432076930999756, "kl": 0.04911906644701958, "learning_rate": 2.232498878842615e-06, "loss": 0.002, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1305 }, { "completion_length": 123.5, "epoch": 0.5812194036493102, "grad_norm": 0.8634981513023376, "kl": 0.09652163088321686, "learning_rate": 2.2286372390230645e-06, "loss": 0.0039, "reward": 0.2605000138282776, "reward_std": 0.1890542209148407, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2605000138282776, "step": 1306 }, { "completion_length": 114.66667175292969, "epoch": 0.5816644414775256, "grad_norm": 0.03657658025622368, "kl": 0.06502784043550491, "learning_rate": 2.2247762542733544e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1307 }, { "completion_length": 197.0, "epoch": 0.582109479305741, "grad_norm": 0.8336066603660583, "kl": 0.06351786851882935, "learning_rate": 2.220915933913903e-06, "loss": 0.0025, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 1308 }, { "completion_length": 112.33333587646484, "epoch": 0.5825545171339563, "grad_norm": 0.882077693939209, "kl": 0.05575979873538017, "learning_rate": 2.217056287263528e-06, "loss": 0.0022, "reward": 0.35483333468437195, "reward_std": 0.12303563207387924, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27149999141693115, "step": 1309 }, { "completion_length": 194.6666717529297, "epoch": 0.5829995549621718, "grad_norm": 0.6482923030853271, "kl": 0.04915091395378113, "learning_rate": 2.2131973236394154e-06, "loss": 0.002, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 1310 }, { "completion_length": 126.83333587646484, "epoch": 0.5834445927903872, "grad_norm": 0.025585202500224113, "kl": 0.0684896856546402, "learning_rate": 2.209339052357107e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1311 }, { "completion_length": 161.83334350585938, "epoch": 0.5838896306186026, "grad_norm": 0.017367461696267128, "kl": 0.04990217089653015, "learning_rate": 2.2054814827304713e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1312 }, { "completion_length": 145.1666717529297, "epoch": 0.584334668446818, "grad_norm": 0.6588000655174255, "kl": 0.06359747052192688, "learning_rate": 2.201624624071683e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1313 }, { "completion_length": 190.33334350585938, "epoch": 0.5847797062750334, "grad_norm": 0.5984198451042175, "kl": 0.05125384032726288, "learning_rate": 2.1977684856912016e-06, "loss": 0.0021, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1314 }, { "completion_length": 166.5, "epoch": 0.5852247441032488, "grad_norm": 0.018499819561839104, "kl": 0.05266015976667404, "learning_rate": 2.1939130768977455e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1315 }, { "completion_length": 151.6666717529297, "epoch": 0.5856697819314641, "grad_norm": 0.6672574281692505, "kl": 0.0633692592382431, "learning_rate": 2.190058406998275e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1316 }, { "completion_length": 200.0, "epoch": 0.5861148197596796, "grad_norm": 0.02820000983774662, "kl": 0.051216401159763336, "learning_rate": 2.1862044852979654e-06, "loss": 0.002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1317 }, { "completion_length": 192.83334350585938, "epoch": 0.586559857587895, "grad_norm": 0.8144605755805969, "kl": 0.05723793804645538, "learning_rate": 2.1823513211001836e-06, "loss": 0.0023, "reward": 0.2711666524410248, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1318 }, { "completion_length": 155.5, "epoch": 0.5870048954161103, "grad_norm": 0.011627976782619953, "kl": 0.03995111584663391, "learning_rate": 2.1784989237064716e-06, "loss": 0.0019, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1319 }, { "completion_length": 185.33334350585938, "epoch": 0.5874499332443258, "grad_norm": 0.705923318862915, "kl": 0.04204287752509117, "learning_rate": 2.1746473024165185e-06, "loss": 0.0017, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1320 }, { "completion_length": 188.83334350585938, "epoch": 0.5878949710725412, "grad_norm": 0.743707537651062, "kl": 0.05525606870651245, "learning_rate": 2.170796466528139e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1321 }, { "completion_length": 162.33334350585938, "epoch": 0.5883400089007566, "grad_norm": 0.6818633079528809, "kl": 0.08869786560535431, "learning_rate": 2.166946425337254e-06, "loss": 0.0035, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1322 }, { "completion_length": 178.1666717529297, "epoch": 0.5887850467289719, "grad_norm": 0.8434653878211975, "kl": 0.05428377911448479, "learning_rate": 2.1630971881378644e-06, "loss": 0.0022, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1323 }, { "completion_length": 143.6666717529297, "epoch": 0.5892300845571874, "grad_norm": 0.01933359168469906, "kl": 0.05959298461675644, "learning_rate": 2.1592487642220305e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1324 }, { "completion_length": 172.6666717529297, "epoch": 0.5896751223854028, "grad_norm": 0.8821998238563538, "kl": 0.06853225827217102, "learning_rate": 2.1554011628798495e-06, "loss": 0.0027, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1325 }, { "completion_length": 136.83334350585938, "epoch": 0.5901201602136181, "grad_norm": 0.022718112915754318, "kl": 0.05797460302710533, "learning_rate": 2.1515543933994344e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1326 }, { "completion_length": 187.6666717529297, "epoch": 0.5905651980418336, "grad_norm": 0.6120337247848511, "kl": 0.061057232320308685, "learning_rate": 2.1477084650668863e-06, "loss": 0.0024, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1327 }, { "completion_length": 131.33334350585938, "epoch": 0.591010235870049, "grad_norm": 0.9323371052742004, "kl": 0.05380024015903473, "learning_rate": 2.1438633871662795e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1328 }, { "completion_length": 140.6666717529297, "epoch": 0.5914552736982643, "grad_norm": 0.026489408686757088, "kl": 0.06107024475932121, "learning_rate": 2.140019168979634e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1329 }, { "completion_length": 121.5, "epoch": 0.5919003115264797, "grad_norm": 0.02367200143635273, "kl": 0.056515760719776154, "learning_rate": 2.136175819786894e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1330 }, { "completion_length": 152.0, "epoch": 0.5923453493546952, "grad_norm": 0.7418760657310486, "kl": 0.08832807838916779, "learning_rate": 2.1323333488659063e-06, "loss": 0.0035, "reward": 0.29350000619888306, "reward_std": 0.20208291709423065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29350000619888306, "step": 1331 }, { "completion_length": 190.0, "epoch": 0.5927903871829105, "grad_norm": 0.765781819820404, "kl": 0.05727764964103699, "learning_rate": 2.1284917654923986e-06, "loss": 0.0023, "reward": 0.27116668224334717, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 1332 }, { "completion_length": 132.33334350585938, "epoch": 0.5932354250111259, "grad_norm": 0.03234122321009636, "kl": 0.07051411271095276, "learning_rate": 2.1246510789399537e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1333 }, { "completion_length": 156.1666717529297, "epoch": 0.5936804628393414, "grad_norm": 0.7347729802131653, "kl": 0.05940014123916626, "learning_rate": 2.1208112984799913e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1334 }, { "completion_length": 158.1666717529297, "epoch": 0.5941255006675568, "grad_norm": 0.7142928242683411, "kl": 0.07999814301729202, "learning_rate": 2.1169724333817443e-06, "loss": 0.0032, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1335 }, { "completion_length": 151.83334350585938, "epoch": 0.5945705384957721, "grad_norm": 0.0350709892809391, "kl": 0.058371301740407944, "learning_rate": 2.1131344929122336e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1336 }, { "completion_length": 127.33333587646484, "epoch": 0.5950155763239875, "grad_norm": 0.01668655313551426, "kl": 0.054240863770246506, "learning_rate": 2.1092974863362508e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1337 }, { "completion_length": 174.83334350585938, "epoch": 0.595460614152203, "grad_norm": 0.6639335751533508, "kl": 0.048784345388412476, "learning_rate": 2.10546142291633e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1338 }, { "completion_length": 162.83334350585938, "epoch": 0.5959056519804183, "grad_norm": 0.6551691889762878, "kl": 0.11608364433050156, "learning_rate": 2.1016263119127315e-06, "loss": 0.0046, "reward": 0.2409999966621399, "reward_std": 0.2736355662345886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2409999817609787, "step": 1339 }, { "completion_length": 144.33334350585938, "epoch": 0.5963506898086337, "grad_norm": 0.01642470248043537, "kl": 0.0515790730714798, "learning_rate": 2.097792162583415e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1340 }, { "completion_length": 86.83333587646484, "epoch": 0.5967957276368492, "grad_norm": 0.02316073514521122, "kl": 0.06323958933353424, "learning_rate": 2.093958984184018e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1341 }, { "completion_length": 158.1666717529297, "epoch": 0.5972407654650645, "grad_norm": 0.6790428757667542, "kl": 0.06524261832237244, "learning_rate": 2.090126785967836e-06, "loss": 0.0026, "reward": 0.3516666889190674, "reward_std": 0.05044468492269516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.351666659116745, "step": 1342 }, { "completion_length": 140.6666717529297, "epoch": 0.5976858032932799, "grad_norm": 0.7449166178703308, "kl": 0.13734249770641327, "learning_rate": 2.0862955771857977e-06, "loss": 0.0055, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1343 }, { "completion_length": 152.33334350585938, "epoch": 0.5981308411214953, "grad_norm": 0.8124682903289795, "kl": 0.05861344933509827, "learning_rate": 2.082465367086442e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1344 }, { "completion_length": 171.33334350585938, "epoch": 0.5985758789497108, "grad_norm": 0.6527742743492126, "kl": 0.050010886043310165, "learning_rate": 2.078636164915898e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1345 }, { "completion_length": 143.83334350585938, "epoch": 0.5990209167779261, "grad_norm": 0.7979593276977539, "kl": 0.06243058666586876, "learning_rate": 2.074807979917863e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1346 }, { "completion_length": 103.83333587646484, "epoch": 0.5994659546061415, "grad_norm": 0.020012816414237022, "kl": 0.0676102340221405, "learning_rate": 2.070980821333576e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1347 }, { "completion_length": 200.0, "epoch": 0.599910992434357, "grad_norm": 0.04530826210975647, "kl": 0.054757438600063324, "learning_rate": 2.0671546984018003e-06, "loss": 0.0022, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1348 }, { "completion_length": 175.6666717529297, "epoch": 0.6003560302625723, "grad_norm": 0.8745347857475281, "kl": 0.08706136047840118, "learning_rate": 2.0633296203587994e-06, "loss": 0.0035, "reward": 0.31333333253860474, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1349 }, { "completion_length": 179.6666717529297, "epoch": 0.6008010680907877, "grad_norm": 0.719700038433075, "kl": 0.056766610592603683, "learning_rate": 2.059505596438312e-06, "loss": 0.0023, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1350 }, { "completion_length": 177.6666717529297, "epoch": 0.6012461059190031, "grad_norm": 0.6509082317352295, "kl": 0.055832430720329285, "learning_rate": 2.0556826358715345e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1351 }, { "completion_length": 186.6666717529297, "epoch": 0.6016911437472185, "grad_norm": 0.7251918315887451, "kl": 0.08503405749797821, "learning_rate": 2.0518607478870966e-06, "loss": 0.0034, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1352 }, { "completion_length": 170.1666717529297, "epoch": 0.6021361815754339, "grad_norm": 0.8101000189781189, "kl": 0.07102101296186447, "learning_rate": 2.048039941711035e-06, "loss": 0.0028, "reward": 0.2056666612625122, "reward_std": 0.3590574860572815, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2056666612625122, "step": 1353 }, { "completion_length": 190.0, "epoch": 0.6025812194036493, "grad_norm": 0.7210485935211182, "kl": 0.05284285545349121, "learning_rate": 2.044220226566781e-06, "loss": 0.0021, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1354 }, { "completion_length": 179.0, "epoch": 0.6030262572318648, "grad_norm": 0.7402732372283936, "kl": 0.05912027508020401, "learning_rate": 2.0404016116751268e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1355 }, { "completion_length": 166.33334350585938, "epoch": 0.6034712950600801, "grad_norm": 0.8056141138076782, "kl": 0.06315895915031433, "learning_rate": 2.0365841062542122e-06, "loss": 0.0025, "reward": 0.29216668009757996, "reward_std": 0.10255226492881775, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1356 }, { "completion_length": 177.1666717529297, "epoch": 0.6039163328882955, "grad_norm": 0.678676187992096, "kl": 0.04996780678629875, "learning_rate": 2.0327677195194956e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1357 }, { "completion_length": 174.1666717529297, "epoch": 0.6043613707165109, "grad_norm": 0.7291713953018188, "kl": 0.04707088693976402, "learning_rate": 2.028952460683737e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1358 }, { "completion_length": 169.5, "epoch": 0.6048064085447263, "grad_norm": 0.8036746382713318, "kl": 0.06635730713605881, "learning_rate": 2.0251383389569743e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1359 }, { "completion_length": 162.5, "epoch": 0.6052514463729417, "grad_norm": 0.6609824299812317, "kl": 0.05208190530538559, "learning_rate": 2.0213253635464974e-06, "loss": 0.0021, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1360 }, { "completion_length": 135.6666717529297, "epoch": 0.6056964842011571, "grad_norm": 0.7489309906959534, "kl": 0.0696021094918251, "learning_rate": 2.0175135436568315e-06, "loss": 0.0028, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1361 }, { "completion_length": 187.83334350585938, "epoch": 0.6061415220293725, "grad_norm": 0.8159146904945374, "kl": 0.045086465775966644, "learning_rate": 2.013702888489713e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1362 }, { "completion_length": 175.33334350585938, "epoch": 0.6065865598575879, "grad_norm": 0.7238532304763794, "kl": 0.08709849417209625, "learning_rate": 2.0098934072440636e-06, "loss": 0.0035, "reward": 0.1850000023841858, "reward_std": 0.24054770171642303, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1850000023841858, "step": 1363 }, { "completion_length": 161.6666717529297, "epoch": 0.6070315976858033, "grad_norm": 0.9293068647384644, "kl": 0.06978730857372284, "learning_rate": 2.0060851091159735e-06, "loss": 0.0028, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1364 }, { "completion_length": 136.33334350585938, "epoch": 0.6074766355140186, "grad_norm": 0.020378444343805313, "kl": 0.06543242931365967, "learning_rate": 2.0022780032986767e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1365 }, { "completion_length": 163.6666717529297, "epoch": 0.6079216733422341, "grad_norm": 0.7763041853904724, "kl": 0.07946053147315979, "learning_rate": 1.998472098982528e-06, "loss": 0.0032, "reward": 0.26850003004074097, "reward_std": 0.15288132429122925, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26850003004074097, "step": 1366 }, { "completion_length": 193.5, "epoch": 0.6083667111704495, "grad_norm": 0.826087474822998, "kl": 0.053222887217998505, "learning_rate": 1.9946674053549826e-06, "loss": 0.0021, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1367 }, { "completion_length": 132.0, "epoch": 0.6088117489986649, "grad_norm": 0.9280709624290466, "kl": 0.062463387846946716, "learning_rate": 1.990863931600573e-06, "loss": 0.0025, "reward": 0.3341667056083679, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1368 }, { "completion_length": 185.0, "epoch": 0.6092567868268803, "grad_norm": 0.749100923538208, "kl": 0.06078839302062988, "learning_rate": 1.987061686900886e-06, "loss": 0.0024, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 1369 }, { "completion_length": 147.83334350585938, "epoch": 0.6097018246550957, "grad_norm": 0.034440264105796814, "kl": 0.06485289335250854, "learning_rate": 1.983260680434543e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1370 }, { "completion_length": 104.5, "epoch": 0.6101468624833111, "grad_norm": 0.01646091602742672, "kl": 0.06164836511015892, "learning_rate": 1.9794609213771756e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1371 }, { "completion_length": 157.6666717529297, "epoch": 0.6105919003115264, "grad_norm": 0.749934196472168, "kl": 0.0504591129720211, "learning_rate": 1.975662418901403e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1372 }, { "completion_length": 163.5, "epoch": 0.6110369381397419, "grad_norm": 0.7931237816810608, "kl": 0.08825130760669708, "learning_rate": 1.9718651821768133e-06, "loss": 0.0035, "reward": 0.25316667556762695, "reward_std": 0.24440494179725647, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25316664576530457, "step": 1373 }, { "completion_length": 160.33334350585938, "epoch": 0.6114819759679573, "grad_norm": 0.034321513026952744, "kl": 0.07206878066062927, "learning_rate": 1.9680692203699374e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1374 }, { "completion_length": 155.6666717529297, "epoch": 0.6119270137961726, "grad_norm": 0.7601866722106934, "kl": 0.07652454823255539, "learning_rate": 1.9642745426442284e-06, "loss": 0.0031, "reward": 0.31333333253860474, "reward_std": 0.15350136160850525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1375 }, { "completion_length": 196.83334350585938, "epoch": 0.6123720516243881, "grad_norm": 0.78342205286026, "kl": 0.08061586320400238, "learning_rate": 1.9604811581600415e-06, "loss": 0.0032, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1376 }, { "completion_length": 153.83334350585938, "epoch": 0.6128170894526035, "grad_norm": 0.6514473557472229, "kl": 0.05775187537074089, "learning_rate": 1.956689076074607e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1377 }, { "completion_length": 157.33334350585938, "epoch": 0.6132621272808189, "grad_norm": 0.7332454919815063, "kl": 0.05692437291145325, "learning_rate": 1.9528983055420143e-06, "loss": 0.0023, "reward": 0.3343333601951599, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33433330059051514, "step": 1378 }, { "completion_length": 189.6666717529297, "epoch": 0.6137071651090342, "grad_norm": 0.7741665840148926, "kl": 0.07697466015815735, "learning_rate": 1.949108855713185e-06, "loss": 0.0031, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1379 }, { "completion_length": 126.66667175292969, "epoch": 0.6141522029372497, "grad_norm": 0.021553704515099525, "kl": 0.05840783566236496, "learning_rate": 1.945320735735853e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1380 }, { "completion_length": 156.33334350585938, "epoch": 0.6145972407654651, "grad_norm": 0.6324295997619629, "kl": 0.04569033905863762, "learning_rate": 1.941533954754541e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1381 }, { "completion_length": 89.66667175292969, "epoch": 0.6150422785936804, "grad_norm": 1.134950876235962, "kl": 0.11393502354621887, "learning_rate": 1.9377485219105416e-06, "loss": 0.0046, "reward": 0.312333345413208, "reward_std": 0.15595084428787231, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3123333156108856, "step": 1382 }, { "completion_length": 182.33334350585938, "epoch": 0.6154873164218959, "grad_norm": 0.7212135195732117, "kl": 0.0604676827788353, "learning_rate": 1.93396444634189e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1383 }, { "completion_length": 156.0, "epoch": 0.6159323542501113, "grad_norm": 0.02910018339753151, "kl": 0.07041066884994507, "learning_rate": 1.930181737183348e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1384 }, { "completion_length": 181.33334350585938, "epoch": 0.6163773920783266, "grad_norm": 0.7267289161682129, "kl": 0.051712047308683395, "learning_rate": 1.926400403566377e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1385 }, { "completion_length": 95.66667175292969, "epoch": 0.616822429906542, "grad_norm": 0.020063413307070732, "kl": 0.06864848732948303, "learning_rate": 1.922620454619117e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1386 }, { "completion_length": 149.33334350585938, "epoch": 0.6172674677347575, "grad_norm": 0.7748768329620361, "kl": 0.05772966891527176, "learning_rate": 1.9188418994663677e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1387 }, { "completion_length": 190.83334350585938, "epoch": 0.6177125055629729, "grad_norm": 0.8829987645149231, "kl": 0.06954911351203918, "learning_rate": 1.9150647472295635e-06, "loss": 0.0028, "reward": 0.14366666972637177, "reward_std": 0.2381979525089264, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14366665482521057, "step": 1388 }, { "completion_length": 144.6666717529297, "epoch": 0.6181575433911882, "grad_norm": 0.7912724018096924, "kl": 0.053366199135780334, "learning_rate": 1.9112890070267513e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1389 }, { "completion_length": 197.0, "epoch": 0.6186025812194037, "grad_norm": 0.7742429375648499, "kl": 0.060363225638866425, "learning_rate": 1.907514687972569e-06, "loss": 0.0024, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 1390 }, { "completion_length": 173.6666717529297, "epoch": 0.6190476190476191, "grad_norm": 0.7186877131462097, "kl": 0.07181936502456665, "learning_rate": 1.903741799178227e-06, "loss": 0.0029, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1391 }, { "completion_length": 143.1666717529297, "epoch": 0.6194926568758344, "grad_norm": 0.025034688413143158, "kl": 0.055738165974617004, "learning_rate": 1.8999703497514782e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1392 }, { "completion_length": 180.1666717529297, "epoch": 0.6199376947040498, "grad_norm": 0.7350578308105469, "kl": 0.07966147363185883, "learning_rate": 1.8962003487966044e-06, "loss": 0.0032, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1393 }, { "completion_length": 127.0, "epoch": 0.6203827325322653, "grad_norm": 0.017875896766781807, "kl": 0.054014332592487335, "learning_rate": 1.8924318054143903e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1394 }, { "completion_length": 162.6666717529297, "epoch": 0.6208277703604806, "grad_norm": 0.06926420331001282, "kl": 0.059310950338840485, "learning_rate": 1.888664728702101e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1395 }, { "completion_length": 200.0, "epoch": 0.621272808188696, "grad_norm": 0.647271454334259, "kl": 0.058838214725255966, "learning_rate": 1.8848991277534609e-06, "loss": 0.0024, "reward": 0.2291666716337204, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 1396 }, { "completion_length": 143.33334350585938, "epoch": 0.6217178460169115, "grad_norm": 0.7888451814651489, "kl": 0.1036970466375351, "learning_rate": 1.8811350116586341e-06, "loss": 0.0041, "reward": 0.2548333406448364, "reward_std": 0.2967965006828308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25483331084251404, "step": 1397 }, { "completion_length": 157.6666717529297, "epoch": 0.6221628838451269, "grad_norm": 0.02223152481019497, "kl": 0.057576995342969894, "learning_rate": 1.8773723895041975e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1398 }, { "completion_length": 193.83334350585938, "epoch": 0.6226079216733422, "grad_norm": 0.6927502751350403, "kl": 0.05820035561919212, "learning_rate": 1.8736112703731235e-06, "loss": 0.0023, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 1399 }, { "completion_length": 130.0, "epoch": 0.6230529595015576, "grad_norm": 0.9275290966033936, "kl": 0.06016760692000389, "learning_rate": 1.869851663344755e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1400 }, { "completion_length": 135.1666717529297, "epoch": 0.6234979973297731, "grad_norm": 0.028401155024766922, "kl": 0.05090929567813873, "learning_rate": 1.8660935774947857e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1401 }, { "completion_length": 163.83334350585938, "epoch": 0.6239430351579884, "grad_norm": 1.0099029541015625, "kl": 0.067303866147995, "learning_rate": 1.8623370218952368e-06, "loss": 0.0027, "reward": 0.2775000035762787, "reward_std": 0.24127474427223206, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2775000035762787, "step": 1402 }, { "completion_length": 133.83334350585938, "epoch": 0.6243880729862038, "grad_norm": 1.1334388256072998, "kl": 0.06404712796211243, "learning_rate": 1.8585820056144349e-06, "loss": 0.0026, "reward": 0.3551666736602783, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1403 }, { "completion_length": 193.0, "epoch": 0.6248331108144193, "grad_norm": 0.7618772387504578, "kl": 0.05527910590171814, "learning_rate": 1.854828537716991e-06, "loss": 0.0022, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1404 }, { "completion_length": 172.1666717529297, "epoch": 0.6252781486426346, "grad_norm": 0.7541539669036865, "kl": 0.05784829705953598, "learning_rate": 1.8510766272637798e-06, "loss": 0.0023, "reward": 0.37566667795181274, "reward_std": 0.07905863225460052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 1405 }, { "completion_length": 182.6666717529297, "epoch": 0.62572318647085, "grad_norm": 0.7231999039649963, "kl": 0.05078327655792236, "learning_rate": 1.8473262833119138e-06, "loss": 0.002, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1406 }, { "completion_length": 179.1666717529297, "epoch": 0.6261682242990654, "grad_norm": 0.7264639139175415, "kl": 0.06620623916387558, "learning_rate": 1.843577514914725e-06, "loss": 0.0026, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1407 }, { "completion_length": 157.1666717529297, "epoch": 0.6266132621272809, "grad_norm": 0.6942959427833557, "kl": 0.05550370737910271, "learning_rate": 1.8398303311217436e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1408 }, { "completion_length": 146.33334350585938, "epoch": 0.6270582999554962, "grad_norm": 0.8330131769180298, "kl": 0.0588110089302063, "learning_rate": 1.8360847409786714e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1409 }, { "completion_length": 187.83334350585938, "epoch": 0.6275033377837116, "grad_norm": 0.6956566572189331, "kl": 0.06108691915869713, "learning_rate": 1.8323407535273658e-06, "loss": 0.0024, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1410 }, { "completion_length": 167.5, "epoch": 0.6279483756119271, "grad_norm": 0.6863263249397278, "kl": 0.061490681022405624, "learning_rate": 1.8285983778058147e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1411 }, { "completion_length": 134.83334350585938, "epoch": 0.6283934134401424, "grad_norm": 0.7065067291259766, "kl": 0.08277900516986847, "learning_rate": 1.824857622848114e-06, "loss": 0.0033, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1412 }, { "completion_length": 127.83333587646484, "epoch": 0.6288384512683578, "grad_norm": 0.711338460445404, "kl": 0.1219438910484314, "learning_rate": 1.8211184976844487e-06, "loss": 0.0049, "reward": 0.25733333826065063, "reward_std": 0.29067277908325195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25733333826065063, "step": 1413 }, { "completion_length": 161.6666717529297, "epoch": 0.6292834890965732, "grad_norm": 0.8949511051177979, "kl": 0.07294663786888123, "learning_rate": 1.8173810113410688e-06, "loss": 0.0029, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1414 }, { "completion_length": 113.83333587646484, "epoch": 0.6297285269247886, "grad_norm": 0.03835580497980118, "kl": 0.07534816116094589, "learning_rate": 1.8136451728402682e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1415 }, { "completion_length": 112.33333587646484, "epoch": 0.630173564753004, "grad_norm": 0.022022180259227753, "kl": 0.057372257113456726, "learning_rate": 1.8099109912003624e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1416 }, { "completion_length": 144.0, "epoch": 0.6306186025812194, "grad_norm": 0.7620473504066467, "kl": 0.06752672046422958, "learning_rate": 1.8061784754356688e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1417 }, { "completion_length": 171.0, "epoch": 0.6310636404094349, "grad_norm": 0.7567469477653503, "kl": 0.06102651357650757, "learning_rate": 1.8024476345564806e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1418 }, { "completion_length": 137.83334350585938, "epoch": 0.6315086782376502, "grad_norm": 1.0179473161697388, "kl": 0.050918303430080414, "learning_rate": 1.7987184775690512e-06, "loss": 0.002, "reward": 0.39633333683013916, "reward_std": 0.09396524727344513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22966668009757996, "step": 1419 }, { "completion_length": 162.6666717529297, "epoch": 0.6319537160658656, "grad_norm": 0.8701192140579224, "kl": 0.17964184284210205, "learning_rate": 1.7949910134755672e-06, "loss": 0.0072, "reward": 0.2861666679382324, "reward_std": 0.22004583477973938, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2861666679382324, "step": 1420 }, { "completion_length": 200.0, "epoch": 0.632398753894081, "grad_norm": 0.17582067847251892, "kl": 0.08828055113554001, "learning_rate": 1.7912652512741273e-06, "loss": 0.0035, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1421 }, { "completion_length": 197.1666717529297, "epoch": 0.6328437917222964, "grad_norm": 0.6580331325531006, "kl": 0.038402341306209564, "learning_rate": 1.7875411999587256e-06, "loss": 0.0015, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 1422 }, { "completion_length": 148.1666717529297, "epoch": 0.6332888295505118, "grad_norm": 0.8078848719596863, "kl": 0.06011554226279259, "learning_rate": 1.7838188685192217e-06, "loss": 0.0024, "reward": 0.3343333601951599, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33433330059051514, "step": 1423 }, { "completion_length": 144.33334350585938, "epoch": 0.6337338673787272, "grad_norm": 0.06990113109350204, "kl": 0.10113313794136047, "learning_rate": 1.7800982659413268e-06, "loss": 0.0043, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1424 }, { "completion_length": 171.5, "epoch": 0.6341789052069426, "grad_norm": 0.7644363641738892, "kl": 0.08610030263662338, "learning_rate": 1.7763794012065771e-06, "loss": 0.0034, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1425 }, { "completion_length": 184.1666717529297, "epoch": 0.634623943035158, "grad_norm": 0.7638565301895142, "kl": 0.052418213337659836, "learning_rate": 1.772662283292314e-06, "loss": 0.0021, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 1426 }, { "completion_length": 144.83334350585938, "epoch": 0.6350689808633734, "grad_norm": 0.7163295745849609, "kl": 0.06099975109100342, "learning_rate": 1.7689469211716614e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1427 }, { "completion_length": 129.33334350585938, "epoch": 0.6355140186915887, "grad_norm": 0.020992541685700417, "kl": 0.06372680515050888, "learning_rate": 1.7652333238135067e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1428 }, { "completion_length": 147.6666717529297, "epoch": 0.6359590565198042, "grad_norm": 0.06478970497846603, "kl": 0.0694139301776886, "learning_rate": 1.761521500182474e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1429 }, { "completion_length": 165.83334350585938, "epoch": 0.6364040943480196, "grad_norm": 0.8788062930107117, "kl": 0.08362124860286713, "learning_rate": 1.7578114592389085e-06, "loss": 0.0033, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1430 }, { "completion_length": 162.5, "epoch": 0.636849132176235, "grad_norm": 0.7626572251319885, "kl": 0.07676111161708832, "learning_rate": 1.75410320993885e-06, "loss": 0.0031, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1431 }, { "completion_length": 161.83334350585938, "epoch": 0.6372941700044504, "grad_norm": 0.6593140959739685, "kl": 0.05127331614494324, "learning_rate": 1.7503967612340153e-06, "loss": 0.0021, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1432 }, { "completion_length": 136.5, "epoch": 0.6377392078326658, "grad_norm": 0.8421521782875061, "kl": 0.05608572065830231, "learning_rate": 1.7466921220717737e-06, "loss": 0.0022, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 1433 }, { "completion_length": 127.83333587646484, "epoch": 0.6381842456608812, "grad_norm": 0.032311953604221344, "kl": 0.06359237432479858, "learning_rate": 1.7429893013951243e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1434 }, { "completion_length": 141.0, "epoch": 0.6386292834890965, "grad_norm": 0.02765144221484661, "kl": 0.06800812482833862, "learning_rate": 1.7392883081426793e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1435 }, { "completion_length": 156.83334350585938, "epoch": 0.639074321317312, "grad_norm": 0.03926115483045578, "kl": 0.07100366055965424, "learning_rate": 1.7355891512486384e-06, "loss": 0.0031, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1436 }, { "completion_length": 136.6666717529297, "epoch": 0.6395193591455274, "grad_norm": 0.9405933022499084, "kl": 0.07274708151817322, "learning_rate": 1.7318918396427676e-06, "loss": 0.0029, "reward": 0.3551666736602783, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1437 }, { "completion_length": 171.5, "epoch": 0.6399643969737427, "grad_norm": 0.6927393674850464, "kl": 0.06484430283308029, "learning_rate": 1.728196382250379e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1438 }, { "completion_length": 185.5, "epoch": 0.6404094348019582, "grad_norm": 0.7905606031417847, "kl": 0.05562688410282135, "learning_rate": 1.7245027879923082e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1439 }, { "completion_length": 121.5, "epoch": 0.6408544726301736, "grad_norm": 0.03622419387102127, "kl": 0.07538799941539764, "learning_rate": 1.7208110657848945e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1440 }, { "completion_length": 179.6666717529297, "epoch": 0.641299510458389, "grad_norm": 0.7712323069572449, "kl": 0.07858553528785706, "learning_rate": 1.7171212245399572e-06, "loss": 0.0031, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 1441 }, { "completion_length": 200.0, "epoch": 0.6417445482866043, "grad_norm": 0.715948760509491, "kl": 0.06623165309429169, "learning_rate": 1.7134332731647734e-06, "loss": 0.0026, "reward": 0.11950000375509262, "reward_std": 0.31965839862823486, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11950000375509262, "step": 1442 }, { "completion_length": 143.5, "epoch": 0.6421895861148198, "grad_norm": 0.900661289691925, "kl": 0.06195361539721489, "learning_rate": 1.7097472205620607e-06, "loss": 0.0025, "reward": 0.3551666736602783, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1443 }, { "completion_length": 120.33333587646484, "epoch": 0.6426346239430352, "grad_norm": 0.01926054246723652, "kl": 0.05608893558382988, "learning_rate": 1.7060630756299529e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1444 }, { "completion_length": 169.1666717529297, "epoch": 0.6430796617712505, "grad_norm": 0.014926317147910595, "kl": 0.050173163414001465, "learning_rate": 1.7023808472619755e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1445 }, { "completion_length": 161.0, "epoch": 0.6435246995994659, "grad_norm": 0.02021980844438076, "kl": 0.05474698543548584, "learning_rate": 1.6987005443470309e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1446 }, { "completion_length": 131.83334350585938, "epoch": 0.6439697374276814, "grad_norm": 0.7747736573219299, "kl": 0.06740984320640564, "learning_rate": 1.6950221757693725e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1447 }, { "completion_length": 146.33334350585938, "epoch": 0.6444147752558967, "grad_norm": 0.8941154479980469, "kl": 0.06897615641355515, "learning_rate": 1.6913457504085828e-06, "loss": 0.0028, "reward": 0.3581666648387909, "reward_std": 0.11561040580272675, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2748333215713501, "step": 1448 }, { "completion_length": 139.0, "epoch": 0.6448598130841121, "grad_norm": 0.8403373956680298, "kl": 0.06838244199752808, "learning_rate": 1.6876712771395553e-06, "loss": 0.0027, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 1449 }, { "completion_length": 177.83334350585938, "epoch": 0.6453048509123276, "grad_norm": 0.6254804730415344, "kl": 0.05985492095351219, "learning_rate": 1.6839987648324702e-06, "loss": 0.0024, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1450 }, { "completion_length": 200.0, "epoch": 0.645749888740543, "grad_norm": 0.017869267612695694, "kl": 0.038964178413152695, "learning_rate": 1.6803282223527737e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1451 }, { "completion_length": 115.83333587646484, "epoch": 0.6461949265687583, "grad_norm": 0.0266580693423748, "kl": 0.06849268078804016, "learning_rate": 1.6766596585611572e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1452 }, { "completion_length": 200.0, "epoch": 0.6466399643969737, "grad_norm": 0.035756222903728485, "kl": 0.05660893768072128, "learning_rate": 1.672993082313536e-06, "loss": 0.0023, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1453 }, { "completion_length": 126.5, "epoch": 0.6470850022251892, "grad_norm": 0.7655314207077026, "kl": 0.05805585905909538, "learning_rate": 1.6693285024610264e-06, "loss": 0.0023, "reward": 0.3966667056083679, "reward_std": 0.05062280222773552, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1454 }, { "completion_length": 126.66667175292969, "epoch": 0.6475300400534045, "grad_norm": 0.028626590967178345, "kl": 0.05400149151682854, "learning_rate": 1.6656659278499262e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1455 }, { "completion_length": 179.6666717529297, "epoch": 0.6479750778816199, "grad_norm": 0.7383698225021362, "kl": 0.0543503537774086, "learning_rate": 1.662005367321693e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1456 }, { "completion_length": 184.83334350585938, "epoch": 0.6484201157098354, "grad_norm": 0.7595195770263672, "kl": 0.07299184054136276, "learning_rate": 1.6583468297129207e-06, "loss": 0.0029, "reward": 0.17650000751018524, "reward_std": 0.25984129309654236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17650000751018524, "step": 1457 }, { "completion_length": 178.83334350585938, "epoch": 0.6488651535380507, "grad_norm": 0.7613358497619629, "kl": 0.05207335948944092, "learning_rate": 1.6546903238553211e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1458 }, { "completion_length": 137.33334350585938, "epoch": 0.6493101913662661, "grad_norm": 0.02273043617606163, "kl": 0.05129929631948471, "learning_rate": 1.6510358585757018e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1459 }, { "completion_length": 200.0, "epoch": 0.6497552291944815, "grad_norm": 0.03293275833129883, "kl": 0.04631292074918747, "learning_rate": 1.6473834426959434e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1460 }, { "completion_length": 191.1666717529297, "epoch": 0.650200267022697, "grad_norm": 0.8067001700401306, "kl": 0.04392020031809807, "learning_rate": 1.6437330850329793e-06, "loss": 0.0018, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 1461 }, { "completion_length": 165.6666717529297, "epoch": 0.6506453048509123, "grad_norm": 0.8360378742218018, "kl": 0.07717983424663544, "learning_rate": 1.6400847943987758e-06, "loss": 0.0031, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1462 }, { "completion_length": 186.6666717529297, "epoch": 0.6510903426791277, "grad_norm": 0.7348068952560425, "kl": 0.05224481225013733, "learning_rate": 1.636438579600307e-06, "loss": 0.0021, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 1463 }, { "completion_length": 155.1666717529297, "epoch": 0.6515353805073432, "grad_norm": 0.6557543277740479, "kl": 0.0562002956867218, "learning_rate": 1.6327944494395387e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1464 }, { "completion_length": 144.0, "epoch": 0.6519804183355585, "grad_norm": 1.051000952720642, "kl": 0.05340130627155304, "learning_rate": 1.6291524127134012e-06, "loss": 0.0021, "reward": 0.3966667056083679, "reward_std": 0.05062280222773552, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 1465 }, { "completion_length": 177.6666717529297, "epoch": 0.6524254561637739, "grad_norm": 0.7785902619361877, "kl": 0.09893165528774261, "learning_rate": 1.6255124782137738e-06, "loss": 0.004, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1466 }, { "completion_length": 168.33334350585938, "epoch": 0.6528704939919893, "grad_norm": 0.7352219223976135, "kl": 0.07749474048614502, "learning_rate": 1.6218746547274612e-06, "loss": 0.0031, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1467 }, { "completion_length": 146.1666717529297, "epoch": 0.6533155318202047, "grad_norm": 0.03056454285979271, "kl": 0.07416104525327682, "learning_rate": 1.618238951036169e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1468 }, { "completion_length": 162.1666717529297, "epoch": 0.6537605696484201, "grad_norm": 0.7109426259994507, "kl": 0.04952998086810112, "learning_rate": 1.6146053759164895e-06, "loss": 0.002, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1469 }, { "completion_length": 174.83334350585938, "epoch": 0.6542056074766355, "grad_norm": 0.7508170008659363, "kl": 0.06910863518714905, "learning_rate": 1.6109739381398746e-06, "loss": 0.0028, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1470 }, { "completion_length": 105.0, "epoch": 0.654650645304851, "grad_norm": 0.024463405832648277, "kl": 0.06437089294195175, "learning_rate": 1.6073446464726158e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1471 }, { "completion_length": 132.6666717529297, "epoch": 0.6550956831330663, "grad_norm": 0.043828994035720825, "kl": 0.06118755787611008, "learning_rate": 1.6037175096758259e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1472 }, { "completion_length": 150.0, "epoch": 0.6555407209612817, "grad_norm": 0.7977015376091003, "kl": 0.10141167044639587, "learning_rate": 1.6000925365054154e-06, "loss": 0.0041, "reward": 0.2861666679382324, "reward_std": 0.22004583477973938, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2861666679382324, "step": 1473 }, { "completion_length": 156.33334350585938, "epoch": 0.655985758789497, "grad_norm": 0.9616900086402893, "kl": 0.052790567278862, "learning_rate": 1.59646973571207e-06, "loss": 0.0021, "reward": 0.3148333430290222, "reward_std": 0.10149761289358139, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3148333430290222, "step": 1474 }, { "completion_length": 153.0, "epoch": 0.6564307966177125, "grad_norm": 0.7333883047103882, "kl": 0.07777030766010284, "learning_rate": 1.5928491160412335e-06, "loss": 0.0031, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1475 }, { "completion_length": 138.83334350585938, "epoch": 0.6568758344459279, "grad_norm": 0.8540945053100586, "kl": 0.08184421062469482, "learning_rate": 1.5892306862330837e-06, "loss": 0.0033, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1476 }, { "completion_length": 159.1666717529297, "epoch": 0.6573208722741433, "grad_norm": 0.7016175389289856, "kl": 0.05608559399843216, "learning_rate": 1.5856144550225113e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 1477 }, { "completion_length": 145.6666717529297, "epoch": 0.6577659101023587, "grad_norm": 0.01697458140552044, "kl": 0.052540235221385956, "learning_rate": 1.5820004311391006e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1478 }, { "completion_length": 126.0, "epoch": 0.6582109479305741, "grad_norm": 1.1132007837295532, "kl": 0.08661770820617676, "learning_rate": 1.5783886233071078e-06, "loss": 0.0035, "reward": 0.3551666736602783, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1479 }, { "completion_length": 153.83334350585938, "epoch": 0.6586559857587895, "grad_norm": 0.5954561829566956, "kl": 0.07186748832464218, "learning_rate": 1.5747790402454377e-06, "loss": 0.0029, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1480 }, { "completion_length": 133.5, "epoch": 0.6591010235870048, "grad_norm": 0.8292099833488464, "kl": 0.04337434098124504, "learning_rate": 1.5711716906676258e-06, "loss": 0.0017, "reward": 0.35499998927116394, "reward_std": 0.12266214936971664, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27166664600372314, "step": 1481 }, { "completion_length": 151.5, "epoch": 0.6595460614152203, "grad_norm": 0.8073120713233948, "kl": 0.059761784970760345, "learning_rate": 1.5675665832818166e-06, "loss": 0.0024, "reward": 0.3551666736602783, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35516664385795593, "step": 1482 }, { "completion_length": 161.5, "epoch": 0.6599910992434357, "grad_norm": 0.7773087620735168, "kl": 0.048801522701978683, "learning_rate": 1.5639637267907399e-06, "loss": 0.002, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1483 }, { "completion_length": 182.0, "epoch": 0.660436137071651, "grad_norm": 0.5983509421348572, "kl": 0.08513455092906952, "learning_rate": 1.5603631298916937e-06, "loss": 0.0034, "reward": 0.17266666889190674, "reward_std": 0.3189123272895813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17266666889190674, "step": 1484 }, { "completion_length": 156.33334350585938, "epoch": 0.6608811748998665, "grad_norm": 0.030569666996598244, "kl": 0.0781131312251091, "learning_rate": 1.5567648012765213e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1485 }, { "completion_length": 193.33334350585938, "epoch": 0.6613262127280819, "grad_norm": 0.7831483483314514, "kl": 0.04813634976744652, "learning_rate": 1.5531687496315887e-06, "loss": 0.0019, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1486 }, { "completion_length": 177.6666717529297, "epoch": 0.6617712505562973, "grad_norm": 0.6230365633964539, "kl": 0.04865188151597977, "learning_rate": 1.549574983637767e-06, "loss": 0.0019, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1487 }, { "completion_length": 176.5, "epoch": 0.6622162883845126, "grad_norm": 0.6940677762031555, "kl": 0.07798059284687042, "learning_rate": 1.545983511970409e-06, "loss": 0.0031, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 1488 }, { "completion_length": 171.83334350585938, "epoch": 0.6626613262127281, "grad_norm": 0.6483290195465088, "kl": 0.06438718736171722, "learning_rate": 1.5423943432993287e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1489 }, { "completion_length": 200.0, "epoch": 0.6631063640409435, "grad_norm": 0.019243627786636353, "kl": 0.0421595573425293, "learning_rate": 1.538807486288782e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1490 }, { "completion_length": 143.6666717529297, "epoch": 0.6635514018691588, "grad_norm": 0.024592779576778412, "kl": 0.052703164517879486, "learning_rate": 1.5352229495974425e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1491 }, { "completion_length": 137.33334350585938, "epoch": 0.6639964396973743, "grad_norm": 0.02066526561975479, "kl": 0.06004131957888603, "learning_rate": 1.5316407418783835e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1492 }, { "completion_length": 144.6666717529297, "epoch": 0.6644414775255897, "grad_norm": 0.8184164762496948, "kl": 0.05917416140437126, "learning_rate": 1.528060871779058e-06, "loss": 0.0024, "reward": 0.33416664600372314, "reward_std": 0.064808689057827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 1493 }, { "completion_length": 166.33334350585938, "epoch": 0.664886515353805, "grad_norm": 0.5798844695091248, "kl": 0.06790559738874435, "learning_rate": 1.5244833479412717e-06, "loss": 0.0027, "reward": 0.21583333611488342, "reward_std": 0.39232656359672546, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21583333611488342, "step": 1494 }, { "completion_length": 163.83334350585938, "epoch": 0.6653315531820204, "grad_norm": 0.0290197916328907, "kl": 0.05775400996208191, "learning_rate": 1.5209081790011704e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1495 }, { "completion_length": 127.66667175292969, "epoch": 0.6657765910102359, "grad_norm": 0.0205070823431015, "kl": 0.060368895530700684, "learning_rate": 1.5173353735892139e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 1496 }, { "completion_length": 156.83334350585938, "epoch": 0.6662216288384513, "grad_norm": 0.6863259673118591, "kl": 0.049120690673589706, "learning_rate": 1.5137649403301551e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1497 }, { "completion_length": 188.0, "epoch": 0.6666666666666666, "grad_norm": 0.6774608492851257, "kl": 0.04528352618217468, "learning_rate": 1.5101968878430229e-06, "loss": 0.0018, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1498 }, { "completion_length": 187.6666717529297, "epoch": 0.6671117044948821, "grad_norm": 0.7849301695823669, "kl": 0.09143184125423431, "learning_rate": 1.5066312247410974e-06, "loss": 0.0037, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 1499 }, { "completion_length": 147.0, "epoch": 0.6675567423230975, "grad_norm": 0.8740885853767395, "kl": 0.06229601427912712, "learning_rate": 1.5030679596318904e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 1500 } ], "logging_steps": 1, "max_steps": 2247, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }