{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.44503782821539833, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 196.6666717529297, "epoch": 0.0004450378282153983, "grad_norm": 0.7149407267570496, "kl": 0.00043882918544113636, "learning_rate": 2.2222222222222224e-08, "loss": 0.0, "reward": -0.312666654586792, "reward_std": 0.39005160331726074, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.312666654586792, "step": 1 }, { "completion_length": 179.0, "epoch": 0.0008900756564307966, "grad_norm": 1.2034916877746582, "kl": 0.00043298103264532983, "learning_rate": 4.444444444444445e-08, "loss": 0.0, "reward": -0.2953333556652069, "reward_std": 0.3479785621166229, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2953333556652069, "step": 2 }, { "completion_length": 200.0, "epoch": 0.0013351134846461949, "grad_norm": 0.00107863440643996, "kl": 0.0003771593910641968, "learning_rate": 6.666666666666668e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 3 }, { "completion_length": 200.0, "epoch": 0.0017801513128615932, "grad_norm": 0.7310553789138794, "kl": 0.00044770282693207264, "learning_rate": 8.88888888888889e-08, "loss": 0.0, "reward": -0.085999995470047, "reward_std": 0.3784494996070862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.085999995470047, "step": 4 }, { "completion_length": 174.83334350585938, "epoch": 0.0022251891410769915, "grad_norm": 0.861896276473999, "kl": 0.00042549317004159093, "learning_rate": 1.1111111111111112e-07, "loss": 0.0, "reward": -0.09316667169332504, "reward_std": 0.2892870306968689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09316667169332504, "step": 5 }, { "completion_length": 200.0, "epoch": 0.0026702269692923898, "grad_norm": 0.0017526125302538276, "kl": 0.0004182992852292955, "learning_rate": 1.3333333333333336e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 6 }, { "completion_length": 166.5, "epoch": 0.003115264797507788, "grad_norm": 0.9585131406784058, "kl": 0.00047177166561596096, "learning_rate": 1.5555555555555556e-07, "loss": 0.0, "reward": -0.33766669034957886, "reward_std": 0.2008279412984848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3376666307449341, "step": 7 }, { "completion_length": 200.0, "epoch": 0.0035603026257231864, "grad_norm": 0.6275889873504639, "kl": 0.0004587940056808293, "learning_rate": 1.777777777777778e-07, "loss": 0.0, "reward": -0.4321666657924652, "reward_std": 0.2741382122039795, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4321666657924652, "step": 8 }, { "completion_length": 152.5, "epoch": 0.004005340453938585, "grad_norm": 0.7820535898208618, "kl": 0.0003722615947481245, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": -0.24449998140335083, "reward_std": 0.2453835904598236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24449998140335083, "step": 9 }, { "completion_length": 200.0, "epoch": 0.004450378282153983, "grad_norm": 0.6632036566734314, "kl": 0.00043898209696635604, "learning_rate": 2.2222222222222224e-07, "loss": 0.0, "reward": -0.006833334919065237, "reward_std": 0.3229244351387024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.006833334919065237, "step": 10 }, { "completion_length": 200.0, "epoch": 0.004895416110369382, "grad_norm": 0.7768407464027405, "kl": 0.00042121915612369776, "learning_rate": 2.444444444444445e-07, "loss": 0.0, "reward": -0.5506666898727417, "reward_std": 0.06373277306556702, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5506666898727417, "step": 11 }, { "completion_length": 199.6666717529297, "epoch": 0.0053404539385847796, "grad_norm": 0.7278463244438171, "kl": 0.0004068611597176641, "learning_rate": 2.666666666666667e-07, "loss": 0.0, "reward": -0.2864999771118164, "reward_std": 0.334197998046875, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2864999771118164, "step": 12 }, { "completion_length": 200.0, "epoch": 0.005785491766800178, "grad_norm": 0.6500892043113708, "kl": 0.0003313750494271517, "learning_rate": 2.888888888888889e-07, "loss": 0.0, "reward": -0.32199999690055847, "reward_std": 0.34629470109939575, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32199999690055847, "step": 13 }, { "completion_length": 200.0, "epoch": 0.006230529595015576, "grad_norm": 0.7193189859390259, "kl": 0.0003894752007909119, "learning_rate": 3.111111111111111e-07, "loss": 0.0, "reward": -0.08683334290981293, "reward_std": 0.32831722497940063, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08683334290981293, "step": 14 }, { "completion_length": 200.0, "epoch": 0.006675567423230975, "grad_norm": 0.6580405235290527, "kl": 0.00035931920865550637, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": 0.006666670553386211, "reward_std": 0.28985628485679626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006666670553386211, "step": 15 }, { "completion_length": 200.0, "epoch": 0.007120605251446373, "grad_norm": 0.7829955220222473, "kl": 0.0004430452245287597, "learning_rate": 3.555555555555556e-07, "loss": 0.0, "reward": -0.4845000207424164, "reward_std": 0.30278095602989197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4845000207424164, "step": 16 }, { "completion_length": 199.6666717529297, "epoch": 0.0075656430796617715, "grad_norm": 0.7301868796348572, "kl": 0.0004354792181402445, "learning_rate": 3.777777777777778e-07, "loss": 0.0, "reward": -0.5509999990463257, "reward_std": 0.09809383004903793, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5509999990463257, "step": 17 }, { "completion_length": 195.33334350585938, "epoch": 0.00801068090787717, "grad_norm": 0.627457320690155, "kl": 0.0004219269612804055, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": -0.382999986410141, "reward_std": 0.258131742477417, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.382999986410141, "step": 18 }, { "completion_length": 198.5, "epoch": 0.008455718736092568, "grad_norm": 0.6126062273979187, "kl": 0.00037288008024916053, "learning_rate": 4.2222222222222226e-07, "loss": 0.0, "reward": -0.5198333263397217, "reward_std": 0.06735106557607651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5198333859443665, "step": 19 }, { "completion_length": 200.0, "epoch": 0.008900756564307966, "grad_norm": 0.7298072576522827, "kl": 0.00042569750803522766, "learning_rate": 4.444444444444445e-07, "loss": 0.0, "reward": -0.12316666543483734, "reward_std": 0.38789814710617065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12316666543483734, "step": 20 }, { "completion_length": 200.0, "epoch": 0.009345794392523364, "grad_norm": 0.7508992552757263, "kl": 0.0003784544242080301, "learning_rate": 4.666666666666667e-07, "loss": 0.0, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 21 }, { "completion_length": 200.0, "epoch": 0.009790832220738763, "grad_norm": 0.7774270176887512, "kl": 0.0004591545439325273, "learning_rate": 4.88888888888889e-07, "loss": 0.0, "reward": -0.12583334743976593, "reward_std": 0.3894747495651245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12583334743976593, "step": 22 }, { "completion_length": 178.6666717529297, "epoch": 0.010235870048954161, "grad_norm": 0.9822273254394531, "kl": 0.0003580343909561634, "learning_rate": 5.111111111111112e-07, "loss": 0.0, "reward": -0.40816670656204224, "reward_std": 0.2941179871559143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40816670656204224, "step": 23 }, { "completion_length": 195.6666717529297, "epoch": 0.010680907877169559, "grad_norm": 0.7280099987983704, "kl": 0.0003752989578060806, "learning_rate": 5.333333333333335e-07, "loss": 0.0, "reward": -0.5036666393280029, "reward_std": 0.06944254040718079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5036666393280029, "step": 24 }, { "completion_length": 194.1666717529297, "epoch": 0.011125945705384957, "grad_norm": 0.7669874429702759, "kl": 0.0003995794686488807, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "reward": -0.1613333374261856, "reward_std": 0.3338997960090637, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1613333374261856, "step": 25 }, { "completion_length": 200.0, "epoch": 0.011570983533600357, "grad_norm": 0.0017349894624203444, "kl": 0.00044584478018805385, "learning_rate": 5.777777777777778e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 26 }, { "completion_length": 200.0, "epoch": 0.012016021361815754, "grad_norm": 0.7146498560905457, "kl": 0.0004711821093223989, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.004333337303251028, "reward_std": 0.2955717444419861, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.004333337303251028, "step": 27 }, { "completion_length": 194.0, "epoch": 0.012461059190031152, "grad_norm": 0.7583353519439697, "kl": 0.0003412925580050796, "learning_rate": 6.222222222222223e-07, "loss": 0.0, "reward": -0.32066667079925537, "reward_std": 0.34830141067504883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32066667079925537, "step": 28 }, { "completion_length": 200.0, "epoch": 0.01290609701824655, "grad_norm": 0.6495766639709473, "kl": 0.0003407001495361328, "learning_rate": 6.444444444444445e-07, "loss": 0.0, "reward": 0.013833334669470787, "reward_std": 0.2723016142845154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013833334669470787, "step": 29 }, { "completion_length": 200.0, "epoch": 0.01335113484646195, "grad_norm": 0.6830177307128906, "kl": 0.00040841297595761716, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": -0.38466668128967285, "reward_std": 0.3953356444835663, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38466668128967285, "step": 30 }, { "completion_length": 197.0, "epoch": 0.013796172674677348, "grad_norm": 0.6479209661483765, "kl": 0.00035427496186457574, "learning_rate": 6.88888888888889e-07, "loss": 0.0, "reward": -0.41850003600120544, "reward_std": 0.27061542868614197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41850003600120544, "step": 31 }, { "completion_length": 200.0, "epoch": 0.014241210502892745, "grad_norm": 0.001505751977674663, "kl": 0.00040443878970108926, "learning_rate": 7.111111111111112e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 32 }, { "completion_length": 194.83334350585938, "epoch": 0.014686248331108143, "grad_norm": 0.8729995489120483, "kl": 0.00045898579992353916, "learning_rate": 7.333333333333334e-07, "loss": 0.0, "reward": -0.4713333547115326, "reward_std": 0.0908794030547142, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4713333547115326, "step": 33 }, { "completion_length": 200.0, "epoch": 0.015131286159323543, "grad_norm": 0.83029705286026, "kl": 0.0004323194734752178, "learning_rate": 7.555555555555556e-07, "loss": 0.0, "reward": -0.3368333578109741, "reward_std": 0.35846197605133057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33683332800865173, "step": 34 }, { "completion_length": 200.0, "epoch": 0.01557632398753894, "grad_norm": 0.678088903427124, "kl": 0.0004401813494041562, "learning_rate": 7.777777777777779e-07, "loss": 0.0, "reward": -0.22800001502037048, "reward_std": 0.38724154233932495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22800001502037048, "step": 35 }, { "completion_length": 200.0, "epoch": 0.01602136181575434, "grad_norm": 0.7426117658615112, "kl": 0.0003528599627315998, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": -0.19583332538604736, "reward_std": 0.3525362014770508, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19583332538604736, "step": 36 }, { "completion_length": 200.0, "epoch": 0.016466399643969738, "grad_norm": 0.0014730676775798202, "kl": 0.000418979674577713, "learning_rate": 8.222222222222223e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 37 }, { "completion_length": 200.0, "epoch": 0.016911437472185136, "grad_norm": 0.6773275136947632, "kl": 0.0003312948392704129, "learning_rate": 8.444444444444445e-07, "loss": 0.0, "reward": 0.016833335161209106, "reward_std": 0.2649531364440918, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016833335161209106, "step": 38 }, { "completion_length": 200.0, "epoch": 0.017356475300400534, "grad_norm": 0.4805554449558258, "kl": 0.00037392970989458263, "learning_rate": 8.666666666666668e-07, "loss": 0.0, "reward": -0.4266667068004608, "reward_std": 0.2752167582511902, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4266667068004608, "step": 39 }, { "completion_length": 188.33334350585938, "epoch": 0.017801513128615932, "grad_norm": 0.8252691626548767, "kl": 0.00034736632369458675, "learning_rate": 8.88888888888889e-07, "loss": 0.0, "reward": -0.43516668677330017, "reward_std": 0.10999348759651184, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43516668677330017, "step": 40 }, { "completion_length": 195.83334350585938, "epoch": 0.01824655095683133, "grad_norm": 0.6390711069107056, "kl": 0.00039921089773997664, "learning_rate": 9.111111111111113e-07, "loss": 0.0, "reward": -0.44216665625572205, "reward_std": 0.1345755159854889, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44216665625572205, "step": 41 }, { "completion_length": 200.0, "epoch": 0.018691588785046728, "grad_norm": 0.584726095199585, "kl": 0.000410672917496413, "learning_rate": 9.333333333333334e-07, "loss": 0.0, "reward": -0.08250001072883606, "reward_std": 0.3218010365962982, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 42 }, { "completion_length": 200.0, "epoch": 0.01913662661326213, "grad_norm": 0.7584457993507385, "kl": 0.0004279993590898812, "learning_rate": 9.555555555555556e-07, "loss": 0.0, "reward": -0.1798333376646042, "reward_std": 0.4852597117424011, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1798333376646042, "step": 43 }, { "completion_length": 200.0, "epoch": 0.019581664441477527, "grad_norm": 0.6866025328636169, "kl": 0.0004159708914812654, "learning_rate": 9.77777777777778e-07, "loss": 0.0, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 44 }, { "completion_length": 200.0, "epoch": 0.020026702269692925, "grad_norm": 0.6524875164031982, "kl": 0.0004553616454359144, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -0.30799999833106995, "reward_std": 0.33663925528526306, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30799999833106995, "step": 45 }, { "completion_length": 197.1666717529297, "epoch": 0.020471740097908322, "grad_norm": 1.1161606311798096, "kl": 0.0004416520241647959, "learning_rate": 1.0222222222222223e-06, "loss": 0.0, "reward": -0.609333336353302, "reward_std": 0.2037426233291626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.609333336353302, "step": 46 }, { "completion_length": 200.0, "epoch": 0.02091677792612372, "grad_norm": 0.0014684926718473434, "kl": 0.0004343845648691058, "learning_rate": 1.0444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 47 }, { "completion_length": 200.0, "epoch": 0.021361815754339118, "grad_norm": 0.7850491404533386, "kl": 0.0004720585129689425, "learning_rate": 1.066666666666667e-06, "loss": 0.0, "reward": -0.24550001323223114, "reward_std": 0.4167482554912567, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24550001323223114, "step": 48 }, { "completion_length": 174.0, "epoch": 0.021806853582554516, "grad_norm": 0.6916747093200684, "kl": 0.0003871291410177946, "learning_rate": 1.0888888888888889e-06, "loss": 0.0, "reward": -0.3348333239555359, "reward_std": 0.15262427926063538, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3348333537578583, "step": 49 }, { "completion_length": 200.0, "epoch": 0.022251891410769914, "grad_norm": 0.6415843367576599, "kl": 0.0004480450297705829, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "reward": -0.5586666464805603, "reward_std": 0.04939500615000725, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5586667060852051, "step": 50 }, { "completion_length": 200.0, "epoch": 0.022696929238985315, "grad_norm": 0.7905615568161011, "kl": 0.00043076984002254903, "learning_rate": 1.1333333333333334e-06, "loss": 0.0, "reward": -0.5353333353996277, "reward_std": 0.01973491534590721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5353333353996277, "step": 51 }, { "completion_length": 195.33334350585938, "epoch": 0.023141967067200713, "grad_norm": 0.6714709401130676, "kl": 0.000462042837170884, "learning_rate": 1.1555555555555556e-06, "loss": 0.0, "reward": -0.45649999380111694, "reward_std": 0.0799018144607544, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45650002360343933, "step": 52 }, { "completion_length": 200.0, "epoch": 0.02358700489541611, "grad_norm": 0.002091527683660388, "kl": 0.0004879847401753068, "learning_rate": 1.1777777777777778e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 53 }, { "completion_length": 200.0, "epoch": 0.02403204272363151, "grad_norm": 0.7136774659156799, "kl": 0.00044702840386889875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.019333332777023315, "reward_std": 0.25882941484451294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019333332777023315, "step": 54 }, { "completion_length": 191.6666717529297, "epoch": 0.024477080551846907, "grad_norm": 1.0719506740570068, "kl": 0.00041414215229451656, "learning_rate": 1.2222222222222223e-06, "loss": 0.0, "reward": 0.0806666687130928, "reward_std": 0.10859404504299164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0806666687130928, "step": 55 }, { "completion_length": 200.0, "epoch": 0.024922118380062305, "grad_norm": 0.002473922213539481, "kl": 0.0005147390766069293, "learning_rate": 1.2444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 56 }, { "completion_length": 200.0, "epoch": 0.025367156208277702, "grad_norm": 0.7581794857978821, "kl": 0.0004257292894180864, "learning_rate": 1.2666666666666669e-06, "loss": 0.0, "reward": -0.09583333134651184, "reward_std": 0.34220486879348755, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09583333134651184, "step": 57 }, { "completion_length": 200.0, "epoch": 0.0258121940364931, "grad_norm": 0.6923669576644897, "kl": 0.0004169998865108937, "learning_rate": 1.288888888888889e-06, "loss": 0.0, "reward": -0.3475000262260437, "reward_std": 0.3798324763774872, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3475000262260437, "step": 58 }, { "completion_length": 109.0, "epoch": 0.0262572318647085, "grad_norm": 1.0712668895721436, "kl": 0.00041128776501864195, "learning_rate": 1.3111111111111112e-06, "loss": 0.0, "reward": -0.06133333593606949, "reward_std": 0.20354819297790527, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06133333966135979, "step": 59 }, { "completion_length": 200.0, "epoch": 0.0267022696929239, "grad_norm": 0.6178426742553711, "kl": 0.0004559112712740898, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 60 }, { "completion_length": 200.0, "epoch": 0.027147307521139297, "grad_norm": 0.6320557594299316, "kl": 0.0005056136287748814, "learning_rate": 1.3555555555555558e-06, "loss": 0.0, "reward": -0.4624999761581421, "reward_std": 0.29319530725479126, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4624999761581421, "step": 61 }, { "completion_length": 196.83334350585938, "epoch": 0.027592345349354695, "grad_norm": 0.7537997364997864, "kl": 0.0004290025099180639, "learning_rate": 1.377777777777778e-06, "loss": 0.0, "reward": -0.48366665840148926, "reward_std": 0.12520170211791992, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48366665840148926, "step": 62 }, { "completion_length": 200.0, "epoch": 0.028037383177570093, "grad_norm": 0.0018171067349612713, "kl": 0.0004468559636734426, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 63 }, { "completion_length": 200.0, "epoch": 0.02848242100578549, "grad_norm": 0.6882118582725525, "kl": 0.0004662362043745816, "learning_rate": 1.4222222222222223e-06, "loss": 0.0, "reward": 0.01850000023841858, "reward_std": 0.26087066531181335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01850000023841858, "step": 64 }, { "completion_length": 195.33334350585938, "epoch": 0.02892745883400089, "grad_norm": 0.7403289675712585, "kl": 0.00043322655255906284, "learning_rate": 1.4444444444444445e-06, "loss": 0.0, "reward": -0.47933337092399597, "reward_std": 0.09026111662387848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.47933337092399597, "step": 65 }, { "completion_length": 200.0, "epoch": 0.029372496662216287, "grad_norm": 0.0017793107545003295, "kl": 0.00043216149788349867, "learning_rate": 1.4666666666666669e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 66 }, { "completion_length": 199.0, "epoch": 0.029817534490431688, "grad_norm": 0.6182725429534912, "kl": 0.00037733028875663877, "learning_rate": 1.4888888888888888e-06, "loss": 0.0, "reward": -0.07999999821186066, "reward_std": 0.32039913535118103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08000000566244125, "step": 67 }, { "completion_length": 130.6666717529297, "epoch": 0.030262572318647086, "grad_norm": 1.1008906364440918, "kl": 0.0003855052054859698, "learning_rate": 1.5111111111111112e-06, "loss": 0.0, "reward": -0.04516666755080223, "reward_std": 0.15019378066062927, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04516666755080223, "step": 68 }, { "completion_length": 200.0, "epoch": 0.030707610146862484, "grad_norm": 0.6984473466873169, "kl": 0.00044030696153640747, "learning_rate": 1.5333333333333334e-06, "loss": 0.0, "reward": -0.34150001406669617, "reward_std": 0.364574134349823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34150001406669617, "step": 69 }, { "completion_length": 195.33334350585938, "epoch": 0.03115264797507788, "grad_norm": 0.6155555248260498, "kl": 0.00039145484333857894, "learning_rate": 1.5555555555555558e-06, "loss": 0.0, "reward": -0.3800000548362732, "reward_std": 0.2700370252132416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3799999952316284, "step": 70 }, { "completion_length": 200.0, "epoch": 0.03159768580329328, "grad_norm": 0.5811487436294556, "kl": 0.0003899557632394135, "learning_rate": 1.5777777777777778e-06, "loss": 0.0, "reward": 0.014166663400828838, "reward_std": 0.2714851498603821, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014166663400828838, "step": 71 }, { "completion_length": 200.0, "epoch": 0.03204272363150868, "grad_norm": 0.6491996049880981, "kl": 0.0004053341690450907, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.01066666841506958, "reward_std": 0.2800583243370056, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01066666841506958, "step": 72 }, { "completion_length": 192.6666717529297, "epoch": 0.032487761459724075, "grad_norm": 0.7621211409568787, "kl": 0.0005313883302733302, "learning_rate": 1.6222222222222223e-06, "loss": 0.0, "reward": -0.4951666593551636, "reward_std": 0.05715736746788025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4951666593551636, "step": 73 }, { "completion_length": 200.0, "epoch": 0.032932799287939477, "grad_norm": 0.7010623216629028, "kl": 0.00042472081258893013, "learning_rate": 1.6444444444444447e-06, "loss": 0.0, "reward": -0.0989999994635582, "reward_std": 0.34915727376937866, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0989999994635582, "step": 74 }, { "completion_length": 200.0, "epoch": 0.03337783711615487, "grad_norm": 0.7583237886428833, "kl": 0.00046256266068667173, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": -0.09333333373069763, "reward_std": 0.33824294805526733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09333333373069763, "step": 75 }, { "completion_length": 200.0, "epoch": 0.03382287494437027, "grad_norm": 0.7639651894569397, "kl": 0.00039581506280228496, "learning_rate": 1.688888888888889e-06, "loss": 0.0, "reward": -0.2276666760444641, "reward_std": 0.3896222710609436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2276666760444641, "step": 76 }, { "completion_length": 164.1666717529297, "epoch": 0.03426791277258567, "grad_norm": 0.8515862822532654, "kl": 0.0004627097805496305, "learning_rate": 1.7111111111111112e-06, "loss": 0.0, "reward": -0.3383333683013916, "reward_std": 0.19050845503807068, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3383333683013916, "step": 77 }, { "completion_length": 200.0, "epoch": 0.03471295060080107, "grad_norm": 0.6962697505950928, "kl": 0.000421873846789822, "learning_rate": 1.7333333333333336e-06, "loss": 0.0, "reward": 0.013333330862224102, "reward_std": 0.2735263705253601, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013333330862224102, "step": 78 }, { "completion_length": 200.0, "epoch": 0.03515798842901647, "grad_norm": 0.6623450517654419, "kl": 0.0004432189743965864, "learning_rate": 1.7555555555555556e-06, "loss": 0.0, "reward": -0.0011666715145111084, "reward_std": 0.3090439736843109, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0011666715145111084, "step": 79 }, { "completion_length": 145.5, "epoch": 0.035603026257231864, "grad_norm": 0.9290313720703125, "kl": 0.0004976950585842133, "learning_rate": 1.777777777777778e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.20369061827659607, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333535194397, "step": 80 }, { "completion_length": 200.0, "epoch": 0.036048064085447265, "grad_norm": 0.00226940237917006, "kl": 0.00044577824883162975, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 81 }, { "completion_length": 200.0, "epoch": 0.03649310191366266, "grad_norm": 0.7341117262840271, "kl": 0.0003961599140893668, "learning_rate": 1.8222222222222225e-06, "loss": 0.0, "reward": 0.012833337299525738, "reward_std": 0.27475112676620483, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012833337299525738, "step": 82 }, { "completion_length": 176.83334350585938, "epoch": 0.03693813974187806, "grad_norm": 1.1084420680999756, "kl": 0.0004944322863593698, "learning_rate": 1.8444444444444445e-06, "loss": 0.0, "reward": -0.4231666922569275, "reward_std": 0.38362035155296326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4231666922569275, "step": 83 }, { "completion_length": 200.0, "epoch": 0.037383177570093455, "grad_norm": 0.5538046956062317, "kl": 0.0004035194288007915, "learning_rate": 1.8666666666666669e-06, "loss": 0.0, "reward": 0.020999997854232788, "reward_std": 0.2547469735145569, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020999997854232788, "step": 84 }, { "completion_length": 199.6666717529297, "epoch": 0.037828215398308856, "grad_norm": 0.6329994797706604, "kl": 0.00045469467295333743, "learning_rate": 1.888888888888889e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.3470242917537689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333386182785, "step": 85 }, { "completion_length": 200.0, "epoch": 0.03827325322652426, "grad_norm": 0.7051041722297668, "kl": 0.00048318642075173557, "learning_rate": 1.9111111111111112e-06, "loss": 0.0, "reward": 0.012499998323619366, "reward_std": 0.27556759119033813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012499998323619366, "step": 86 }, { "completion_length": 200.0, "epoch": 0.03871829105473965, "grad_norm": 0.9833978414535522, "kl": 0.0004773414693772793, "learning_rate": 1.9333333333333336e-06, "loss": 0.0, "reward": -0.37816667556762695, "reward_std": 0.35067784786224365, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37816667556762695, "step": 87 }, { "completion_length": 200.0, "epoch": 0.039163328882955054, "grad_norm": 0.7608519196510315, "kl": 0.0004284613241907209, "learning_rate": 1.955555555555556e-06, "loss": 0.0, "reward": -0.4285000264644623, "reward_std": 0.27370041608810425, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4285000264644623, "step": 88 }, { "completion_length": 200.0, "epoch": 0.03960836671117045, "grad_norm": 0.8064699769020081, "kl": 0.00046465283958241343, "learning_rate": 1.977777777777778e-06, "loss": 0.0, "reward": -0.6141666769981384, "reward_std": 0.05563242360949516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6141666769981384, "step": 89 }, { "completion_length": 200.0, "epoch": 0.04005340453938585, "grad_norm": 0.7693816423416138, "kl": 0.0004991068853996694, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.21366667747497559, "reward_std": 0.42142030596733093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21366667747497559, "step": 90 }, { "completion_length": 170.83334350585938, "epoch": 0.040498442367601244, "grad_norm": 0.8475862145423889, "kl": 0.0004466826212592423, "learning_rate": 2.0222222222222223e-06, "loss": 0.0, "reward": -0.25600001215934753, "reward_std": 0.3458733558654785, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25600001215934753, "step": 91 }, { "completion_length": 200.0, "epoch": 0.040943480195816645, "grad_norm": 0.001588360988534987, "kl": 0.00045517744729295373, "learning_rate": 2.0444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 92 }, { "completion_length": 168.83334350585938, "epoch": 0.04138851802403204, "grad_norm": 0.7196416854858398, "kl": 0.0005219130543991923, "learning_rate": 2.0666666666666666e-06, "loss": 0.0, "reward": -0.28700000047683716, "reward_std": 0.23125484585762024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28700003027915955, "step": 93 }, { "completion_length": 200.0, "epoch": 0.04183355585224744, "grad_norm": 0.0016818700823932886, "kl": 0.0004399843164719641, "learning_rate": 2.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 94 }, { "completion_length": 187.33334350585938, "epoch": 0.04227859368046284, "grad_norm": 0.662451446056366, "kl": 0.0005664011696353555, "learning_rate": 2.1111111111111114e-06, "loss": 0.0, "reward": -0.07083334028720856, "reward_std": 0.4042125344276428, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07083334028720856, "step": 95 }, { "completion_length": 200.0, "epoch": 0.042723631508678236, "grad_norm": 0.8778610229492188, "kl": 0.0005331834545359015, "learning_rate": 2.133333333333334e-06, "loss": 0.0, "reward": 0.013499995693564415, "reward_std": 0.27311810851097107, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013499995693564415, "step": 96 }, { "completion_length": 200.0, "epoch": 0.04316866933689364, "grad_norm": 0.7499815821647644, "kl": 0.0005722627975046635, "learning_rate": 2.1555555555555558e-06, "loss": 0.0, "reward": -0.36016663908958435, "reward_std": 0.37692034244537354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36016666889190674, "step": 97 }, { "completion_length": 199.6666717529297, "epoch": 0.04361370716510903, "grad_norm": 0.6992683410644531, "kl": 0.0006015513790771365, "learning_rate": 2.1777777777777777e-06, "loss": 0.0, "reward": -0.3303333520889282, "reward_std": 0.3531128168106079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3303333520889282, "step": 98 }, { "completion_length": 200.0, "epoch": 0.044058744993324434, "grad_norm": 0.7034227848052979, "kl": 0.0005352528532966971, "learning_rate": 2.2e-06, "loss": 0.0, "reward": -0.5301666855812073, "reward_std": 0.03528975695371628, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5301666855812073, "step": 99 }, { "completion_length": 200.0, "epoch": 0.04450378282153983, "grad_norm": 0.7136642932891846, "kl": 0.0005618830909952521, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 100 }, { "completion_length": 200.0, "epoch": 0.04494882064975523, "grad_norm": 0.9584755301475525, "kl": 0.0008303936338052154, "learning_rate": 2.2444444444444445e-06, "loss": 0.0, "reward": -0.34933334589004517, "reward_std": 0.369188129901886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34933334589004517, "step": 101 }, { "completion_length": 200.0, "epoch": 0.04539385847797063, "grad_norm": 0.810990035533905, "kl": 0.0007378787267953157, "learning_rate": 2.266666666666667e-06, "loss": 0.0, "reward": -0.5558333396911621, "reward_std": 0.02642286941409111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5558333396911621, "step": 102 }, { "completion_length": 200.0, "epoch": 0.045838896306186025, "grad_norm": 0.8462039232254028, "kl": 0.0007519976934418082, "learning_rate": 2.2888888888888892e-06, "loss": 0.0, "reward": -0.5571666955947876, "reward_std": 0.042873844504356384, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5571666955947876, "step": 103 }, { "completion_length": 152.33334350585938, "epoch": 0.046283934134401426, "grad_norm": 0.8582832217216492, "kl": 0.0006616115570068359, "learning_rate": 2.311111111111111e-06, "loss": 0.0, "reward": -0.20516666769981384, "reward_std": 0.3065997064113617, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20516668260097504, "step": 104 }, { "completion_length": 200.0, "epoch": 0.04672897196261682, "grad_norm": 0.751700758934021, "kl": 0.0007129245204851031, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "reward": -0.24383334815502167, "reward_std": 0.40426644682884216, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24383334815502167, "step": 105 }, { "completion_length": 200.0, "epoch": 0.04717400979083222, "grad_norm": 0.7948376536369324, "kl": 0.0009101564064621925, "learning_rate": 2.3555555555555555e-06, "loss": 0.0, "reward": -0.26483333110809326, "reward_std": 0.44175583124160767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26483333110809326, "step": 106 }, { "completion_length": 200.0, "epoch": 0.047619047619047616, "grad_norm": 0.6845387816429138, "kl": 0.0005481390398927033, "learning_rate": 2.377777777777778e-06, "loss": 0.0, "reward": -0.08950000256299973, "reward_std": 0.3333369195461273, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08950000256299973, "step": 107 }, { "completion_length": 174.33334350585938, "epoch": 0.04806408544726302, "grad_norm": 0.9041478633880615, "kl": 0.0010797386057674885, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": -0.3813333511352539, "reward_std": 0.1361376941204071, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3813333511352539, "step": 108 }, { "completion_length": 200.0, "epoch": 0.04850912327547842, "grad_norm": 0.8546884059906006, "kl": 0.0009791944175958633, "learning_rate": 2.4222222222222223e-06, "loss": 0.0, "reward": -0.17666666209697723, "reward_std": 0.3318732678890228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17666666209697723, "step": 109 }, { "completion_length": 200.0, "epoch": 0.04895416110369381, "grad_norm": 0.002719539450481534, "kl": 0.0005873936461284757, "learning_rate": 2.4444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 110 }, { "completion_length": 200.0, "epoch": 0.049399198931909215, "grad_norm": 0.7770993113517761, "kl": 0.0009914538823068142, "learning_rate": 2.466666666666667e-06, "loss": 0.0, "reward": -0.3736667037010193, "reward_std": 0.3873085379600525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3736667037010193, "step": 111 }, { "completion_length": 200.0, "epoch": 0.04984423676012461, "grad_norm": 0.6850268244743347, "kl": 0.0007792222313582897, "learning_rate": 2.488888888888889e-06, "loss": 0.0, "reward": -0.5395000576972961, "reward_std": 0.025041967630386353, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5395000576972961, "step": 112 }, { "completion_length": 198.33334350585938, "epoch": 0.05028927458834001, "grad_norm": 0.7239099144935608, "kl": 0.001012115739285946, "learning_rate": 2.5111111111111114e-06, "loss": 0.0, "reward": -0.42133331298828125, "reward_std": 0.27339982986450195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42133331298828125, "step": 113 }, { "completion_length": 200.0, "epoch": 0.050734312416555405, "grad_norm": 0.5771994590759277, "kl": 0.0015950507950037718, "learning_rate": 2.5333333333333338e-06, "loss": 0.0001, "reward": -0.621666669845581, "reward_std": 0.031443070620298386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.621666669845581, "step": 114 }, { "completion_length": 187.6666717529297, "epoch": 0.051179350244770806, "grad_norm": 0.8059555292129517, "kl": 0.0011016380740329623, "learning_rate": 2.5555555555555557e-06, "loss": 0.0, "reward": -0.31283336877822876, "reward_std": 0.25179630517959595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31283336877822876, "step": 115 }, { "completion_length": 200.0, "epoch": 0.0516243880729862, "grad_norm": 0.7699070572853088, "kl": 0.000991647131741047, "learning_rate": 2.577777777777778e-06, "loss": 0.0, "reward": -0.4051666855812073, "reward_std": 0.26059579849243164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4051666855812073, "step": 116 }, { "completion_length": 200.0, "epoch": 0.0520694259012016, "grad_norm": 0.6547927856445312, "kl": 0.0008666824433021247, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 117 }, { "completion_length": 198.1666717529297, "epoch": 0.052514463729417, "grad_norm": 0.8903563022613525, "kl": 0.0012721801176667213, "learning_rate": 2.6222222222222225e-06, "loss": 0.0001, "reward": -0.5, "reward_std": 0.10164842009544373, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5, "step": 118 }, { "completion_length": 200.0, "epoch": 0.0529595015576324, "grad_norm": 0.7130769491195679, "kl": 0.0009466246119700372, "learning_rate": 2.6444444444444444e-06, "loss": 0.0, "reward": -0.07766667008399963, "reward_std": 0.36592990159988403, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07766667008399963, "step": 119 }, { "completion_length": 200.0, "epoch": 0.0534045393858478, "grad_norm": 0.010055916383862495, "kl": 0.0016336208209395409, "learning_rate": 2.666666666666667e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 120 }, { "completion_length": 200.0, "epoch": 0.05384957721406319, "grad_norm": 0.7626153826713562, "kl": 0.0019612584728747606, "learning_rate": 2.6888888888888892e-06, "loss": 0.0001, "reward": -0.44099998474121094, "reward_std": 0.27870485186576843, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4410000443458557, "step": 121 }, { "completion_length": 200.0, "epoch": 0.054294615042278595, "grad_norm": 0.7897221446037292, "kl": 0.001940418384037912, "learning_rate": 2.7111111111111116e-06, "loss": 0.0001, "reward": -0.20816665887832642, "reward_std": 0.36510735750198364, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20816665887832642, "step": 122 }, { "completion_length": 200.0, "epoch": 0.05473965287049399, "grad_norm": 0.004020801745355129, "kl": 0.0008703676867298782, "learning_rate": 2.7333333333333336e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 123 }, { "completion_length": 200.0, "epoch": 0.05518469069870939, "grad_norm": 0.005421373061835766, "kl": 0.0008717196760699153, "learning_rate": 2.755555555555556e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 124 }, { "completion_length": 178.33334350585938, "epoch": 0.05562972852692479, "grad_norm": 0.8171259164810181, "kl": 0.0014178266283124685, "learning_rate": 2.7777777777777783e-06, "loss": 0.0001, "reward": -0.16316668689250946, "reward_std": 0.3424929082393646, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16316668689250946, "step": 125 }, { "completion_length": 200.0, "epoch": 0.056074766355140186, "grad_norm": 0.811120331287384, "kl": 0.00203678198158741, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "reward": -0.4819999933242798, "reward_std": 0.3060758113861084, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4819999933242798, "step": 126 }, { "completion_length": 200.0, "epoch": 0.05651980418335559, "grad_norm": 0.004288077354431152, "kl": 0.00076559919398278, "learning_rate": 2.8222222222222223e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 127 }, { "completion_length": 200.0, "epoch": 0.05696484201157098, "grad_norm": 0.7662628293037415, "kl": 0.0013816291466355324, "learning_rate": 2.8444444444444446e-06, "loss": 0.0001, "reward": -0.4233333468437195, "reward_std": 0.2736119031906128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4233333468437195, "step": 128 }, { "completion_length": 200.0, "epoch": 0.05740987983978638, "grad_norm": 0.7000979781150818, "kl": 0.0022010253742337227, "learning_rate": 2.866666666666667e-06, "loss": 0.0001, "reward": -0.43416666984558105, "reward_std": 0.2743176221847534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43416666984558105, "step": 129 }, { "completion_length": 189.5, "epoch": 0.05785491766800178, "grad_norm": 0.6776527762413025, "kl": 0.0026975106447935104, "learning_rate": 2.888888888888889e-06, "loss": 0.0001, "reward": -0.035999998450279236, "reward_std": 0.27313145995140076, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03599999472498894, "step": 130 }, { "completion_length": 200.0, "epoch": 0.05829995549621718, "grad_norm": 0.7056086659431458, "kl": 0.001560688717290759, "learning_rate": 2.9111111111111114e-06, "loss": 0.0001, "reward": -0.31833332777023315, "reward_std": 0.3454664647579193, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31833332777023315, "step": 131 }, { "completion_length": 172.5, "epoch": 0.05874499332443257, "grad_norm": 1.0837831497192383, "kl": 0.0017425650730729103, "learning_rate": 2.9333333333333338e-06, "loss": 0.0001, "reward": -0.3088333308696747, "reward_std": 0.3001648783683777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3088333308696747, "step": 132 }, { "completion_length": 200.0, "epoch": 0.059190031152647975, "grad_norm": 0.7270302176475525, "kl": 0.0014150540810078382, "learning_rate": 2.955555555555556e-06, "loss": 0.0001, "reward": -0.53083336353302, "reward_std": 0.02674633078277111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.53083336353302, "step": 133 }, { "completion_length": 181.5, "epoch": 0.059635068980863376, "grad_norm": 0.6710302233695984, "kl": 0.0028289398178458214, "learning_rate": 2.9777777777777777e-06, "loss": 0.0001, "reward": -0.15183335542678833, "reward_std": 0.3105810284614563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15183335542678833, "step": 134 }, { "completion_length": 200.0, "epoch": 0.06008010680907877, "grad_norm": 0.8479624390602112, "kl": 0.0018316200003027916, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.02383333444595337, "reward_std": 0.24780671298503876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02383333444595337, "step": 135 }, { "completion_length": 200.0, "epoch": 0.06052514463729417, "grad_norm": 0.8236755728721619, "kl": 0.002227437449619174, "learning_rate": 3.0222222222222225e-06, "loss": 0.0001, "reward": -0.3340000510215759, "reward_std": 0.35908883810043335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3340000510215759, "step": 136 }, { "completion_length": 200.0, "epoch": 0.060970182465509566, "grad_norm": 0.011570720933377743, "kl": 0.0022140974178910255, "learning_rate": 3.044444444444445e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 137 }, { "completion_length": 200.0, "epoch": 0.06141522029372497, "grad_norm": 0.7572616338729858, "kl": 0.003316813614219427, "learning_rate": 3.066666666666667e-06, "loss": 0.0001, "reward": -0.18433332443237305, "reward_std": 0.340387225151062, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18433332443237305, "step": 138 }, { "completion_length": 200.0, "epoch": 0.06186025812194036, "grad_norm": 0.00493138050660491, "kl": 0.0011617927812039852, "learning_rate": 3.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 139 }, { "completion_length": 200.0, "epoch": 0.06230529595015576, "grad_norm": 0.6994268894195557, "kl": 0.003730988595634699, "learning_rate": 3.1111111111111116e-06, "loss": 0.0001, "reward": -0.10350000858306885, "reward_std": 0.35402247309684753, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10350000858306885, "step": 140 }, { "completion_length": 188.6666717529297, "epoch": 0.06275033377837116, "grad_norm": 0.7533054351806641, "kl": 0.007150403223931789, "learning_rate": 3.133333333333334e-06, "loss": 0.0003, "reward": -0.21416668593883514, "reward_std": 0.36423152685165405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21416667103767395, "step": 141 }, { "completion_length": 200.0, "epoch": 0.06319537160658656, "grad_norm": 0.853130042552948, "kl": 0.0037856251001358032, "learning_rate": 3.1555555555555555e-06, "loss": 0.0002, "reward": -0.45483335852622986, "reward_std": 0.23466865718364716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45483335852622986, "step": 142 }, { "completion_length": 184.5, "epoch": 0.06364040943480195, "grad_norm": 0.6948954463005066, "kl": 0.003319720271974802, "learning_rate": 3.177777777777778e-06, "loss": 0.0001, "reward": -0.33100003004074097, "reward_std": 0.12057197093963623, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33100003004074097, "step": 143 }, { "completion_length": 200.0, "epoch": 0.06408544726301736, "grad_norm": 0.6649389266967773, "kl": 0.004403320141136646, "learning_rate": 3.2000000000000003e-06, "loss": 0.0002, "reward": -0.33233335614204407, "reward_std": 0.3574983775615692, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3323333263397217, "step": 144 }, { "completion_length": 200.0, "epoch": 0.06453048509123276, "grad_norm": 0.008188747800886631, "kl": 0.0016727236798033118, "learning_rate": 3.2222222222222227e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 145 }, { "completion_length": 197.0, "epoch": 0.06497552291944815, "grad_norm": 0.7850582599639893, "kl": 0.004564880859106779, "learning_rate": 3.2444444444444446e-06, "loss": 0.0002, "reward": -0.18733333051204681, "reward_std": 0.36280059814453125, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18733333051204681, "step": 146 }, { "completion_length": 200.0, "epoch": 0.06542056074766354, "grad_norm": 0.8551555275917053, "kl": 0.005018829368054867, "learning_rate": 3.266666666666667e-06, "loss": 0.0002, "reward": -0.22599999606609344, "reward_std": 0.3867609202861786, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 147 }, { "completion_length": 200.0, "epoch": 0.06586559857587895, "grad_norm": 0.011043811216950417, "kl": 0.0033470559865236282, "learning_rate": 3.2888888888888894e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 148 }, { "completion_length": 187.6666717529297, "epoch": 0.06631063640409435, "grad_norm": 0.7791420221328735, "kl": 0.007582447957247496, "learning_rate": 3.3111111111111118e-06, "loss": 0.0003, "reward": -0.24450001120567322, "reward_std": 0.23151740431785583, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24450001120567322, "step": 149 }, { "completion_length": 200.0, "epoch": 0.06675567423230974, "grad_norm": 0.7961557507514954, "kl": 0.005281232297420502, "learning_rate": 3.3333333333333333e-06, "loss": 0.0002, "reward": 0.025166666135191917, "reward_std": 0.2445407211780548, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025166666135191917, "step": 150 }, { "completion_length": 191.1666717529297, "epoch": 0.06720071206052515, "grad_norm": 0.7541393041610718, "kl": 0.008166976273059845, "learning_rate": 3.3555555555555557e-06, "loss": 0.0003, "reward": -0.26350000500679016, "reward_std": 0.3062899112701416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26350000500679016, "step": 151 }, { "completion_length": 138.5, "epoch": 0.06764574988874054, "grad_norm": 1.3180961608886719, "kl": 0.0019082196522504091, "learning_rate": 3.377777777777778e-06, "loss": 0.0001, "reward": -0.19033333659172058, "reward_std": 0.2720842957496643, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19033333659172058, "step": 152 }, { "completion_length": 200.0, "epoch": 0.06809078771695594, "grad_norm": 0.014968675561249256, "kl": 0.0043890452943742275, "learning_rate": 3.4000000000000005e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 153 }, { "completion_length": 200.0, "epoch": 0.06853582554517133, "grad_norm": 1.0916509628295898, "kl": 0.006353219039738178, "learning_rate": 3.4222222222222224e-06, "loss": 0.0003, "reward": -0.014833331108093262, "reward_std": 0.4068416953086853, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014833331108093262, "step": 154 }, { "completion_length": 200.0, "epoch": 0.06898086337338674, "grad_norm": 0.7793182730674744, "kl": 0.00863957405090332, "learning_rate": 3.444444444444445e-06, "loss": 0.0003, "reward": -0.10183333605527878, "reward_std": 0.4037016034126282, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10183333605527878, "step": 155 }, { "completion_length": 200.0, "epoch": 0.06942590120160214, "grad_norm": 0.012789091095328331, "kl": 0.0035248759668320417, "learning_rate": 3.4666666666666672e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 156 }, { "completion_length": 199.83334350585938, "epoch": 0.06987093902981753, "grad_norm": 0.8552259802818298, "kl": 0.007797297090291977, "learning_rate": 3.4888888888888896e-06, "loss": 0.0003, "reward": -0.32883334159851074, "reward_std": 0.3524216115474701, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32883334159851074, "step": 157 }, { "completion_length": 200.0, "epoch": 0.07031597685803294, "grad_norm": 0.7439639568328857, "kl": 0.008432665839791298, "learning_rate": 3.511111111111111e-06, "loss": 0.0003, "reward": -0.3968333601951599, "reward_std": 0.25783050060272217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3968333601951599, "step": 158 }, { "completion_length": 200.0, "epoch": 0.07076101468624833, "grad_norm": 0.8293086290359497, "kl": 0.015361580066382885, "learning_rate": 3.5333333333333335e-06, "loss": 0.0006, "reward": -0.09950000047683716, "reward_std": 0.34847769141197205, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09950000047683716, "step": 159 }, { "completion_length": 200.0, "epoch": 0.07120605251446373, "grad_norm": 0.02794010564684868, "kl": 0.011896876618266106, "learning_rate": 3.555555555555556e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 160 }, { "completion_length": 168.5, "epoch": 0.07165109034267912, "grad_norm": 1.1416176557540894, "kl": 0.01758456416428089, "learning_rate": 3.577777777777778e-06, "loss": 0.0007, "reward": -0.21533334255218506, "reward_std": 0.36748045682907104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21533334255218506, "step": 161 }, { "completion_length": 200.0, "epoch": 0.07209612817089453, "grad_norm": 0.8033695220947266, "kl": 0.016339905560016632, "learning_rate": 3.6000000000000003e-06, "loss": 0.0007, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 162 }, { "completion_length": 200.0, "epoch": 0.07254116599910992, "grad_norm": 0.013050896115601063, "kl": 0.005038695875555277, "learning_rate": 3.6222222222222226e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 163 }, { "completion_length": 200.0, "epoch": 0.07298620382732532, "grad_norm": 0.8717074990272522, "kl": 0.01940556988120079, "learning_rate": 3.644444444444445e-06, "loss": 0.0008, "reward": -0.44333335757255554, "reward_std": 0.27983972430229187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44333332777023315, "step": 164 }, { "completion_length": 200.0, "epoch": 0.07343124165554073, "grad_norm": 0.7769482135772705, "kl": 0.008692685514688492, "learning_rate": 3.6666666666666666e-06, "loss": 0.0003, "reward": -0.1525000035762787, "reward_std": 0.4397725462913513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1525000035762787, "step": 165 }, { "completion_length": 200.0, "epoch": 0.07387627948375612, "grad_norm": 0.009532845579087734, "kl": 0.003257167525589466, "learning_rate": 3.688888888888889e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 166 }, { "completion_length": 200.0, "epoch": 0.07432131731197152, "grad_norm": 0.766445517539978, "kl": 0.01774817332625389, "learning_rate": 3.7111111111111113e-06, "loss": 0.0007, "reward": 0.01966666243970394, "reward_std": 0.25801295042037964, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01966666243970394, "step": 167 }, { "completion_length": 192.83334350585938, "epoch": 0.07476635514018691, "grad_norm": 0.7344871759414673, "kl": 0.019676849246025085, "learning_rate": 3.7333333333333337e-06, "loss": 0.0008, "reward": -0.3711666762828827, "reward_std": 0.27725324034690857, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3711666762828827, "step": 168 }, { "completion_length": 200.0, "epoch": 0.07521139296840232, "grad_norm": 0.708225429058075, "kl": 0.01603546366095543, "learning_rate": 3.7555555555555557e-06, "loss": 0.0006, "reward": -0.007833331823348999, "reward_std": 0.32537388801574707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007833331823348999, "step": 169 }, { "completion_length": 200.0, "epoch": 0.07565643079661771, "grad_norm": 0.012713441625237465, "kl": 0.005252152215689421, "learning_rate": 3.777777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 170 }, { "completion_length": 200.0, "epoch": 0.07610146862483311, "grad_norm": 0.013440362177789211, "kl": 0.006324666552245617, "learning_rate": 3.8000000000000005e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 171 }, { "completion_length": 200.0, "epoch": 0.07654650645304852, "grad_norm": 0.018537871539592743, "kl": 0.0071646831929683685, "learning_rate": 3.8222222222222224e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 172 }, { "completion_length": 200.0, "epoch": 0.07699154428126391, "grad_norm": 0.01842048391699791, "kl": 0.008270082995295525, "learning_rate": 3.844444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 173 }, { "completion_length": 200.0, "epoch": 0.0774365821094793, "grad_norm": 0.7180718779563904, "kl": 0.023133087903261185, "learning_rate": 3.866666666666667e-06, "loss": 0.0009, "reward": -0.32500001788139343, "reward_std": 0.35587525367736816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32500001788139343, "step": 174 }, { "completion_length": 200.0, "epoch": 0.0778816199376947, "grad_norm": 0.8401187062263489, "kl": 0.013387969695031643, "learning_rate": 3.88888888888889e-06, "loss": 0.0005, "reward": 0.029499998316168785, "reward_std": 0.2993685007095337, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.029499998316168785, "step": 175 }, { "completion_length": 200.0, "epoch": 0.07832665776591011, "grad_norm": 0.0299467034637928, "kl": 0.015433305874466896, "learning_rate": 3.911111111111112e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 200.0, "epoch": 0.0787716955941255, "grad_norm": 0.01781369000673294, "kl": 0.00665412237867713, "learning_rate": 3.9333333333333335e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 177 }, { "completion_length": 200.0, "epoch": 0.0792167334223409, "grad_norm": 0.029804501682519913, "kl": 0.015158621594309807, "learning_rate": 3.955555555555556e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 178 }, { "completion_length": 200.0, "epoch": 0.0796617712505563, "grad_norm": 0.020057376474142075, "kl": 0.005600334610790014, "learning_rate": 3.977777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 179 }, { "completion_length": 200.0, "epoch": 0.0801068090787717, "grad_norm": 0.6821267008781433, "kl": 0.01735800690948963, "learning_rate": 4.000000000000001e-06, "loss": 0.0007, "reward": -0.2288333624601364, "reward_std": 0.3883763253688812, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2288333624601364, "step": 180 }, { "completion_length": 200.0, "epoch": 0.08055184690698709, "grad_norm": 0.8316364884376526, "kl": 0.03854802995920181, "learning_rate": 4.022222222222222e-06, "loss": 0.0015, "reward": -0.11349999904632568, "reward_std": 0.37036940455436707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11349999904632568, "step": 181 }, { "completion_length": 200.0, "epoch": 0.08099688473520249, "grad_norm": 0.8583576679229736, "kl": 0.014207671396434307, "learning_rate": 4.044444444444445e-06, "loss": 0.0006, "reward": 0.002499997615814209, "reward_std": 0.30006250739097595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.002499997615814209, "step": 182 }, { "completion_length": 200.0, "epoch": 0.0814419225634179, "grad_norm": 0.0153994495049119, "kl": 0.007102414965629578, "learning_rate": 4.066666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 183 }, { "completion_length": 200.0, "epoch": 0.08188696039163329, "grad_norm": 0.8327325582504272, "kl": 0.022632954642176628, "learning_rate": 4.088888888888889e-06, "loss": 0.0009, "reward": 0.014666667208075523, "reward_std": 0.27026039361953735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014666667208075523, "step": 184 }, { "completion_length": 200.0, "epoch": 0.08233199821984868, "grad_norm": 0.010787020437419415, "kl": 0.004412154667079449, "learning_rate": 4.111111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 185 }, { "completion_length": 200.0, "epoch": 0.08277703604806408, "grad_norm": 0.029312826693058014, "kl": 0.01270313374698162, "learning_rate": 4.133333333333333e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 186 }, { "completion_length": 175.1666717529297, "epoch": 0.08322207387627949, "grad_norm": 0.8411778211593628, "kl": 0.028145212680101395, "learning_rate": 4.155555555555556e-06, "loss": 0.0011, "reward": -0.01066666841506958, "reward_std": 0.17476919293403625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01066666841506958, "step": 187 }, { "completion_length": 200.0, "epoch": 0.08366711170449488, "grad_norm": 0.023354342207312584, "kl": 0.00851635355502367, "learning_rate": 4.177777777777778e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 192.5, "epoch": 0.08411214953271028, "grad_norm": 0.8554210662841797, "kl": 0.01744459569454193, "learning_rate": 4.2000000000000004e-06, "loss": 0.0007, "reward": -0.04233333468437195, "reward_std": 0.2719578444957733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04233333468437195, "step": 189 }, { "completion_length": 177.33334350585938, "epoch": 0.08455718736092568, "grad_norm": 0.8024124503135681, "kl": 0.015208413824439049, "learning_rate": 4.222222222222223e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 190 }, { "completion_length": 200.0, "epoch": 0.08500222518914108, "grad_norm": 0.6993169188499451, "kl": 0.014287407509982586, "learning_rate": 4.244444444444445e-06, "loss": 0.0006, "reward": -0.1301666796207428, "reward_std": 0.39577287435531616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 191 }, { "completion_length": 200.0, "epoch": 0.08544726301735647, "grad_norm": 0.6678306460380554, "kl": 0.013930333778262138, "learning_rate": 4.266666666666668e-06, "loss": 0.0006, "reward": 0.043666668236255646, "reward_std": 0.26521819829940796, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.043666668236255646, "step": 192 }, { "completion_length": 200.0, "epoch": 0.08589230084557187, "grad_norm": 0.028331147506833076, "kl": 0.017971333116292953, "learning_rate": 4.288888888888889e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 193 }, { "completion_length": 200.0, "epoch": 0.08633733867378728, "grad_norm": 0.027453351765871048, "kl": 0.009431993588805199, "learning_rate": 4.3111111111111115e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 194 }, { "completion_length": 200.0, "epoch": 0.08678237650200267, "grad_norm": 0.773445725440979, "kl": 0.023339729756116867, "learning_rate": 4.333333333333334e-06, "loss": 0.0009, "reward": -0.10633333772420883, "reward_std": 0.35838064551353455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10633333772420883, "step": 195 }, { "completion_length": 200.0, "epoch": 0.08722741433021806, "grad_norm": 0.014650012366473675, "kl": 0.007890871725976467, "learning_rate": 4.3555555555555555e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 196 }, { "completion_length": 200.0, "epoch": 0.08767245215843347, "grad_norm": 0.025273794308304787, "kl": 0.01200440526008606, "learning_rate": 4.377777777777778e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 197 }, { "completion_length": 192.33334350585938, "epoch": 0.08811748998664887, "grad_norm": 0.7459567785263062, "kl": 0.01451034378260374, "learning_rate": 4.4e-06, "loss": 0.0006, "reward": -0.26500001549720764, "reward_std": 0.33120569586753845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26500001549720764, "step": 198 }, { "completion_length": 200.0, "epoch": 0.08856252781486426, "grad_norm": 0.010312313213944435, "kl": 0.0038611218333244324, "learning_rate": 4.422222222222223e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 199 }, { "completion_length": 200.0, "epoch": 0.08900756564307966, "grad_norm": 0.6166565418243408, "kl": 0.009738167747855186, "learning_rate": 4.444444444444444e-06, "loss": 0.0004, "reward": 0.0035000047646462917, "reward_std": 0.2976129949092865, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0035000047646462917, "step": 200 }, { "completion_length": 200.0, "epoch": 0.08945260347129506, "grad_norm": 0.008906065486371517, "kl": 0.0037507950328290462, "learning_rate": 4.4666666666666665e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 201 }, { "completion_length": 198.1666717529297, "epoch": 0.08989764129951046, "grad_norm": 0.9748014807701111, "kl": 0.009199577383697033, "learning_rate": 4.488888888888889e-06, "loss": 0.0004, "reward": 0.04983333498239517, "reward_std": 0.18411998450756073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04983333498239517, "step": 202 }, { "completion_length": 200.0, "epoch": 0.09034267912772585, "grad_norm": 0.04424808546900749, "kl": 0.01891401596367359, "learning_rate": 4.511111111111111e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 203 }, { "completion_length": 200.0, "epoch": 0.09078771695594126, "grad_norm": 0.008351677097380161, "kl": 0.003573081223294139, "learning_rate": 4.533333333333334e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 204 }, { "completion_length": 200.0, "epoch": 0.09123275478415666, "grad_norm": 0.7445893883705139, "kl": 0.014485888183116913, "learning_rate": 4.555555555555556e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 205 }, { "completion_length": 200.0, "epoch": 0.09167779261237205, "grad_norm": 0.6814647316932678, "kl": 0.017133373767137527, "learning_rate": 4.5777777777777785e-06, "loss": 0.0007, "reward": -0.08250000327825546, "reward_std": 0.37661266326904297, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 206 }, { "completion_length": 200.0, "epoch": 0.09212283044058744, "grad_norm": 0.013401404023170471, "kl": 0.0045086476020514965, "learning_rate": 4.600000000000001e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 207 }, { "completion_length": 200.0, "epoch": 0.09256786826880285, "grad_norm": 0.015258646570146084, "kl": 0.005985723342746496, "learning_rate": 4.622222222222222e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 208 }, { "completion_length": 200.0, "epoch": 0.09301290609701825, "grad_norm": 0.017956389114260674, "kl": 0.0076329647563397884, "learning_rate": 4.644444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 209 }, { "completion_length": 200.0, "epoch": 0.09345794392523364, "grad_norm": 0.6878367066383362, "kl": 0.00822862982749939, "learning_rate": 4.666666666666667e-06, "loss": 0.0003, "reward": 0.025333335623145103, "reward_std": 0.24413248896598816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025333335623145103, "step": 210 }, { "completion_length": 185.6666717529297, "epoch": 0.09390298175344905, "grad_norm": 0.7083204388618469, "kl": 0.008970928378403187, "learning_rate": 4.6888888888888895e-06, "loss": 0.0004, "reward": -0.10883334279060364, "reward_std": 0.31478020548820496, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10883334279060364, "step": 211 }, { "completion_length": 200.0, "epoch": 0.09434801958166444, "grad_norm": 0.010259282775223255, "kl": 0.005621683783829212, "learning_rate": 4.711111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 212 }, { "completion_length": 200.0, "epoch": 0.09479305740987984, "grad_norm": 0.63719642162323, "kl": 0.017996463924646378, "learning_rate": 4.7333333333333335e-06, "loss": 0.0007, "reward": -0.2148333489894867, "reward_std": 0.37843701243400574, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2148333489894867, "step": 213 }, { "completion_length": 200.0, "epoch": 0.09523809523809523, "grad_norm": 0.7535067796707153, "kl": 0.009188219904899597, "learning_rate": 4.755555555555556e-06, "loss": 0.0004, "reward": 0.025833334773778915, "reward_std": 0.24290774762630463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025833334773778915, "step": 214 }, { "completion_length": 200.0, "epoch": 0.09568313306631064, "grad_norm": 0.03754749149084091, "kl": 0.023878587409853935, "learning_rate": 4.777777777777778e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 215 }, { "completion_length": 195.1666717529297, "epoch": 0.09612817089452604, "grad_norm": 0.7495411038398743, "kl": 0.020265337079763412, "learning_rate": 4.800000000000001e-06, "loss": 0.0008, "reward": -0.1301666796207428, "reward_std": 0.2998015582561493, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 216 }, { "completion_length": 200.0, "epoch": 0.09657320872274143, "grad_norm": 0.8328439593315125, "kl": 0.012733696028590202, "learning_rate": 4.822222222222222e-06, "loss": 0.0005, "reward": -0.10983332991600037, "reward_std": 0.36434730887413025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10983332991600037, "step": 217 }, { "completion_length": 200.0, "epoch": 0.09701824655095684, "grad_norm": 0.014833502471446991, "kl": 0.005706641357392073, "learning_rate": 4.8444444444444446e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 218 }, { "completion_length": 200.0, "epoch": 0.09746328437917223, "grad_norm": 0.01487264595925808, "kl": 0.007111959159374237, "learning_rate": 4.866666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 219 }, { "completion_length": 191.5, "epoch": 0.09790832220738763, "grad_norm": 0.6546903252601624, "kl": 0.03114478290081024, "learning_rate": 4.888888888888889e-06, "loss": 0.0012, "reward": -0.02316666767001152, "reward_std": 0.3049527406692505, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02316666767001152, "step": 220 }, { "completion_length": 200.0, "epoch": 0.09835336003560302, "grad_norm": 0.012690722942352295, "kl": 0.004840874578803778, "learning_rate": 4.911111111111112e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 221 }, { "completion_length": 200.0, "epoch": 0.09879839786381843, "grad_norm": 0.8420057892799377, "kl": 0.01995580643415451, "learning_rate": 4.933333333333334e-06, "loss": 0.0008, "reward": -0.11483334004878998, "reward_std": 0.37201637029647827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11483334004878998, "step": 222 }, { "completion_length": 162.6666717529297, "epoch": 0.09924343569203382, "grad_norm": 0.7063876390457153, "kl": 0.018741153180599213, "learning_rate": 4.9555555555555565e-06, "loss": 0.0007, "reward": -0.15850001573562622, "reward_std": 0.2223670333623886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15850001573562622, "step": 223 }, { "completion_length": 142.1666717529297, "epoch": 0.09968847352024922, "grad_norm": 1.3013415336608887, "kl": 0.0186506025493145, "learning_rate": 4.977777777777778e-06, "loss": 0.0007, "reward": 0.08866667002439499, "reward_std": 0.11559354513883591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08866667002439499, "step": 224 }, { "completion_length": 200.0, "epoch": 0.10013351134846461, "grad_norm": 0.8148112297058105, "kl": 0.014559872448444366, "learning_rate": 5e-06, "loss": 0.0006, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 225 }, { "completion_length": 200.0, "epoch": 0.10057854917668002, "grad_norm": 0.7223935127258301, "kl": 0.01846488006412983, "learning_rate": 4.999996982499377e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 226 }, { "completion_length": 176.83334350585938, "epoch": 0.10102358700489542, "grad_norm": 0.855940580368042, "kl": 0.023898562416434288, "learning_rate": 4.9999879300047904e-06, "loss": 0.001, "reward": -0.06350000202655792, "reward_std": 0.24804334342479706, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06350000202655792, "step": 227 }, { "completion_length": 178.1666717529297, "epoch": 0.10146862483311081, "grad_norm": 1.3559764623641968, "kl": 0.012699100188910961, "learning_rate": 4.999972842538094e-06, "loss": 0.0005, "reward": 0.0833333358168602, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 228 }, { "completion_length": 200.0, "epoch": 0.10191366266132622, "grad_norm": 0.11357201635837555, "kl": 0.02485606074333191, "learning_rate": 4.999951720135707e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 229 }, { "completion_length": 193.5, "epoch": 0.10235870048954161, "grad_norm": 1.0450315475463867, "kl": 0.022413700819015503, "learning_rate": 4.999924562848623e-06, "loss": 0.0009, "reward": -0.27666670083999634, "reward_std": 0.32778817415237427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.27666670083999634, "step": 230 }, { "completion_length": 200.0, "epoch": 0.102803738317757, "grad_norm": 0.022118445485830307, "kl": 0.011586411856114864, "learning_rate": 4.999891370742395e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 231 }, { "completion_length": 200.0, "epoch": 0.1032487761459724, "grad_norm": 0.032451968640089035, "kl": 0.01942615956068039, "learning_rate": 4.999852143897152e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 232 }, { "completion_length": 187.33334350585938, "epoch": 0.10369381397418781, "grad_norm": 1.0973916053771973, "kl": 0.028928395360708237, "learning_rate": 4.999806882407586e-06, "loss": 0.0012, "reward": 0.09416666626930237, "reward_std": 0.07552593946456909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09416666626930237, "step": 233 }, { "completion_length": 200.0, "epoch": 0.1041388518024032, "grad_norm": 0.8492021560668945, "kl": 0.020361032336950302, "learning_rate": 4.9997555863829584e-06, "loss": 0.0008, "reward": -0.0728333443403244, "reward_std": 0.3074478507041931, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0728333443403244, "step": 234 }, { "completion_length": 200.0, "epoch": 0.1045838896306186, "grad_norm": 0.7689980268478394, "kl": 0.010467594489455223, "learning_rate": 4.999698255947099e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 235 }, { "completion_length": 200.0, "epoch": 0.105028927458834, "grad_norm": 0.022514864802360535, "kl": 0.016714762896299362, "learning_rate": 4.9996348912384025e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 236 }, { "completion_length": 200.0, "epoch": 0.1054739652870494, "grad_norm": 0.7497798800468445, "kl": 0.035260215401649475, "learning_rate": 4.999565492409831e-06, "loss": 0.0014, "reward": -0.07899999618530273, "reward_std": 0.3160455822944641, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07899999618530273, "step": 237 }, { "completion_length": 180.33334350585938, "epoch": 0.1059190031152648, "grad_norm": 1.1552839279174805, "kl": 0.020859047770500183, "learning_rate": 4.999490059628914e-06, "loss": 0.0008, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 238 }, { "completion_length": 200.0, "epoch": 0.10636404094348019, "grad_norm": 0.014450768008828163, "kl": 0.00946769304573536, "learning_rate": 4.999408593077747e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 239 }, { "completion_length": 200.0, "epoch": 0.1068090787716956, "grad_norm": 0.6490687131881714, "kl": 0.013119819574058056, "learning_rate": 4.999321092952989e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 240 }, { "completion_length": 198.1666717529297, "epoch": 0.10725411659991099, "grad_norm": 0.8787567615509033, "kl": 0.027725404128432274, "learning_rate": 4.999227559465865e-06, "loss": 0.0011, "reward": -0.060833342373371124, "reward_std": 0.2964027523994446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.060833342373371124, "step": 241 }, { "completion_length": 200.0, "epoch": 0.10769915442812639, "grad_norm": 0.6240781545639038, "kl": 0.011513415724039078, "learning_rate": 4.999127992842167e-06, "loss": 0.0005, "reward": 0.019833337515592575, "reward_std": 0.2576046586036682, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019833337515592575, "step": 242 }, { "completion_length": 200.0, "epoch": 0.1081441922563418, "grad_norm": 0.021726641803979874, "kl": 0.011281192302703857, "learning_rate": 4.999022393322246e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 243 }, { "completion_length": 200.0, "epoch": 0.10858923008455719, "grad_norm": 0.01203103642910719, "kl": 0.007502372842282057, "learning_rate": 4.998910761161022e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 244 }, { "completion_length": 200.0, "epoch": 0.10903426791277258, "grad_norm": 0.03352230042219162, "kl": 0.015283550135791302, "learning_rate": 4.998793096627973e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 245 }, { "completion_length": 180.0, "epoch": 0.10947930574098798, "grad_norm": 0.6931204199790955, "kl": 0.025455031543970108, "learning_rate": 4.998669400007142e-06, "loss": 0.001, "reward": -0.0663333386182785, "reward_std": 0.25824224948883057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0663333386182785, "step": 246 }, { "completion_length": 179.33334350585938, "epoch": 0.10992434356920339, "grad_norm": 0.7052227258682251, "kl": 0.01645379140973091, "learning_rate": 4.998539671597134e-06, "loss": 0.0007, "reward": -0.1290000081062317, "reward_std": 0.243107408285141, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1290000081062317, "step": 247 }, { "completion_length": 200.0, "epoch": 0.11036938139741878, "grad_norm": 0.6620118021965027, "kl": 0.01992572657763958, "learning_rate": 4.998403911711112e-06, "loss": 0.0008, "reward": 0.02500000223517418, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02500000223517418, "step": 248 }, { "completion_length": 200.0, "epoch": 0.11081441922563418, "grad_norm": 0.012199520133435726, "kl": 0.01109787356108427, "learning_rate": 4.9982621206768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 249 }, { "completion_length": 200.0, "epoch": 0.11125945705384958, "grad_norm": 0.6646403074264526, "kl": 0.017368197441101074, "learning_rate": 4.998114298836483e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 250 }, { "completion_length": 200.0, "epoch": 0.11170449488206498, "grad_norm": 0.64476478099823, "kl": 0.018091298639774323, "learning_rate": 4.997960446547002e-06, "loss": 0.0007, "reward": 0.007833331823348999, "reward_std": 0.28699856996536255, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007833331823348999, "step": 251 }, { "completion_length": 200.0, "epoch": 0.11214953271028037, "grad_norm": 0.01579858362674713, "kl": 0.009715499356389046, "learning_rate": 4.997800564179758e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 252 }, { "completion_length": 200.0, "epoch": 0.11259457053849577, "grad_norm": 0.7200601696968079, "kl": 0.02384258806705475, "learning_rate": 4.997634652120704e-06, "loss": 0.001, "reward": 0.016166668385267258, "reward_std": 0.2665861248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 253 }, { "completion_length": 193.33334350585938, "epoch": 0.11303960836671118, "grad_norm": 0.7668142318725586, "kl": 0.029656527563929558, "learning_rate": 4.997462710770356e-06, "loss": 0.0012, "reward": -0.13450001180171967, "reward_std": 0.29981911182403564, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13450001180171967, "step": 254 }, { "completion_length": 200.0, "epoch": 0.11348464619492657, "grad_norm": 0.02032412961125374, "kl": 0.014308085665106773, "learning_rate": 4.997284740543776e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 255 }, { "completion_length": 198.1666717529297, "epoch": 0.11392968402314196, "grad_norm": 0.025216558948159218, "kl": 0.022948317229747772, "learning_rate": 4.997100741870587e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 256 }, { "completion_length": 200.0, "epoch": 0.11437472185135737, "grad_norm": 0.0239882729947567, "kl": 0.01751275360584259, "learning_rate": 4.996910715194963e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 257 }, { "completion_length": 200.0, "epoch": 0.11481975967957277, "grad_norm": 0.028994852676987648, "kl": 0.01953146606683731, "learning_rate": 4.9967146609756254e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 200.0, "epoch": 0.11526479750778816, "grad_norm": 0.7920787334442139, "kl": 0.024792861193418503, "learning_rate": 4.996512579685851e-06, "loss": 0.001, "reward": 0.034333329647779465, "reward_std": 0.28770241141319275, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.034333329647779465, "step": 259 }, { "completion_length": 200.0, "epoch": 0.11570983533600356, "grad_norm": 0.027261994779109955, "kl": 0.01596745476126671, "learning_rate": 4.996304471813464e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 260 }, { "completion_length": 199.6666717529297, "epoch": 0.11615487316421896, "grad_norm": 0.7512991428375244, "kl": 0.020810922607779503, "learning_rate": 4.996090337860836e-06, "loss": 0.0008, "reward": 0.03583333641290665, "reward_std": 0.218412846326828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03583333641290665, "step": 261 }, { "completion_length": 200.0, "epoch": 0.11659991099243436, "grad_norm": 0.01981634460389614, "kl": 0.02145499363541603, "learning_rate": 4.995870178344888e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 262 }, { "completion_length": 200.0, "epoch": 0.11704494882064975, "grad_norm": 0.015385876409709454, "kl": 0.010222600772976875, "learning_rate": 4.995643993797084e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 263 }, { "completion_length": 200.0, "epoch": 0.11748998664886515, "grad_norm": 0.010453257709741592, "kl": 0.004751277156174183, "learning_rate": 4.995411784763434e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 264 }, { "completion_length": 200.0, "epoch": 0.11793502447708056, "grad_norm": 0.02353510819375515, "kl": 0.01597435772418976, "learning_rate": 4.995173551804491e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 265 }, { "completion_length": 200.0, "epoch": 0.11838006230529595, "grad_norm": 0.6973554491996765, "kl": 0.013702675700187683, "learning_rate": 4.9949292954953486e-06, "loss": 0.0005, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 266 }, { "completion_length": 200.0, "epoch": 0.11882510013351134, "grad_norm": 0.7307512760162354, "kl": 0.02334902063012123, "learning_rate": 4.994679016425642e-06, "loss": 0.0009, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 267 }, { "completion_length": 196.5, "epoch": 0.11927013796172675, "grad_norm": 0.7821613550186157, "kl": 0.023126548156142235, "learning_rate": 4.994422715199546e-06, "loss": 0.0009, "reward": -0.35100001096725464, "reward_std": 0.24427853524684906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35100001096725464, "step": 268 }, { "completion_length": 200.0, "epoch": 0.11971517578994215, "grad_norm": 0.014924556948244572, "kl": 0.010245325975120068, "learning_rate": 4.99416039243577e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 269 }, { "completion_length": 200.0, "epoch": 0.12016021361815754, "grad_norm": 0.011302834376692772, "kl": 0.0074605681002140045, "learning_rate": 4.993892048767563e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 270 }, { "completion_length": 196.83334350585938, "epoch": 0.12060525144637294, "grad_norm": 0.7507510185241699, "kl": 0.01992712914943695, "learning_rate": 4.993617684842707e-06, "loss": 0.0008, "reward": 0.0560000017285347, "reward_std": 0.1690147966146469, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0560000017285347, "step": 271 }, { "completion_length": 186.33334350585938, "epoch": 0.12105028927458834, "grad_norm": 0.642690896987915, "kl": 0.03827090188860893, "learning_rate": 4.9933373013235156e-06, "loss": 0.0015, "reward": -0.01666666567325592, "reward_std": 0.2657710909843445, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01666666567325592, "step": 272 }, { "completion_length": 200.0, "epoch": 0.12149532710280374, "grad_norm": 0.016800519078969955, "kl": 0.007222745567560196, "learning_rate": 4.993050898886833e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 273 }, { "completion_length": 200.0, "epoch": 0.12194036493101913, "grad_norm": 0.758976936340332, "kl": 0.018932368606328964, "learning_rate": 4.992758478224039e-06, "loss": 0.0008, "reward": -0.24283334612846375, "reward_std": 0.41019290685653687, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24283334612846375, "step": 274 }, { "completion_length": 200.0, "epoch": 0.12238540275923454, "grad_norm": 0.020172713324427605, "kl": 0.012656650505959988, "learning_rate": 4.992460040041034e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 275 }, { "completion_length": 200.0, "epoch": 0.12283044058744993, "grad_norm": 0.01721290498971939, "kl": 0.008657376281917095, "learning_rate": 4.992155585058248e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 276 }, { "completion_length": 186.83334350585938, "epoch": 0.12327547841566533, "grad_norm": 0.6902337074279785, "kl": 0.06522956490516663, "learning_rate": 4.991845114010638e-06, "loss": 0.0026, "reward": -0.09683333337306976, "reward_std": 0.308468759059906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09683333337306976, "step": 277 }, { "completion_length": 200.0, "epoch": 0.12372051624388072, "grad_norm": 0.013619703240692616, "kl": 0.009151134639978409, "learning_rate": 4.99152862764768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 278 }, { "completion_length": 200.0, "epoch": 0.12416555407209613, "grad_norm": 0.01365516148507595, "kl": 0.005762772168964148, "learning_rate": 4.99120612673337e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 279 }, { "completion_length": 200.0, "epoch": 0.12461059190031153, "grad_norm": 0.03844517469406128, "kl": 0.009146707132458687, "learning_rate": 4.990877612046228e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 280 }, { "completion_length": 200.0, "epoch": 0.12505562972852693, "grad_norm": 0.010963845066726208, "kl": 0.004802503623068333, "learning_rate": 4.9905430843792886e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 281 }, { "completion_length": 200.0, "epoch": 0.12550066755674233, "grad_norm": 0.018975911661982536, "kl": 0.00843195803463459, "learning_rate": 4.9902025445401e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 282 }, { "completion_length": 191.33334350585938, "epoch": 0.12594570538495772, "grad_norm": 0.9152283072471619, "kl": 0.01359584927558899, "learning_rate": 4.989855993350728e-06, "loss": 0.0005, "reward": 0.08433333039283752, "reward_std": 0.09961257874965668, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08433333039283752, "step": 283 }, { "completion_length": 199.33334350585938, "epoch": 0.12639074321317312, "grad_norm": 0.6327641606330872, "kl": 0.011363311670720577, "learning_rate": 4.989503431647744e-06, "loss": 0.0005, "reward": 0.05283333361148834, "reward_std": 0.17677150666713715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05283333361148834, "step": 284 }, { "completion_length": 200.0, "epoch": 0.1268357810413885, "grad_norm": 0.015086417086422443, "kl": 0.006470312364399433, "learning_rate": 4.9891448602822355e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 285 }, { "completion_length": 200.0, "epoch": 0.1272808188696039, "grad_norm": 0.6536492705345154, "kl": 0.015819404274225235, "learning_rate": 4.988780280119792e-06, "loss": 0.0006, "reward": -0.09083332866430283, "reward_std": 0.33507877588272095, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09083332866430283, "step": 286 }, { "completion_length": 200.0, "epoch": 0.1277258566978193, "grad_norm": 0.008064402267336845, "kl": 0.002744730096310377, "learning_rate": 4.988409692040511e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 287 }, { "completion_length": 180.83334350585938, "epoch": 0.12817089452603472, "grad_norm": 0.819420337677002, "kl": 0.06618692725896835, "learning_rate": 4.988033096938991e-06, "loss": 0.0026, "reward": 0.021166665479540825, "reward_std": 0.24596862494945526, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021166665479540825, "step": 288 }, { "completion_length": 200.0, "epoch": 0.12861593235425012, "grad_norm": 0.017804304137825966, "kl": 0.0107121542096138, "learning_rate": 4.9876504957243345e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 289 }, { "completion_length": 200.0, "epoch": 0.1290609701824655, "grad_norm": 0.009461048990488052, "kl": 0.005491574760526419, "learning_rate": 4.987261889320141e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 290 }, { "completion_length": 200.0, "epoch": 0.1295060080106809, "grad_norm": 0.020793907344341278, "kl": 0.006585664115846157, "learning_rate": 4.986867278664505e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 291 }, { "completion_length": 200.0, "epoch": 0.1299510458388963, "grad_norm": 0.01623818278312683, "kl": 0.014776560477912426, "learning_rate": 4.9864666647100176e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 292 }, { "completion_length": 200.0, "epoch": 0.1303960836671117, "grad_norm": 0.015048404224216938, "kl": 0.006943022832274437, "learning_rate": 4.986060048423761e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 293 }, { "completion_length": 200.0, "epoch": 0.1308411214953271, "grad_norm": 0.10332320630550385, "kl": 0.018736328929662704, "learning_rate": 4.985647430787308e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 294 }, { "completion_length": 200.0, "epoch": 0.1312861593235425, "grad_norm": 0.7041745185852051, "kl": 0.013492944650352001, "learning_rate": 4.985228812796717e-06, "loss": 0.0005, "reward": 0.0403333380818367, "reward_std": 0.20739012956619263, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0403333380818367, "step": 295 }, { "completion_length": 180.33334350585938, "epoch": 0.1317311971517579, "grad_norm": 1.2074172496795654, "kl": 0.017207711935043335, "learning_rate": 4.984804195462532e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 296 }, { "completion_length": 200.0, "epoch": 0.1321762349799733, "grad_norm": 0.007344384212046862, "kl": 0.0033924030140042305, "learning_rate": 4.984373579809778e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 200.0, "epoch": 0.1326212728081887, "grad_norm": 0.6581267714500427, "kl": 0.004817100241780281, "learning_rate": 4.983936966877964e-06, "loss": 0.0002, "reward": 0.02033333107829094, "reward_std": 0.2563799321651459, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02033333107829094, "step": 298 }, { "completion_length": 200.0, "epoch": 0.1330663106364041, "grad_norm": 0.6918825507164001, "kl": 0.008517519570887089, "learning_rate": 4.983494357721074e-06, "loss": 0.0003, "reward": -0.029333334416151047, "reward_std": 0.3780378997325897, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.029333334416151047, "step": 299 }, { "completion_length": 200.0, "epoch": 0.13351134846461948, "grad_norm": 0.6355785727500916, "kl": 0.007732154801487923, "learning_rate": 4.983045753407564e-06, "loss": 0.0003, "reward": -0.12200000137090683, "reward_std": 0.3381112515926361, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12200000137090683, "step": 300 }, { "completion_length": 200.0, "epoch": 0.13395638629283488, "grad_norm": 0.00788823515176773, "kl": 0.004913420882076025, "learning_rate": 4.982591155020367e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 301 }, { "completion_length": 200.0, "epoch": 0.1344014241210503, "grad_norm": 0.019300375133752823, "kl": 0.0115005848929286, "learning_rate": 4.9821305636568835e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 302 }, { "completion_length": 200.0, "epoch": 0.1348464619492657, "grad_norm": 0.5956396460533142, "kl": 0.014617225155234337, "learning_rate": 4.981663980428981e-06, "loss": 0.0006, "reward": 0.019499998539686203, "reward_std": 0.2584211826324463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 303 }, { "completion_length": 200.0, "epoch": 0.1352914997774811, "grad_norm": 0.013532106764614582, "kl": 0.007755351718515158, "learning_rate": 4.981191406462991e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 304 }, { "completion_length": 200.0, "epoch": 0.13573653760569648, "grad_norm": 0.6766030788421631, "kl": 0.019166380167007446, "learning_rate": 4.9807128428997085e-06, "loss": 0.0008, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 305 }, { "completion_length": 200.0, "epoch": 0.13618157543391188, "grad_norm": 0.0559423454105854, "kl": 0.023686787113547325, "learning_rate": 4.980228290894386e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 306 }, { "completion_length": 200.0, "epoch": 0.13662661326212727, "grad_norm": 0.03602616861462593, "kl": 0.021832283586263657, "learning_rate": 4.979737751616732e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 307 }, { "completion_length": 200.0, "epoch": 0.13707165109034267, "grad_norm": 1.0004466772079468, "kl": 0.01577741652727127, "learning_rate": 4.979241226250908e-06, "loss": 0.0006, "reward": 0.010333329439163208, "reward_std": 0.2808748185634613, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.010333329439163208, "step": 308 }, { "completion_length": 200.0, "epoch": 0.1375166889185581, "grad_norm": 0.01729634776711464, "kl": 0.007570373825728893, "learning_rate": 4.9787387159955265e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 309 }, { "completion_length": 200.0, "epoch": 0.13796172674677348, "grad_norm": 0.0166346225887537, "kl": 0.007312558591365814, "learning_rate": 4.978230222063649e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 310 }, { "completion_length": 200.0, "epoch": 0.13840676457498888, "grad_norm": 0.018261654302477837, "kl": 0.009910644963383675, "learning_rate": 4.9777157456827785e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 311 }, { "completion_length": 200.0, "epoch": 0.13885180240320427, "grad_norm": 0.02585846185684204, "kl": 0.011723216623067856, "learning_rate": 4.977195288094863e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 312 }, { "completion_length": 200.0, "epoch": 0.13929684023141967, "grad_norm": 0.02551482431590557, "kl": 0.011289785616099834, "learning_rate": 4.976668850556284e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 313 }, { "completion_length": 200.0, "epoch": 0.13974187805963506, "grad_norm": 0.022060496732592583, "kl": 0.01366241555660963, "learning_rate": 4.976136434337866e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 314 }, { "completion_length": 200.0, "epoch": 0.14018691588785046, "grad_norm": 0.5639723539352417, "kl": 0.006856884807348251, "learning_rate": 4.97559804072486e-06, "loss": 0.0003, "reward": 0.022499999031424522, "reward_std": 0.2510727047920227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022499999031424522, "step": 315 }, { "completion_length": 196.0, "epoch": 0.14063195371606588, "grad_norm": 0.8882031440734863, "kl": 0.012949245050549507, "learning_rate": 4.9750536710169485e-06, "loss": 0.0005, "reward": 0.061000000685453415, "reward_std": 0.1567673534154892, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.061000000685453415, "step": 316 }, { "completion_length": 200.0, "epoch": 0.14107699154428127, "grad_norm": 0.738691508769989, "kl": 0.018215883523225784, "learning_rate": 4.97450332652824e-06, "loss": 0.0007, "reward": -0.19816666841506958, "reward_std": 0.35409173369407654, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19816666841506958, "step": 317 }, { "completion_length": 200.0, "epoch": 0.14152202937249667, "grad_norm": 0.008259186521172523, "kl": 0.004001545254141092, "learning_rate": 4.973947008587268e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 318 }, { "completion_length": 183.0, "epoch": 0.14196706720071206, "grad_norm": 0.8659139275550842, "kl": 0.02181245945394039, "learning_rate": 4.973384718536982e-06, "loss": 0.0009, "reward": 0.03533333167433739, "reward_std": 0.18040475249290466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03533333167433739, "step": 319 }, { "completion_length": 200.0, "epoch": 0.14241210502892745, "grad_norm": 0.013459905050694942, "kl": 0.008506972342729568, "learning_rate": 4.972816457734752e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 320 }, { "completion_length": 200.0, "epoch": 0.14285714285714285, "grad_norm": 0.6455613970756531, "kl": 0.04512510821223259, "learning_rate": 4.972242227552358e-06, "loss": 0.0018, "reward": 0.008666664361953735, "reward_std": 0.2849573493003845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008666664361953735, "step": 321 }, { "completion_length": 182.33334350585938, "epoch": 0.14330218068535824, "grad_norm": 0.7501981854438782, "kl": 0.01761593297123909, "learning_rate": 4.971662029375995e-06, "loss": 0.0007, "reward": -0.17916667461395264, "reward_std": 0.331107497215271, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17916667461395264, "step": 322 }, { "completion_length": 200.0, "epoch": 0.14374721851357367, "grad_norm": 0.008645527996122837, "kl": 0.003186706220731139, "learning_rate": 4.97107586460626e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 323 }, { "completion_length": 175.83334350585938, "epoch": 0.14419225634178906, "grad_norm": 0.7112619876861572, "kl": 0.030575290322303772, "learning_rate": 4.970483734658154e-06, "loss": 0.0012, "reward": 0.08583333343267441, "reward_std": 0.12314936518669128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 324 }, { "completion_length": 200.0, "epoch": 0.14463729417000445, "grad_norm": 0.035192981362342834, "kl": 0.008340008556842804, "learning_rate": 4.969885640961081e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 325 }, { "completion_length": 190.0, "epoch": 0.14508233199821985, "grad_norm": 0.7466467618942261, "kl": 0.0285557322204113, "learning_rate": 4.969281584958838e-06, "loss": 0.0011, "reward": 0.015166670083999634, "reward_std": 0.2690356373786926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.015166670083999634, "step": 326 }, { "completion_length": 191.1666717529297, "epoch": 0.14552736982643524, "grad_norm": 0.802043616771698, "kl": 0.025279924273490906, "learning_rate": 4.968671568109617e-06, "loss": 0.001, "reward": 0.0898333415389061, "reward_std": 0.21741704642772675, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0898333415389061, "step": 327 }, { "completion_length": 167.5, "epoch": 0.14597240765465064, "grad_norm": 0.812122642993927, "kl": 0.01720350608229637, "learning_rate": 4.968055591885999e-06, "loss": 0.0007, "reward": -0.06016666814684868, "reward_std": 0.24665558338165283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06016666814684868, "step": 328 }, { "completion_length": 200.0, "epoch": 0.14641744548286603, "grad_norm": 0.7395609021186829, "kl": 0.05103464424610138, "learning_rate": 4.967433657774952e-06, "loss": 0.002, "reward": -0.07100000232458115, "reward_std": 0.3038104772567749, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07100000232458115, "step": 329 }, { "completion_length": 179.83334350585938, "epoch": 0.14686248331108145, "grad_norm": 0.7787045836448669, "kl": 0.04071980342268944, "learning_rate": 4.9668057672778225e-06, "loss": 0.0016, "reward": 0.047833334654569626, "reward_std": 0.21287593245506287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.047833334654569626, "step": 330 }, { "completion_length": 200.0, "epoch": 0.14730752113929685, "grad_norm": 0.6824626326560974, "kl": 0.008245850913226604, "learning_rate": 4.966171921910341e-06, "loss": 0.0003, "reward": -0.09216667711734772, "reward_std": 0.34250280261039734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09216667711734772, "step": 331 }, { "completion_length": 200.0, "epoch": 0.14775255896751224, "grad_norm": 0.01006829272955656, "kl": 0.004723408259451389, "learning_rate": 4.96553212320261e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 332 }, { "completion_length": 200.0, "epoch": 0.14819759679572764, "grad_norm": 0.7323021292686462, "kl": 0.010274862870573997, "learning_rate": 4.9648863726991035e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 333 }, { "completion_length": 200.0, "epoch": 0.14864263462394303, "grad_norm": 0.024399923160672188, "kl": 0.013128578662872314, "learning_rate": 4.964234671958663e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 334 }, { "completion_length": 200.0, "epoch": 0.14908767245215843, "grad_norm": 0.010077173821628094, "kl": 0.004746645223349333, "learning_rate": 4.963577022554496e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 335 }, { "completion_length": 200.0, "epoch": 0.14953271028037382, "grad_norm": 0.011270418763160706, "kl": 0.005329379811882973, "learning_rate": 4.962913426074166e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 336 }, { "completion_length": 200.0, "epoch": 0.14997774810858924, "grad_norm": 0.012212223373353481, "kl": 0.006592839956283569, "learning_rate": 4.9622438841195986e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 337 }, { "completion_length": 200.0, "epoch": 0.15042278593680464, "grad_norm": 0.007356896065175533, "kl": 0.003410342149436474, "learning_rate": 4.961568398307065e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 338 }, { "completion_length": 199.5, "epoch": 0.15086782376502003, "grad_norm": 0.6488639116287231, "kl": 0.01651693880558014, "learning_rate": 4.960886970267191e-06, "loss": 0.0007, "reward": 0.05366666615009308, "reward_std": 0.17473027110099792, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05366666615009308, "step": 339 }, { "completion_length": 200.0, "epoch": 0.15131286159323543, "grad_norm": 0.024112718179821968, "kl": 0.018834218382835388, "learning_rate": 4.960199601644943e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 340 }, { "completion_length": 200.0, "epoch": 0.15175789942145082, "grad_norm": 0.013589066453278065, "kl": 0.01101887971162796, "learning_rate": 4.959506294099629e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 341 }, { "completion_length": 200.0, "epoch": 0.15220293724966621, "grad_norm": 0.010311473160982132, "kl": 0.007442638278007507, "learning_rate": 4.958807049304893e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 342 }, { "completion_length": 171.1666717529297, "epoch": 0.1526479750778816, "grad_norm": 5.103739261627197, "kl": 0.3760009706020355, "learning_rate": 4.958101868948715e-06, "loss": 0.015, "reward": 0.1758333444595337, "reward_std": 0.12451573461294174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1758333444595337, "step": 343 }, { "completion_length": 200.0, "epoch": 0.15309301290609703, "grad_norm": 0.013698318973183632, "kl": 0.011166570708155632, "learning_rate": 4.957390754733398e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 344 }, { "completion_length": 200.0, "epoch": 0.15353805073431243, "grad_norm": 0.014360117726027966, "kl": 0.010611571371555328, "learning_rate": 4.956673708375574e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 345 }, { "completion_length": 189.5, "epoch": 0.15398308856252782, "grad_norm": 0.7445936799049377, "kl": 0.028817251324653625, "learning_rate": 4.955950731606192e-06, "loss": 0.0012, "reward": 0.07383333891630173, "reward_std": 0.12533222138881683, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07383333891630173, "step": 346 }, { "completion_length": 184.0, "epoch": 0.15442812639074321, "grad_norm": 0.7008059024810791, "kl": 0.05760132521390915, "learning_rate": 4.9552218261705185e-06, "loss": 0.0023, "reward": -0.22599999606609344, "reward_std": 0.35765790939331055, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 347 }, { "completion_length": 200.0, "epoch": 0.1548731642189586, "grad_norm": 0.6954376697540283, "kl": 0.007197665050625801, "learning_rate": 4.954486993828132e-06, "loss": 0.0003, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 348 }, { "completion_length": 200.0, "epoch": 0.155318202047174, "grad_norm": 0.009056415408849716, "kl": 0.007077607326209545, "learning_rate": 4.953746236352917e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 349 }, { "completion_length": 200.0, "epoch": 0.1557632398753894, "grad_norm": 0.01317316759377718, "kl": 0.01229158602654934, "learning_rate": 4.952999555533065e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 350 }, { "completion_length": 195.5, "epoch": 0.15620827770360482, "grad_norm": 0.6882670521736145, "kl": 0.024929411709308624, "learning_rate": 4.952246953171062e-06, "loss": 0.001, "reward": -0.04200000315904617, "reward_std": 0.33165282011032104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04200000315904617, "step": 351 }, { "completion_length": 195.6666717529297, "epoch": 0.15665331553182021, "grad_norm": 0.676509439945221, "kl": 0.04691646993160248, "learning_rate": 4.951488431083689e-06, "loss": 0.0019, "reward": -0.13633333146572113, "reward_std": 0.29477566480636597, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13633333146572113, "step": 352 }, { "completion_length": 200.0, "epoch": 0.1570983533600356, "grad_norm": 0.6416396498680115, "kl": 0.01036627497524023, "learning_rate": 4.950723991102022e-06, "loss": 0.0004, "reward": -0.014666667208075523, "reward_std": 0.34211206436157227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014666667208075523, "step": 353 }, { "completion_length": 200.0, "epoch": 0.157543391188251, "grad_norm": 0.6665915846824646, "kl": 0.03278065845370293, "learning_rate": 4.949953635071417e-06, "loss": 0.0013, "reward": -0.08783333748579025, "reward_std": 0.3303600549697876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08783333748579025, "step": 354 }, { "completion_length": 191.5, "epoch": 0.1579884290164664, "grad_norm": 0.9799753427505493, "kl": 0.01656440459191799, "learning_rate": 4.949177364851515e-06, "loss": 0.0007, "reward": 0.0429999977350235, "reward_std": 0.20085816085338593, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0429999977350235, "step": 355 }, { "completion_length": 200.0, "epoch": 0.1584334668446818, "grad_norm": 0.016740066930651665, "kl": 0.01629803143441677, "learning_rate": 4.9483951823162326e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 356 }, { "completion_length": 200.0, "epoch": 0.1588785046728972, "grad_norm": 0.014137927442789078, "kl": 0.007187969516962767, "learning_rate": 4.947607089353758e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 357 }, { "completion_length": 200.0, "epoch": 0.1593235425011126, "grad_norm": 0.6014176607131958, "kl": 0.018046831712126732, "learning_rate": 4.946813087866549e-06, "loss": 0.0007, "reward": 0.007333338260650635, "reward_std": 0.2882232666015625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007333338260650635, "step": 358 }, { "completion_length": 200.0, "epoch": 0.159768580329328, "grad_norm": 0.013204401358962059, "kl": 0.013151012361049652, "learning_rate": 4.946013179771325e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 359 }, { "completion_length": 199.5, "epoch": 0.1602136181575434, "grad_norm": 0.6359446048736572, "kl": 0.02717038244009018, "learning_rate": 4.9452073669990656e-06, "loss": 0.0011, "reward": 0.03733333572745323, "reward_std": 0.2147386074066162, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03733333572745323, "step": 360 }, { "completion_length": 200.0, "epoch": 0.1606586559857588, "grad_norm": 0.013449462130665779, "kl": 0.006884987931698561, "learning_rate": 4.944395651495002e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 361 }, { "completion_length": 187.6666717529297, "epoch": 0.16110369381397419, "grad_norm": 0.818776547908783, "kl": 0.0218803733587265, "learning_rate": 4.9435780352186154e-06, "loss": 0.0009, "reward": -0.00916666816920042, "reward_std": 0.21735171973705292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.009166665375232697, "step": 362 }, { "completion_length": 200.0, "epoch": 0.16154873164218958, "grad_norm": 0.013731640763580799, "kl": 0.0065148696303367615, "learning_rate": 4.942754520143634e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 363 }, { "completion_length": 200.0, "epoch": 0.16199376947040497, "grad_norm": 0.009971830062568188, "kl": 0.012283856980502605, "learning_rate": 4.9419251082580216e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 364 }, { "completion_length": 200.0, "epoch": 0.16243880729862037, "grad_norm": 0.6981242895126343, "kl": 0.022827019914984703, "learning_rate": 4.94108980156398e-06, "loss": 0.0009, "reward": -0.02850000187754631, "reward_std": 0.3187059760093689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02850000187754631, "step": 365 }, { "completion_length": 186.1666717529297, "epoch": 0.1628838451268358, "grad_norm": 0.7663524746894836, "kl": 0.02265150099992752, "learning_rate": 4.940248602077939e-06, "loss": 0.0009, "reward": 0.006000000052154064, "reward_std": 0.2016005963087082, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006000000052154064, "step": 366 }, { "completion_length": 200.0, "epoch": 0.16332888295505119, "grad_norm": 0.01284782588481903, "kl": 0.009670613333582878, "learning_rate": 4.939401511830556e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 367 }, { "completion_length": 200.0, "epoch": 0.16377392078326658, "grad_norm": 0.01885703206062317, "kl": 0.009269410744309425, "learning_rate": 4.938548532866706e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 368 }, { "completion_length": 200.0, "epoch": 0.16421895861148197, "grad_norm": 0.6328880190849304, "kl": 0.01867607608437538, "learning_rate": 4.937689667245481e-06, "loss": 0.0007, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 369 }, { "completion_length": 200.0, "epoch": 0.16466399643969737, "grad_norm": 0.010342692025005817, "kl": 0.0059143840335309505, "learning_rate": 4.936824917040184e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 370 }, { "completion_length": 200.0, "epoch": 0.16510903426791276, "grad_norm": 0.6799391508102417, "kl": 0.01873181015253067, "learning_rate": 4.935954284338321e-06, "loss": 0.0007, "reward": 0.005833338014781475, "reward_std": 0.2918975353240967, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005833338014781475, "step": 371 }, { "completion_length": 200.0, "epoch": 0.16555407209612816, "grad_norm": 0.017115725204348564, "kl": 0.010264448821544647, "learning_rate": 4.9350777712415995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 372 }, { "completion_length": 200.0, "epoch": 0.16599910992434358, "grad_norm": 0.010852769017219543, "kl": 0.007528107613325119, "learning_rate": 4.934195379865925e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 373 }, { "completion_length": 168.83334350585938, "epoch": 0.16644414775255897, "grad_norm": 1.29231595993042, "kl": 0.18531188368797302, "learning_rate": 4.933307112341388e-06, "loss": 0.0074, "reward": 0.1628333330154419, "reward_std": 0.13582256436347961, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1628333330154419, "step": 374 }, { "completion_length": 200.0, "epoch": 0.16688918558077437, "grad_norm": 0.009757625870406628, "kl": 0.01133672520518303, "learning_rate": 4.932412970812269e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 375 }, { "completion_length": 199.5, "epoch": 0.16733422340898976, "grad_norm": 0.01278277114033699, "kl": 0.009305099956691265, "learning_rate": 4.931512957437024e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 376 }, { "completion_length": 200.0, "epoch": 0.16777926123720516, "grad_norm": 0.013093134388327599, "kl": 0.007520940154790878, "learning_rate": 4.930607074388287e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 377 }, { "completion_length": 200.0, "epoch": 0.16822429906542055, "grad_norm": 0.011228191666305065, "kl": 0.008690441027283669, "learning_rate": 4.92969532385286e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 378 }, { "completion_length": 185.5, "epoch": 0.16866933689363595, "grad_norm": 0.7297086119651794, "kl": 0.026256537064909935, "learning_rate": 4.928777708031709e-06, "loss": 0.0011, "reward": 0.016166668385267258, "reward_std": 0.16865399479866028, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 379 }, { "completion_length": 200.0, "epoch": 0.16911437472185137, "grad_norm": 0.015303281135857105, "kl": 0.01288022380322218, "learning_rate": 4.927854229139959e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 380 }, { "completion_length": 200.0, "epoch": 0.16955941255006676, "grad_norm": 0.011454589664936066, "kl": 0.006882285233587027, "learning_rate": 4.9269248894068886e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 381 }, { "completion_length": 200.0, "epoch": 0.17000445037828216, "grad_norm": 0.014687180519104004, "kl": 0.014141609892249107, "learning_rate": 4.9259896910759246e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 382 }, { "completion_length": 200.0, "epoch": 0.17044948820649755, "grad_norm": 0.016531087458133698, "kl": 0.00648513063788414, "learning_rate": 4.925048636404635e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 383 }, { "completion_length": 200.0, "epoch": 0.17089452603471295, "grad_norm": 0.00957447849214077, "kl": 0.0061697340570390224, "learning_rate": 4.9241017276647295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 384 }, { "completion_length": 200.0, "epoch": 0.17133956386292834, "grad_norm": 0.01072862558066845, "kl": 0.005429576151072979, "learning_rate": 4.923148967142043e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 385 }, { "completion_length": 200.0, "epoch": 0.17178460169114373, "grad_norm": 0.024660624563694, "kl": 0.009238356724381447, "learning_rate": 4.9221903571365406e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 386 }, { "completion_length": 200.0, "epoch": 0.17222963951935916, "grad_norm": 0.7495405673980713, "kl": 0.0185337346047163, "learning_rate": 4.921225899962308e-06, "loss": 0.0007, "reward": -0.13200001418590546, "reward_std": 0.3997148871421814, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13200001418590546, "step": 387 }, { "completion_length": 200.0, "epoch": 0.17267467734757455, "grad_norm": 0.010481936857104301, "kl": 0.00567130371928215, "learning_rate": 4.920255597947545e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 388 }, { "completion_length": 200.0, "epoch": 0.17311971517578995, "grad_norm": 0.007845093496143818, "kl": 0.003956751897931099, "learning_rate": 4.919279453434561e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 389 }, { "completion_length": 181.0, "epoch": 0.17356475300400534, "grad_norm": 0.6615269184112549, "kl": 0.030248617753386497, "learning_rate": 4.918297468779771e-06, "loss": 0.0012, "reward": -0.13499999046325684, "reward_std": 0.35328683257102966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13499999046325684, "step": 390 }, { "completion_length": 200.0, "epoch": 0.17400979083222073, "grad_norm": 0.691724419593811, "kl": 0.02659853920340538, "learning_rate": 4.917309646353682e-06, "loss": 0.0011, "reward": -0.09699999541044235, "reward_std": 0.34400463104248047, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09699999541044235, "step": 391 }, { "completion_length": 184.5, "epoch": 0.17445482866043613, "grad_norm": 0.6306662559509277, "kl": 0.0601261667907238, "learning_rate": 4.916315988540903e-06, "loss": 0.0024, "reward": -0.09049999713897705, "reward_std": 0.25501197576522827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09049999713897705, "step": 392 }, { "completion_length": 200.0, "epoch": 0.17489986648865152, "grad_norm": 0.012944119051098824, "kl": 0.01151751633733511, "learning_rate": 4.9153164977401215e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 393 }, { "completion_length": 200.0, "epoch": 0.17534490431686695, "grad_norm": 0.7263959646224976, "kl": 0.011329833418130875, "learning_rate": 4.914311176364109e-06, "loss": 0.0005, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 394 }, { "completion_length": 200.0, "epoch": 0.17578994214508234, "grad_norm": 0.00732870027422905, "kl": 0.004891596734523773, "learning_rate": 4.913300026839714e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 395 }, { "completion_length": 200.0, "epoch": 0.17623497997329773, "grad_norm": 0.7324749827384949, "kl": 0.01928497850894928, "learning_rate": 4.912283051607849e-06, "loss": 0.0008, "reward": 0.02199999988079071, "reward_std": 0.25229746103286743, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02199999988079071, "step": 396 }, { "completion_length": 200.0, "epoch": 0.17668001780151313, "grad_norm": 0.8718001842498779, "kl": 0.13316845893859863, "learning_rate": 4.911260253123494e-06, "loss": 0.0053, "reward": -0.1536666750907898, "reward_std": 0.30690693855285645, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1536666750907898, "step": 397 }, { "completion_length": 200.0, "epoch": 0.17712505562972852, "grad_norm": 0.7277849912643433, "kl": 0.01658281497657299, "learning_rate": 4.9102316338556844e-06, "loss": 0.0007, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 398 }, { "completion_length": 200.0, "epoch": 0.17757009345794392, "grad_norm": 0.6445873975753784, "kl": 0.02959112823009491, "learning_rate": 4.909197196287509e-06, "loss": 0.0012, "reward": -0.1274999976158142, "reward_std": 0.44477805495262146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1274999976158142, "step": 399 }, { "completion_length": 172.33334350585938, "epoch": 0.1780151312861593, "grad_norm": 0.7523168921470642, "kl": 0.0322515144944191, "learning_rate": 4.908156942916101e-06, "loss": 0.0013, "reward": -0.03199999779462814, "reward_std": 0.2558077275753021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03199999779462814, "step": 400 }, { "completion_length": 167.0, "epoch": 0.17846016911437473, "grad_norm": 0.8686386942863464, "kl": 0.056089848279953, "learning_rate": 4.90711087625263e-06, "loss": 0.0022, "reward": 0.07116666436195374, "reward_std": 0.14513224363327026, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07116666436195374, "step": 401 }, { "completion_length": 200.0, "epoch": 0.17890520694259013, "grad_norm": 0.00890452042222023, "kl": 0.005404898431152105, "learning_rate": 4.906058998822303e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 402 }, { "completion_length": 200.0, "epoch": 0.17935024477080552, "grad_norm": 0.9637120962142944, "kl": 0.02167494222521782, "learning_rate": 4.905001313164353e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 403 }, { "completion_length": 200.0, "epoch": 0.17979528259902092, "grad_norm": 0.007997624576091766, "kl": 0.004169671796262264, "learning_rate": 4.9039378218320325e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 404 }, { "completion_length": 200.0, "epoch": 0.1802403204272363, "grad_norm": 0.016796903684735298, "kl": 0.011683585122227669, "learning_rate": 4.902868527392612e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 405 }, { "completion_length": 200.0, "epoch": 0.1806853582554517, "grad_norm": 0.015744149684906006, "kl": 0.012497194111347198, "learning_rate": 4.9017934324273655e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 406 }, { "completion_length": 199.1666717529297, "epoch": 0.1811303960836671, "grad_norm": 0.5119321346282959, "kl": 0.028500860556960106, "learning_rate": 4.900712539531577e-06, "loss": 0.0011, "reward": 0.03933333605527878, "reward_std": 0.2756495475769043, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03933333605527878, "step": 407 }, { "completion_length": 193.6666717529297, "epoch": 0.18157543391188252, "grad_norm": 0.009107636287808418, "kl": 0.009039022959768772, "learning_rate": 4.89962585131452e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 408 }, { "completion_length": 200.0, "epoch": 0.18202047174009792, "grad_norm": 0.013876304030418396, "kl": 0.009923950769007206, "learning_rate": 4.898533370399459e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 409 }, { "completion_length": 200.0, "epoch": 0.1824655095683133, "grad_norm": 0.03001904860138893, "kl": 0.01629229262471199, "learning_rate": 4.897435099423647e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 410 }, { "completion_length": 200.0, "epoch": 0.1829105473965287, "grad_norm": 0.020497044548392296, "kl": 0.0096663823351264, "learning_rate": 4.896331041038309e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 411 }, { "completion_length": 200.0, "epoch": 0.1833555852247441, "grad_norm": 0.9498317241668701, "kl": 0.01545047014951706, "learning_rate": 4.895221197908643e-06, "loss": 0.0006, "reward": 0.001833329675719142, "reward_std": 0.3016955256462097, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.001833329675719142, "step": 412 }, { "completion_length": 200.0, "epoch": 0.1838006230529595, "grad_norm": 0.006678743753582239, "kl": 0.003321468597277999, "learning_rate": 4.89410557271381e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 413 }, { "completion_length": 200.0, "epoch": 0.1842456608811749, "grad_norm": 0.009199023246765137, "kl": 0.005622576922178268, "learning_rate": 4.8929841681469295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 414 }, { "completion_length": 200.0, "epoch": 0.1846906987093903, "grad_norm": 0.7114378213882446, "kl": 0.02522313967347145, "learning_rate": 4.891856986915073e-06, "loss": 0.001, "reward": -0.10500000417232513, "reward_std": 0.31319066882133484, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10500000417232513, "step": 415 }, { "completion_length": 200.0, "epoch": 0.1851357365376057, "grad_norm": 0.011489290744066238, "kl": 0.008540185168385506, "learning_rate": 4.8907240317392565e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 416 }, { "completion_length": 200.0, "epoch": 0.1855807743658211, "grad_norm": 0.6549782156944275, "kl": 0.023001134395599365, "learning_rate": 4.889585305354436e-06, "loss": 0.0009, "reward": 0.05899999663233757, "reward_std": 0.30181917548179626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05899999663233757, "step": 417 }, { "completion_length": 200.0, "epoch": 0.1860258121940365, "grad_norm": 0.6743258833885193, "kl": 0.014783745631575584, "learning_rate": 4.888440810509496e-06, "loss": 0.0006, "reward": -0.007166664116084576, "reward_std": 0.267223060131073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007166664116084576, "step": 418 }, { "completion_length": 200.0, "epoch": 0.1864708500222519, "grad_norm": 0.011519741266965866, "kl": 0.006074823439121246, "learning_rate": 4.887290549967247e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 419 }, { "completion_length": 183.1666717529297, "epoch": 0.18691588785046728, "grad_norm": 0.8046799302101135, "kl": 0.018539931625127792, "learning_rate": 4.886134526504421e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.07905694097280502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 420 }, { "completion_length": 200.0, "epoch": 0.18736092567868268, "grad_norm": 0.02228482812643051, "kl": 0.022595927119255066, "learning_rate": 4.884972742911656e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 421 }, { "completion_length": 170.0, "epoch": 0.1878059635068981, "grad_norm": 1.0265932083129883, "kl": 0.03965630382299423, "learning_rate": 4.8838052019935005e-06, "loss": 0.0016, "reward": -0.19983333349227905, "reward_std": 0.36561259627342224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19983333349227905, "step": 422 }, { "completion_length": 200.0, "epoch": 0.1882510013351135, "grad_norm": 0.008268559351563454, "kl": 0.006413072347640991, "learning_rate": 4.882631906568398e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 423 }, { "completion_length": 200.0, "epoch": 0.1886960391633289, "grad_norm": 0.011764715425670147, "kl": 0.00969620794057846, "learning_rate": 4.881452859468685e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 424 }, { "completion_length": 200.0, "epoch": 0.18914107699154428, "grad_norm": 0.013174543157219887, "kl": 0.012654460966587067, "learning_rate": 4.880268063540581e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 425 }, { "completion_length": 200.0, "epoch": 0.18958611481975968, "grad_norm": 0.009335462003946304, "kl": 0.011132560670375824, "learning_rate": 4.8790775216441835e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 426 }, { "completion_length": 141.1666717529297, "epoch": 0.19003115264797507, "grad_norm": 0.8435110449790955, "kl": 0.023837899789214134, "learning_rate": 4.877881236653463e-06, "loss": 0.001, "reward": -0.03983333706855774, "reward_std": 0.24120646715164185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03983333706855774, "step": 427 }, { "completion_length": 199.6666717529297, "epoch": 0.19047619047619047, "grad_norm": 0.6499157547950745, "kl": 0.01160873007029295, "learning_rate": 4.8766792114562495e-06, "loss": 0.0005, "reward": -0.09033333510160446, "reward_std": 0.351275771856308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09033333510160446, "step": 428 }, { "completion_length": 197.83334350585938, "epoch": 0.1909212283044059, "grad_norm": 0.015310406684875488, "kl": 0.010684727691113949, "learning_rate": 4.875471448954234e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 429 }, { "completion_length": 200.0, "epoch": 0.19136626613262128, "grad_norm": 0.6541863679885864, "kl": 0.01754167675971985, "learning_rate": 4.874257952062957e-06, "loss": 0.0007, "reward": 0.00533333420753479, "reward_std": 0.2931222915649414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00533333420753479, "step": 430 }, { "completion_length": 200.0, "epoch": 0.19181130396083668, "grad_norm": 0.009647169150412083, "kl": 0.005298088304698467, "learning_rate": 4.873038723711798e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 431 }, { "completion_length": 200.0, "epoch": 0.19225634178905207, "grad_norm": 0.010216983035206795, "kl": 0.007101266644895077, "learning_rate": 4.871813766843977e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 432 }, { "completion_length": 200.0, "epoch": 0.19270137961726747, "grad_norm": 0.007988468743860722, "kl": 0.003836569143459201, "learning_rate": 4.870583084416539e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 433 }, { "completion_length": 200.0, "epoch": 0.19314641744548286, "grad_norm": 0.7281708717346191, "kl": 0.016352718695998192, "learning_rate": 4.869346679400353e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 434 }, { "completion_length": 200.0, "epoch": 0.19359145527369825, "grad_norm": 0.009215595200657845, "kl": 0.006540496833622456, "learning_rate": 4.868104554780101e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 435 }, { "completion_length": 178.0, "epoch": 0.19403649310191368, "grad_norm": 0.6501026749610901, "kl": 0.030550524592399597, "learning_rate": 4.866856713554271e-06, "loss": 0.0012, "reward": 0.04383333772420883, "reward_std": 0.15619271993637085, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04383333772420883, "step": 436 }, { "completion_length": 200.0, "epoch": 0.19448153093012907, "grad_norm": 0.010590564459562302, "kl": 0.006376064382493496, "learning_rate": 4.865603158735155e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 437 }, { "completion_length": 199.6666717529297, "epoch": 0.19492656875834447, "grad_norm": 0.5724078416824341, "kl": 0.015208952128887177, "learning_rate": 4.864343893348834e-06, "loss": 0.0006, "reward": 0.0690000057220459, "reward_std": 0.20461182296276093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0690000057220459, "step": 438 }, { "completion_length": 200.0, "epoch": 0.19537160658655986, "grad_norm": 0.5766968727111816, "kl": 0.011872484348714352, "learning_rate": 4.863078920435173e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 439 }, { "completion_length": 200.0, "epoch": 0.19581664441477525, "grad_norm": 0.0435682088136673, "kl": 0.017930179834365845, "learning_rate": 4.861808243047822e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 440 }, { "completion_length": 200.0, "epoch": 0.19626168224299065, "grad_norm": 0.012740753591060638, "kl": 0.005490332376211882, "learning_rate": 4.860531864254192e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 441 }, { "completion_length": 200.0, "epoch": 0.19670672007120604, "grad_norm": 0.007875868119299412, "kl": 0.003752867691218853, "learning_rate": 4.8592497871354646e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 442 }, { "completion_length": 200.0, "epoch": 0.19715175789942144, "grad_norm": 0.00968019850552082, "kl": 0.0069991848431527615, "learning_rate": 4.857962014786575e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 443 }, { "completion_length": 200.0, "epoch": 0.19759679572763686, "grad_norm": 0.6327012777328491, "kl": 0.013870742172002792, "learning_rate": 4.856668550316203e-06, "loss": 0.0006, "reward": 0.008500000461935997, "reward_std": 0.2853655517101288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008500000461935997, "step": 444 }, { "completion_length": 200.0, "epoch": 0.19804183355585225, "grad_norm": 0.6365529298782349, "kl": 0.01527687069028616, "learning_rate": 4.855369396846778e-06, "loss": 0.0006, "reward": 0.0004999985685572028, "reward_std": 0.248800128698349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0004999985685572028, "step": 445 }, { "completion_length": 200.0, "epoch": 0.19848687138406765, "grad_norm": 0.013388743624091148, "kl": 0.0054934462532401085, "learning_rate": 4.854064557514452e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 446 }, { "completion_length": 147.33334350585938, "epoch": 0.19893190921228304, "grad_norm": 0.80791836977005, "kl": 0.06670716404914856, "learning_rate": 4.8527540354691095e-06, "loss": 0.0027, "reward": 0.14483334124088287, "reward_std": 0.055607251822948456, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14483334124088287, "step": 447 }, { "completion_length": 200.0, "epoch": 0.19937694704049844, "grad_norm": 0.008610348217189312, "kl": 0.0053973570466041565, "learning_rate": 4.8514378338743525e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 448 }, { "completion_length": 200.0, "epoch": 0.19982198486871383, "grad_norm": 0.6462050676345825, "kl": 0.022491535171866417, "learning_rate": 4.850115955907491e-06, "loss": 0.0009, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 449 }, { "completion_length": 200.0, "epoch": 0.20026702269692923, "grad_norm": 0.013753926381468773, "kl": 0.003668756689876318, "learning_rate": 4.8487884047595395e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 450 }, { "completion_length": 200.0, "epoch": 0.20071206052514465, "grad_norm": 0.01476313453167677, "kl": 0.009049910120666027, "learning_rate": 4.847455183635207e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 451 }, { "completion_length": 194.5, "epoch": 0.20115709835336004, "grad_norm": 0.7963857650756836, "kl": 0.024605944752693176, "learning_rate": 4.846116295752891e-06, "loss": 0.001, "reward": 0.08583333343267441, "reward_std": 0.1649368703365326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 452 }, { "completion_length": 197.33334350585938, "epoch": 0.20160213618157544, "grad_norm": 0.008257864974439144, "kl": 0.00925783533602953, "learning_rate": 4.844771744344666e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 453 }, { "completion_length": 169.5, "epoch": 0.20204717400979083, "grad_norm": 0.7012379169464111, "kl": 0.032544177025556564, "learning_rate": 4.843421532656281e-06, "loss": 0.0013, "reward": 0.028833335265517235, "reward_std": 0.24847166240215302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.028833335265517235, "step": 454 }, { "completion_length": 133.5, "epoch": 0.20249221183800623, "grad_norm": 1.1525821685791016, "kl": 0.030363596975803375, "learning_rate": 4.8420656639471466e-06, "loss": 0.0012, "reward": -0.10766666382551193, "reward_std": 0.2986413836479187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10766667127609253, "step": 455 }, { "completion_length": 189.1666717529297, "epoch": 0.20293724966622162, "grad_norm": 0.01882368139922619, "kl": 0.018681103363633156, "learning_rate": 4.84070414149033e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 456 }, { "completion_length": 192.33334350585938, "epoch": 0.20338228749443701, "grad_norm": 0.907418429851532, "kl": 0.016113460063934326, "learning_rate": 4.83933696857255e-06, "loss": 0.0006, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 457 }, { "completion_length": 200.0, "epoch": 0.20382732532265244, "grad_norm": 0.02320541813969612, "kl": 0.012556570582091808, "learning_rate": 4.83796414849416e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 458 }, { "completion_length": 189.1666717529297, "epoch": 0.20427236315086783, "grad_norm": 0.8501640558242798, "kl": 0.023711485788226128, "learning_rate": 4.836585684569148e-06, "loss": 0.0009, "reward": 0.05000000447034836, "reward_std": 0.18371173739433289, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05000000447034836, "step": 459 }, { "completion_length": 200.0, "epoch": 0.20471740097908322, "grad_norm": 0.0207956675440073, "kl": 0.008212253451347351, "learning_rate": 4.83520158012513e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 460 }, { "completion_length": 199.33334350585938, "epoch": 0.20516243880729862, "grad_norm": 0.01620439812541008, "kl": 0.011511600576341152, "learning_rate": 4.833811838503331e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 461 }, { "completion_length": 200.0, "epoch": 0.205607476635514, "grad_norm": 0.8750971555709839, "kl": 0.007969997823238373, "learning_rate": 4.83241646305859e-06, "loss": 0.0003, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 462 }, { "completion_length": 200.0, "epoch": 0.2060525144637294, "grad_norm": 0.012503387406468391, "kl": 0.005554807838052511, "learning_rate": 4.8310154571593435e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 463 }, { "completion_length": 200.0, "epoch": 0.2064975522919448, "grad_norm": 0.01813841424882412, "kl": 0.00543005159124732, "learning_rate": 4.829608824187621e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 464 }, { "completion_length": 200.0, "epoch": 0.20694259012016022, "grad_norm": 0.007825582288205624, "kl": 0.007653496228158474, "learning_rate": 4.828196567539034e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 465 }, { "completion_length": 200.0, "epoch": 0.20738762794837562, "grad_norm": 0.03445260971784592, "kl": 0.011151362210512161, "learning_rate": 4.826778690622772e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 466 }, { "completion_length": 192.6666717529297, "epoch": 0.207832665776591, "grad_norm": 0.6254943609237671, "kl": 0.0196918286383152, "learning_rate": 4.82535519686159e-06, "loss": 0.0008, "reward": -0.05883334204554558, "reward_std": 0.3185080885887146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05883333832025528, "step": 467 }, { "completion_length": 183.33334350585938, "epoch": 0.2082777036048064, "grad_norm": 0.7328706383705139, "kl": 0.045631419867277145, "learning_rate": 4.823926089691803e-06, "loss": 0.0018, "reward": -0.11100000143051147, "reward_std": 0.37771734595298767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11100000143051147, "step": 468 }, { "completion_length": 200.0, "epoch": 0.2087227414330218, "grad_norm": 0.013255412690341473, "kl": 0.011069796979427338, "learning_rate": 4.822491372563276e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 469 }, { "completion_length": 200.0, "epoch": 0.2091677792612372, "grad_norm": 0.022944483906030655, "kl": 0.013812100514769554, "learning_rate": 4.821051048939416e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 470 }, { "completion_length": 200.0, "epoch": 0.2096128170894526, "grad_norm": 0.009076782502233982, "kl": 0.0035323970951139927, "learning_rate": 4.819605122297167e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 471 }, { "completion_length": 200.0, "epoch": 0.210057854917668, "grad_norm": 0.047525037080049515, "kl": 0.008770107291638851, "learning_rate": 4.818153596126995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 472 }, { "completion_length": 200.0, "epoch": 0.2105028927458834, "grad_norm": 0.7111804485321045, "kl": 0.02760786935687065, "learning_rate": 4.816696473932886e-06, "loss": 0.0011, "reward": -0.012666663154959679, "reward_std": 0.33721309900283813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.012666663154959679, "step": 473 }, { "completion_length": 70.66667175292969, "epoch": 0.2109479305740988, "grad_norm": 1.4313840866088867, "kl": 0.02774934470653534, "learning_rate": 4.815233759232333e-06, "loss": 0.0011, "reward": 0.01666666753590107, "reward_std": 0.054006174206733704, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01666666753590107, "step": 474 }, { "completion_length": 200.0, "epoch": 0.2113929684023142, "grad_norm": 0.008336659520864487, "kl": 0.003154546720907092, "learning_rate": 4.8137654555563305e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 475 }, { "completion_length": 200.0, "epoch": 0.2118380062305296, "grad_norm": 0.6730583310127258, "kl": 0.02612406387925148, "learning_rate": 4.812291566449363e-06, "loss": 0.001, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 476 }, { "completion_length": 200.0, "epoch": 0.21228304405874499, "grad_norm": 0.6641086935997009, "kl": 0.013326774351298809, "learning_rate": 4.810812095469401e-06, "loss": 0.0005, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 477 }, { "completion_length": 196.0, "epoch": 0.21272808188696038, "grad_norm": 0.02167276106774807, "kl": 0.016343414783477783, "learning_rate": 4.809327046187888e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 478 }, { "completion_length": 200.0, "epoch": 0.2131731197151758, "grad_norm": 0.6960393786430359, "kl": 0.00471863616257906, "learning_rate": 4.807836422189733e-06, "loss": 0.0002, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 479 }, { "completion_length": 200.0, "epoch": 0.2136181575433912, "grad_norm": 0.03306792676448822, "kl": 0.022989537566900253, "learning_rate": 4.806340227073304e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 480 }, { "completion_length": 186.1666717529297, "epoch": 0.2140631953716066, "grad_norm": 0.7957750558853149, "kl": 0.0064355782233178616, "learning_rate": 4.8048384644504165e-06, "loss": 0.0003, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 481 }, { "completion_length": 190.0, "epoch": 0.21450823319982198, "grad_norm": 0.7082236409187317, "kl": 0.05344567820429802, "learning_rate": 4.8033311379463255e-06, "loss": 0.0021, "reward": -0.0898333415389061, "reward_std": 0.3882114887237549, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0898333415389061, "step": 482 }, { "completion_length": 200.0, "epoch": 0.21495327102803738, "grad_norm": 0.019929101690649986, "kl": 0.008283906616270542, "learning_rate": 4.801818251199718e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 483 }, { "completion_length": 185.5, "epoch": 0.21539830885625277, "grad_norm": 0.6174662113189697, "kl": 0.02444113790988922, "learning_rate": 4.800299807862705e-06, "loss": 0.001, "reward": 0.00916666816920042, "reward_std": 0.22120074927806854, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00916666816920042, "step": 484 }, { "completion_length": 200.0, "epoch": 0.21584334668446817, "grad_norm": 0.6985958814620972, "kl": 0.047310732305049896, "learning_rate": 4.798775811600807e-06, "loss": 0.0019, "reward": -0.09299999475479126, "reward_std": 0.3469115197658539, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09299999475479126, "step": 485 }, { "completion_length": 200.0, "epoch": 0.2162883845126836, "grad_norm": 0.008483149111270905, "kl": 0.004622921347618103, "learning_rate": 4.7972462660929546e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 486 }, { "completion_length": 190.6666717529297, "epoch": 0.21673342234089898, "grad_norm": 0.8138934969902039, "kl": 0.022382408380508423, "learning_rate": 4.795711175031467e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 487 }, { "completion_length": 200.0, "epoch": 0.21717846016911438, "grad_norm": 0.010969250462949276, "kl": 0.005018382798880339, "learning_rate": 4.79417054212206e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 488 }, { "completion_length": 200.0, "epoch": 0.21762349799732977, "grad_norm": 0.007726567331701517, "kl": 0.003069926518946886, "learning_rate": 4.792624371083819e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 489 }, { "completion_length": 200.0, "epoch": 0.21806853582554517, "grad_norm": 0.00889444351196289, "kl": 0.004114137962460518, "learning_rate": 4.791072665649203e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 187.0, "epoch": 0.21851357365376056, "grad_norm": 0.9759646654129028, "kl": 0.02656198851764202, "learning_rate": 4.789515429564029e-06, "loss": 0.0011, "reward": 0.13449999690055847, "reward_std": 0.023270150646567345, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13449999690055847, "step": 491 }, { "completion_length": 164.6666717529297, "epoch": 0.21895861148197596, "grad_norm": 0.698348343372345, "kl": 0.03639654442667961, "learning_rate": 4.787952666587465e-06, "loss": 0.0015, "reward": 0.06016666814684868, "reward_std": 0.2878718078136444, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06016666814684868, "step": 492 }, { "completion_length": 181.83334350585938, "epoch": 0.21940364931019138, "grad_norm": 0.6999357342720032, "kl": 0.0428909957408905, "learning_rate": 4.786384380492024e-06, "loss": 0.0017, "reward": 0.16099999845027924, "reward_std": 0.13888844847679138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16099999845027924, "step": 493 }, { "completion_length": 184.83334350585938, "epoch": 0.21984868713840677, "grad_norm": 0.7373248338699341, "kl": 0.051096897572278976, "learning_rate": 4.784810575063546e-06, "loss": 0.002, "reward": 0.019499998539686203, "reward_std": 0.3235452175140381, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 494 }, { "completion_length": 198.5, "epoch": 0.22029372496662217, "grad_norm": 0.8084774017333984, "kl": 0.012246180325746536, "learning_rate": 4.783231254101201e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 495 }, { "completion_length": 200.0, "epoch": 0.22073876279483756, "grad_norm": 0.008205811493098736, "kl": 0.007350212894380093, "learning_rate": 4.781646421417469e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 496 }, { "completion_length": 200.0, "epoch": 0.22118380062305296, "grad_norm": 0.018955878913402557, "kl": 0.005687872879207134, "learning_rate": 4.780056080838138e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 497 }, { "completion_length": 200.0, "epoch": 0.22162883845126835, "grad_norm": 0.17352764308452606, "kl": 0.04136586934328079, "learning_rate": 4.77846023620229e-06, "loss": 0.0017, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 498 }, { "completion_length": 199.0, "epoch": 0.22207387627948375, "grad_norm": 0.014824754558503628, "kl": 0.008487005718052387, "learning_rate": 4.776858891362296e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 499 }, { "completion_length": 200.0, "epoch": 0.22251891410769917, "grad_norm": 0.7017592191696167, "kl": 0.012654486112296581, "learning_rate": 4.775252050183802e-06, "loss": 0.0005, "reward": 0.025499999523162842, "reward_std": 0.24372422695159912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025499999523162842, "step": 500 }, { "completion_length": 200.0, "epoch": 0.22296395193591456, "grad_norm": 0.8674989342689514, "kl": 0.024426866322755814, "learning_rate": 4.773639716545723e-06, "loss": 0.001, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 501 }, { "completion_length": 200.0, "epoch": 0.22340898976412996, "grad_norm": 0.008451178669929504, "kl": 0.0035606666933745146, "learning_rate": 4.772021894340235e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 502 }, { "completion_length": 200.0, "epoch": 0.22385402759234535, "grad_norm": 0.014713208191096783, "kl": 0.00692252442240715, "learning_rate": 4.77039858747276e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 503 }, { "completion_length": 200.0, "epoch": 0.22429906542056074, "grad_norm": 0.7125733494758606, "kl": 0.031231265515089035, "learning_rate": 4.768769799861962e-06, "loss": 0.0012, "reward": 0.019166668877005577, "reward_std": 0.2592376470565796, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019166668877005577, "step": 504 }, { "completion_length": 199.33334350585938, "epoch": 0.22474410324877614, "grad_norm": 0.6759501099586487, "kl": 0.01907402276992798, "learning_rate": 4.767135535439736e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 505 }, { "completion_length": 200.0, "epoch": 0.22518914107699153, "grad_norm": 0.008459432050585747, "kl": 0.009604415856301785, "learning_rate": 4.765495798151196e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 506 }, { "completion_length": 191.33334350585938, "epoch": 0.22563417890520696, "grad_norm": 0.9782493710517883, "kl": 0.030173055827617645, "learning_rate": 4.763850591954668e-06, "loss": 0.0012, "reward": 0.1875, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 507 }, { "completion_length": 196.33334350585938, "epoch": 0.22607921673342235, "grad_norm": 0.7100333571434021, "kl": 0.020358001813292503, "learning_rate": 4.762199920821683e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 508 }, { "completion_length": 199.5, "epoch": 0.22652425456163774, "grad_norm": 0.011824924498796463, "kl": 0.0119178993627429, "learning_rate": 4.760543788736961e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 509 }, { "completion_length": 200.0, "epoch": 0.22696929238985314, "grad_norm": 0.007140854839235544, "kl": 0.005115116946399212, "learning_rate": 4.758882199698405e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 510 }, { "completion_length": 190.1666717529297, "epoch": 0.22741433021806853, "grad_norm": 0.7087447643280029, "kl": 0.018751170486211777, "learning_rate": 4.757215157717091e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 511 }, { "completion_length": 194.0, "epoch": 0.22785936804628393, "grad_norm": 0.05364219471812248, "kl": 0.0219450481235981, "learning_rate": 4.7555426668172614e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 512 }, { "completion_length": 179.33334350585938, "epoch": 0.22830440587449932, "grad_norm": 0.7024439573287964, "kl": 0.027089372277259827, "learning_rate": 4.753864731036308e-06, "loss": 0.0011, "reward": 0.187666654586792, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 513 }, { "completion_length": 200.0, "epoch": 0.22874944370271474, "grad_norm": 0.010735830292105675, "kl": 0.008807472884654999, "learning_rate": 4.752181354424769e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 514 }, { "completion_length": 200.0, "epoch": 0.22919448153093014, "grad_norm": 0.008272531442344189, "kl": 0.014129738323390484, "learning_rate": 4.750492541046318e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 515 }, { "completion_length": 200.0, "epoch": 0.22963951935914553, "grad_norm": 0.006092743948101997, "kl": 0.0037111197598278522, "learning_rate": 4.74879829497775e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 516 }, { "completion_length": 200.0, "epoch": 0.23008455718736093, "grad_norm": 0.7442413568496704, "kl": 0.005196265410631895, "learning_rate": 4.747098620308975e-06, "loss": 0.0002, "reward": 0.1041666716337204, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 517 }, { "completion_length": 186.0, "epoch": 0.23052959501557632, "grad_norm": 0.9965373277664185, "kl": 0.03707145154476166, "learning_rate": 4.7453935211430105e-06, "loss": 0.0015, "reward": 0.2083333432674408, "reward_std": 0.15138253569602966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 518 }, { "completion_length": 200.0, "epoch": 0.23097463284379172, "grad_norm": 0.6923635005950928, "kl": 0.025277286767959595, "learning_rate": 4.743683001595965e-06, "loss": 0.001, "reward": 0.04116666316986084, "reward_std": 0.3334938883781433, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04116666316986084, "step": 519 }, { "completion_length": 200.0, "epoch": 0.2314196706720071, "grad_norm": 0.009316305629909039, "kl": 0.003812011331319809, "learning_rate": 4.741967065797036e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 520 }, { "completion_length": 163.83334350585938, "epoch": 0.23186470850022253, "grad_norm": 1.0490024089813232, "kl": 0.03154284879565239, "learning_rate": 4.740245717888491e-06, "loss": 0.0013, "reward": 0.13366666436195374, "reward_std": 0.11832442134618759, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13366666436195374, "step": 521 }, { "completion_length": 200.0, "epoch": 0.23230974632843793, "grad_norm": 0.5622179508209229, "kl": 0.00910902488976717, "learning_rate": 4.738518962025665e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 522 }, { "completion_length": 199.6666717529297, "epoch": 0.23275478415665332, "grad_norm": 0.6993998885154724, "kl": 0.028999265283346176, "learning_rate": 4.736786802376948e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 523 }, { "completion_length": 200.0, "epoch": 0.23319982198486872, "grad_norm": 0.011826745234429836, "kl": 0.005504906177520752, "learning_rate": 4.735049243123774e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 524 }, { "completion_length": 200.0, "epoch": 0.2336448598130841, "grad_norm": 0.8087031245231628, "kl": 0.06020812317728996, "learning_rate": 4.7333062884606114e-06, "loss": 0.0024, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 525 }, { "completion_length": 167.33334350585938, "epoch": 0.2340898976412995, "grad_norm": 0.7873777151107788, "kl": 0.03628785163164139, "learning_rate": 4.731557942594956e-06, "loss": 0.0015, "reward": 0.0885000079870224, "reward_std": 0.27989909052848816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0885000079870224, "step": 526 }, { "completion_length": 187.1666717529297, "epoch": 0.2345349354695149, "grad_norm": 0.7097110152244568, "kl": 0.034623414278030396, "learning_rate": 4.729804209747313e-06, "loss": 0.0014, "reward": -0.0033333352766931057, "reward_std": 0.3099939823150635, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0033333352766931057, "step": 527 }, { "completion_length": 200.0, "epoch": 0.2349799732977303, "grad_norm": 0.0095818554982543, "kl": 0.003446843009442091, "learning_rate": 4.728045094151194e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 528 }, { "completion_length": 158.6666717529297, "epoch": 0.23542501112594572, "grad_norm": 1.1593365669250488, "kl": 0.038195449858903885, "learning_rate": 4.726280600053109e-06, "loss": 0.0015, "reward": 0.1041666716337204, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 529 }, { "completion_length": 195.1666717529297, "epoch": 0.2358700489541611, "grad_norm": 0.6258925795555115, "kl": 0.021456394344568253, "learning_rate": 4.724510731712543e-06, "loss": 0.0009, "reward": 0.12516666948795319, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 530 }, { "completion_length": 185.33334350585938, "epoch": 0.2363150867823765, "grad_norm": 0.8283340930938721, "kl": 0.04360618814826012, "learning_rate": 4.722735493401961e-06, "loss": 0.0017, "reward": 0.2291666716337204, "reward_std": 0.14613065123558044, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 531 }, { "completion_length": 198.33334350585938, "epoch": 0.2367601246105919, "grad_norm": 0.6890245676040649, "kl": 0.012334956787526608, "learning_rate": 4.720954889406789e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 532 }, { "completion_length": 200.0, "epoch": 0.2372051624388073, "grad_norm": 0.006551599130034447, "kl": 0.00498668709769845, "learning_rate": 4.719168924025407e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 533 }, { "completion_length": 190.83334350585938, "epoch": 0.2376502002670227, "grad_norm": 0.7857916355133057, "kl": 0.037860430777072906, "learning_rate": 4.7173776015691345e-06, "loss": 0.0015, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 534 }, { "completion_length": 189.1666717529297, "epoch": 0.23809523809523808, "grad_norm": 0.7188397645950317, "kl": 0.02583630383014679, "learning_rate": 4.715580926362225e-06, "loss": 0.001, "reward": 0.0755000039935112, "reward_std": 0.3398551344871521, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007833331823348999, "step": 535 }, { "completion_length": 200.0, "epoch": 0.2385402759234535, "grad_norm": 0.7328579425811768, "kl": 0.014590962789952755, "learning_rate": 4.713778902741855e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 536 }, { "completion_length": 151.6666717529297, "epoch": 0.2389853137516689, "grad_norm": 0.8885166049003601, "kl": 0.0610353946685791, "learning_rate": 4.7119715350581096e-06, "loss": 0.0024, "reward": 0.250166654586792, "reward_std": 0.15827244520187378, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1668333262205124, "step": 537 }, { "completion_length": 200.0, "epoch": 0.2394303515798843, "grad_norm": 0.6243503093719482, "kl": 0.011630935594439507, "learning_rate": 4.710158827673974e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 538 }, { "completion_length": 188.1666717529297, "epoch": 0.2398753894080997, "grad_norm": 0.6943646669387817, "kl": 0.02668793499469757, "learning_rate": 4.708340784965326e-06, "loss": 0.0011, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 539 }, { "completion_length": 185.83334350585938, "epoch": 0.24032042723631508, "grad_norm": 0.7564919590950012, "kl": 0.036508020013570786, "learning_rate": 4.7065174113209225e-06, "loss": 0.0015, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 540 }, { "completion_length": 193.1666717529297, "epoch": 0.24076546506453048, "grad_norm": 1.0034822225570679, "kl": 0.038777224719524384, "learning_rate": 4.7046887111423865e-06, "loss": 0.0016, "reward": 0.1041666716337204, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 541 }, { "completion_length": 184.83334350585938, "epoch": 0.24121050289274587, "grad_norm": 0.8872579336166382, "kl": 0.009863987565040588, "learning_rate": 4.702854688844202e-06, "loss": 0.0004, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 542 }, { "completion_length": 112.66667175292969, "epoch": 0.2416555407209613, "grad_norm": 0.8163735866546631, "kl": 0.059337299317121506, "learning_rate": 4.701015348853699e-06, "loss": 0.0024, "reward": 0.2866666913032532, "reward_std": 0.26798635721206665, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03666666895151138, "step": 543 }, { "completion_length": 111.5, "epoch": 0.2421005785491767, "grad_norm": 1.2242677211761475, "kl": 0.08483864367008209, "learning_rate": 4.699170695611047e-06, "loss": 0.0034, "reward": 0.250166654586792, "reward_std": 0.13693124055862427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1668333262205124, "step": 544 }, { "completion_length": 196.6666717529297, "epoch": 0.24254561637739208, "grad_norm": 0.6293683648109436, "kl": 0.014732254669070244, "learning_rate": 4.697320733569238e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 545 }, { "completion_length": 200.0, "epoch": 0.24299065420560748, "grad_norm": 0.015561908483505249, "kl": 0.006728894077241421, "learning_rate": 4.695465467194082e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 546 }, { "completion_length": 199.0, "epoch": 0.24343569203382287, "grad_norm": 0.6494888663291931, "kl": 0.04665082320570946, "learning_rate": 4.693604900964193e-06, "loss": 0.0019, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 547 }, { "completion_length": 164.1666717529297, "epoch": 0.24388072986203826, "grad_norm": 0.9423882365226746, "kl": 0.06301107257604599, "learning_rate": 4.691739039370979e-06, "loss": 0.0025, "reward": 0.2291666716337204, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 548 }, { "completion_length": 191.5, "epoch": 0.24432576769025366, "grad_norm": 0.7398462891578674, "kl": 0.028107035905122757, "learning_rate": 4.68986788691863e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 549 }, { "completion_length": 200.0, "epoch": 0.24477080551846908, "grad_norm": 0.015271569602191448, "kl": 0.006307273171842098, "learning_rate": 4.68799144812411e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 550 }, { "completion_length": 161.1666717529297, "epoch": 0.24521584334668448, "grad_norm": 0.8253611922264099, "kl": 0.061717789620161057, "learning_rate": 4.686109727517142e-06, "loss": 0.0025, "reward": 0.2916666865348816, "reward_std": 0.23273734748363495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 551 }, { "completion_length": 149.6666717529297, "epoch": 0.24566088117489987, "grad_norm": 0.9274349808692932, "kl": 0.060279231518507004, "learning_rate": 4.6842227296402025e-06, "loss": 0.0024, "reward": 0.2293333262205124, "reward_std": 0.18415391445159912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 552 }, { "completion_length": 180.5, "epoch": 0.24610591900311526, "grad_norm": 0.7610526084899902, "kl": 0.040498070418834686, "learning_rate": 4.6823304590485025e-06, "loss": 0.0016, "reward": 0.1459999978542328, "reward_std": 0.16642116010189056, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 553 }, { "completion_length": 174.83334350585938, "epoch": 0.24655095683133066, "grad_norm": 0.820443868637085, "kl": 0.07258454710245132, "learning_rate": 4.680432920309986e-06, "loss": 0.0029, "reward": 0.2708333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 554 }, { "completion_length": 177.33334350585938, "epoch": 0.24699599465954605, "grad_norm": 0.7389416694641113, "kl": 0.04997410625219345, "learning_rate": 4.678530118005313e-06, "loss": 0.002, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 555 }, { "completion_length": 176.83334350585938, "epoch": 0.24744103248776145, "grad_norm": 0.7619123458862305, "kl": 0.0550689622759819, "learning_rate": 4.676622056727848e-06, "loss": 0.0022, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 556 }, { "completion_length": 180.6666717529297, "epoch": 0.24788607031597687, "grad_norm": 0.6843514442443848, "kl": 0.031852152198553085, "learning_rate": 4.674708741083651e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 557 }, { "completion_length": 95.5, "epoch": 0.24833110814419226, "grad_norm": 1.1454741954803467, "kl": 0.08709007501602173, "learning_rate": 4.6727901756914694e-06, "loss": 0.0035, "reward": 0.375166654586792, "reward_std": 0.15811441838741302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 558 }, { "completion_length": 184.5, "epoch": 0.24877614597240766, "grad_norm": 0.6881827712059021, "kl": 0.05518307909369469, "learning_rate": 4.670866365182719e-06, "loss": 0.0022, "reward": 0.1458333432674408, "reward_std": 0.0940965861082077, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 559 }, { "completion_length": 188.1666717529297, "epoch": 0.24922118380062305, "grad_norm": 0.8532994389533997, "kl": 0.03818666934967041, "learning_rate": 4.66893731420148e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 560 }, { "completion_length": 200.0, "epoch": 0.24966622162883845, "grad_norm": 0.8707519769668579, "kl": 0.02029397338628769, "learning_rate": 4.667003027404483e-06, "loss": 0.0008, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 561 }, { "completion_length": 167.5, "epoch": 0.25011125945705387, "grad_norm": 0.7573645710945129, "kl": 0.08380292356014252, "learning_rate": 4.665063509461098e-06, "loss": 0.0034, "reward": 0.15600000321865082, "reward_std": 0.3755556046962738, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07266666740179062, "step": 562 }, { "completion_length": 200.0, "epoch": 0.25055629728526924, "grad_norm": 0.7199256420135498, "kl": 0.027807530015707016, "learning_rate": 4.663118765053319e-06, "loss": 0.0011, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 563 }, { "completion_length": 129.33334350585938, "epoch": 0.25100133511348466, "grad_norm": 0.9969347715377808, "kl": 0.09646206349134445, "learning_rate": 4.661168798875763e-06, "loss": 0.0039, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 564 }, { "completion_length": 145.5, "epoch": 0.2514463729417, "grad_norm": 0.6874487996101379, "kl": 0.07618147879838943, "learning_rate": 4.6592136156356476e-06, "loss": 0.003, "reward": 0.25, "reward_std": 0.15811388194561005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 565 }, { "completion_length": 200.0, "epoch": 0.25189141076991545, "grad_norm": 0.8305972814559937, "kl": 0.022951535880565643, "learning_rate": 4.6572532200527875e-06, "loss": 0.0009, "reward": -0.05400000140070915, "reward_std": 0.37837284803390503, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05400000140070915, "step": 566 }, { "completion_length": 153.1666717529297, "epoch": 0.2523364485981308, "grad_norm": 0.9877628684043884, "kl": 0.0713796615600586, "learning_rate": 4.655287616859578e-06, "loss": 0.0029, "reward": 0.187666654586792, "reward_std": 0.06828372180461884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18766668438911438, "step": 567 }, { "completion_length": 195.33334350585938, "epoch": 0.25278148642634624, "grad_norm": 0.7665106058120728, "kl": 0.04054148495197296, "learning_rate": 4.6533168108009855e-06, "loss": 0.0016, "reward": 0.2291666716337204, "reward_std": 0.14613065123558044, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 568 }, { "completion_length": 170.33334350585938, "epoch": 0.25322652425456166, "grad_norm": 0.8183974623680115, "kl": 0.047635436058044434, "learning_rate": 4.651340806634538e-06, "loss": 0.0019, "reward": 0.3543333411216736, "reward_std": 0.12293358892202377, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 569 }, { "completion_length": 168.6666717529297, "epoch": 0.253671562082777, "grad_norm": 0.7534694075584412, "kl": 0.04617027938365936, "learning_rate": 4.64935960913031e-06, "loss": 0.0018, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 570 }, { "completion_length": 113.16667175292969, "epoch": 0.25411659991099245, "grad_norm": 1.2481725215911865, "kl": 0.07855173945426941, "learning_rate": 4.647373223070913e-06, "loss": 0.0031, "reward": 0.2916666865348816, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 571 }, { "completion_length": 103.66667175292969, "epoch": 0.2545616377392078, "grad_norm": 1.0237010717391968, "kl": 0.11217594146728516, "learning_rate": 4.645381653251485e-06, "loss": 0.0045, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 572 }, { "completion_length": 195.5, "epoch": 0.25500667556742324, "grad_norm": 0.886687695980072, "kl": 0.016482416540384293, "learning_rate": 4.643384904479675e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 573 }, { "completion_length": 114.33333587646484, "epoch": 0.2554517133956386, "grad_norm": 0.04589557647705078, "kl": 0.08041426539421082, "learning_rate": 4.641382981575637e-06, "loss": 0.0032, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 574 }, { "completion_length": 200.0, "epoch": 0.255896751223854, "grad_norm": 0.8335212469100952, "kl": 0.060845568776130676, "learning_rate": 4.639375889372013e-06, "loss": 0.0024, "reward": 0.125, "reward_std": 0.1369306445121765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 575 }, { "completion_length": 192.6666717529297, "epoch": 0.25634178905206945, "grad_norm": 0.8120817542076111, "kl": 0.03659413754940033, "learning_rate": 4.637363632713924e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 576 }, { "completion_length": 181.6666717529297, "epoch": 0.2567868268802848, "grad_norm": 0.723743736743927, "kl": 0.028544750064611435, "learning_rate": 4.63534621645896e-06, "loss": 0.0011, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 577 }, { "completion_length": 123.0, "epoch": 0.25723186470850024, "grad_norm": 0.8822214007377625, "kl": 0.15481743216514587, "learning_rate": 4.6333236454771644e-06, "loss": 0.0062, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 578 }, { "completion_length": 196.0, "epoch": 0.2576769025367156, "grad_norm": 0.8370240926742554, "kl": 0.045977018773555756, "learning_rate": 4.6312959246510245e-06, "loss": 0.0018, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 579 }, { "completion_length": 150.6666717529297, "epoch": 0.258121940364931, "grad_norm": 0.7318511009216309, "kl": 0.05776175111532211, "learning_rate": 4.629263058875458e-06, "loss": 0.0023, "reward": 0.2291666716337204, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 580 }, { "completion_length": 155.0, "epoch": 0.2585669781931464, "grad_norm": 0.9142718315124512, "kl": 0.069237619638443, "learning_rate": 4.627225053057806e-06, "loss": 0.0028, "reward": 0.3333333432674408, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 581 }, { "completion_length": 165.0, "epoch": 0.2590120160213618, "grad_norm": 0.7616869807243347, "kl": 0.0599093958735466, "learning_rate": 4.6251819121178145e-06, "loss": 0.0024, "reward": 0.3125, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 582 }, { "completion_length": 154.6666717529297, "epoch": 0.25945705384957723, "grad_norm": 0.806352972984314, "kl": 0.08792692422866821, "learning_rate": 4.623133640987628e-06, "loss": 0.0035, "reward": 0.2918333411216736, "reward_std": 0.1881493180990219, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 583 }, { "completion_length": 189.1666717529297, "epoch": 0.2599020916777926, "grad_norm": 0.7354152798652649, "kl": 0.03302110731601715, "learning_rate": 4.621080244611772e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 584 }, { "completion_length": 197.6666717529297, "epoch": 0.260347129506008, "grad_norm": 0.6742915511131287, "kl": 0.032543033361434937, "learning_rate": 4.619021727947147e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 585 }, { "completion_length": 151.83334350585938, "epoch": 0.2607921673342234, "grad_norm": 0.9699811935424805, "kl": 0.08444561809301376, "learning_rate": 4.616958095963014e-06, "loss": 0.0034, "reward": 0.1276666671037674, "reward_std": 0.48804986476898193, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1223333403468132, "step": 586 }, { "completion_length": 169.1666717529297, "epoch": 0.2612372051624388, "grad_norm": 1.1708648204803467, "kl": 0.04619307070970535, "learning_rate": 4.6148893536409815e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 587 }, { "completion_length": 109.33333587646484, "epoch": 0.2616822429906542, "grad_norm": 0.978725016117096, "kl": 0.08659862726926804, "learning_rate": 4.612815505974993e-06, "loss": 0.0035, "reward": 0.3333333432674408, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 588 }, { "completion_length": 185.5, "epoch": 0.2621272808188696, "grad_norm": 0.8235928416252136, "kl": 0.052005793899297714, "learning_rate": 4.610736557971321e-06, "loss": 0.0021, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 589 }, { "completion_length": 174.1666717529297, "epoch": 0.262572318647085, "grad_norm": 0.976612389087677, "kl": 0.045445047318935394, "learning_rate": 4.608652514648544e-06, "loss": 0.0018, "reward": 0.2916666865348816, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 590 }, { "completion_length": 200.0, "epoch": 0.2630173564753004, "grad_norm": 0.014436294324696064, "kl": 0.03448406979441643, "learning_rate": 4.606563381037544e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 591 }, { "completion_length": 200.0, "epoch": 0.2634623943035158, "grad_norm": 0.8270597457885742, "kl": 0.034238051623106, "learning_rate": 4.604469162181492e-06, "loss": 0.0014, "reward": 0.01300000213086605, "reward_std": 0.2962499260902405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01300000213086605, "step": 592 }, { "completion_length": 148.0, "epoch": 0.2639074321317312, "grad_norm": 0.8803796768188477, "kl": 0.06960055232048035, "learning_rate": 4.6023698631358326e-06, "loss": 0.0028, "reward": 0.437666654586792, "reward_std": 0.10446371138095856, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 593 }, { "completion_length": 200.0, "epoch": 0.2643524699599466, "grad_norm": 0.7200313210487366, "kl": 0.05139869078993797, "learning_rate": 4.6002654889682755e-06, "loss": 0.0021, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 594 }, { "completion_length": 193.6666717529297, "epoch": 0.26479750778816197, "grad_norm": 0.7966943979263306, "kl": 0.040248602628707886, "learning_rate": 4.598156044758779e-06, "loss": 0.0016, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 595 }, { "completion_length": 200.0, "epoch": 0.2652425456163774, "grad_norm": 0.755216121673584, "kl": 0.019312169402837753, "learning_rate": 4.5960415355995444e-06, "loss": 0.0008, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 596 }, { "completion_length": 186.1666717529297, "epoch": 0.2656875834445928, "grad_norm": 0.853950023651123, "kl": 0.039613865315914154, "learning_rate": 4.593921966594997e-06, "loss": 0.0016, "reward": 0.026833336800336838, "reward_std": 0.4285675883293152, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.026833336800336838, "step": 597 }, { "completion_length": 191.1666717529297, "epoch": 0.2661326212728082, "grad_norm": 0.7605770826339722, "kl": 0.05703141912817955, "learning_rate": 4.591797342861778e-06, "loss": 0.0023, "reward": 0.1875, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 598 }, { "completion_length": 200.0, "epoch": 0.2665776591010236, "grad_norm": 0.8873202800750732, "kl": 0.03285577893257141, "learning_rate": 4.589667669528729e-06, "loss": 0.0013, "reward": 0.0833333358168602, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 599 }, { "completion_length": 152.5, "epoch": 0.26702269692923897, "grad_norm": 0.7901185154914856, "kl": 0.07483705878257751, "learning_rate": 4.587532951736884e-06, "loss": 0.003, "reward": 0.2916666865348816, "reward_std": 0.1881931722164154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 600 }, { "completion_length": 147.1666717529297, "epoch": 0.2674677347574544, "grad_norm": 0.9961167573928833, "kl": 0.0769721046090126, "learning_rate": 4.585393194639452e-06, "loss": 0.0031, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 601 }, { "completion_length": 200.0, "epoch": 0.26791277258566976, "grad_norm": 0.7630622386932373, "kl": 0.035730935633182526, "learning_rate": 4.583248403401808e-06, "loss": 0.0014, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 602 }, { "completion_length": 160.33334350585938, "epoch": 0.2683578104138852, "grad_norm": 0.896909236907959, "kl": 0.07461149990558624, "learning_rate": 4.581098583201478e-06, "loss": 0.003, "reward": 0.375, "reward_std": 0.1369306445121765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 603 }, { "completion_length": 116.83333587646484, "epoch": 0.2688028482421006, "grad_norm": 0.9277684688568115, "kl": 0.08133234083652496, "learning_rate": 4.578943739228131e-06, "loss": 0.0033, "reward": 0.24300000071525574, "reward_std": 0.22485819458961487, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15966667234897614, "step": 604 }, { "completion_length": 184.83334350585938, "epoch": 0.26924788607031597, "grad_norm": 0.7694399952888489, "kl": 0.058748748153448105, "learning_rate": 4.576783876683559e-06, "loss": 0.0023, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 605 }, { "completion_length": 136.33334350585938, "epoch": 0.2696929238985314, "grad_norm": 0.9464454054832458, "kl": 0.054710254073143005, "learning_rate": 4.574619000781674e-06, "loss": 0.0022, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 606 }, { "completion_length": 179.6666717529297, "epoch": 0.27013796172674676, "grad_norm": 0.8664156198501587, "kl": 0.07143112272024155, "learning_rate": 4.572449116748485e-06, "loss": 0.0029, "reward": 0.3333333432674408, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 607 }, { "completion_length": 200.0, "epoch": 0.2705829995549622, "grad_norm": 0.9034672975540161, "kl": 0.026494070887565613, "learning_rate": 4.570274229822095e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 608 }, { "completion_length": 200.0, "epoch": 0.27102803738317754, "grad_norm": 0.6816303133964539, "kl": 0.04936773702502251, "learning_rate": 4.5680943452526814e-06, "loss": 0.002, "reward": -0.1028333306312561, "reward_std": 0.36173439025878906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1028333306312561, "step": 609 }, { "completion_length": 89.5, "epoch": 0.27147307521139297, "grad_norm": 1.2434215545654297, "kl": 0.10808803886175156, "learning_rate": 4.565909468302486e-06, "loss": 0.0043, "reward": 0.382999986410141, "reward_std": 0.14375397562980652, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13300000131130219, "step": 610 }, { "completion_length": 200.0, "epoch": 0.2719181130396084, "grad_norm": 0.03729372099041939, "kl": 0.05879303440451622, "learning_rate": 4.563719604245804e-06, "loss": 0.0024, "reward": 0.0, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 611 }, { "completion_length": 171.83334350585938, "epoch": 0.27236315086782376, "grad_norm": 0.7619086503982544, "kl": 0.14301355183124542, "learning_rate": 4.561524758368968e-06, "loss": 0.0057, "reward": 0.22450000047683716, "reward_std": 0.4011252820491791, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.025499999523162842, "step": 612 }, { "completion_length": 161.83334350585938, "epoch": 0.2728081886960392, "grad_norm": 0.8366971015930176, "kl": 0.07396448403596878, "learning_rate": 4.559324935970337e-06, "loss": 0.003, "reward": 0.3543333411216736, "reward_std": 0.22915641963481903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 613 }, { "completion_length": 178.1666717529297, "epoch": 0.27325322652425454, "grad_norm": 0.9588587880134583, "kl": 0.06857272237539291, "learning_rate": 4.5571201423602825e-06, "loss": 0.0027, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 614 }, { "completion_length": 185.83334350585938, "epoch": 0.27369826435246997, "grad_norm": 0.7504873871803284, "kl": 0.07397520542144775, "learning_rate": 4.554910382861178e-06, "loss": 0.003, "reward": 0.2916666865348816, "reward_std": 0.2457980364561081, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 615 }, { "completion_length": 199.33334350585938, "epoch": 0.27414330218068533, "grad_norm": 0.8504765629768372, "kl": 0.06764481961727142, "learning_rate": 4.552695662807385e-06, "loss": 0.0027, "reward": 0.1459999978542328, "reward_std": 0.12306909263134003, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 616 }, { "completion_length": 91.66667175292969, "epoch": 0.27458834000890076, "grad_norm": 0.24826188385486603, "kl": 0.1672677993774414, "learning_rate": 4.550475987545238e-06, "loss": 0.0067, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 617 }, { "completion_length": 187.0, "epoch": 0.2750333778371162, "grad_norm": 0.9045337438583374, "kl": 0.07695237547159195, "learning_rate": 4.548251362433033e-06, "loss": 0.0031, "reward": 0.1458333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 618 }, { "completion_length": 106.66667175292969, "epoch": 0.27547841566533154, "grad_norm": 0.9984831213951111, "kl": 0.17150583863258362, "learning_rate": 4.546021792841019e-06, "loss": 0.0069, "reward": 0.3543333411216736, "reward_std": 0.22915641963481903, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 619 }, { "completion_length": 86.0, "epoch": 0.27592345349354697, "grad_norm": 0.9944882392883301, "kl": 0.17101812362670898, "learning_rate": 4.543787284151374e-06, "loss": 0.0068, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 620 }, { "completion_length": 141.5, "epoch": 0.27636849132176233, "grad_norm": 1.0072808265686035, "kl": 0.11485801637172699, "learning_rate": 4.541547841758207e-06, "loss": 0.0046, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 621 }, { "completion_length": 156.0, "epoch": 0.27681352914997776, "grad_norm": 1.1295337677001953, "kl": 0.121914803981781, "learning_rate": 4.539303471067531e-06, "loss": 0.0049, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 622 }, { "completion_length": 168.5, "epoch": 0.2772585669781931, "grad_norm": 0.791140615940094, "kl": 0.06966628134250641, "learning_rate": 4.537054177497259e-06, "loss": 0.0028, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 623 }, { "completion_length": 120.0, "epoch": 0.27770360480640854, "grad_norm": 0.9863029718399048, "kl": 0.10272692143917084, "learning_rate": 4.534799966477186e-06, "loss": 0.0041, "reward": 0.3959999978542328, "reward_std": 0.1658191978931427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06266666948795319, "step": 624 }, { "completion_length": 200.0, "epoch": 0.27814864263462397, "grad_norm": 0.8481348752975464, "kl": 0.06225643306970596, "learning_rate": 4.532540843448979e-06, "loss": 0.0025, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 625 }, { "completion_length": 144.0, "epoch": 0.27859368046283933, "grad_norm": 0.9757879376411438, "kl": 0.09962724894285202, "learning_rate": 4.530276813866162e-06, "loss": 0.004, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 626 }, { "completion_length": 194.6666717529297, "epoch": 0.27903871829105475, "grad_norm": 0.931898295879364, "kl": 0.07902925461530685, "learning_rate": 4.528007883194102e-06, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 627 }, { "completion_length": 190.83334350585938, "epoch": 0.2794837561192701, "grad_norm": 0.831519365310669, "kl": 0.06486264616250992, "learning_rate": 4.525734056910002e-06, "loss": 0.0026, "reward": 0.2918333411216736, "reward_std": 0.2042061984539032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 628 }, { "completion_length": 111.33333587646484, "epoch": 0.27992879394748554, "grad_norm": 0.9687957167625427, "kl": 0.1136898398399353, "learning_rate": 4.523455340502878e-06, "loss": 0.0045, "reward": 0.375166654586792, "reward_std": 0.15811441838741302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12516666948795319, "step": 629 }, { "completion_length": 120.66667175292969, "epoch": 0.2803738317757009, "grad_norm": 0.0987262949347496, "kl": 0.11421214044094086, "learning_rate": 4.521171739473552e-06, "loss": 0.0046, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 630 }, { "completion_length": 120.83333587646484, "epoch": 0.28081886960391633, "grad_norm": 0.8952674269676208, "kl": 0.09326162189245224, "learning_rate": 4.5188832593346386e-06, "loss": 0.0037, "reward": 0.4583333432674408, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 631 }, { "completion_length": 112.33333587646484, "epoch": 0.28126390743213175, "grad_norm": 0.06504479050636292, "kl": 0.12895925343036652, "learning_rate": 4.51658990561053e-06, "loss": 0.0052, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 632 }, { "completion_length": 118.16667175292969, "epoch": 0.2817089452603471, "grad_norm": 1.197392463684082, "kl": 0.39798209071159363, "learning_rate": 4.514291683837383e-06, "loss": 0.0159, "reward": 0.2523333430290222, "reward_std": 0.41331908106803894, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08100000023841858, "step": 633 }, { "completion_length": 141.33334350585938, "epoch": 0.28215398308856254, "grad_norm": 1.023812174797058, "kl": 0.0909874364733696, "learning_rate": 4.511988599563107e-06, "loss": 0.0036, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 634 }, { "completion_length": 107.5, "epoch": 0.2825990209167779, "grad_norm": 1.506011962890625, "kl": 0.10861188173294067, "learning_rate": 4.509680658347347e-06, "loss": 0.0043, "reward": 0.3959999978542328, "reward_std": 0.2002398669719696, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06266666948795319, "step": 635 }, { "completion_length": 150.1666717529297, "epoch": 0.28304405874499333, "grad_norm": 1.0660347938537598, "kl": 0.1432390809059143, "learning_rate": 4.507367865761476e-06, "loss": 0.0057, "reward": 0.3543333411216736, "reward_std": 0.20028147101402283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 636 }, { "completion_length": 190.33334350585938, "epoch": 0.2834890965732087, "grad_norm": 0.9527061581611633, "kl": 0.07011144608259201, "learning_rate": 4.505050227388575e-06, "loss": 0.0028, "reward": 0.1875, "reward_std": 0.246855229139328, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 637 }, { "completion_length": 117.16667175292969, "epoch": 0.2839341344014241, "grad_norm": 1.230230689048767, "kl": 0.09981067478656769, "learning_rate": 4.502727748823425e-06, "loss": 0.004, "reward": 0.2084999978542328, "reward_std": 0.23266606032848358, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04183333367109299, "step": 638 }, { "completion_length": 129.0, "epoch": 0.28437917222963954, "grad_norm": 0.8897629976272583, "kl": 0.09878586232662201, "learning_rate": 4.50040043567249e-06, "loss": 0.004, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 639 }, { "completion_length": 92.83333587646484, "epoch": 0.2848242100578549, "grad_norm": 0.9260250926017761, "kl": 0.08705293387174606, "learning_rate": 4.498068293553906e-06, "loss": 0.0035, "reward": 0.437666654586792, "reward_std": 0.15268486738204956, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 640 }, { "completion_length": 195.5, "epoch": 0.28526924788607033, "grad_norm": 0.7902094125747681, "kl": 0.07190477102994919, "learning_rate": 4.495731328097464e-06, "loss": 0.0029, "reward": 0.1666666716337204, "reward_std": 0.1881931722164154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 641 }, { "completion_length": 200.0, "epoch": 0.2857142857142857, "grad_norm": 0.8260470032691956, "kl": 0.0487191379070282, "learning_rate": 4.4933895449446e-06, "loss": 0.0019, "reward": 0.1666666716337204, "reward_std": 0.12909945845603943, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 642 }, { "completion_length": 126.33333587646484, "epoch": 0.2861593235425011, "grad_norm": 0.7838308811187744, "kl": 0.12842848896980286, "learning_rate": 4.491042949748381e-06, "loss": 0.0051, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 643 }, { "completion_length": 154.6666717529297, "epoch": 0.2866043613707165, "grad_norm": 0.820894181728363, "kl": 0.060759272426366806, "learning_rate": 4.488691548173487e-06, "loss": 0.0024, "reward": 0.28716668486595154, "reward_std": 0.21236613392829895, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12049999833106995, "step": 644 }, { "completion_length": 147.33334350585938, "epoch": 0.2870493991989319, "grad_norm": 0.7532824873924255, "kl": 0.0968950018286705, "learning_rate": 4.486335345896204e-06, "loss": 0.0039, "reward": 0.4166666865348816, "reward_std": 0.20412415266036987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 645 }, { "completion_length": 179.5, "epoch": 0.28749443702714733, "grad_norm": 0.7337256073951721, "kl": 0.0560641810297966, "learning_rate": 4.483974348604407e-06, "loss": 0.0022, "reward": 0.20866666734218597, "reward_std": 0.12942281365394592, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20866666734218597, "step": 646 }, { "completion_length": 191.33334350585938, "epoch": 0.2879394748553627, "grad_norm": 0.665947675704956, "kl": 0.04197518154978752, "learning_rate": 4.48160856199754e-06, "loss": 0.0017, "reward": 0.187666654586792, "reward_std": 0.20557884871959686, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 647 }, { "completion_length": 168.33334350585938, "epoch": 0.2883845126835781, "grad_norm": 0.7467770576477051, "kl": 0.05759914964437485, "learning_rate": 4.479237991786617e-06, "loss": 0.0023, "reward": 0.10266666859388351, "reward_std": 0.39400848746299744, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019333332777023315, "step": 648 }, { "completion_length": 200.0, "epoch": 0.2888295505117935, "grad_norm": 0.7417876720428467, "kl": 0.05739980190992355, "learning_rate": 4.476862643694194e-06, "loss": 0.0023, "reward": 0.0416666679084301, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 649 }, { "completion_length": 196.83334350585938, "epoch": 0.2892745883400089, "grad_norm": 0.6486422419548035, "kl": 0.05907044932246208, "learning_rate": 4.474482523454363e-06, "loss": 0.0024, "reward": 0.1041666716337204, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 650 }, { "completion_length": 132.1666717529297, "epoch": 0.2897196261682243, "grad_norm": 0.8684423565864563, "kl": 0.05625095218420029, "learning_rate": 4.472097636812736e-06, "loss": 0.0022, "reward": 0.437666654586792, "reward_std": 0.15268486738204956, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021000001579523087, "step": 651 }, { "completion_length": 143.0, "epoch": 0.2901646639964397, "grad_norm": 0.8793902397155762, "kl": 0.07028966397047043, "learning_rate": 4.469707989526429e-06, "loss": 0.0028, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 652 }, { "completion_length": 177.5, "epoch": 0.2906097018246551, "grad_norm": 0.9311726093292236, "kl": 0.05447518453001976, "learning_rate": 4.467313587364053e-06, "loss": 0.0022, "reward": 0.2708333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 653 }, { "completion_length": 165.33334350585938, "epoch": 0.2910547396528705, "grad_norm": 0.6879425644874573, "kl": 0.06616301834583282, "learning_rate": 4.464914436105695e-06, "loss": 0.0026, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 654 }, { "completion_length": 200.0, "epoch": 0.2914997774810859, "grad_norm": 0.6475021243095398, "kl": 0.039877697825431824, "learning_rate": 4.462510541542909e-06, "loss": 0.0016, "reward": 0.04933333396911621, "reward_std": 0.38236457109451294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04933333396911621, "step": 655 }, { "completion_length": 143.83334350585938, "epoch": 0.2919448153093013, "grad_norm": 0.9102987051010132, "kl": 0.09255407750606537, "learning_rate": 4.460101909478696e-06, "loss": 0.0037, "reward": 0.14616666734218597, "reward_std": 0.1465870589017868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 656 }, { "completion_length": 126.16667175292969, "epoch": 0.2923898531375167, "grad_norm": 0.045655757188797, "kl": 0.07757769525051117, "learning_rate": 4.457688545727496e-06, "loss": 0.0031, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 657 }, { "completion_length": 191.6666717529297, "epoch": 0.29283489096573206, "grad_norm": 0.6900054216384888, "kl": 0.1448201835155487, "learning_rate": 4.45527045611517e-06, "loss": 0.0058, "reward": 0.010500004515051842, "reward_std": 0.3959311842918396, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07283332943916321, "step": 658 }, { "completion_length": 119.33333587646484, "epoch": 0.2932799287939475, "grad_norm": 0.9801993370056152, "kl": 0.10018780082464218, "learning_rate": 4.452847646478987e-06, "loss": 0.004, "reward": 0.35483333468437195, "reward_std": 0.0940093994140625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27149999141693115, "step": 659 }, { "completion_length": 193.5, "epoch": 0.2937249666221629, "grad_norm": 0.7721496224403381, "kl": 0.04845193028450012, "learning_rate": 4.4504201226676124e-06, "loss": 0.0019, "reward": 0.2708333432674408, "reward_std": 0.25515520572662354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 660 }, { "completion_length": 164.0, "epoch": 0.2941700044503783, "grad_norm": 0.8990196585655212, "kl": 0.038245782256126404, "learning_rate": 4.4479878905410875e-06, "loss": 0.0015, "reward": 0.3959999978542328, "reward_std": 0.12286578863859177, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1459999978542328, "step": 661 }, { "completion_length": 197.0, "epoch": 0.2946150422785937, "grad_norm": 0.7893727421760559, "kl": 0.04667172580957413, "learning_rate": 4.445550955970823e-06, "loss": 0.0019, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 662 }, { "completion_length": 110.33333587646484, "epoch": 0.29506008010680906, "grad_norm": 0.8910045623779297, "kl": 0.08671041578054428, "learning_rate": 4.443109324839581e-06, "loss": 0.0035, "reward": 0.4375, "reward_std": 0.1530931293964386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 663 }, { "completion_length": 193.6666717529297, "epoch": 0.2955051179350245, "grad_norm": 0.7910698056221008, "kl": 0.043277036398649216, "learning_rate": 4.440663003041459e-06, "loss": 0.0017, "reward": 0.2291666716337204, "reward_std": 0.22935599088668823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 664 }, { "completion_length": 186.33334350585938, "epoch": 0.29595015576323985, "grad_norm": 0.8543537855148315, "kl": 0.06220155954360962, "learning_rate": 4.43821199648188e-06, "loss": 0.0025, "reward": 0.1666666716337204, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 665 }, { "completion_length": 172.1666717529297, "epoch": 0.2963951935914553, "grad_norm": 0.8467397093772888, "kl": 0.05557259917259216, "learning_rate": 4.435756311077573e-06, "loss": 0.0022, "reward": 0.250166654586792, "reward_std": 0.2372765839099884, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08349999785423279, "step": 666 }, { "completion_length": 111.66667175292969, "epoch": 0.2968402314196707, "grad_norm": 0.0691637396812439, "kl": 0.09109967201948166, "learning_rate": 4.4332959527565666e-06, "loss": 0.0036, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 667 }, { "completion_length": 171.5, "epoch": 0.29728526924788606, "grad_norm": 0.6657293438911438, "kl": 0.06025252863764763, "learning_rate": 4.430830927458166e-06, "loss": 0.0024, "reward": 0.3961666524410248, "reward_std": 0.14607451856136322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 668 }, { "completion_length": 119.66667175292969, "epoch": 0.2977303070761015, "grad_norm": 0.9089164137840271, "kl": 0.08549092710018158, "learning_rate": 4.428361241132943e-06, "loss": 0.0034, "reward": 0.375166654586792, "reward_std": 0.19339123368263245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04183333367109299, "step": 669 }, { "completion_length": 200.0, "epoch": 0.29817534490431685, "grad_norm": 0.6968740820884705, "kl": 0.04054586589336395, "learning_rate": 4.425886899742722e-06, "loss": 0.0016, "reward": 0.1458333432674408, "reward_std": 0.12289901822805405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 670 }, { "completion_length": 121.33333587646484, "epoch": 0.2986203827325323, "grad_norm": 1.0022284984588623, "kl": 0.08514213562011719, "learning_rate": 4.423407909260564e-06, "loss": 0.0034, "reward": 0.437666654586792, "reward_std": 0.10446371138095856, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 671 }, { "completion_length": 184.1666717529297, "epoch": 0.29906542056074764, "grad_norm": 0.7887581586837769, "kl": 0.038804031908512115, "learning_rate": 4.420924275670753e-06, "loss": 0.0016, "reward": 0.2083333432674408, "reward_std": 0.17078250646591187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 672 }, { "completion_length": 198.0, "epoch": 0.29951045838896306, "grad_norm": 0.8230785131454468, "kl": 0.04383291304111481, "learning_rate": 4.4184360049687826e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.12313678115606308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 673 }, { "completion_length": 200.0, "epoch": 0.2999554962171785, "grad_norm": 0.7423299551010132, "kl": 0.03673369064927101, "learning_rate": 4.41594310316134e-06, "loss": 0.0015, "reward": 0.0625, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 674 }, { "completion_length": 137.0, "epoch": 0.30040053404539385, "grad_norm": 0.8925609588623047, "kl": 0.08815717697143555, "learning_rate": 4.4134455762662895e-06, "loss": 0.0035, "reward": 0.4166666865348816, "reward_std": 0.20412413775920868, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.4166666865348816, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 675 }, { "completion_length": 97.5, "epoch": 0.3008455718736093, "grad_norm": 0.03086089715361595, "kl": 0.06561337411403656, "learning_rate": 4.410943430312663e-06, "loss": 0.0026, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 676 }, { "completion_length": 200.0, "epoch": 0.30129060970182464, "grad_norm": 0.03762960061430931, "kl": 0.025554362684488297, "learning_rate": 4.408436671340643e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 677 }, { "completion_length": 193.6666717529297, "epoch": 0.30173564753004006, "grad_norm": 0.7618845701217651, "kl": 0.03975854814052582, "learning_rate": 4.405925305401547e-06, "loss": 0.0016, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 678 }, { "completion_length": 143.0, "epoch": 0.30218068535825543, "grad_norm": 1.127150535583496, "kl": 0.08482104539871216, "learning_rate": 4.4034093385578125e-06, "loss": 0.0034, "reward": 0.3336666524410248, "reward_std": 0.21874704957008362, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08366666734218597, "step": 679 }, { "completion_length": 144.5, "epoch": 0.30262572318647085, "grad_norm": 0.03559992089867592, "kl": 0.05165047571063042, "learning_rate": 4.400888776882985e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 680 }, { "completion_length": 150.0, "epoch": 0.3030707610146863, "grad_norm": 0.8135696649551392, "kl": 0.07061418145895004, "learning_rate": 4.398363626461702e-06, "loss": 0.0028, "reward": 0.3543333411216736, "reward_std": 0.20028147101402283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10433333367109299, "step": 681 }, { "completion_length": 200.0, "epoch": 0.30351579884290164, "grad_norm": 0.7969770431518555, "kl": 0.03894883021712303, "learning_rate": 4.395833893389676e-06, "loss": 0.0016, "reward": 0.12433333694934845, "reward_std": 0.3078192174434662, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12433333694934845, "step": 682 }, { "completion_length": 200.0, "epoch": 0.30396083667111706, "grad_norm": 0.7363507747650146, "kl": 0.03019530698657036, "learning_rate": 4.393299583773688e-06, "loss": 0.0012, "reward": 0.125, "reward_std": 0.11180339753627777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 683 }, { "completion_length": 195.5, "epoch": 0.30440587449933243, "grad_norm": 0.7232860326766968, "kl": 0.03277706354856491, "learning_rate": 4.390760703731559e-06, "loss": 0.0013, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 684 }, { "completion_length": 198.5, "epoch": 0.30485091232754785, "grad_norm": 0.7945019602775574, "kl": 0.04717344790697098, "learning_rate": 4.388217259392148e-06, "loss": 0.0019, "reward": 0.14266666769981384, "reward_std": 0.3285280168056488, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14266666769981384, "step": 685 }, { "completion_length": 186.0, "epoch": 0.3052959501557632, "grad_norm": 0.7385047078132629, "kl": 0.04403085261583328, "learning_rate": 4.38566925689533e-06, "loss": 0.0018, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 686 }, { "completion_length": 163.6666717529297, "epoch": 0.30574098798397864, "grad_norm": 0.856914758682251, "kl": 0.05494067072868347, "learning_rate": 4.383116702391988e-06, "loss": 0.0022, "reward": 0.35466668009757996, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 687 }, { "completion_length": 196.6666717529297, "epoch": 0.30618602581219406, "grad_norm": 0.8300358057022095, "kl": 0.043124958872795105, "learning_rate": 4.3805596020439845e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 688 }, { "completion_length": 169.6666717529297, "epoch": 0.30663106364040943, "grad_norm": 0.749025285243988, "kl": 0.05098932236433029, "learning_rate": 4.3779979620241644e-06, "loss": 0.002, "reward": 0.20083332061767578, "reward_std": 0.40986746549606323, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03416666388511658, "step": 689 }, { "completion_length": 163.1666717529297, "epoch": 0.30707610146862485, "grad_norm": 0.9226408004760742, "kl": 0.055235203355550766, "learning_rate": 4.375431788516326e-06, "loss": 0.0022, "reward": 0.39633333683013916, "reward_std": 0.09396524727344513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22966668009757996, "step": 690 }, { "completion_length": 145.1666717529297, "epoch": 0.3075211392968402, "grad_norm": 0.8971224427223206, "kl": 0.05605591833591461, "learning_rate": 4.372861087715215e-06, "loss": 0.0022, "reward": 0.4586666524410248, "reward_std": 0.06403334438800812, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12533333897590637, "step": 691 }, { "completion_length": 149.5, "epoch": 0.30796617712505564, "grad_norm": 0.8178679943084717, "kl": 0.05255182832479477, "learning_rate": 4.3702858658265044e-06, "loss": 0.0021, "reward": 0.3958333432674408, "reward_std": 0.16614501178264618, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 692 }, { "completion_length": 200.0, "epoch": 0.308411214953271, "grad_norm": 0.8197976350784302, "kl": 0.03835117816925049, "learning_rate": 4.367706129066781e-06, "loss": 0.0015, "reward": 0.1875, "reward_std": 0.10458251088857651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 693 }, { "completion_length": 172.83334350585938, "epoch": 0.30885625278148643, "grad_norm": 0.8643909692764282, "kl": 0.04992937296628952, "learning_rate": 4.36512188366353e-06, "loss": 0.002, "reward": 0.27116668224334717, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 694 }, { "completion_length": 200.0, "epoch": 0.30930129060970185, "grad_norm": 0.7522907853126526, "kl": 0.04939102381467819, "learning_rate": 4.36253313585512e-06, "loss": 0.002, "reward": 0.02083333395421505, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02083333395421505, "step": 695 }, { "completion_length": 197.6666717529297, "epoch": 0.3097463284379172, "grad_norm": 0.7550695538520813, "kl": 0.02722327411174774, "learning_rate": 4.359939891890793e-06, "loss": 0.0011, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 696 }, { "completion_length": 200.0, "epoch": 0.31019136626613264, "grad_norm": 0.023237159475684166, "kl": 0.013823822140693665, "learning_rate": 4.357342158030638e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 697 }, { "completion_length": 200.0, "epoch": 0.310636404094348, "grad_norm": 0.6470344066619873, "kl": 0.028774775564670563, "learning_rate": 4.354739940545587e-06, "loss": 0.0012, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 698 }, { "completion_length": 178.0, "epoch": 0.31108144192256343, "grad_norm": 0.8396903872489929, "kl": 0.036307577043771744, "learning_rate": 4.352133245717393e-06, "loss": 0.0015, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 699 }, { "completion_length": 194.0, "epoch": 0.3115264797507788, "grad_norm": 0.8275584578514099, "kl": 0.03899259492754936, "learning_rate": 4.349522079838622e-06, "loss": 0.0016, "reward": 0.125, "reward_std": 0.07905694097280502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 700 }, { "completion_length": 185.6666717529297, "epoch": 0.3119715175789942, "grad_norm": 0.043758708983659744, "kl": 0.049132104963064194, "learning_rate": 4.346906449212627e-06, "loss": 0.002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 701 }, { "completion_length": 196.83334350585938, "epoch": 0.31241655540720964, "grad_norm": 0.8897082209587097, "kl": 0.03444843739271164, "learning_rate": 4.344286360153541e-06, "loss": 0.0014, "reward": 0.1875, "reward_std": 0.18957190215587616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 702 }, { "completion_length": 177.0, "epoch": 0.312861593235425, "grad_norm": 0.8655028343200684, "kl": 0.05428258329629898, "learning_rate": 4.341661818986263e-06, "loss": 0.0022, "reward": 0.3966667056083679, "reward_std": 0.05062280222773552, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 703 }, { "completion_length": 200.0, "epoch": 0.31330663106364043, "grad_norm": 0.7451438903808594, "kl": 0.03979836031794548, "learning_rate": 4.339032832046434e-06, "loss": 0.0016, "reward": 0.187666654586792, "reward_std": 0.13138745725154877, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 704 }, { "completion_length": 197.1666717529297, "epoch": 0.3137516688918558, "grad_norm": 0.7860096096992493, "kl": 0.04280473291873932, "learning_rate": 4.336399405680432e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 705 }, { "completion_length": 200.0, "epoch": 0.3141967067200712, "grad_norm": 0.9517030715942383, "kl": 0.07379502058029175, "learning_rate": 4.333761546245348e-06, "loss": 0.003, "reward": 0.0416666679084301, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 706 }, { "completion_length": 180.1666717529297, "epoch": 0.3146417445482866, "grad_norm": 0.8798065185546875, "kl": 0.050519704818725586, "learning_rate": 4.331119260108977e-06, "loss": 0.002, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 707 }, { "completion_length": 190.1666717529297, "epoch": 0.315086782376502, "grad_norm": 0.7868778705596924, "kl": 0.07131218910217285, "learning_rate": 4.328472553649799e-06, "loss": 0.0029, "reward": 0.125, "reward_std": 0.20916502177715302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0416666679084301, "step": 708 }, { "completion_length": 200.0, "epoch": 0.31553182020471743, "grad_norm": 0.029699545353651047, "kl": 0.04753357172012329, "learning_rate": 4.325821433256963e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 709 }, { "completion_length": 188.83334350585938, "epoch": 0.3159768580329328, "grad_norm": 0.7864294648170471, "kl": 0.04412949085235596, "learning_rate": 4.323165905330277e-06, "loss": 0.0018, "reward": -0.10866667330265045, "reward_std": 0.38256901502609253, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10866667330265045, "step": 710 }, { "completion_length": 183.6666717529297, "epoch": 0.3164218958611482, "grad_norm": 0.8975881338119507, "kl": 0.04495091363787651, "learning_rate": 4.320505976280186e-06, "loss": 0.0018, "reward": 0.35466668009757996, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 711 }, { "completion_length": 184.0, "epoch": 0.3168669336893636, "grad_norm": 0.7394328117370605, "kl": 0.08109882473945618, "learning_rate": 4.3178416525277586e-06, "loss": 0.0032, "reward": 0.25, "reward_std": 0.22360679507255554, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 712 }, { "completion_length": 197.5, "epoch": 0.317311971517579, "grad_norm": 0.7964750528335571, "kl": 0.0377558134496212, "learning_rate": 4.315172940504677e-06, "loss": 0.0015, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 713 }, { "completion_length": 196.83334350585938, "epoch": 0.3177570093457944, "grad_norm": 1.8717976808547974, "kl": 0.176216721534729, "learning_rate": 4.312499846653211e-06, "loss": 0.007, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 714 }, { "completion_length": 141.0, "epoch": 0.3182020471740098, "grad_norm": 1.7549928426742554, "kl": 0.2996341586112976, "learning_rate": 4.309822377426211e-06, "loss": 0.012, "reward": 0.10500000417232513, "reward_std": 0.5367960333824158, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.25, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14500001072883606, "step": 715 }, { "completion_length": 195.1666717529297, "epoch": 0.3186470850022252, "grad_norm": 0.813319206237793, "kl": 0.05743285268545151, "learning_rate": 4.307140539287089e-06, "loss": 0.0023, "reward": 0.1458333432674408, "reward_std": 0.2002602517604828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0625, "step": 716 }, { "completion_length": 186.5, "epoch": 0.3190921228304406, "grad_norm": 0.8374386429786682, "kl": 0.03491507098078728, "learning_rate": 4.304454338709803e-06, "loss": 0.0014, "reward": 0.22966668009757996, "reward_std": 0.1666717231273651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22966668009757996, "step": 717 }, { "completion_length": 192.0, "epoch": 0.319537160658656, "grad_norm": 0.7661421298980713, "kl": 0.04614044725894928, "learning_rate": 4.3017637821788436e-06, "loss": 0.0018, "reward": 0.2916666865348816, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 718 }, { "completion_length": 198.0, "epoch": 0.31998219848687137, "grad_norm": 0.8050372004508972, "kl": 0.018003419041633606, "learning_rate": 4.2990688761892155e-06, "loss": 0.0007, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 719 }, { "completion_length": 162.0, "epoch": 0.3204272363150868, "grad_norm": 0.758633017539978, "kl": 0.06661681085824966, "learning_rate": 4.296369627246422e-06, "loss": 0.0027, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 720 }, { "completion_length": 171.33334350585938, "epoch": 0.32087227414330216, "grad_norm": 0.8165731430053711, "kl": 0.06688942015171051, "learning_rate": 4.293666041866453e-06, "loss": 0.0027, "reward": 0.31283333897590637, "reward_std": 0.20551829040050507, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.1666666716337204, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14616666734218597, "step": 721 }, { "completion_length": 194.33334350585938, "epoch": 0.3213173119715176, "grad_norm": 0.8424144387245178, "kl": 0.04467558115720749, "learning_rate": 4.290958126575764e-06, "loss": 0.0018, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 722 }, { "completion_length": 171.83334350585938, "epoch": 0.32176234979973295, "grad_norm": 0.8838493824005127, "kl": 0.05616327375173569, "learning_rate": 4.2882458879112634e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 723 }, { "completion_length": 199.0, "epoch": 0.32220738762794837, "grad_norm": 0.7267130017280579, "kl": 0.04860411211848259, "learning_rate": 4.285529332420298e-06, "loss": 0.0019, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 724 }, { "completion_length": 200.0, "epoch": 0.3226524254561638, "grad_norm": 0.8249167203903198, "kl": 0.04377565532922745, "learning_rate": 4.282808466660632e-06, "loss": 0.0018, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 725 }, { "completion_length": 190.1666717529297, "epoch": 0.32309746328437916, "grad_norm": 0.7198425531387329, "kl": 0.04959031194448471, "learning_rate": 4.280083297200439e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 726 }, { "completion_length": 192.1666717529297, "epoch": 0.3235425011125946, "grad_norm": 0.7675707340240479, "kl": 0.04044210538268089, "learning_rate": 4.277353830618279e-06, "loss": 0.0016, "reward": 0.187666654586792, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.187666654586792, "step": 727 }, { "completion_length": 200.0, "epoch": 0.32398753894080995, "grad_norm": 0.7638272643089294, "kl": 0.03953151777386665, "learning_rate": 4.274620073503084e-06, "loss": 0.0016, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 728 }, { "completion_length": 152.6666717529297, "epoch": 0.32443257676902537, "grad_norm": 0.05756811052560806, "kl": 0.06373923271894455, "learning_rate": 4.2718820324541475e-06, "loss": 0.0025, "reward": 0.5, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.5, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 729 }, { "completion_length": 162.0, "epoch": 0.32487761459724074, "grad_norm": 0.7738885283470154, "kl": 0.06379255652427673, "learning_rate": 4.2691397140811e-06, "loss": 0.0026, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 730 }, { "completion_length": 117.66667175292969, "epoch": 0.32532265242545616, "grad_norm": 0.9680776596069336, "kl": 0.08181880414485931, "learning_rate": 4.2663931250039005e-06, "loss": 0.0033, "reward": 0.3333333432674408, "reward_std": 0.25819891691207886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.3333333432674408, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 731 }, { "completion_length": 170.83334350585938, "epoch": 0.3257676902536716, "grad_norm": 0.8730087280273438, "kl": 0.07460620254278183, "learning_rate": 4.2636422718528155e-06, "loss": 0.003, "reward": 0.18150000274181366, "reward_std": 0.3583281934261322, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18150000274181366, "step": 732 }, { "completion_length": 184.33334350585938, "epoch": 0.32621272808188695, "grad_norm": 0.8382135629653931, "kl": 0.05216635391116142, "learning_rate": 4.2608871612684074e-06, "loss": 0.0021, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 733 }, { "completion_length": 171.1666717529297, "epoch": 0.32665776591010237, "grad_norm": 0.9829415082931519, "kl": 0.050829045474529266, "learning_rate": 4.258127799901512e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 734 }, { "completion_length": 190.5, "epoch": 0.32710280373831774, "grad_norm": 0.7752060890197754, "kl": 0.04926741123199463, "learning_rate": 4.255364194413232e-06, "loss": 0.002, "reward": 0.2084999978542328, "reward_std": 0.1021445021033287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 735 }, { "completion_length": 200.0, "epoch": 0.32754784156653316, "grad_norm": 0.7153200507164001, "kl": 0.040820520371198654, "learning_rate": 4.25259635147491e-06, "loss": 0.0016, "reward": 0.2083333432674408, "reward_std": 0.10206207633018494, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 736 }, { "completion_length": 200.0, "epoch": 0.3279928793947485, "grad_norm": 0.023354843258857727, "kl": 0.03863299638032913, "learning_rate": 4.249824277768122e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 737 }, { "completion_length": 172.1666717529297, "epoch": 0.32843791722296395, "grad_norm": 0.796233594417572, "kl": 0.047746941447257996, "learning_rate": 4.2470479799846545e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 738 }, { "completion_length": 200.0, "epoch": 0.32888295505117937, "grad_norm": 0.024252623319625854, "kl": 0.04170938581228256, "learning_rate": 4.2442674648264914e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 739 }, { "completion_length": 200.0, "epoch": 0.32932799287939474, "grad_norm": 0.8098192811012268, "kl": 0.055160801857709885, "learning_rate": 4.241482739005798e-06, "loss": 0.0022, "reward": 0.11166667193174362, "reward_std": 0.3388460874557495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11166667193174362, "step": 740 }, { "completion_length": 177.33334350585938, "epoch": 0.32977303070761016, "grad_norm": 0.7267571091651917, "kl": 0.037675365805625916, "learning_rate": 4.238693809244904e-06, "loss": 0.0015, "reward": 0.1899999976158142, "reward_std": 0.26663535833358765, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1066666692495346, "step": 741 }, { "completion_length": 200.0, "epoch": 0.3302180685358255, "grad_norm": 0.03363126143813133, "kl": 0.04696328565478325, "learning_rate": 4.235900682276287e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 742 }, { "completion_length": 194.33334350585938, "epoch": 0.33066310636404095, "grad_norm": 0.6722932457923889, "kl": 0.040012698620557785, "learning_rate": 4.2331033648425565e-06, "loss": 0.0016, "reward": 0.12316666543483734, "reward_std": 0.37579911947250366, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12316666543483734, "step": 743 }, { "completion_length": 200.0, "epoch": 0.3311081441922563, "grad_norm": 0.9077341556549072, "kl": 0.04133099317550659, "learning_rate": 4.230301863696439e-06, "loss": 0.0017, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 744 }, { "completion_length": 156.83334350585938, "epoch": 0.33155318202047174, "grad_norm": 1.1381067037582397, "kl": 0.0650453120470047, "learning_rate": 4.22749618560076e-06, "loss": 0.0026, "reward": 0.27133333683013916, "reward_std": 0.09423092007637024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 745 }, { "completion_length": 200.0, "epoch": 0.33199821984868716, "grad_norm": 0.78874272108078, "kl": 0.02345721237361431, "learning_rate": 4.224686337328426e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 746 }, { "completion_length": 179.0, "epoch": 0.3324432576769025, "grad_norm": 0.836664617061615, "kl": 0.04164649173617363, "learning_rate": 4.221872325662414e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.15811388194561005, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 747 }, { "completion_length": 135.83334350585938, "epoch": 0.33288829550511795, "grad_norm": 0.9000113010406494, "kl": 0.06278669834136963, "learning_rate": 4.219054157395749e-06, "loss": 0.0025, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 748 }, { "completion_length": 191.83334350585938, "epoch": 0.3333333333333333, "grad_norm": 0.8174548745155334, "kl": 0.0394861064851284, "learning_rate": 4.21623183933149e-06, "loss": 0.0016, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 749 }, { "completion_length": 166.0, "epoch": 0.33377837116154874, "grad_norm": 0.7031174302101135, "kl": 0.03650489076972008, "learning_rate": 4.213405378282714e-06, "loss": 0.0015, "reward": 0.25033333897590637, "reward_std": 0.13729628920555115, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25033333897590637, "step": 750 }, { "completion_length": 192.83334350585938, "epoch": 0.3342234089897641, "grad_norm": 0.7927865386009216, "kl": 0.0499715581536293, "learning_rate": 4.210574781072501e-06, "loss": 0.002, "reward": 0.2084999978542328, "reward_std": 0.12935802340507507, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 751 }, { "completion_length": 162.33334350585938, "epoch": 0.3346684468179795, "grad_norm": 0.7981622219085693, "kl": 0.04853060469031334, "learning_rate": 4.207740054533913e-06, "loss": 0.0019, "reward": 0.22949998080730438, "reward_std": 0.14653019607067108, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 752 }, { "completion_length": 200.0, "epoch": 0.33511348464619495, "grad_norm": 0.022546028718352318, "kl": 0.01208839938044548, "learning_rate": 4.204901205509981e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 753 }, { "completion_length": 200.0, "epoch": 0.3355585224744103, "grad_norm": 0.7191113829612732, "kl": 0.031925659626722336, "learning_rate": 4.202058240853689e-06, "loss": 0.0013, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 754 }, { "completion_length": 172.33334350585938, "epoch": 0.33600356030262574, "grad_norm": 0.8460274338722229, "kl": 0.047413043677806854, "learning_rate": 4.199211167427955e-06, "loss": 0.0019, "reward": 0.2121666520833969, "reward_std": 0.18116556107997894, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2121666669845581, "step": 755 }, { "completion_length": 114.66667175292969, "epoch": 0.3364485981308411, "grad_norm": 1.0755983591079712, "kl": 0.054288625717163086, "learning_rate": 4.196359992105614e-06, "loss": 0.0022, "reward": 0.1456666737794876, "reward_std": 0.26882386207580566, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1456666737794876, "step": 756 }, { "completion_length": 192.33334350585938, "epoch": 0.3368936359590565, "grad_norm": 0.7851623296737671, "kl": 0.03890561684966087, "learning_rate": 4.193504721769406e-06, "loss": 0.0016, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 757 }, { "completion_length": 173.83334350585938, "epoch": 0.3373386737872719, "grad_norm": 1.2191767692565918, "kl": 0.03187034651637077, "learning_rate": 4.190645363311955e-06, "loss": 0.0013, "reward": 0.20533333718776703, "reward_std": 0.06943102180957794, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20533333718776703, "step": 758 }, { "completion_length": 164.0, "epoch": 0.3377837116154873, "grad_norm": 0.6636083126068115, "kl": 0.13397148251533508, "learning_rate": 4.187781923635753e-06, "loss": 0.0054, "reward": 0.2254999876022339, "reward_std": 0.2528444528579712, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22550001740455627, "step": 759 }, { "completion_length": 127.66667175292969, "epoch": 0.33822874944370274, "grad_norm": 0.030917134135961533, "kl": 0.06076059490442276, "learning_rate": 4.184914409653147e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 760 }, { "completion_length": 194.0, "epoch": 0.3386737872719181, "grad_norm": 0.723721444606781, "kl": 0.043149642646312714, "learning_rate": 4.182042828286313e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 761 }, { "completion_length": 193.33334350585938, "epoch": 0.3391188251001335, "grad_norm": 0.7052424550056458, "kl": 0.039253827184438705, "learning_rate": 4.179167186467255e-06, "loss": 0.0016, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 762 }, { "completion_length": 116.83333587646484, "epoch": 0.3395638629283489, "grad_norm": 1.4630470275878906, "kl": 0.026879621669650078, "learning_rate": 4.17628749113777e-06, "loss": 0.0011, "reward": 0.24266664683818817, "reward_std": 0.15977442264556885, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24266664683818817, "step": 763 }, { "completion_length": 124.0, "epoch": 0.3400089007565643, "grad_norm": 0.04300786554813385, "kl": 0.05142722651362419, "learning_rate": 4.173403749249444e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 764 }, { "completion_length": 171.6666717529297, "epoch": 0.3404539385847797, "grad_norm": 1.0912110805511475, "kl": 0.04297208786010742, "learning_rate": 4.170515967763633e-06, "loss": 0.0017, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 765 }, { "completion_length": 163.33334350585938, "epoch": 0.3408989764129951, "grad_norm": 0.9113920331001282, "kl": 0.1428011953830719, "learning_rate": 4.167624153651444e-06, "loss": 0.0057, "reward": 0.18649999797344208, "reward_std": 0.34627026319503784, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18649999797344208, "step": 766 }, { "completion_length": 197.5, "epoch": 0.3413440142412105, "grad_norm": 0.7323278188705444, "kl": 0.046782054007053375, "learning_rate": 4.1647283138937144e-06, "loss": 0.0019, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 767 }, { "completion_length": 183.1666717529297, "epoch": 0.3417890520694259, "grad_norm": 0.7725353837013245, "kl": 0.027050405740737915, "learning_rate": 4.1618284554810056e-06, "loss": 0.0011, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949999570846558, "step": 768 }, { "completion_length": 137.0, "epoch": 0.3422340898976413, "grad_norm": 0.03009336069226265, "kl": 0.057053741067647934, "learning_rate": 4.158924585413576e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 769 }, { "completion_length": 197.5, "epoch": 0.3426791277258567, "grad_norm": 0.601902425289154, "kl": 0.027302034199237823, "learning_rate": 4.156016710701369e-06, "loss": 0.0011, "reward": 0.2293333262205124, "reward_std": 0.09440692514181137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2293333262205124, "step": 770 }, { "completion_length": 200.0, "epoch": 0.3431241655540721, "grad_norm": 0.013710107654333115, "kl": 0.03290403261780739, "learning_rate": 4.153104838363997e-06, "loss": 0.0013, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 771 }, { "completion_length": 197.83334350585938, "epoch": 0.34356920338228747, "grad_norm": 0.6330693960189819, "kl": 0.03465873375535011, "learning_rate": 4.15018897543072e-06, "loss": 0.0014, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 772 }, { "completion_length": 167.6666717529297, "epoch": 0.3440142412105029, "grad_norm": 0.7272565364837646, "kl": 0.04871327802538872, "learning_rate": 4.1472691289404335e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 773 }, { "completion_length": 200.0, "epoch": 0.3444592790387183, "grad_norm": 0.025043027475476265, "kl": 0.03786566108465195, "learning_rate": 4.144345305941648e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 774 }, { "completion_length": 200.0, "epoch": 0.3449043168669337, "grad_norm": 0.03370220214128494, "kl": 0.0471016988158226, "learning_rate": 4.141417513492473e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 775 }, { "completion_length": 200.0, "epoch": 0.3453493546951491, "grad_norm": 0.7220287919044495, "kl": 0.035600695759058, "learning_rate": 4.138485758660602e-06, "loss": 0.0014, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 776 }, { "completion_length": 120.66667175292969, "epoch": 0.34579439252336447, "grad_norm": 1.2347756624221802, "kl": 0.08316612988710403, "learning_rate": 4.135550048523293e-06, "loss": 0.0033, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 777 }, { "completion_length": 170.6666717529297, "epoch": 0.3462394303515799, "grad_norm": 0.7800046801567078, "kl": 0.053269438445568085, "learning_rate": 4.132610390167349e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 778 }, { "completion_length": 157.1666717529297, "epoch": 0.34668446817979526, "grad_norm": 0.7412069439888, "kl": 0.06059323251247406, "learning_rate": 4.12966679068911e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 779 }, { "completion_length": 200.0, "epoch": 0.3471295060080107, "grad_norm": 0.7685080766677856, "kl": 0.056760650128126144, "learning_rate": 4.126719257194425e-06, "loss": 0.0023, "reward": 0.12250000238418579, "reward_std": 0.31230995059013367, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12250000238418579, "step": 780 }, { "completion_length": 200.0, "epoch": 0.3475745438362261, "grad_norm": 0.018277641385793686, "kl": 0.03469054400920868, "learning_rate": 4.123767796798641e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 781 }, { "completion_length": 133.5, "epoch": 0.34801958166444147, "grad_norm": 0.8028903007507324, "kl": 0.06514596194028854, "learning_rate": 4.120812416626586e-06, "loss": 0.0026, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 782 }, { "completion_length": 146.83334350585938, "epoch": 0.3484646194926569, "grad_norm": 1.0154013633728027, "kl": 0.0660717785358429, "learning_rate": 4.117853123812549e-06, "loss": 0.0026, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 783 }, { "completion_length": 161.6666717529297, "epoch": 0.34890965732087226, "grad_norm": 0.76609206199646, "kl": 0.05136913061141968, "learning_rate": 4.1148899255002636e-06, "loss": 0.0021, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 784 }, { "completion_length": 142.5, "epoch": 0.3493546951490877, "grad_norm": 0.03385661169886589, "kl": 0.05575406178832054, "learning_rate": 4.111922828842892e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 785 }, { "completion_length": 187.0, "epoch": 0.34979973297730305, "grad_norm": 0.7177822589874268, "kl": 0.030316343531012535, "learning_rate": 4.108951841003009e-06, "loss": 0.0012, "reward": 0.25050002336502075, "reward_std": 0.13747835159301758, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25050002336502075, "step": 786 }, { "completion_length": 178.5, "epoch": 0.35024477080551847, "grad_norm": 0.7809700965881348, "kl": 0.039071641862392426, "learning_rate": 4.105976969152578e-06, "loss": 0.0016, "reward": 0.31333333253860474, "reward_std": 0.1049412414431572, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 787 }, { "completion_length": 177.1666717529297, "epoch": 0.3506898086337339, "grad_norm": 0.7270780205726624, "kl": 0.042834021151065826, "learning_rate": 4.102998220472943e-06, "loss": 0.0017, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 788 }, { "completion_length": 200.0, "epoch": 0.35113484646194926, "grad_norm": 0.6192082762718201, "kl": 0.04256781190633774, "learning_rate": 4.100015602154802e-06, "loss": 0.0017, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 789 }, { "completion_length": 200.0, "epoch": 0.3515798842901647, "grad_norm": 0.014344203285872936, "kl": 0.03416243940591812, "learning_rate": 4.0970291213982e-06, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 790 }, { "completion_length": 200.0, "epoch": 0.35202492211838005, "grad_norm": 0.8113481998443604, "kl": 0.05264754593372345, "learning_rate": 4.094038785412504e-06, "loss": 0.0021, "reward": 0.11516666412353516, "reward_std": 0.3302728831768036, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11516666412353516, "step": 791 }, { "completion_length": 138.0, "epoch": 0.35246995994659547, "grad_norm": 1.720563292503357, "kl": 0.03622860834002495, "learning_rate": 4.091044601416383e-06, "loss": 0.0014, "reward": 0.27133333683013916, "reward_std": 0.1465587466955185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 792 }, { "completion_length": 177.5, "epoch": 0.35291499777481083, "grad_norm": 0.7472648024559021, "kl": 0.04746413230895996, "learning_rate": 4.0880465766378015e-06, "loss": 0.0019, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 793 }, { "completion_length": 129.83334350585938, "epoch": 0.35336003560302626, "grad_norm": 1.1209585666656494, "kl": 0.0549950897693634, "learning_rate": 4.085044718313991e-06, "loss": 0.0022, "reward": 0.31349998712539673, "reward_std": 0.10458250343799591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31349998712539673, "step": 794 }, { "completion_length": 193.1666717529297, "epoch": 0.3538050734312417, "grad_norm": 0.6405737400054932, "kl": 0.04125973582267761, "learning_rate": 4.08203903369144e-06, "loss": 0.0017, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 795 }, { "completion_length": 185.5, "epoch": 0.35425011125945705, "grad_norm": 0.8062145709991455, "kl": 0.04092112183570862, "learning_rate": 4.079029530025873e-06, "loss": 0.0016, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 796 }, { "completion_length": 142.83334350585938, "epoch": 0.35469514908767247, "grad_norm": 0.0398949459195137, "kl": 0.052285999059677124, "learning_rate": 4.076016214582232e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 797 }, { "completion_length": 154.0, "epoch": 0.35514018691588783, "grad_norm": 1.113144874572754, "kl": 0.07075363397598267, "learning_rate": 4.072999094634663e-06, "loss": 0.0028, "reward": 0.29233333468437195, "reward_std": 0.10222654044628143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 798 }, { "completion_length": 200.0, "epoch": 0.35558522474410326, "grad_norm": 0.016199104487895966, "kl": 0.03834648057818413, "learning_rate": 4.069978177466495e-06, "loss": 0.0015, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 799 }, { "completion_length": 200.0, "epoch": 0.3560302625723186, "grad_norm": 0.9117387533187866, "kl": 0.18886041641235352, "learning_rate": 4.066953470370223e-06, "loss": 0.0076, "reward": 0.016166668385267258, "reward_std": 0.2665861248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 800 }, { "completion_length": 178.33334350585938, "epoch": 0.35647530040053405, "grad_norm": 0.6704380512237549, "kl": 0.057386137545108795, "learning_rate": 4.063924980647492e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 801 }, { "completion_length": 168.33334350585938, "epoch": 0.35692033822874947, "grad_norm": 0.767604649066925, "kl": 0.0486370325088501, "learning_rate": 4.060892715609078e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 802 }, { "completion_length": 141.0, "epoch": 0.35736537605696483, "grad_norm": 1.3145936727523804, "kl": 0.04570293426513672, "learning_rate": 4.0578566825748685e-06, "loss": 0.0018, "reward": 0.29233333468437195, "reward_std": 0.15182314813137054, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 803 }, { "completion_length": 126.66667175292969, "epoch": 0.35781041388518026, "grad_norm": 0.6498439311981201, "kl": 0.06144469231367111, "learning_rate": 4.054816888873852e-06, "loss": 0.0025, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 804 }, { "completion_length": 185.6666717529297, "epoch": 0.3582554517133956, "grad_norm": 0.9341973662376404, "kl": 0.05542676895856857, "learning_rate": 4.051773341844088e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 805 }, { "completion_length": 153.33334350585938, "epoch": 0.35870048954161105, "grad_norm": 0.6858371496200562, "kl": 0.06017071008682251, "learning_rate": 4.048726048832704e-06, "loss": 0.0024, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 806 }, { "completion_length": 182.6666717529297, "epoch": 0.3591455273698264, "grad_norm": 0.5970679521560669, "kl": 0.03451145440340042, "learning_rate": 4.045675017195866e-06, "loss": 0.0014, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 807 }, { "completion_length": 162.5, "epoch": 0.35959056519804183, "grad_norm": 0.6962899565696716, "kl": 0.05475949868559837, "learning_rate": 4.042620254298765e-06, "loss": 0.0022, "reward": 0.29233333468437195, "reward_std": 0.15182313323020935, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 808 }, { "completion_length": 183.83334350585938, "epoch": 0.36003560302625726, "grad_norm": 0.7941375970840454, "kl": 0.045727722346782684, "learning_rate": 4.039561767515599e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 809 }, { "completion_length": 174.1666717529297, "epoch": 0.3604806408544726, "grad_norm": 0.6645864248275757, "kl": 0.052175264805555344, "learning_rate": 4.036499564229559e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 810 }, { "completion_length": 192.0, "epoch": 0.36092567868268804, "grad_norm": 0.7362174391746521, "kl": 0.05001889169216156, "learning_rate": 4.033433651832806e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 811 }, { "completion_length": 183.33334350585938, "epoch": 0.3613707165109034, "grad_norm": 0.700889527797699, "kl": 0.04670108109712601, "learning_rate": 4.0303640377264505e-06, "loss": 0.0019, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 812 }, { "completion_length": 121.66667175292969, "epoch": 0.36181575433911883, "grad_norm": 0.11063014715909958, "kl": 0.08557015657424927, "learning_rate": 4.027290729320545e-06, "loss": 0.0037, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 813 }, { "completion_length": 75.83333587646484, "epoch": 0.3622607921673342, "grad_norm": 1.2718409299850464, "kl": 0.08148349821567535, "learning_rate": 4.024213734034057e-06, "loss": 0.0033, "reward": 0.39666664600372314, "reward_std": 0.05062279850244522, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0833333358168602, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 814 }, { "completion_length": 186.5, "epoch": 0.3627058299955496, "grad_norm": 0.7700260281562805, "kl": 0.04646776616573334, "learning_rate": 4.021133059294855e-06, "loss": 0.0019, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 815 }, { "completion_length": 147.5, "epoch": 0.36315086782376504, "grad_norm": 1.176891803741455, "kl": 0.05650541931390762, "learning_rate": 4.018048712539689e-06, "loss": 0.0023, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 816 }, { "completion_length": 180.0, "epoch": 0.3635959056519804, "grad_norm": 0.6683288216590881, "kl": 0.06407992541790009, "learning_rate": 4.014960701214173e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 817 }, { "completion_length": 105.0, "epoch": 0.36404094348019583, "grad_norm": 0.9382845163345337, "kl": 0.06550440192222595, "learning_rate": 4.011869032772769e-06, "loss": 0.0026, "reward": 0.31333333253860474, "reward_std": 0.15350136160850525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 818 }, { "completion_length": 165.6666717529297, "epoch": 0.3644859813084112, "grad_norm": 0.7827046513557434, "kl": 0.05532263219356537, "learning_rate": 4.008773714678766e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 819 }, { "completion_length": 145.6666717529297, "epoch": 0.3649310191366266, "grad_norm": 0.026940084993839264, "kl": 0.0543852262198925, "learning_rate": 4.005674754404263e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 820 }, { "completion_length": 138.6666717529297, "epoch": 0.365376056964842, "grad_norm": 0.035869237035512924, "kl": 0.06597190350294113, "learning_rate": 4.002572159430151e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 821 }, { "completion_length": 163.83334350585938, "epoch": 0.3658210947930574, "grad_norm": 0.910876989364624, "kl": 0.06616390496492386, "learning_rate": 3.999465937246096e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 822 }, { "completion_length": 133.1666717529297, "epoch": 0.36626613262127283, "grad_norm": 0.8520764708518982, "kl": 0.08879172801971436, "learning_rate": 3.996356095350522e-06, "loss": 0.0036, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 823 }, { "completion_length": 170.0, "epoch": 0.3667111704494882, "grad_norm": 0.8743280172348022, "kl": 0.061830103397369385, "learning_rate": 3.993242641250586e-06, "loss": 0.0025, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 824 }, { "completion_length": 171.1666717529297, "epoch": 0.3671562082777036, "grad_norm": 0.7217028737068176, "kl": 0.056446269154548645, "learning_rate": 3.990125582462171e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 825 }, { "completion_length": 179.1666717529297, "epoch": 0.367601246105919, "grad_norm": 0.6563734412193298, "kl": 0.060024701058864594, "learning_rate": 3.987004926509854e-06, "loss": 0.0024, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 826 }, { "completion_length": 188.0, "epoch": 0.3680462839341344, "grad_norm": 0.6999355554580688, "kl": 0.045960139483213425, "learning_rate": 3.983880680926904e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 827 }, { "completion_length": 98.16667175292969, "epoch": 0.3684913217623498, "grad_norm": 0.04137030988931656, "kl": 0.08726416528224945, "learning_rate": 3.98075285325525e-06, "loss": 0.0038, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 828 }, { "completion_length": 160.1666717529297, "epoch": 0.3689363595905652, "grad_norm": 0.7065590620040894, "kl": 0.061148129403591156, "learning_rate": 3.977621451045469e-06, "loss": 0.0024, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 829 }, { "completion_length": 192.1666717529297, "epoch": 0.3693813974187806, "grad_norm": 0.7527098059654236, "kl": 0.055032290518283844, "learning_rate": 3.974486481856769e-06, "loss": 0.0022, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 830 }, { "completion_length": 157.6666717529297, "epoch": 0.369826435246996, "grad_norm": 0.8093323111534119, "kl": 0.06528506428003311, "learning_rate": 3.971347953256965e-06, "loss": 0.0026, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 831 }, { "completion_length": 136.5, "epoch": 0.3702714730752114, "grad_norm": 0.04360884055495262, "kl": 0.0801323652267456, "learning_rate": 3.968205872822468e-06, "loss": 0.0035, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 832 }, { "completion_length": 173.83334350585938, "epoch": 0.3707165109034268, "grad_norm": 0.7571378350257874, "kl": 0.05690945312380791, "learning_rate": 3.965060248138263e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 833 }, { "completion_length": 87.66667175292969, "epoch": 0.3711615487316422, "grad_norm": 0.06486710906028748, "kl": 0.10215651988983154, "learning_rate": 3.961911086797886e-06, "loss": 0.0044, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 834 }, { "completion_length": 169.83334350585938, "epoch": 0.37160658655985757, "grad_norm": 0.7843642830848694, "kl": 0.05757517367601395, "learning_rate": 3.958758396403418e-06, "loss": 0.0023, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 835 }, { "completion_length": 149.6666717529297, "epoch": 0.372051624388073, "grad_norm": 0.6219406127929688, "kl": 0.11813263595104218, "learning_rate": 3.955602184565452e-06, "loss": 0.0047, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 836 }, { "completion_length": 154.6666717529297, "epoch": 0.3724966622162884, "grad_norm": 0.699906051158905, "kl": 0.06392105668783188, "learning_rate": 3.952442458903087e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 837 }, { "completion_length": 200.0, "epoch": 0.3729417000445038, "grad_norm": 0.036947038024663925, "kl": 0.05290337651968002, "learning_rate": 3.9492792270439015e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 838 }, { "completion_length": 99.33333587646484, "epoch": 0.3733867378727192, "grad_norm": 1.119400978088379, "kl": 0.06555521488189697, "learning_rate": 3.946112496623939e-06, "loss": 0.0026, "reward": 0.3343333601951599, "reward_std": 0.10206206887960434, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33433330059051514, "step": 839 }, { "completion_length": 153.6666717529297, "epoch": 0.37383177570093457, "grad_norm": 0.030132388696074486, "kl": 0.057201653718948364, "learning_rate": 3.942942275287688e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 840 }, { "completion_length": 123.16667175292969, "epoch": 0.37427681352915, "grad_norm": 0.027864990755915642, "kl": 0.06204737722873688, "learning_rate": 3.939768570688064e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 841 }, { "completion_length": 161.0, "epoch": 0.37472185135736535, "grad_norm": 0.6778984069824219, "kl": 0.06228325888514519, "learning_rate": 3.936591390486393e-06, "loss": 0.0025, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 842 }, { "completion_length": 165.83334350585938, "epoch": 0.3751668891855808, "grad_norm": 0.03816313296556473, "kl": 0.06406964361667633, "learning_rate": 3.933410742352388e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 843 }, { "completion_length": 161.6666717529297, "epoch": 0.3756119270137962, "grad_norm": 0.8484237790107727, "kl": 0.09677344560623169, "learning_rate": 3.930226633964137e-06, "loss": 0.0039, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 844 }, { "completion_length": 135.5, "epoch": 0.37605696484201157, "grad_norm": 0.03836163505911827, "kl": 0.06688190996646881, "learning_rate": 3.927039073008077e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 845 }, { "completion_length": 200.0, "epoch": 0.376502002670227, "grad_norm": 0.03299454599618912, "kl": 0.052207015454769135, "learning_rate": 3.9238480671789836e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 846 }, { "completion_length": 189.33334350585938, "epoch": 0.37694704049844235, "grad_norm": 0.8058741092681885, "kl": 0.052587032318115234, "learning_rate": 3.920653624179945e-06, "loss": 0.0021, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 847 }, { "completion_length": 156.0, "epoch": 0.3773920783266578, "grad_norm": 0.9629629254341125, "kl": 0.18556594848632812, "learning_rate": 3.917455751722349e-06, "loss": 0.0074, "reward": 0.28033334016799927, "reward_std": 0.23433451354503632, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28033334016799927, "step": 848 }, { "completion_length": 132.33334350585938, "epoch": 0.37783711615487314, "grad_norm": 0.05315527319908142, "kl": 0.07583107054233551, "learning_rate": 3.914254457525862e-06, "loss": 0.0033, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 849 }, { "completion_length": 187.1666717529297, "epoch": 0.37828215398308856, "grad_norm": 0.7499144673347473, "kl": 0.051162999123334885, "learning_rate": 3.9110497493184084e-06, "loss": 0.002, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 850 }, { "completion_length": 153.33334350585938, "epoch": 0.378727191811304, "grad_norm": 0.6836094856262207, "kl": 0.06891427934169769, "learning_rate": 3.9078416348361555e-06, "loss": 0.0028, "reward": 0.3341667056083679, "reward_std": 0.10247030854225159, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 851 }, { "completion_length": 180.1666717529297, "epoch": 0.37917222963951935, "grad_norm": 0.8249133229255676, "kl": 0.04966040700674057, "learning_rate": 3.904630121823495e-06, "loss": 0.002, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 852 }, { "completion_length": 91.0, "epoch": 0.3796172674677348, "grad_norm": 1.4175060987472534, "kl": 0.15148359537124634, "learning_rate": 3.901415218033019e-06, "loss": 0.0061, "reward": 0.36516666412353516, "reward_std": 0.026536142453551292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36516663432121277, "step": 853 }, { "completion_length": 146.5, "epoch": 0.38006230529595014, "grad_norm": 0.850635290145874, "kl": 0.06043955311179161, "learning_rate": 3.8981969312255075e-06, "loss": 0.0024, "reward": 0.31333333253860474, "reward_std": 0.10494124889373779, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 854 }, { "completion_length": 134.6666717529297, "epoch": 0.38050734312416556, "grad_norm": 0.028085196390748024, "kl": 0.061860792338848114, "learning_rate": 3.894975269169906e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 855 }, { "completion_length": 188.0, "epoch": 0.38095238095238093, "grad_norm": 1.085335612297058, "kl": 0.16889886558055878, "learning_rate": 3.891750239643309e-06, "loss": 0.0068, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 856 }, { "completion_length": 137.5, "epoch": 0.38139741878059635, "grad_norm": 0.8269402384757996, "kl": 0.06682190299034119, "learning_rate": 3.888521850430939e-06, "loss": 0.0027, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 857 }, { "completion_length": 200.0, "epoch": 0.3818424566088118, "grad_norm": 0.026867447420954704, "kl": 0.04740360379219055, "learning_rate": 3.885290109326131e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 858 }, { "completion_length": 200.0, "epoch": 0.38228749443702714, "grad_norm": 0.023793501779437065, "kl": 0.046479731798172, "learning_rate": 3.882055024130307e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 859 }, { "completion_length": 189.83334350585938, "epoch": 0.38273253226524256, "grad_norm": 0.6452245712280273, "kl": 0.050767965614795685, "learning_rate": 3.878816602652965e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 860 }, { "completion_length": 126.5, "epoch": 0.38317757009345793, "grad_norm": 0.02156522497534752, "kl": 0.062334608286619186, "learning_rate": 3.875574852711656e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 861 }, { "completion_length": 184.33334350585938, "epoch": 0.38362260792167335, "grad_norm": 1.0488053560256958, "kl": 0.050179775804281235, "learning_rate": 3.872329782131967e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 862 }, { "completion_length": 177.0, "epoch": 0.3840676457498887, "grad_norm": 0.7218441367149353, "kl": 0.05524627864360809, "learning_rate": 3.869081398747499e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 863 }, { "completion_length": 169.5, "epoch": 0.38451268357810414, "grad_norm": 0.6805753707885742, "kl": 0.057348743081092834, "learning_rate": 3.865829710399852e-06, "loss": 0.0023, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 864 }, { "completion_length": 190.1666717529297, "epoch": 0.38495772140631956, "grad_norm": 0.7564582824707031, "kl": 0.06877424567937851, "learning_rate": 3.862574724938602e-06, "loss": 0.0028, "reward": 0.0833333358168602, "reward_std": 0.3685337007045746, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333283662796, "step": 865 }, { "completion_length": 130.6666717529297, "epoch": 0.38540275923453493, "grad_norm": 0.028254088014364243, "kl": 0.05616528540849686, "learning_rate": 3.859316450221286e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 866 }, { "completion_length": 155.33334350585938, "epoch": 0.38584779706275035, "grad_norm": 0.6590713858604431, "kl": 0.05754072964191437, "learning_rate": 3.856054894113382e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 867 }, { "completion_length": 155.5, "epoch": 0.3862928348909657, "grad_norm": 0.11690230667591095, "kl": 0.07130900025367737, "learning_rate": 3.852790064488286e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 868 }, { "completion_length": 193.6666717529297, "epoch": 0.38673787271918114, "grad_norm": 0.7855741381645203, "kl": 0.0497148297727108, "learning_rate": 3.8495219692273e-06, "loss": 0.002, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 869 }, { "completion_length": 149.1666717529297, "epoch": 0.3871829105473965, "grad_norm": 0.749443769454956, "kl": 0.044368281960487366, "learning_rate": 3.846250616219607e-06, "loss": 0.0018, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 870 }, { "completion_length": 147.5, "epoch": 0.38762794837561193, "grad_norm": 0.779934823513031, "kl": 0.09931713342666626, "learning_rate": 3.842976013362255e-06, "loss": 0.004, "reward": 0.2381666600704193, "reward_std": 0.3376213312149048, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2381666600704193, "step": 871 }, { "completion_length": 162.83334350585938, "epoch": 0.38807298620382735, "grad_norm": 0.0258543211966753, "kl": 0.05789042264223099, "learning_rate": 3.839698168560137e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 872 }, { "completion_length": 178.6666717529297, "epoch": 0.3885180240320427, "grad_norm": 0.7851041555404663, "kl": 0.0560050830245018, "learning_rate": 3.8364170897259715e-06, "loss": 0.0022, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 873 }, { "completion_length": 160.6666717529297, "epoch": 0.38896306186025814, "grad_norm": 0.7662412524223328, "kl": 0.07136371731758118, "learning_rate": 3.833132784780284e-06, "loss": 0.0029, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 874 }, { "completion_length": 160.33334350585938, "epoch": 0.3894080996884735, "grad_norm": 0.6216509938240051, "kl": 0.05862082540988922, "learning_rate": 3.82984526165139e-06, "loss": 0.0023, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 875 }, { "completion_length": 162.33334350585938, "epoch": 0.38985313751668893, "grad_norm": 0.6714538335800171, "kl": 0.06793516874313354, "learning_rate": 3.8265545282753706e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 876 }, { "completion_length": 176.0, "epoch": 0.3902981753449043, "grad_norm": 0.6328492760658264, "kl": 0.07553236186504364, "learning_rate": 3.823260592596058e-06, "loss": 0.003, "reward": 0.20350000262260437, "reward_std": 0.36431294679641724, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20350000262260437, "step": 877 }, { "completion_length": 200.0, "epoch": 0.3907432131731197, "grad_norm": 0.6953088045120239, "kl": 0.03606265038251877, "learning_rate": 3.819963462565015e-06, "loss": 0.0014, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 878 }, { "completion_length": 168.6666717529297, "epoch": 0.39118825100133514, "grad_norm": 0.6372416615486145, "kl": 0.06722667813301086, "learning_rate": 3.816663146141514e-06, "loss": 0.0027, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 879 }, { "completion_length": 159.5, "epoch": 0.3916332888295505, "grad_norm": 0.8337939381599426, "kl": 0.06765174865722656, "learning_rate": 3.813359651292522e-06, "loss": 0.0027, "reward": 0.31333333253860474, "reward_std": 0.06864885985851288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31333333253860474, "step": 880 }, { "completion_length": 164.5, "epoch": 0.39207832665776593, "grad_norm": 0.6367321610450745, "kl": 0.049886442720890045, "learning_rate": 3.810052985992677e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 881 }, { "completion_length": 151.1666717529297, "epoch": 0.3925233644859813, "grad_norm": 0.03452099859714508, "kl": 0.0649208277463913, "learning_rate": 3.8067431582242697e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 882 }, { "completion_length": 150.0, "epoch": 0.3929684023141967, "grad_norm": 0.8333684802055359, "kl": 0.061620742082595825, "learning_rate": 3.8034301759772263e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 883 }, { "completion_length": 132.33334350585938, "epoch": 0.3934134401424121, "grad_norm": 0.03911470249295235, "kl": 0.07865100353956223, "learning_rate": 3.8001140472490887e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 884 }, { "completion_length": 130.1666717529297, "epoch": 0.3938584779706275, "grad_norm": 1.2934985160827637, "kl": 0.05334341526031494, "learning_rate": 3.796794780044992e-06, "loss": 0.0021, "reward": 0.17083333432674408, "reward_std": 0.2419474571943283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17083333432674408, "step": 885 }, { "completion_length": 139.1666717529297, "epoch": 0.3943035157988429, "grad_norm": 0.8030723929405212, "kl": 0.06765462458133698, "learning_rate": 3.7934723823776494e-06, "loss": 0.0027, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 886 }, { "completion_length": 105.5, "epoch": 0.3947485536270583, "grad_norm": 0.02923821657896042, "kl": 0.06622253358364105, "learning_rate": 3.7901468622673303e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 887 }, { "completion_length": 140.6666717529297, "epoch": 0.3951935914552737, "grad_norm": 0.7183970808982849, "kl": 0.0615064799785614, "learning_rate": 3.786818227741842e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 888 }, { "completion_length": 165.33334350585938, "epoch": 0.3956386292834891, "grad_norm": 0.733579158782959, "kl": 0.04961232468485832, "learning_rate": 3.78348648683651e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 889 }, { "completion_length": 171.33334350585938, "epoch": 0.3960836671117045, "grad_norm": 0.7921627163887024, "kl": 0.05507838726043701, "learning_rate": 3.780151647594159e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 890 }, { "completion_length": 171.0, "epoch": 0.3965287049399199, "grad_norm": 0.7230894565582275, "kl": 0.057939350605010986, "learning_rate": 3.7768137180650915e-06, "loss": 0.0023, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 891 }, { "completion_length": 155.0, "epoch": 0.3969737427681353, "grad_norm": 0.7460453510284424, "kl": 0.08669416606426239, "learning_rate": 3.773472706307072e-06, "loss": 0.0035, "reward": 0.29216665029525757, "reward_std": 0.2053488940000534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216665029525757, "step": 892 }, { "completion_length": 127.0, "epoch": 0.39741878059635066, "grad_norm": 0.9885587692260742, "kl": 0.23846940696239471, "learning_rate": 3.7701286203853036e-06, "loss": 0.0095, "reward": 0.3103333115577698, "reward_std": 0.16084983944892883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3103333115577698, "step": 893 }, { "completion_length": 133.1666717529297, "epoch": 0.3978638184245661, "grad_norm": 1.0468348264694214, "kl": 0.241329163312912, "learning_rate": 3.7667814683724126e-06, "loss": 0.0097, "reward": 0.34300002455711365, "reward_std": 0.08083315193653107, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34299999475479126, "step": 894 }, { "completion_length": 140.33334350585938, "epoch": 0.3983088562527815, "grad_norm": 0.965579628944397, "kl": 0.06833823025226593, "learning_rate": 3.7634312583484244e-06, "loss": 0.0027, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 895 }, { "completion_length": 148.5, "epoch": 0.3987538940809969, "grad_norm": 0.7001117467880249, "kl": 0.05449951812624931, "learning_rate": 3.7600779984007485e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 896 }, { "completion_length": 171.5, "epoch": 0.3991989319092123, "grad_norm": 0.678555965423584, "kl": 0.04612820968031883, "learning_rate": 3.756721696624156e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 897 }, { "completion_length": 177.6666717529297, "epoch": 0.39964396973742766, "grad_norm": 0.915947675704956, "kl": 0.05997948348522186, "learning_rate": 3.7533623611207607e-06, "loss": 0.0024, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 898 }, { "completion_length": 200.0, "epoch": 0.4000890075656431, "grad_norm": 0.05163982883095741, "kl": 0.049683578312397, "learning_rate": 3.7500000000000005e-06, "loss": 0.002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 899 }, { "completion_length": 168.6666717529297, "epoch": 0.40053404539385845, "grad_norm": 0.7753753066062927, "kl": 0.05140147730708122, "learning_rate": 3.7466346213786165e-06, "loss": 0.0021, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 900 }, { "completion_length": 176.1666717529297, "epoch": 0.4009790832220739, "grad_norm": 0.027781542390584946, "kl": 0.054599761962890625, "learning_rate": 3.743266233380635e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 901 }, { "completion_length": 83.66667175292969, "epoch": 0.4014241210502893, "grad_norm": 0.02804761379957199, "kl": 0.07853664457798004, "learning_rate": 3.7398948441373454e-06, "loss": 0.0034, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 902 }, { "completion_length": 141.6666717529297, "epoch": 0.40186915887850466, "grad_norm": 0.028074469417333603, "kl": 0.056927796453237534, "learning_rate": 3.7365204617872834e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 903 }, { "completion_length": 177.83334350585938, "epoch": 0.4023141967067201, "grad_norm": 0.8167720437049866, "kl": 0.06986045837402344, "learning_rate": 3.7331430944762105e-06, "loss": 0.0028, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 904 }, { "completion_length": 187.1666717529297, "epoch": 0.40275923453493545, "grad_norm": 0.7425169944763184, "kl": 0.04808403179049492, "learning_rate": 3.729762750357092e-06, "loss": 0.0019, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 905 }, { "completion_length": 139.5, "epoch": 0.4032042723631509, "grad_norm": 0.03980370983481407, "kl": 0.062199126929044724, "learning_rate": 3.7263794375900803e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 906 }, { "completion_length": 145.5, "epoch": 0.40364931019136624, "grad_norm": 0.022532425820827484, "kl": 0.056330885738134384, "learning_rate": 3.7229931643424943e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 907 }, { "completion_length": 198.83334350585938, "epoch": 0.40409434801958166, "grad_norm": 0.625041127204895, "kl": 0.05984826013445854, "learning_rate": 3.7196039387887995e-06, "loss": 0.0024, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 908 }, { "completion_length": 192.5, "epoch": 0.4045393858477971, "grad_norm": 0.7206208109855652, "kl": 0.050445057451725006, "learning_rate": 3.7162117691105894e-06, "loss": 0.002, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 909 }, { "completion_length": 200.0, "epoch": 0.40498442367601245, "grad_norm": 0.028280407190322876, "kl": 0.040611665695905685, "learning_rate": 3.71281666349656e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 910 }, { "completion_length": 184.83334350585938, "epoch": 0.4054294615042279, "grad_norm": 0.7396180629730225, "kl": 0.04777958244085312, "learning_rate": 3.7094186301425006e-06, "loss": 0.0019, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 911 }, { "completion_length": 191.33334350585938, "epoch": 0.40587449933244324, "grad_norm": 1.011993408203125, "kl": 0.04695138335227966, "learning_rate": 3.706017677251266e-06, "loss": 0.0019, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 912 }, { "completion_length": 114.0, "epoch": 0.40631953716065866, "grad_norm": 0.029970509931445122, "kl": 0.06655572354793549, "learning_rate": 3.7026138130327547e-06, "loss": 0.003, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 913 }, { "completion_length": 180.33334350585938, "epoch": 0.40676457498887403, "grad_norm": 0.660056471824646, "kl": 0.041105978190898895, "learning_rate": 3.6992070457038998e-06, "loss": 0.0016, "reward": 0.29233333468437195, "reward_std": 0.10247080028057098, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 914 }, { "completion_length": 183.1666717529297, "epoch": 0.40720961281708945, "grad_norm": 0.7320570945739746, "kl": 0.045362964272499084, "learning_rate": 3.6957973834886387e-06, "loss": 0.0018, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 915 }, { "completion_length": 159.0, "epoch": 0.4076546506453049, "grad_norm": 0.7662878036499023, "kl": 0.05844952166080475, "learning_rate": 3.692384834617897e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 916 }, { "completion_length": 184.1666717529297, "epoch": 0.40809968847352024, "grad_norm": 0.7281984090805054, "kl": 0.05307254195213318, "learning_rate": 3.688969407329569e-06, "loss": 0.0021, "reward": 0.27133333683013916, "reward_std": 0.12340772151947021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27133333683013916, "step": 917 }, { "completion_length": 177.6666717529297, "epoch": 0.40854472630173566, "grad_norm": 0.7347136735916138, "kl": 0.05313348397612572, "learning_rate": 3.6855511098684996e-06, "loss": 0.0021, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 918 }, { "completion_length": 200.0, "epoch": 0.40898976412995103, "grad_norm": 0.7880335450172424, "kl": 0.03796866908669472, "learning_rate": 3.682129950486459e-06, "loss": 0.0015, "reward": 0.2291666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2291666716337204, "step": 919 }, { "completion_length": 171.83334350585938, "epoch": 0.40943480195816645, "grad_norm": 0.03392661735415459, "kl": 0.05389983206987381, "learning_rate": 3.678705937442128e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 920 }, { "completion_length": 159.33334350585938, "epoch": 0.4098798397863818, "grad_norm": 0.8597182035446167, "kl": 0.0735437273979187, "learning_rate": 3.675279079001077e-06, "loss": 0.0029, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 921 }, { "completion_length": 196.5, "epoch": 0.41032487761459724, "grad_norm": 0.7532240152359009, "kl": 0.053770922124385834, "learning_rate": 3.6718493834357415e-06, "loss": 0.0022, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 922 }, { "completion_length": 153.6666717529297, "epoch": 0.41076991544281266, "grad_norm": 0.7527331709861755, "kl": 0.05144277960062027, "learning_rate": 3.6684168590254103e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 923 }, { "completion_length": 182.5, "epoch": 0.411214953271028, "grad_norm": 0.5914585590362549, "kl": 0.05143951624631882, "learning_rate": 3.6649815140561995e-06, "loss": 0.0021, "reward": 0.21649998426437378, "reward_std": 0.3328048884868622, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21649998426437378, "step": 924 }, { "completion_length": 185.0, "epoch": 0.41165999109924345, "grad_norm": 0.6755291819572449, "kl": 0.04394887015223503, "learning_rate": 3.6615433568210313e-06, "loss": 0.0018, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 925 }, { "completion_length": 194.83334350585938, "epoch": 0.4121050289274588, "grad_norm": 0.8636181354522705, "kl": 0.047952380031347275, "learning_rate": 3.658102395619621e-06, "loss": 0.0019, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 926 }, { "completion_length": 200.0, "epoch": 0.41255006675567424, "grad_norm": 0.7906734347343445, "kl": 0.02828504703938961, "learning_rate": 3.65465863875845e-06, "loss": 0.0011, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 927 }, { "completion_length": 149.1666717529297, "epoch": 0.4129951045838896, "grad_norm": 0.8365933895111084, "kl": 0.08980211615562439, "learning_rate": 3.651212094550748e-06, "loss": 0.0036, "reward": 0.26616665720939636, "reward_std": 0.2690356373786926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26616665720939636, "step": 928 }, { "completion_length": 199.6666717529297, "epoch": 0.413440142412105, "grad_norm": 0.6435677409172058, "kl": 0.03840624541044235, "learning_rate": 3.6477627713164767e-06, "loss": 0.0015, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 929 }, { "completion_length": 140.1666717529297, "epoch": 0.41388518024032045, "grad_norm": 0.02791665680706501, "kl": 0.0602748803794384, "learning_rate": 3.6443106773823025e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 930 }, { "completion_length": 190.0, "epoch": 0.4143302180685358, "grad_norm": 0.8331409096717834, "kl": 0.05726364254951477, "learning_rate": 3.6408558210815814e-06, "loss": 0.0023, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 931 }, { "completion_length": 193.1666717529297, "epoch": 0.41477525589675124, "grad_norm": 0.7008532285690308, "kl": 0.05072779580950737, "learning_rate": 3.6373982107543398e-06, "loss": 0.002, "reward": 0.2711666524410248, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 932 }, { "completion_length": 136.33334350585938, "epoch": 0.4152202937249666, "grad_norm": 0.02707936242222786, "kl": 0.05186247080564499, "learning_rate": 3.63393785474725e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 933 }, { "completion_length": 145.5, "epoch": 0.415665331553182, "grad_norm": 0.7188604474067688, "kl": 0.06224457547068596, "learning_rate": 3.6304747614136126e-06, "loss": 0.0025, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 934 }, { "completion_length": 117.66667175292969, "epoch": 0.4161103693813974, "grad_norm": 1.1184520721435547, "kl": 0.0940161943435669, "learning_rate": 3.6270089391133378e-06, "loss": 0.0038, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 935 }, { "completion_length": 158.5, "epoch": 0.4165554072096128, "grad_norm": 0.683738648891449, "kl": 0.05458883196115494, "learning_rate": 3.6235403962129218e-06, "loss": 0.0022, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 936 }, { "completion_length": 176.5, "epoch": 0.41700044503782824, "grad_norm": 0.7745259404182434, "kl": 0.053183965384960175, "learning_rate": 3.6200691410854284e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.0688314288854599, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 937 }, { "completion_length": 134.5, "epoch": 0.4174454828660436, "grad_norm": 0.028597315773367882, "kl": 0.06111491844058037, "learning_rate": 3.61659518211047e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 938 }, { "completion_length": 157.0, "epoch": 0.417890520694259, "grad_norm": 0.05513963848352432, "kl": 0.07339125871658325, "learning_rate": 3.6131185276741846e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 939 }, { "completion_length": 173.0, "epoch": 0.4183355585224744, "grad_norm": 0.7153098583221436, "kl": 0.053828366100788116, "learning_rate": 3.6096391861692183e-06, "loss": 0.0022, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 940 }, { "completion_length": 193.0, "epoch": 0.4187805963506898, "grad_norm": 0.738511323928833, "kl": 0.05540754646062851, "learning_rate": 3.6061571659947032e-06, "loss": 0.0022, "reward": 0.29233333468437195, "reward_std": 0.12961584329605103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29233333468437195, "step": 941 }, { "completion_length": 144.33334350585938, "epoch": 0.4192256341789052, "grad_norm": 0.01766749657690525, "kl": 0.05199790000915527, "learning_rate": 3.602672475556237e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 942 }, { "completion_length": 131.1666717529297, "epoch": 0.4196706720071206, "grad_norm": 0.02660481631755829, "kl": 0.05784265324473381, "learning_rate": 3.5991851232658647e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 943 }, { "completion_length": 191.6666717529297, "epoch": 0.420115709835336, "grad_norm": 0.6994498372077942, "kl": 0.039949461817741394, "learning_rate": 3.595695117542057e-06, "loss": 0.0016, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 944 }, { "completion_length": 166.83334350585938, "epoch": 0.4205607476635514, "grad_norm": 0.8509209752082825, "kl": 0.10748060047626495, "learning_rate": 3.5922024668096885e-06, "loss": 0.0043, "reward": 0.29216668009757996, "reward_std": 0.06493817269802094, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 945 }, { "completion_length": 143.1666717529297, "epoch": 0.4210057854917668, "grad_norm": 0.014815707691013813, "kl": 0.04326681047677994, "learning_rate": 3.5887071795000204e-06, "loss": 0.002, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 946 }, { "completion_length": 182.83334350585938, "epoch": 0.4214508233199822, "grad_norm": 0.8161847591400146, "kl": 0.0375329852104187, "learning_rate": 3.585209264050678e-06, "loss": 0.0015, "reward": 0.22949999570846558, "reward_std": 0.12337382137775421, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22949998080730438, "step": 947 }, { "completion_length": 171.1666717529297, "epoch": 0.4218958611481976, "grad_norm": 0.7375685572624207, "kl": 0.057841382920742035, "learning_rate": 3.5817087289056305e-06, "loss": 0.0023, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 948 }, { "completion_length": 155.0, "epoch": 0.42234089897641297, "grad_norm": 0.02155291475355625, "kl": 0.051909781992435455, "learning_rate": 3.5782055825151722e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 949 }, { "completion_length": 171.33334350585938, "epoch": 0.4227859368046284, "grad_norm": 0.635463297367096, "kl": 0.049717407673597336, "learning_rate": 3.5746998333358994e-06, "loss": 0.002, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 950 }, { "completion_length": 135.83334350585938, "epoch": 0.4232309746328438, "grad_norm": 0.04738845303654671, "kl": 0.07275395095348358, "learning_rate": 3.571191489830693e-06, "loss": 0.0032, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 951 }, { "completion_length": 148.5, "epoch": 0.4236760124610592, "grad_norm": 0.734367311000824, "kl": 0.0618259459733963, "learning_rate": 3.567680560468696e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 952 }, { "completion_length": 185.1666717529297, "epoch": 0.4241210502892746, "grad_norm": 0.6942402720451355, "kl": 0.04942780360579491, "learning_rate": 3.564167053725293e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 953 }, { "completion_length": 149.83334350585938, "epoch": 0.42456608811748997, "grad_norm": 0.6157485246658325, "kl": 0.048379119485616684, "learning_rate": 3.560650978082092e-06, "loss": 0.0019, "reward": 0.33416664600372314, "reward_std": 0.10247032344341278, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 954 }, { "completion_length": 181.33334350585938, "epoch": 0.4250111259457054, "grad_norm": 0.6741864085197449, "kl": 0.05080154538154602, "learning_rate": 3.5571323420269e-06, "loss": 0.002, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 955 }, { "completion_length": 164.33334350585938, "epoch": 0.42545616377392076, "grad_norm": 0.6833618879318237, "kl": 0.04868559539318085, "learning_rate": 3.5536111540537076e-06, "loss": 0.0019, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 956 }, { "completion_length": 153.83334350585938, "epoch": 0.4259012016021362, "grad_norm": 0.8367120027542114, "kl": 0.08897934854030609, "learning_rate": 3.5500874226626635e-06, "loss": 0.0036, "reward": 0.2783333361148834, "reward_std": 0.23923347890377045, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27833330631256104, "step": 957 }, { "completion_length": 180.1666717529297, "epoch": 0.4263462394303516, "grad_norm": 0.8038119673728943, "kl": 0.06598465144634247, "learning_rate": 3.546561156360057e-06, "loss": 0.0026, "reward": 0.31300002336502075, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31300002336502075, "step": 958 }, { "completion_length": 167.6666717529297, "epoch": 0.42679127725856697, "grad_norm": 0.8257107734680176, "kl": 0.06047782301902771, "learning_rate": 3.543032363658297e-06, "loss": 0.0024, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 959 }, { "completion_length": 151.1666717529297, "epoch": 0.4272363150867824, "grad_norm": 0.029956277459859848, "kl": 0.05927738547325134, "learning_rate": 3.5395010530758913e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 960 }, { "completion_length": 185.0, "epoch": 0.42768135291499776, "grad_norm": 0.7257969379425049, "kl": 0.043350182473659515, "learning_rate": 3.535967233137424e-06, "loss": 0.0017, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 961 }, { "completion_length": 164.33334350585938, "epoch": 0.4281263907432132, "grad_norm": 0.019633714109659195, "kl": 0.04942721128463745, "learning_rate": 3.5324309123735396e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 962 }, { "completion_length": 200.0, "epoch": 0.42857142857142855, "grad_norm": 0.6552579402923584, "kl": 0.055333852767944336, "learning_rate": 3.5288920993209175e-06, "loss": 0.0022, "reward": 0.2083333432674408, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2083333432674408, "step": 963 }, { "completion_length": 172.0, "epoch": 0.42901646639964397, "grad_norm": 0.8508467078208923, "kl": 0.047447673976421356, "learning_rate": 3.5253508025222545e-06, "loss": 0.0019, "reward": 0.2615000009536743, "reward_std": 0.13879159092903137, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2615000009536743, "step": 964 }, { "completion_length": 199.5, "epoch": 0.4294615042278594, "grad_norm": 0.6771219968795776, "kl": 0.0449552908539772, "learning_rate": 3.5218070305262427e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 965 }, { "completion_length": 149.5, "epoch": 0.42990654205607476, "grad_norm": 0.046515047550201416, "kl": 0.06530429422855377, "learning_rate": 3.5182607918875495e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 966 }, { "completion_length": 180.1666717529297, "epoch": 0.4303515798842902, "grad_norm": 0.645074188709259, "kl": 0.051890529692173004, "learning_rate": 3.514712095166797e-06, "loss": 0.0021, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 967 }, { "completion_length": 177.5, "epoch": 0.43079661771250555, "grad_norm": 0.7000871896743774, "kl": 0.05255164951086044, "learning_rate": 3.511160948930539e-06, "loss": 0.0021, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 968 }, { "completion_length": 149.1666717529297, "epoch": 0.43124165554072097, "grad_norm": 0.02208912931382656, "kl": 0.05640026181936264, "learning_rate": 3.507607361751248e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 969 }, { "completion_length": 168.6666717529297, "epoch": 0.43168669336893634, "grad_norm": 0.032338086515665054, "kl": 0.059976160526275635, "learning_rate": 3.504051342207282e-06, "loss": 0.0027, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 970 }, { "completion_length": 151.0, "epoch": 0.43213173119715176, "grad_norm": 0.8209261894226074, "kl": 0.064157634973526, "learning_rate": 3.5004928988828748e-06, "loss": 0.0026, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 971 }, { "completion_length": 197.5, "epoch": 0.4325767690253672, "grad_norm": 0.6907802820205688, "kl": 0.05787642300128937, "learning_rate": 3.4969320403681105e-06, "loss": 0.0023, "reward": 0.2709999978542328, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 972 }, { "completion_length": 199.83334350585938, "epoch": 0.43302180685358255, "grad_norm": 0.802642285823822, "kl": 0.044948406517505646, "learning_rate": 3.493368775258904e-06, "loss": 0.0018, "reward": 0.2709999978542328, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2709999978542328, "step": 973 }, { "completion_length": 168.83334350585938, "epoch": 0.43346684468179797, "grad_norm": 0.6813015341758728, "kl": 0.04564934968948364, "learning_rate": 3.489803112156978e-06, "loss": 0.0018, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 974 }, { "completion_length": 161.33334350585938, "epoch": 0.43391188251001334, "grad_norm": 0.028628792613744736, "kl": 0.05609843134880066, "learning_rate": 3.486235059669846e-06, "loss": 0.0025, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 975 }, { "completion_length": 172.83334350585938, "epoch": 0.43435692033822876, "grad_norm": 0.7536810636520386, "kl": 0.07381969690322876, "learning_rate": 3.482664626410787e-06, "loss": 0.003, "reward": 0.3341667056083679, "reward_std": 0.0648086816072464, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33416664600372314, "step": 976 }, { "completion_length": 185.5, "epoch": 0.4348019581664441, "grad_norm": 0.7587660551071167, "kl": 0.04436537250876427, "learning_rate": 3.47909182099883e-06, "loss": 0.0018, "reward": 0.250333309173584, "reward_std": 0.1122509092092514, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250333309173584, "step": 977 }, { "completion_length": 174.6666717529297, "epoch": 0.43524699599465955, "grad_norm": 0.7341634035110474, "kl": 0.04806718975305557, "learning_rate": 3.4755166520587297e-06, "loss": 0.0019, "reward": 0.29216668009757996, "reward_std": 0.10255225747823715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 978 }, { "completion_length": 200.0, "epoch": 0.43569203382287497, "grad_norm": 0.01764507032930851, "kl": 0.046592336148023605, "learning_rate": 3.4719391282209437e-06, "loss": 0.0019, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 979 }, { "completion_length": 132.1666717529297, "epoch": 0.43613707165109034, "grad_norm": 0.921298086643219, "kl": 0.15562453866004944, "learning_rate": 3.4683592581216173e-06, "loss": 0.0062, "reward": 0.3048333525657654, "reward_std": 0.1743220090866089, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.304833322763443, "step": 980 }, { "completion_length": 119.16667175292969, "epoch": 0.43658210947930576, "grad_norm": 0.024678191170096397, "kl": 0.06529170274734497, "learning_rate": 3.464777050402559e-06, "loss": 0.0029, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 981 }, { "completion_length": 144.1666717529297, "epoch": 0.4370271473075211, "grad_norm": 0.016610290855169296, "kl": 0.04939974471926689, "learning_rate": 3.461192513711219e-06, "loss": 0.0023, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 982 }, { "completion_length": 149.83334350585938, "epoch": 0.43747218513573655, "grad_norm": 0.029446417465806007, "kl": 0.05275744944810867, "learning_rate": 3.4576056567006728e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 983 }, { "completion_length": 178.6666717529297, "epoch": 0.4379172229639519, "grad_norm": 0.7193699479103088, "kl": 0.057997625321149826, "learning_rate": 3.454016488029592e-06, "loss": 0.0023, "reward": 0.29216668009757996, "reward_std": 0.10255226492881775, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.29216668009757996, "step": 984 }, { "completion_length": 132.0, "epoch": 0.43836226079216734, "grad_norm": 0.016964778304100037, "kl": 0.052684418857097626, "learning_rate": 3.4504250163622334e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 985 }, { "completion_length": 191.6666717529297, "epoch": 0.43880729862038276, "grad_norm": 0.7094153761863708, "kl": 0.03705962002277374, "learning_rate": 3.446831250368412e-06, "loss": 0.0015, "reward": 0.27116668224334717, "reward_std": 0.09453976154327393, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27116668224334717, "step": 986 }, { "completion_length": 182.5, "epoch": 0.4392523364485981, "grad_norm": 0.7157971858978271, "kl": 0.05741278454661369, "learning_rate": 3.443235198723479e-06, "loss": 0.0023, "reward": 0.35499998927116394, "reward_std": 0.051439281553030014, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 987 }, { "completion_length": 195.0, "epoch": 0.43969737427681355, "grad_norm": 0.7865637540817261, "kl": 0.03509274870157242, "learning_rate": 3.4396368701083073e-06, "loss": 0.0014, "reward": 0.2084999978542328, "reward_std": 0.10238896310329437, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2084999978542328, "step": 988 }, { "completion_length": 162.1666717529297, "epoch": 0.4401424121050289, "grad_norm": 1.0707346200942993, "kl": 0.06345522403717041, "learning_rate": 3.436036273209261e-06, "loss": 0.0025, "reward": 0.33399999141693115, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33399999141693115, "step": 989 }, { "completion_length": 157.6666717529297, "epoch": 0.44058744993324434, "grad_norm": 0.7505767345428467, "kl": 0.06585465371608734, "learning_rate": 3.432433416718184e-06, "loss": 0.0026, "reward": 0.35500001907348633, "reward_std": 0.051439277827739716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35499998927116394, "step": 990 }, { "completion_length": 197.6666717529297, "epoch": 0.4410324877614597, "grad_norm": 0.7079508900642395, "kl": 0.05217638239264488, "learning_rate": 3.428828309332375e-06, "loss": 0.0021, "reward": 0.250166654586792, "reward_std": 0.07937358319759369, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.250166654586792, "step": 991 }, { "completion_length": 125.33333587646484, "epoch": 0.4414775255896751, "grad_norm": 0.022779956459999084, "kl": 0.06337767839431763, "learning_rate": 3.4252209597545634e-06, "loss": 0.0028, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 992 }, { "completion_length": 200.0, "epoch": 0.44192256341789055, "grad_norm": 0.02871246449649334, "kl": 0.045304443687200546, "learning_rate": 3.4216113766928926e-06, "loss": 0.0018, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 993 }, { "completion_length": 196.33334350585938, "epoch": 0.4423676012461059, "grad_norm": 0.7327296137809753, "kl": 0.04095136374235153, "learning_rate": 3.4179995688608996e-06, "loss": 0.0016, "reward": 0.2919999957084656, "reward_std": 0.06506611406803131, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2919999957084656, "step": 994 }, { "completion_length": 179.0, "epoch": 0.44281263907432133, "grad_norm": 0.6812580823898315, "kl": 0.054273735731840134, "learning_rate": 3.414385544977489e-06, "loss": 0.0022, "reward": 0.24016666412353516, "reward_std": 0.23370617628097534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24016666412353516, "step": 995 }, { "completion_length": 147.0, "epoch": 0.4432576769025367, "grad_norm": 0.029301166534423828, "kl": 0.05664734169840813, "learning_rate": 3.4107693137669167e-06, "loss": 0.0026, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 996 }, { "completion_length": 187.83334350585938, "epoch": 0.4437027147307521, "grad_norm": 0.7347012758255005, "kl": 0.0524878203868866, "learning_rate": 3.4071508839587676e-06, "loss": 0.0021, "reward": 0.31299999356269836, "reward_std": 0.06901304423809052, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.312999963760376, "step": 997 }, { "completion_length": 175.83334350585938, "epoch": 0.4441477525589675, "grad_norm": 0.7827162742614746, "kl": 0.05347862094640732, "learning_rate": 3.403530264287931e-06, "loss": 0.0021, "reward": 0.31316667795181274, "reward_std": 0.10506077855825424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31316667795181274, "step": 998 }, { "completion_length": 179.0, "epoch": 0.4445927903871829, "grad_norm": 0.0856582522392273, "kl": 0.05322853848338127, "learning_rate": 3.3999074634945854e-06, "loss": 0.0024, "reward": 0.3760000169277191, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37599998712539673, "step": 999 }, { "completion_length": 193.0, "epoch": 0.44503782821539833, "grad_norm": 0.7221679091453552, "kl": 0.03924994543194771, "learning_rate": 3.396282490324175e-06, "loss": 0.0016, "reward": 0.2711666524410248, "reward_std": 0.09453975409269333, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2711666524410248, "step": 1000 } ], "logging_steps": 1, "max_steps": 2247, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }