diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,24033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6675567423230975, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 196.6666717529297, + "epoch": 0.0004450378282153983, + "grad_norm": 0.7149407267570496, + "kl": 0.00043882918544113636, + "learning_rate": 2.2222222222222224e-08, + "loss": 0.0, + "reward": -0.312666654586792, + "reward_std": 0.39005160331726074, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.312666654586792, + "step": 1 + }, + { + "completion_length": 179.0, + "epoch": 0.0008900756564307966, + "grad_norm": 1.2034916877746582, + "kl": 0.00043298103264532983, + "learning_rate": 4.444444444444445e-08, + "loss": 0.0, + "reward": -0.2953333556652069, + "reward_std": 0.3479785621166229, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.2953333556652069, + "step": 2 + }, + { + "completion_length": 200.0, + "epoch": 0.0013351134846461949, + "grad_norm": 0.00107863440643996, + "kl": 0.0003771593910641968, + "learning_rate": 6.666666666666668e-08, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 3 + }, + { + "completion_length": 200.0, + "epoch": 0.0017801513128615932, + "grad_norm": 0.7310553789138794, + "kl": 0.00044770282693207264, + "learning_rate": 8.88888888888889e-08, + "loss": 0.0, + "reward": -0.085999995470047, + "reward_std": 0.3784494996070862, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.085999995470047, + "step": 4 + }, + { + "completion_length": 174.83334350585938, + "epoch": 0.0022251891410769915, + "grad_norm": 0.861896276473999, + "kl": 0.00042549317004159093, + "learning_rate": 1.1111111111111112e-07, + "loss": 0.0, + "reward": -0.09316667169332504, + "reward_std": 0.2892870306968689, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09316667169332504, + "step": 5 + }, + { + "completion_length": 200.0, + "epoch": 0.0026702269692923898, + "grad_norm": 0.0017526125302538276, + "kl": 0.0004182992852292955, + "learning_rate": 1.3333333333333336e-07, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 6 + }, + { + "completion_length": 166.5, + "epoch": 0.003115264797507788, + "grad_norm": 0.9585131406784058, + "kl": 0.00047177166561596096, + "learning_rate": 1.5555555555555556e-07, + "loss": 0.0, + "reward": -0.33766669034957886, + "reward_std": 0.2008279412984848, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3376666307449341, + "step": 7 + }, + { + "completion_length": 200.0, + "epoch": 0.0035603026257231864, + "grad_norm": 0.6275889873504639, + "kl": 0.0004587940056808293, + "learning_rate": 1.777777777777778e-07, + "loss": 0.0, + "reward": -0.4321666657924652, + "reward_std": 0.2741382122039795, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4321666657924652, + "step": 8 + }, + { + "completion_length": 152.5, + "epoch": 0.004005340453938585, + "grad_norm": 0.7820535898208618, + "kl": 0.0003722615947481245, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "reward": -0.24449998140335083, + "reward_std": 0.2453835904598236, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.24449998140335083, + "step": 9 + }, + { + "completion_length": 200.0, + "epoch": 0.004450378282153983, + "grad_norm": 0.6632036566734314, + "kl": 0.00043898209696635604, + "learning_rate": 2.2222222222222224e-07, + "loss": 0.0, + "reward": -0.006833334919065237, + "reward_std": 0.3229244351387024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.006833334919065237, + "step": 10 + }, + { + "completion_length": 200.0, + "epoch": 0.004895416110369382, + "grad_norm": 0.7768407464027405, + "kl": 0.00042121915612369776, + "learning_rate": 2.444444444444445e-07, + "loss": 0.0, + "reward": -0.5506666898727417, + "reward_std": 0.06373277306556702, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5506666898727417, + "step": 11 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.0053404539385847796, + "grad_norm": 0.7278463244438171, + "kl": 0.0004068611597176641, + "learning_rate": 2.666666666666667e-07, + "loss": 0.0, + "reward": -0.2864999771118164, + "reward_std": 0.334197998046875, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.2864999771118164, + "step": 12 + }, + { + "completion_length": 200.0, + "epoch": 0.005785491766800178, + "grad_norm": 0.6500892043113708, + "kl": 0.0003313750494271517, + "learning_rate": 2.888888888888889e-07, + "loss": 0.0, + "reward": -0.32199999690055847, + "reward_std": 0.34629470109939575, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.32199999690055847, + "step": 13 + }, + { + "completion_length": 200.0, + "epoch": 0.006230529595015576, + "grad_norm": 0.7193189859390259, + "kl": 0.0003894752007909119, + "learning_rate": 3.111111111111111e-07, + "loss": 0.0, + "reward": -0.08683334290981293, + "reward_std": 0.32831722497940063, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08683334290981293, + "step": 14 + }, + { + "completion_length": 200.0, + "epoch": 0.006675567423230975, + "grad_norm": 0.6580405235290527, + "kl": 0.00035931920865550637, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0, + "reward": 0.006666670553386211, + "reward_std": 0.28985628485679626, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.006666670553386211, + "step": 15 + }, + { + "completion_length": 200.0, + "epoch": 0.007120605251446373, + "grad_norm": 0.7829955220222473, + "kl": 0.0004430452245287597, + "learning_rate": 3.555555555555556e-07, + "loss": 0.0, + "reward": -0.4845000207424164, + "reward_std": 0.30278095602989197, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4845000207424164, + "step": 16 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.0075656430796617715, + "grad_norm": 0.7301868796348572, + "kl": 0.0004354792181402445, + "learning_rate": 3.777777777777778e-07, + "loss": 0.0, + "reward": -0.5509999990463257, + "reward_std": 0.09809383004903793, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5509999990463257, + "step": 17 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.00801068090787717, + "grad_norm": 0.627457320690155, + "kl": 0.0004219269612804055, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": -0.382999986410141, + "reward_std": 0.258131742477417, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.382999986410141, + "step": 18 + }, + { + "completion_length": 198.5, + "epoch": 0.008455718736092568, + "grad_norm": 0.6126062273979187, + "kl": 0.00037288008024916053, + "learning_rate": 4.2222222222222226e-07, + "loss": 0.0, + "reward": -0.5198333263397217, + "reward_std": 0.06735106557607651, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5198333859443665, + "step": 19 + }, + { + "completion_length": 200.0, + "epoch": 0.008900756564307966, + "grad_norm": 0.7298072576522827, + "kl": 0.00042569750803522766, + "learning_rate": 4.444444444444445e-07, + "loss": 0.0, + "reward": -0.12316666543483734, + "reward_std": 0.38789814710617065, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.12316666543483734, + "step": 20 + }, + { + "completion_length": 200.0, + "epoch": 0.009345794392523364, + "grad_norm": 0.7508992552757263, + "kl": 0.0003784544242080301, + "learning_rate": 4.666666666666667e-07, + "loss": 0.0, + "reward": 0.007999996654689312, + "reward_std": 0.2865903079509735, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.007999996654689312, + "step": 21 + }, + { + "completion_length": 200.0, + "epoch": 0.009790832220738763, + "grad_norm": 0.7774270176887512, + "kl": 0.0004591545439325273, + "learning_rate": 4.88888888888889e-07, + "loss": 0.0, + "reward": -0.12583334743976593, + "reward_std": 0.3894747495651245, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.12583334743976593, + "step": 22 + }, + { + "completion_length": 178.6666717529297, + "epoch": 0.010235870048954161, + "grad_norm": 0.9822273254394531, + "kl": 0.0003580343909561634, + "learning_rate": 5.111111111111112e-07, + "loss": 0.0, + "reward": -0.40816670656204224, + "reward_std": 0.2941179871559143, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.40816670656204224, + "step": 23 + }, + { + "completion_length": 195.6666717529297, + "epoch": 0.010680907877169559, + "grad_norm": 0.7280099987983704, + "kl": 0.0003752989578060806, + "learning_rate": 5.333333333333335e-07, + "loss": 0.0, + "reward": -0.5036666393280029, + "reward_std": 0.06944254040718079, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5036666393280029, + "step": 24 + }, + { + "completion_length": 194.1666717529297, + "epoch": 0.011125945705384957, + "grad_norm": 0.7669874429702759, + "kl": 0.0003995794686488807, + "learning_rate": 5.555555555555555e-07, + "loss": 0.0, + "reward": -0.1613333374261856, + "reward_std": 0.3338997960090637, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1613333374261856, + "step": 25 + }, + { + "completion_length": 200.0, + "epoch": 0.011570983533600357, + "grad_norm": 0.0017349894624203444, + "kl": 0.00044584478018805385, + "learning_rate": 5.777777777777778e-07, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 26 + }, + { + "completion_length": 200.0, + "epoch": 0.012016021361815754, + "grad_norm": 0.7146498560905457, + "kl": 0.0004711821093223989, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "reward": 0.004333337303251028, + "reward_std": 0.2955717444419861, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.004333337303251028, + "step": 27 + }, + { + "completion_length": 194.0, + "epoch": 0.012461059190031152, + "grad_norm": 0.7583353519439697, + "kl": 0.0003412925580050796, + "learning_rate": 6.222222222222223e-07, + "loss": 0.0, + "reward": -0.32066667079925537, + "reward_std": 0.34830141067504883, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.32066667079925537, + "step": 28 + }, + { + "completion_length": 200.0, + "epoch": 0.01290609701824655, + "grad_norm": 0.6495766639709473, + "kl": 0.0003407001495361328, + "learning_rate": 6.444444444444445e-07, + "loss": 0.0, + "reward": 0.013833334669470787, + "reward_std": 0.2723016142845154, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.013833334669470787, + "step": 29 + }, + { + "completion_length": 200.0, + "epoch": 0.01335113484646195, + "grad_norm": 0.6830177307128906, + "kl": 0.00040841297595761716, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0, + "reward": -0.38466668128967285, + "reward_std": 0.3953356444835663, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.38466668128967285, + "step": 30 + }, + { + "completion_length": 197.0, + "epoch": 0.013796172674677348, + "grad_norm": 0.6479209661483765, + "kl": 0.00035427496186457574, + "learning_rate": 6.88888888888889e-07, + "loss": 0.0, + "reward": -0.41850003600120544, + "reward_std": 0.27061542868614197, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.41850003600120544, + "step": 31 + }, + { + "completion_length": 200.0, + "epoch": 0.014241210502892745, + "grad_norm": 0.001505751977674663, + "kl": 0.00040443878970108926, + "learning_rate": 7.111111111111112e-07, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 32 + }, + { + "completion_length": 194.83334350585938, + "epoch": 0.014686248331108143, + "grad_norm": 0.8729995489120483, + "kl": 0.00045898579992353916, + "learning_rate": 7.333333333333334e-07, + "loss": 0.0, + "reward": -0.4713333547115326, + "reward_std": 0.0908794030547142, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4713333547115326, + "step": 33 + }, + { + "completion_length": 200.0, + "epoch": 0.015131286159323543, + "grad_norm": 0.83029705286026, + "kl": 0.0004323194734752178, + "learning_rate": 7.555555555555556e-07, + "loss": 0.0, + "reward": -0.3368333578109741, + "reward_std": 0.35846197605133057, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.33683332800865173, + "step": 34 + }, + { + "completion_length": 200.0, + "epoch": 0.01557632398753894, + "grad_norm": 0.678088903427124, + "kl": 0.0004401813494041562, + "learning_rate": 7.777777777777779e-07, + "loss": 0.0, + "reward": -0.22800001502037048, + "reward_std": 0.38724154233932495, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.22800001502037048, + "step": 35 + }, + { + "completion_length": 200.0, + "epoch": 0.01602136181575434, + "grad_norm": 0.7426117658615112, + "kl": 0.0003528599627315998, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "reward": -0.19583332538604736, + "reward_std": 0.3525362014770508, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.19583332538604736, + "step": 36 + }, + { + "completion_length": 200.0, + "epoch": 0.016466399643969738, + "grad_norm": 0.0014730676775798202, + "kl": 0.000418979674577713, + "learning_rate": 8.222222222222223e-07, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 37 + }, + { + "completion_length": 200.0, + "epoch": 0.016911437472185136, + "grad_norm": 0.6773275136947632, + "kl": 0.0003312948392704129, + "learning_rate": 8.444444444444445e-07, + "loss": 0.0, + "reward": 0.016833335161209106, + "reward_std": 0.2649531364440918, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.016833335161209106, + "step": 38 + }, + { + "completion_length": 200.0, + "epoch": 0.017356475300400534, + "grad_norm": 0.4805554449558258, + "kl": 0.00037392970989458263, + "learning_rate": 8.666666666666668e-07, + "loss": 0.0, + "reward": -0.4266667068004608, + "reward_std": 0.2752167582511902, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4266667068004608, + "step": 39 + }, + { + "completion_length": 188.33334350585938, + "epoch": 0.017801513128615932, + "grad_norm": 0.8252691626548767, + "kl": 0.00034736632369458675, + "learning_rate": 8.88888888888889e-07, + "loss": 0.0, + "reward": -0.43516668677330017, + "reward_std": 0.10999348759651184, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.43516668677330017, + "step": 40 + }, + { + "completion_length": 195.83334350585938, + "epoch": 0.01824655095683133, + "grad_norm": 0.6390711069107056, + "kl": 0.00039921089773997664, + "learning_rate": 9.111111111111113e-07, + "loss": 0.0, + "reward": -0.44216665625572205, + "reward_std": 0.1345755159854889, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.44216665625572205, + "step": 41 + }, + { + "completion_length": 200.0, + "epoch": 0.018691588785046728, + "grad_norm": 0.584726095199585, + "kl": 0.000410672917496413, + "learning_rate": 9.333333333333334e-07, + "loss": 0.0, + "reward": -0.08250001072883606, + "reward_std": 0.3218010365962982, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08250000327825546, + "step": 42 + }, + { + "completion_length": 200.0, + "epoch": 0.01913662661326213, + "grad_norm": 0.7584457993507385, + "kl": 0.0004279993590898812, + "learning_rate": 9.555555555555556e-07, + "loss": 0.0, + "reward": -0.1798333376646042, + "reward_std": 0.4852597117424011, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1798333376646042, + "step": 43 + }, + { + "completion_length": 200.0, + "epoch": 0.019581664441477527, + "grad_norm": 0.6866025328636169, + "kl": 0.0004159708914812654, + "learning_rate": 9.77777777777778e-07, + "loss": 0.0, + "reward": 0.022333335131406784, + "reward_std": 0.25148093700408936, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.022333335131406784, + "step": 44 + }, + { + "completion_length": 200.0, + "epoch": 0.020026702269692925, + "grad_norm": 0.6524875164031982, + "kl": 0.0004553616454359144, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0, + "reward": -0.30799999833106995, + "reward_std": 0.33663925528526306, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.30799999833106995, + "step": 45 + }, + { + "completion_length": 197.1666717529297, + "epoch": 0.020471740097908322, + "grad_norm": 1.1161606311798096, + "kl": 0.0004416520241647959, + "learning_rate": 1.0222222222222223e-06, + "loss": 0.0, + "reward": -0.609333336353302, + "reward_std": 0.2037426233291626, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.609333336353302, + "step": 46 + }, + { + "completion_length": 200.0, + "epoch": 0.02091677792612372, + "grad_norm": 0.0014684926718473434, + "kl": 0.0004343845648691058, + "learning_rate": 1.0444444444444445e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 47 + }, + { + "completion_length": 200.0, + "epoch": 0.021361815754339118, + "grad_norm": 0.7850491404533386, + "kl": 0.0004720585129689425, + "learning_rate": 1.066666666666667e-06, + "loss": 0.0, + "reward": -0.24550001323223114, + "reward_std": 0.4167482554912567, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.24550001323223114, + "step": 48 + }, + { + "completion_length": 174.0, + "epoch": 0.021806853582554516, + "grad_norm": 0.6916747093200684, + "kl": 0.0003871291410177946, + "learning_rate": 1.0888888888888889e-06, + "loss": 0.0, + "reward": -0.3348333239555359, + "reward_std": 0.15262427926063538, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3348333537578583, + "step": 49 + }, + { + "completion_length": 200.0, + "epoch": 0.022251891410769914, + "grad_norm": 0.6415843367576599, + "kl": 0.0004480450297705829, + "learning_rate": 1.111111111111111e-06, + "loss": 0.0, + "reward": -0.5586666464805603, + "reward_std": 0.04939500615000725, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5586667060852051, + "step": 50 + }, + { + "completion_length": 200.0, + "epoch": 0.022696929238985315, + "grad_norm": 0.7905615568161011, + "kl": 0.00043076984002254903, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.0, + "reward": -0.5353333353996277, + "reward_std": 0.01973491534590721, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5353333353996277, + "step": 51 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.023141967067200713, + "grad_norm": 0.6714709401130676, + "kl": 0.000462042837170884, + "learning_rate": 1.1555555555555556e-06, + "loss": 0.0, + "reward": -0.45649999380111694, + "reward_std": 0.0799018144607544, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.45650002360343933, + "step": 52 + }, + { + "completion_length": 200.0, + "epoch": 0.02358700489541611, + "grad_norm": 0.002091527683660388, + "kl": 0.0004879847401753068, + "learning_rate": 1.1777777777777778e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 53 + }, + { + "completion_length": 200.0, + "epoch": 0.02403204272363151, + "grad_norm": 0.7136774659156799, + "kl": 0.00044702840386889875, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0, + "reward": 0.019333332777023315, + "reward_std": 0.25882941484451294, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019333332777023315, + "step": 54 + }, + { + "completion_length": 191.6666717529297, + "epoch": 0.024477080551846907, + "grad_norm": 1.0719506740570068, + "kl": 0.00041414215229451656, + "learning_rate": 1.2222222222222223e-06, + "loss": 0.0, + "reward": 0.0806666687130928, + "reward_std": 0.10859404504299164, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0806666687130928, + "step": 55 + }, + { + "completion_length": 200.0, + "epoch": 0.024922118380062305, + "grad_norm": 0.002473922213539481, + "kl": 0.0005147390766069293, + "learning_rate": 1.2444444444444445e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 56 + }, + { + "completion_length": 200.0, + "epoch": 0.025367156208277702, + "grad_norm": 0.7581794857978821, + "kl": 0.0004257292894180864, + "learning_rate": 1.2666666666666669e-06, + "loss": 0.0, + "reward": -0.09583333134651184, + "reward_std": 0.34220486879348755, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09583333134651184, + "step": 57 + }, + { + "completion_length": 200.0, + "epoch": 0.0258121940364931, + "grad_norm": 0.6923669576644897, + "kl": 0.0004169998865108937, + "learning_rate": 1.288888888888889e-06, + "loss": 0.0, + "reward": -0.3475000262260437, + "reward_std": 0.3798324763774872, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3475000262260437, + "step": 58 + }, + { + "completion_length": 109.0, + "epoch": 0.0262572318647085, + "grad_norm": 1.0712668895721436, + "kl": 0.00041128776501864195, + "learning_rate": 1.3111111111111112e-06, + "loss": 0.0, + "reward": -0.06133333593606949, + "reward_std": 0.20354819297790527, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.06133333966135979, + "step": 59 + }, + { + "completion_length": 200.0, + "epoch": 0.0267022696929239, + "grad_norm": 0.6178426742553711, + "kl": 0.0004559112712740898, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0, + "reward": 0.023666664958000183, + "reward_std": 0.2482149600982666, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.023666664958000183, + "step": 60 + }, + { + "completion_length": 200.0, + "epoch": 0.027147307521139297, + "grad_norm": 0.6320557594299316, + "kl": 0.0005056136287748814, + "learning_rate": 1.3555555555555558e-06, + "loss": 0.0, + "reward": -0.4624999761581421, + "reward_std": 0.29319530725479126, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4624999761581421, + "step": 61 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.027592345349354695, + "grad_norm": 0.7537997364997864, + "kl": 0.0004290025099180639, + "learning_rate": 1.377777777777778e-06, + "loss": 0.0, + "reward": -0.48366665840148926, + "reward_std": 0.12520170211791992, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.48366665840148926, + "step": 62 + }, + { + "completion_length": 200.0, + "epoch": 0.028037383177570093, + "grad_norm": 0.0018171067349612713, + "kl": 0.0004468559636734426, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 63 + }, + { + "completion_length": 200.0, + "epoch": 0.02848242100578549, + "grad_norm": 0.6882118582725525, + "kl": 0.0004662362043745816, + "learning_rate": 1.4222222222222223e-06, + "loss": 0.0, + "reward": 0.01850000023841858, + "reward_std": 0.26087066531181335, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.01850000023841858, + "step": 64 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.02892745883400089, + "grad_norm": 0.7403289675712585, + "kl": 0.00043322655255906284, + "learning_rate": 1.4444444444444445e-06, + "loss": 0.0, + "reward": -0.47933337092399597, + "reward_std": 0.09026111662387848, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.47933337092399597, + "step": 65 + }, + { + "completion_length": 200.0, + "epoch": 0.029372496662216287, + "grad_norm": 0.0017793107545003295, + "kl": 0.00043216149788349867, + "learning_rate": 1.4666666666666669e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 66 + }, + { + "completion_length": 199.0, + "epoch": 0.029817534490431688, + "grad_norm": 0.6182725429534912, + "kl": 0.00037733028875663877, + "learning_rate": 1.4888888888888888e-06, + "loss": 0.0, + "reward": -0.07999999821186066, + "reward_std": 0.32039913535118103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08000000566244125, + "step": 67 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.030262572318647086, + "grad_norm": 1.1008906364440918, + "kl": 0.0003855052054859698, + "learning_rate": 1.5111111111111112e-06, + "loss": 0.0, + "reward": -0.04516666755080223, + "reward_std": 0.15019378066062927, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.04516666755080223, + "step": 68 + }, + { + "completion_length": 200.0, + "epoch": 0.030707610146862484, + "grad_norm": 0.6984473466873169, + "kl": 0.00044030696153640747, + "learning_rate": 1.5333333333333334e-06, + "loss": 0.0, + "reward": -0.34150001406669617, + "reward_std": 0.364574134349823, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.34150001406669617, + "step": 69 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.03115264797507788, + "grad_norm": 0.6155555248260498, + "kl": 0.00039145484333857894, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.0, + "reward": -0.3800000548362732, + "reward_std": 0.2700370252132416, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3799999952316284, + "step": 70 + }, + { + "completion_length": 200.0, + "epoch": 0.03159768580329328, + "grad_norm": 0.5811487436294556, + "kl": 0.0003899557632394135, + "learning_rate": 1.5777777777777778e-06, + "loss": 0.0, + "reward": 0.014166663400828838, + "reward_std": 0.2714851498603821, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.014166663400828838, + "step": 71 + }, + { + "completion_length": 200.0, + "epoch": 0.03204272363150868, + "grad_norm": 0.6491996049880981, + "kl": 0.0004053341690450907, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "reward": 0.01066666841506958, + "reward_std": 0.2800583243370056, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.01066666841506958, + "step": 72 + }, + { + "completion_length": 192.6666717529297, + "epoch": 0.032487761459724075, + "grad_norm": 0.7621211409568787, + "kl": 0.0005313883302733302, + "learning_rate": 1.6222222222222223e-06, + "loss": 0.0, + "reward": -0.4951666593551636, + "reward_std": 0.05715736746788025, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4951666593551636, + "step": 73 + }, + { + "completion_length": 200.0, + "epoch": 0.032932799287939477, + "grad_norm": 0.7010623216629028, + "kl": 0.00042472081258893013, + "learning_rate": 1.6444444444444447e-06, + "loss": 0.0, + "reward": -0.0989999994635582, + "reward_std": 0.34915727376937866, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0989999994635582, + "step": 74 + }, + { + "completion_length": 200.0, + "epoch": 0.03337783711615487, + "grad_norm": 0.7583237886428833, + "kl": 0.00046256266068667173, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0, + "reward": -0.09333333373069763, + "reward_std": 0.33824294805526733, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09333333373069763, + "step": 75 + }, + { + "completion_length": 200.0, + "epoch": 0.03382287494437027, + "grad_norm": 0.7639651894569397, + "kl": 0.00039581506280228496, + "learning_rate": 1.688888888888889e-06, + "loss": 0.0, + "reward": -0.2276666760444641, + "reward_std": 0.3896222710609436, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.2276666760444641, + "step": 76 + }, + { + "completion_length": 164.1666717529297, + "epoch": 0.03426791277258567, + "grad_norm": 0.8515862822532654, + "kl": 0.0004627097805496305, + "learning_rate": 1.7111111111111112e-06, + "loss": 0.0, + "reward": -0.3383333683013916, + "reward_std": 0.19050845503807068, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3383333683013916, + "step": 77 + }, + { + "completion_length": 200.0, + "epoch": 0.03471295060080107, + "grad_norm": 0.6962697505950928, + "kl": 0.000421873846789822, + "learning_rate": 1.7333333333333336e-06, + "loss": 0.0, + "reward": 0.013333330862224102, + "reward_std": 0.2735263705253601, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.013333330862224102, + "step": 78 + }, + { + "completion_length": 200.0, + "epoch": 0.03515798842901647, + "grad_norm": 0.6623450517654419, + "kl": 0.0004432189743965864, + "learning_rate": 1.7555555555555556e-06, + "loss": 0.0, + "reward": -0.0011666715145111084, + "reward_std": 0.3090439736843109, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0011666715145111084, + "step": 79 + }, + { + "completion_length": 145.5, + "epoch": 0.035603026257231864, + "grad_norm": 0.9290313720703125, + "kl": 0.0004976950585842133, + "learning_rate": 1.777777777777778e-06, + "loss": 0.0, + "reward": -0.1913333386182785, + "reward_std": 0.20369061827659607, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1913333535194397, + "step": 80 + }, + { + "completion_length": 200.0, + "epoch": 0.036048064085447265, + "grad_norm": 0.00226940237917006, + "kl": 0.00044577824883162975, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 81 + }, + { + "completion_length": 200.0, + "epoch": 0.03649310191366266, + "grad_norm": 0.7341117262840271, + "kl": 0.0003961599140893668, + "learning_rate": 1.8222222222222225e-06, + "loss": 0.0, + "reward": 0.012833337299525738, + "reward_std": 0.27475112676620483, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.012833337299525738, + "step": 82 + }, + { + "completion_length": 176.83334350585938, + "epoch": 0.03693813974187806, + "grad_norm": 1.1084420680999756, + "kl": 0.0004944322863593698, + "learning_rate": 1.8444444444444445e-06, + "loss": 0.0, + "reward": -0.4231666922569275, + "reward_std": 0.38362035155296326, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4231666922569275, + "step": 83 + }, + { + "completion_length": 200.0, + "epoch": 0.037383177570093455, + "grad_norm": 0.5538046956062317, + "kl": 0.0004035194288007915, + "learning_rate": 1.8666666666666669e-06, + "loss": 0.0, + "reward": 0.020999997854232788, + "reward_std": 0.2547469735145569, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.020999997854232788, + "step": 84 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.037828215398308856, + "grad_norm": 0.6329994797706604, + "kl": 0.00045469467295333743, + "learning_rate": 1.888888888888889e-06, + "loss": 0.0, + "reward": -0.1913333386182785, + "reward_std": 0.3470242917537689, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1913333386182785, + "step": 85 + }, + { + "completion_length": 200.0, + "epoch": 0.03827325322652426, + "grad_norm": 0.7051041722297668, + "kl": 0.00048318642075173557, + "learning_rate": 1.9111111111111112e-06, + "loss": 0.0, + "reward": 0.012499998323619366, + "reward_std": 0.27556759119033813, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.012499998323619366, + "step": 86 + }, + { + "completion_length": 200.0, + "epoch": 0.03871829105473965, + "grad_norm": 0.9833978414535522, + "kl": 0.0004773414693772793, + "learning_rate": 1.9333333333333336e-06, + "loss": 0.0, + "reward": -0.37816667556762695, + "reward_std": 0.35067784786224365, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.37816667556762695, + "step": 87 + }, + { + "completion_length": 200.0, + "epoch": 0.039163328882955054, + "grad_norm": 0.7608519196510315, + "kl": 0.0004284613241907209, + "learning_rate": 1.955555555555556e-06, + "loss": 0.0, + "reward": -0.4285000264644623, + "reward_std": 0.27370041608810425, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4285000264644623, + "step": 88 + }, + { + "completion_length": 200.0, + "epoch": 0.03960836671117045, + "grad_norm": 0.8064699769020081, + "kl": 0.00046465283958241343, + "learning_rate": 1.977777777777778e-06, + "loss": 0.0, + "reward": -0.6141666769981384, + "reward_std": 0.05563242360949516, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.6141666769981384, + "step": 89 + }, + { + "completion_length": 200.0, + "epoch": 0.04005340453938585, + "grad_norm": 0.7693816423416138, + "kl": 0.0004991068853996694, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "reward": -0.21366667747497559, + "reward_std": 0.42142030596733093, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.21366667747497559, + "step": 90 + }, + { + "completion_length": 170.83334350585938, + "epoch": 0.040498442367601244, + "grad_norm": 0.8475862145423889, + "kl": 0.0004466826212592423, + "learning_rate": 2.0222222222222223e-06, + "loss": 0.0, + "reward": -0.25600001215934753, + "reward_std": 0.3458733558654785, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.25600001215934753, + "step": 91 + }, + { + "completion_length": 200.0, + "epoch": 0.040943480195816645, + "grad_norm": 0.001588360988534987, + "kl": 0.00045517744729295373, + "learning_rate": 2.0444444444444447e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 92 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.04138851802403204, + "grad_norm": 0.7196416854858398, + "kl": 0.0005219130543991923, + "learning_rate": 2.0666666666666666e-06, + "loss": 0.0, + "reward": -0.28700000047683716, + "reward_std": 0.23125484585762024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.28700003027915955, + "step": 93 + }, + { + "completion_length": 200.0, + "epoch": 0.04183355585224744, + "grad_norm": 0.0016818700823932886, + "kl": 0.0004399843164719641, + "learning_rate": 2.088888888888889e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 94 + }, + { + "completion_length": 187.33334350585938, + "epoch": 0.04227859368046284, + "grad_norm": 0.662451446056366, + "kl": 0.0005664011696353555, + "learning_rate": 2.1111111111111114e-06, + "loss": 0.0, + "reward": -0.07083334028720856, + "reward_std": 0.4042125344276428, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.07083334028720856, + "step": 95 + }, + { + "completion_length": 200.0, + "epoch": 0.042723631508678236, + "grad_norm": 0.8778610229492188, + "kl": 0.0005331834545359015, + "learning_rate": 2.133333333333334e-06, + "loss": 0.0, + "reward": 0.013499995693564415, + "reward_std": 0.27311810851097107, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.013499995693564415, + "step": 96 + }, + { + "completion_length": 200.0, + "epoch": 0.04316866933689364, + "grad_norm": 0.7499815821647644, + "kl": 0.0005722627975046635, + "learning_rate": 2.1555555555555558e-06, + "loss": 0.0, + "reward": -0.36016663908958435, + "reward_std": 0.37692034244537354, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.36016666889190674, + "step": 97 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.04361370716510903, + "grad_norm": 0.6992683410644531, + "kl": 0.0006015513790771365, + "learning_rate": 2.1777777777777777e-06, + "loss": 0.0, + "reward": -0.3303333520889282, + "reward_std": 0.3531128168106079, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3303333520889282, + "step": 98 + }, + { + "completion_length": 200.0, + "epoch": 0.044058744993324434, + "grad_norm": 0.7034227848052979, + "kl": 0.0005352528532966971, + "learning_rate": 2.2e-06, + "loss": 0.0, + "reward": -0.5301666855812073, + "reward_std": 0.03528975695371628, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5301666855812073, + "step": 99 + }, + { + "completion_length": 200.0, + "epoch": 0.04450378282153983, + "grad_norm": 0.7136642932891846, + "kl": 0.0005618830909952521, + "learning_rate": 2.222222222222222e-06, + "loss": 0.0, + "reward": 0.023666664958000183, + "reward_std": 0.2482149600982666, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.023666664958000183, + "step": 100 + }, + { + "completion_length": 200.0, + "epoch": 0.04494882064975523, + "grad_norm": 0.9584755301475525, + "kl": 0.0008303936338052154, + "learning_rate": 2.2444444444444445e-06, + "loss": 0.0, + "reward": -0.34933334589004517, + "reward_std": 0.369188129901886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.34933334589004517, + "step": 101 + }, + { + "completion_length": 200.0, + "epoch": 0.04539385847797063, + "grad_norm": 0.810990035533905, + "kl": 0.0007378787267953157, + "learning_rate": 2.266666666666667e-06, + "loss": 0.0, + "reward": -0.5558333396911621, + "reward_std": 0.02642286941409111, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5558333396911621, + "step": 102 + }, + { + "completion_length": 200.0, + "epoch": 0.045838896306186025, + "grad_norm": 0.8462039232254028, + "kl": 0.0007519976934418082, + "learning_rate": 2.2888888888888892e-06, + "loss": 0.0, + "reward": -0.5571666955947876, + "reward_std": 0.042873844504356384, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5571666955947876, + "step": 103 + }, + { + "completion_length": 152.33334350585938, + "epoch": 0.046283934134401426, + "grad_norm": 0.8582832217216492, + "kl": 0.0006616115570068359, + "learning_rate": 2.311111111111111e-06, + "loss": 0.0, + "reward": -0.20516666769981384, + "reward_std": 0.3065997064113617, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.20516668260097504, + "step": 104 + }, + { + "completion_length": 200.0, + "epoch": 0.04672897196261682, + "grad_norm": 0.751700758934021, + "kl": 0.0007129245204851031, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.0, + "reward": -0.24383334815502167, + "reward_std": 0.40426644682884216, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.24383334815502167, + "step": 105 + }, + { + "completion_length": 200.0, + "epoch": 0.04717400979083222, + "grad_norm": 0.7948376536369324, + "kl": 0.0009101564064621925, + "learning_rate": 2.3555555555555555e-06, + "loss": 0.0, + "reward": -0.26483333110809326, + "reward_std": 0.44175583124160767, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.26483333110809326, + "step": 106 + }, + { + "completion_length": 200.0, + "epoch": 0.047619047619047616, + "grad_norm": 0.6845387816429138, + "kl": 0.0005481390398927033, + "learning_rate": 2.377777777777778e-06, + "loss": 0.0, + "reward": -0.08950000256299973, + "reward_std": 0.3333369195461273, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08950000256299973, + "step": 107 + }, + { + "completion_length": 174.33334350585938, + "epoch": 0.04806408544726302, + "grad_norm": 0.9041478633880615, + "kl": 0.0010797386057674885, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0, + "reward": -0.3813333511352539, + "reward_std": 0.1361376941204071, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3813333511352539, + "step": 108 + }, + { + "completion_length": 200.0, + "epoch": 0.04850912327547842, + "grad_norm": 0.8546884059906006, + "kl": 0.0009791944175958633, + "learning_rate": 2.4222222222222223e-06, + "loss": 0.0, + "reward": -0.17666666209697723, + "reward_std": 0.3318732678890228, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.17666666209697723, + "step": 109 + }, + { + "completion_length": 200.0, + "epoch": 0.04895416110369381, + "grad_norm": 0.002719539450481534, + "kl": 0.0005873936461284757, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 110 + }, + { + "completion_length": 200.0, + "epoch": 0.049399198931909215, + "grad_norm": 0.7770993113517761, + "kl": 0.0009914538823068142, + "learning_rate": 2.466666666666667e-06, + "loss": 0.0, + "reward": -0.3736667037010193, + "reward_std": 0.3873085379600525, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3736667037010193, + "step": 111 + }, + { + "completion_length": 200.0, + "epoch": 0.04984423676012461, + "grad_norm": 0.6850268244743347, + "kl": 0.0007792222313582897, + "learning_rate": 2.488888888888889e-06, + "loss": 0.0, + "reward": -0.5395000576972961, + "reward_std": 0.025041967630386353, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5395000576972961, + "step": 112 + }, + { + "completion_length": 198.33334350585938, + "epoch": 0.05028927458834001, + "grad_norm": 0.7239099144935608, + "kl": 0.001012115739285946, + "learning_rate": 2.5111111111111114e-06, + "loss": 0.0, + "reward": -0.42133331298828125, + "reward_std": 0.27339982986450195, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.42133331298828125, + "step": 113 + }, + { + "completion_length": 200.0, + "epoch": 0.050734312416555405, + "grad_norm": 0.5771994590759277, + "kl": 0.0015950507950037718, + "learning_rate": 2.5333333333333338e-06, + "loss": 0.0001, + "reward": -0.621666669845581, + "reward_std": 0.031443070620298386, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.621666669845581, + "step": 114 + }, + { + "completion_length": 187.6666717529297, + "epoch": 0.051179350244770806, + "grad_norm": 0.8059555292129517, + "kl": 0.0011016380740329623, + "learning_rate": 2.5555555555555557e-06, + "loss": 0.0, + "reward": -0.31283336877822876, + "reward_std": 0.25179630517959595, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.31283336877822876, + "step": 115 + }, + { + "completion_length": 200.0, + "epoch": 0.0516243880729862, + "grad_norm": 0.7699070572853088, + "kl": 0.000991647131741047, + "learning_rate": 2.577777777777778e-06, + "loss": 0.0, + "reward": -0.4051666855812073, + "reward_std": 0.26059579849243164, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4051666855812073, + "step": 116 + }, + { + "completion_length": 200.0, + "epoch": 0.0520694259012016, + "grad_norm": 0.6547927856445312, + "kl": 0.0008666824433021247, + "learning_rate": 2.6e-06, + "loss": 0.0, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 117 + }, + { + "completion_length": 198.1666717529297, + "epoch": 0.052514463729417, + "grad_norm": 0.8903563022613525, + "kl": 0.0012721801176667213, + "learning_rate": 2.6222222222222225e-06, + "loss": 0.0001, + "reward": -0.5, + "reward_std": 0.10164842009544373, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.5, + "step": 118 + }, + { + "completion_length": 200.0, + "epoch": 0.0529595015576324, + "grad_norm": 0.7130769491195679, + "kl": 0.0009466246119700372, + "learning_rate": 2.6444444444444444e-06, + "loss": 0.0, + "reward": -0.07766667008399963, + "reward_std": 0.36592990159988403, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.07766667008399963, + "step": 119 + }, + { + "completion_length": 200.0, + "epoch": 0.0534045393858478, + "grad_norm": 0.010055916383862495, + "kl": 0.0016336208209395409, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 120 + }, + { + "completion_length": 200.0, + "epoch": 0.05384957721406319, + "grad_norm": 0.7626153826713562, + "kl": 0.0019612584728747606, + "learning_rate": 2.6888888888888892e-06, + "loss": 0.0001, + "reward": -0.44099998474121094, + "reward_std": 0.27870485186576843, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4410000443458557, + "step": 121 + }, + { + "completion_length": 200.0, + "epoch": 0.054294615042278595, + "grad_norm": 0.7897221446037292, + "kl": 0.001940418384037912, + "learning_rate": 2.7111111111111116e-06, + "loss": 0.0001, + "reward": -0.20816665887832642, + "reward_std": 0.36510735750198364, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.20816665887832642, + "step": 122 + }, + { + "completion_length": 200.0, + "epoch": 0.05473965287049399, + "grad_norm": 0.004020801745355129, + "kl": 0.0008703676867298782, + "learning_rate": 2.7333333333333336e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 123 + }, + { + "completion_length": 200.0, + "epoch": 0.05518469069870939, + "grad_norm": 0.005421373061835766, + "kl": 0.0008717196760699153, + "learning_rate": 2.755555555555556e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 124 + }, + { + "completion_length": 178.33334350585938, + "epoch": 0.05562972852692479, + "grad_norm": 0.8171259164810181, + "kl": 0.0014178266283124685, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.0001, + "reward": -0.16316668689250946, + "reward_std": 0.3424929082393646, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.16316668689250946, + "step": 125 + }, + { + "completion_length": 200.0, + "epoch": 0.056074766355140186, + "grad_norm": 0.811120331287384, + "kl": 0.00203678198158741, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0001, + "reward": -0.4819999933242798, + "reward_std": 0.3060758113861084, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4819999933242798, + "step": 126 + }, + { + "completion_length": 200.0, + "epoch": 0.05651980418335559, + "grad_norm": 0.004288077354431152, + "kl": 0.00076559919398278, + "learning_rate": 2.8222222222222223e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 127 + }, + { + "completion_length": 200.0, + "epoch": 0.05696484201157098, + "grad_norm": 0.7662628293037415, + "kl": 0.0013816291466355324, + "learning_rate": 2.8444444444444446e-06, + "loss": 0.0001, + "reward": -0.4233333468437195, + "reward_std": 0.2736119031906128, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.4233333468437195, + "step": 128 + }, + { + "completion_length": 200.0, + "epoch": 0.05740987983978638, + "grad_norm": 0.7000979781150818, + "kl": 0.0022010253742337227, + "learning_rate": 2.866666666666667e-06, + "loss": 0.0001, + "reward": -0.43416666984558105, + "reward_std": 0.2743176221847534, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.43416666984558105, + "step": 129 + }, + { + "completion_length": 189.5, + "epoch": 0.05785491766800178, + "grad_norm": 0.6776527762413025, + "kl": 0.0026975106447935104, + "learning_rate": 2.888888888888889e-06, + "loss": 0.0001, + "reward": -0.035999998450279236, + "reward_std": 0.27313145995140076, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.03599999472498894, + "step": 130 + }, + { + "completion_length": 200.0, + "epoch": 0.05829995549621718, + "grad_norm": 0.7056086659431458, + "kl": 0.001560688717290759, + "learning_rate": 2.9111111111111114e-06, + "loss": 0.0001, + "reward": -0.31833332777023315, + "reward_std": 0.3454664647579193, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.31833332777023315, + "step": 131 + }, + { + "completion_length": 172.5, + "epoch": 0.05874499332443257, + "grad_norm": 1.0837831497192383, + "kl": 0.0017425650730729103, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.0001, + "reward": -0.3088333308696747, + "reward_std": 0.3001648783683777, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3088333308696747, + "step": 132 + }, + { + "completion_length": 200.0, + "epoch": 0.059190031152647975, + "grad_norm": 0.7270302176475525, + "kl": 0.0014150540810078382, + "learning_rate": 2.955555555555556e-06, + "loss": 0.0001, + "reward": -0.53083336353302, + "reward_std": 0.02674633078277111, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.53083336353302, + "step": 133 + }, + { + "completion_length": 181.5, + "epoch": 0.059635068980863376, + "grad_norm": 0.6710302233695984, + "kl": 0.0028289398178458214, + "learning_rate": 2.9777777777777777e-06, + "loss": 0.0001, + "reward": -0.15183335542678833, + "reward_std": 0.3105810284614563, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.15183335542678833, + "step": 134 + }, + { + "completion_length": 200.0, + "epoch": 0.06008010680907877, + "grad_norm": 0.8479624390602112, + "kl": 0.0018316200003027916, + "learning_rate": 3e-06, + "loss": 0.0001, + "reward": 0.02383333444595337, + "reward_std": 0.24780671298503876, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02383333444595337, + "step": 135 + }, + { + "completion_length": 200.0, + "epoch": 0.06052514463729417, + "grad_norm": 0.8236755728721619, + "kl": 0.002227437449619174, + "learning_rate": 3.0222222222222225e-06, + "loss": 0.0001, + "reward": -0.3340000510215759, + "reward_std": 0.35908883810043335, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3340000510215759, + "step": 136 + }, + { + "completion_length": 200.0, + "epoch": 0.060970182465509566, + "grad_norm": 0.011570720933377743, + "kl": 0.0022140974178910255, + "learning_rate": 3.044444444444445e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 137 + }, + { + "completion_length": 200.0, + "epoch": 0.06141522029372497, + "grad_norm": 0.7572616338729858, + "kl": 0.003316813614219427, + "learning_rate": 3.066666666666667e-06, + "loss": 0.0001, + "reward": -0.18433332443237305, + "reward_std": 0.340387225151062, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.18433332443237305, + "step": 138 + }, + { + "completion_length": 200.0, + "epoch": 0.06186025812194036, + "grad_norm": 0.00493138050660491, + "kl": 0.0011617927812039852, + "learning_rate": 3.088888888888889e-06, + "loss": 0.0, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 139 + }, + { + "completion_length": 200.0, + "epoch": 0.06230529595015576, + "grad_norm": 0.6994268894195557, + "kl": 0.003730988595634699, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.0001, + "reward": -0.10350000858306885, + "reward_std": 0.35402247309684753, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10350000858306885, + "step": 140 + }, + { + "completion_length": 188.6666717529297, + "epoch": 0.06275033377837116, + "grad_norm": 0.7533054351806641, + "kl": 0.007150403223931789, + "learning_rate": 3.133333333333334e-06, + "loss": 0.0003, + "reward": -0.21416668593883514, + "reward_std": 0.36423152685165405, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.21416667103767395, + "step": 141 + }, + { + "completion_length": 200.0, + "epoch": 0.06319537160658656, + "grad_norm": 0.853130042552948, + "kl": 0.0037856251001358032, + "learning_rate": 3.1555555555555555e-06, + "loss": 0.0002, + "reward": -0.45483335852622986, + "reward_std": 0.23466865718364716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.45483335852622986, + "step": 142 + }, + { + "completion_length": 184.5, + "epoch": 0.06364040943480195, + "grad_norm": 0.6948954463005066, + "kl": 0.003319720271974802, + "learning_rate": 3.177777777777778e-06, + "loss": 0.0001, + "reward": -0.33100003004074097, + "reward_std": 0.12057197093963623, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.33100003004074097, + "step": 143 + }, + { + "completion_length": 200.0, + "epoch": 0.06408544726301736, + "grad_norm": 0.6649389266967773, + "kl": 0.004403320141136646, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0002, + "reward": -0.33233335614204407, + "reward_std": 0.3574983775615692, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3323333263397217, + "step": 144 + }, + { + "completion_length": 200.0, + "epoch": 0.06453048509123276, + "grad_norm": 0.008188747800886631, + "kl": 0.0016727236798033118, + "learning_rate": 3.2222222222222227e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 145 + }, + { + "completion_length": 197.0, + "epoch": 0.06497552291944815, + "grad_norm": 0.7850582599639893, + "kl": 0.004564880859106779, + "learning_rate": 3.2444444444444446e-06, + "loss": 0.0002, + "reward": -0.18733333051204681, + "reward_std": 0.36280059814453125, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.18733333051204681, + "step": 146 + }, + { + "completion_length": 200.0, + "epoch": 0.06542056074766354, + "grad_norm": 0.8551555275917053, + "kl": 0.005018829368054867, + "learning_rate": 3.266666666666667e-06, + "loss": 0.0002, + "reward": -0.22599999606609344, + "reward_std": 0.3867609202861786, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.22599999606609344, + "step": 147 + }, + { + "completion_length": 200.0, + "epoch": 0.06586559857587895, + "grad_norm": 0.011043811216950417, + "kl": 0.0033470559865236282, + "learning_rate": 3.2888888888888894e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 148 + }, + { + "completion_length": 187.6666717529297, + "epoch": 0.06631063640409435, + "grad_norm": 0.7791420221328735, + "kl": 0.007582447957247496, + "learning_rate": 3.3111111111111118e-06, + "loss": 0.0003, + "reward": -0.24450001120567322, + "reward_std": 0.23151740431785583, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.24450001120567322, + "step": 149 + }, + { + "completion_length": 200.0, + "epoch": 0.06675567423230974, + "grad_norm": 0.7961557507514954, + "kl": 0.005281232297420502, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0002, + "reward": 0.025166666135191917, + "reward_std": 0.2445407211780548, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.025166666135191917, + "step": 150 + }, + { + "completion_length": 191.1666717529297, + "epoch": 0.06720071206052515, + "grad_norm": 0.7541393041610718, + "kl": 0.008166976273059845, + "learning_rate": 3.3555555555555557e-06, + "loss": 0.0003, + "reward": -0.26350000500679016, + "reward_std": 0.3062899112701416, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.26350000500679016, + "step": 151 + }, + { + "completion_length": 138.5, + "epoch": 0.06764574988874054, + "grad_norm": 1.3180961608886719, + "kl": 0.0019082196522504091, + "learning_rate": 3.377777777777778e-06, + "loss": 0.0001, + "reward": -0.19033333659172058, + "reward_std": 0.2720842957496643, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.19033333659172058, + "step": 152 + }, + { + "completion_length": 200.0, + "epoch": 0.06809078771695594, + "grad_norm": 0.014968675561249256, + "kl": 0.0043890452943742275, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 153 + }, + { + "completion_length": 200.0, + "epoch": 0.06853582554517133, + "grad_norm": 1.0916509628295898, + "kl": 0.006353219039738178, + "learning_rate": 3.4222222222222224e-06, + "loss": 0.0003, + "reward": -0.014833331108093262, + "reward_std": 0.4068416953086853, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.014833331108093262, + "step": 154 + }, + { + "completion_length": 200.0, + "epoch": 0.06898086337338674, + "grad_norm": 0.7793182730674744, + "kl": 0.00863957405090332, + "learning_rate": 3.444444444444445e-06, + "loss": 0.0003, + "reward": -0.10183333605527878, + "reward_std": 0.4037016034126282, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10183333605527878, + "step": 155 + }, + { + "completion_length": 200.0, + "epoch": 0.06942590120160214, + "grad_norm": 0.012789091095328331, + "kl": 0.0035248759668320417, + "learning_rate": 3.4666666666666672e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 156 + }, + { + "completion_length": 199.83334350585938, + "epoch": 0.06987093902981753, + "grad_norm": 0.8552259802818298, + "kl": 0.007797297090291977, + "learning_rate": 3.4888888888888896e-06, + "loss": 0.0003, + "reward": -0.32883334159851074, + "reward_std": 0.3524216115474701, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.32883334159851074, + "step": 157 + }, + { + "completion_length": 200.0, + "epoch": 0.07031597685803294, + "grad_norm": 0.7439639568328857, + "kl": 0.008432665839791298, + "learning_rate": 3.511111111111111e-06, + "loss": 0.0003, + "reward": -0.3968333601951599, + "reward_std": 0.25783050060272217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3968333601951599, + "step": 158 + }, + { + "completion_length": 200.0, + "epoch": 0.07076101468624833, + "grad_norm": 0.8293086290359497, + "kl": 0.015361580066382885, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.0006, + "reward": -0.09950000047683716, + "reward_std": 0.34847769141197205, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09950000047683716, + "step": 159 + }, + { + "completion_length": 200.0, + "epoch": 0.07120605251446373, + "grad_norm": 0.02794010564684868, + "kl": 0.011896876618266106, + "learning_rate": 3.555555555555556e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 160 + }, + { + "completion_length": 168.5, + "epoch": 0.07165109034267912, + "grad_norm": 1.1416176557540894, + "kl": 0.01758456416428089, + "learning_rate": 3.577777777777778e-06, + "loss": 0.0007, + "reward": -0.21533334255218506, + "reward_std": 0.36748045682907104, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.21533334255218506, + "step": 161 + }, + { + "completion_length": 200.0, + "epoch": 0.07209612817089453, + "grad_norm": 0.8033695220947266, + "kl": 0.016339905560016632, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0007, + "reward": 0.022333335131406784, + "reward_std": 0.25148093700408936, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.022333335131406784, + "step": 162 + }, + { + "completion_length": 200.0, + "epoch": 0.07254116599910992, + "grad_norm": 0.013050896115601063, + "kl": 0.005038695875555277, + "learning_rate": 3.6222222222222226e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 163 + }, + { + "completion_length": 200.0, + "epoch": 0.07298620382732532, + "grad_norm": 0.8717074990272522, + "kl": 0.01940556988120079, + "learning_rate": 3.644444444444445e-06, + "loss": 0.0008, + "reward": -0.44333335757255554, + "reward_std": 0.27983972430229187, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.44333332777023315, + "step": 164 + }, + { + "completion_length": 200.0, + "epoch": 0.07343124165554073, + "grad_norm": 0.7769482135772705, + "kl": 0.008692685514688492, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.0003, + "reward": -0.1525000035762787, + "reward_std": 0.4397725462913513, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1525000035762787, + "step": 165 + }, + { + "completion_length": 200.0, + "epoch": 0.07387627948375612, + "grad_norm": 0.009532845579087734, + "kl": 0.003257167525589466, + "learning_rate": 3.688888888888889e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 166 + }, + { + "completion_length": 200.0, + "epoch": 0.07432131731197152, + "grad_norm": 0.766445517539978, + "kl": 0.01774817332625389, + "learning_rate": 3.7111111111111113e-06, + "loss": 0.0007, + "reward": 0.01966666243970394, + "reward_std": 0.25801295042037964, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.01966666243970394, + "step": 167 + }, + { + "completion_length": 192.83334350585938, + "epoch": 0.07476635514018691, + "grad_norm": 0.7344871759414673, + "kl": 0.019676849246025085, + "learning_rate": 3.7333333333333337e-06, + "loss": 0.0008, + "reward": -0.3711666762828827, + "reward_std": 0.27725324034690857, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.3711666762828827, + "step": 168 + }, + { + "completion_length": 200.0, + "epoch": 0.07521139296840232, + "grad_norm": 0.708225429058075, + "kl": 0.01603546366095543, + "learning_rate": 3.7555555555555557e-06, + "loss": 0.0006, + "reward": -0.007833331823348999, + "reward_std": 0.32537388801574707, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.007833331823348999, + "step": 169 + }, + { + "completion_length": 200.0, + "epoch": 0.07565643079661771, + "grad_norm": 0.012713441625237465, + "kl": 0.005252152215689421, + "learning_rate": 3.777777777777778e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 170 + }, + { + "completion_length": 200.0, + "epoch": 0.07610146862483311, + "grad_norm": 0.013440362177789211, + "kl": 0.006324666552245617, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 171 + }, + { + "completion_length": 200.0, + "epoch": 0.07654650645304852, + "grad_norm": 0.018537871539592743, + "kl": 0.0071646831929683685, + "learning_rate": 3.8222222222222224e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 172 + }, + { + "completion_length": 200.0, + "epoch": 0.07699154428126391, + "grad_norm": 0.01842048391699791, + "kl": 0.008270082995295525, + "learning_rate": 3.844444444444445e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 173 + }, + { + "completion_length": 200.0, + "epoch": 0.0774365821094793, + "grad_norm": 0.7180718779563904, + "kl": 0.023133087903261185, + "learning_rate": 3.866666666666667e-06, + "loss": 0.0009, + "reward": -0.32500001788139343, + "reward_std": 0.35587525367736816, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.32500001788139343, + "step": 174 + }, + { + "completion_length": 200.0, + "epoch": 0.0778816199376947, + "grad_norm": 0.8401187062263489, + "kl": 0.013387969695031643, + "learning_rate": 3.88888888888889e-06, + "loss": 0.0005, + "reward": 0.029499998316168785, + "reward_std": 0.2993685007095337, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.029499998316168785, + "step": 175 + }, + { + "completion_length": 200.0, + "epoch": 0.07832665776591011, + "grad_norm": 0.0299467034637928, + "kl": 0.015433305874466896, + "learning_rate": 3.911111111111112e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 176 + }, + { + "completion_length": 200.0, + "epoch": 0.0787716955941255, + "grad_norm": 0.01781369000673294, + "kl": 0.00665412237867713, + "learning_rate": 3.9333333333333335e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 177 + }, + { + "completion_length": 200.0, + "epoch": 0.0792167334223409, + "grad_norm": 0.029804501682519913, + "kl": 0.015158621594309807, + "learning_rate": 3.955555555555556e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 178 + }, + { + "completion_length": 200.0, + "epoch": 0.0796617712505563, + "grad_norm": 0.020057376474142075, + "kl": 0.005600334610790014, + "learning_rate": 3.977777777777778e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 179 + }, + { + "completion_length": 200.0, + "epoch": 0.0801068090787717, + "grad_norm": 0.6821267008781433, + "kl": 0.01735800690948963, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0007, + "reward": -0.2288333624601364, + "reward_std": 0.3883763253688812, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.2288333624601364, + "step": 180 + }, + { + "completion_length": 200.0, + "epoch": 0.08055184690698709, + "grad_norm": 0.8316364884376526, + "kl": 0.03854802995920181, + "learning_rate": 4.022222222222222e-06, + "loss": 0.0015, + "reward": -0.11349999904632568, + "reward_std": 0.37036940455436707, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.11349999904632568, + "step": 181 + }, + { + "completion_length": 200.0, + "epoch": 0.08099688473520249, + "grad_norm": 0.8583576679229736, + "kl": 0.014207671396434307, + "learning_rate": 4.044444444444445e-06, + "loss": 0.0006, + "reward": 0.002499997615814209, + "reward_std": 0.30006250739097595, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.002499997615814209, + "step": 182 + }, + { + "completion_length": 200.0, + "epoch": 0.0814419225634179, + "grad_norm": 0.0153994495049119, + "kl": 0.007102414965629578, + "learning_rate": 4.066666666666667e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 183 + }, + { + "completion_length": 200.0, + "epoch": 0.08188696039163329, + "grad_norm": 0.8327325582504272, + "kl": 0.022632954642176628, + "learning_rate": 4.088888888888889e-06, + "loss": 0.0009, + "reward": 0.014666667208075523, + "reward_std": 0.27026039361953735, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.014666667208075523, + "step": 184 + }, + { + "completion_length": 200.0, + "epoch": 0.08233199821984868, + "grad_norm": 0.010787020437419415, + "kl": 0.004412154667079449, + "learning_rate": 4.111111111111111e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 185 + }, + { + "completion_length": 200.0, + "epoch": 0.08277703604806408, + "grad_norm": 0.029312826693058014, + "kl": 0.01270313374698162, + "learning_rate": 4.133333333333333e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 186 + }, + { + "completion_length": 175.1666717529297, + "epoch": 0.08322207387627949, + "grad_norm": 0.8411778211593628, + "kl": 0.028145212680101395, + "learning_rate": 4.155555555555556e-06, + "loss": 0.0011, + "reward": -0.01066666841506958, + "reward_std": 0.17476919293403625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.01066666841506958, + "step": 187 + }, + { + "completion_length": 200.0, + "epoch": 0.08366711170449488, + "grad_norm": 0.023354342207312584, + "kl": 0.00851635355502367, + "learning_rate": 4.177777777777778e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 188 + }, + { + "completion_length": 192.5, + "epoch": 0.08411214953271028, + "grad_norm": 0.8554210662841797, + "kl": 0.01744459569454193, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0007, + "reward": -0.04233333468437195, + "reward_std": 0.2719578444957733, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.04233333468437195, + "step": 189 + }, + { + "completion_length": 177.33334350585938, + "epoch": 0.08455718736092568, + "grad_norm": 0.8024124503135681, + "kl": 0.015208413824439049, + "learning_rate": 4.222222222222223e-06, + "loss": 0.0006, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 190 + }, + { + "completion_length": 200.0, + "epoch": 0.08500222518914108, + "grad_norm": 0.6993169188499451, + "kl": 0.014287407509982586, + "learning_rate": 4.244444444444445e-06, + "loss": 0.0006, + "reward": -0.1301666796207428, + "reward_std": 0.39577287435531616, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1301666796207428, + "step": 191 + }, + { + "completion_length": 200.0, + "epoch": 0.08544726301735647, + "grad_norm": 0.6678306460380554, + "kl": 0.013930333778262138, + "learning_rate": 4.266666666666668e-06, + "loss": 0.0006, + "reward": 0.043666668236255646, + "reward_std": 0.26521819829940796, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.043666668236255646, + "step": 192 + }, + { + "completion_length": 200.0, + "epoch": 0.08589230084557187, + "grad_norm": 0.028331147506833076, + "kl": 0.017971333116292953, + "learning_rate": 4.288888888888889e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 193 + }, + { + "completion_length": 200.0, + "epoch": 0.08633733867378728, + "grad_norm": 0.027453351765871048, + "kl": 0.009431993588805199, + "learning_rate": 4.3111111111111115e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 194 + }, + { + "completion_length": 200.0, + "epoch": 0.08678237650200267, + "grad_norm": 0.773445725440979, + "kl": 0.023339729756116867, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0009, + "reward": -0.10633333772420883, + "reward_std": 0.35838064551353455, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10633333772420883, + "step": 195 + }, + { + "completion_length": 200.0, + "epoch": 0.08722741433021806, + "grad_norm": 0.014650012366473675, + "kl": 0.007890871725976467, + "learning_rate": 4.3555555555555555e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 196 + }, + { + "completion_length": 200.0, + "epoch": 0.08767245215843347, + "grad_norm": 0.025273794308304787, + "kl": 0.01200440526008606, + "learning_rate": 4.377777777777778e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 197 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.08811748998664887, + "grad_norm": 0.7459567785263062, + "kl": 0.01451034378260374, + "learning_rate": 4.4e-06, + "loss": 0.0006, + "reward": -0.26500001549720764, + "reward_std": 0.33120569586753845, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.26500001549720764, + "step": 198 + }, + { + "completion_length": 200.0, + "epoch": 0.08856252781486426, + "grad_norm": 0.010312313213944435, + "kl": 0.0038611218333244324, + "learning_rate": 4.422222222222223e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 199 + }, + { + "completion_length": 200.0, + "epoch": 0.08900756564307966, + "grad_norm": 0.6166565418243408, + "kl": 0.009738167747855186, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0004, + "reward": 0.0035000047646462917, + "reward_std": 0.2976129949092865, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0035000047646462917, + "step": 200 + }, + { + "completion_length": 200.0, + "epoch": 0.08945260347129506, + "grad_norm": 0.008906065486371517, + "kl": 0.0037507950328290462, + "learning_rate": 4.4666666666666665e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 201 + }, + { + "completion_length": 198.1666717529297, + "epoch": 0.08989764129951046, + "grad_norm": 0.9748014807701111, + "kl": 0.009199577383697033, + "learning_rate": 4.488888888888889e-06, + "loss": 0.0004, + "reward": 0.04983333498239517, + "reward_std": 0.18411998450756073, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04983333498239517, + "step": 202 + }, + { + "completion_length": 200.0, + "epoch": 0.09034267912772585, + "grad_norm": 0.04424808546900749, + "kl": 0.01891401596367359, + "learning_rate": 4.511111111111111e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 203 + }, + { + "completion_length": 200.0, + "epoch": 0.09078771695594126, + "grad_norm": 0.008351677097380161, + "kl": 0.003573081223294139, + "learning_rate": 4.533333333333334e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 204 + }, + { + "completion_length": 200.0, + "epoch": 0.09123275478415666, + "grad_norm": 0.7445893883705139, + "kl": 0.014485888183116913, + "learning_rate": 4.555555555555556e-06, + "loss": 0.0006, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 205 + }, + { + "completion_length": 200.0, + "epoch": 0.09167779261237205, + "grad_norm": 0.6814647316932678, + "kl": 0.017133373767137527, + "learning_rate": 4.5777777777777785e-06, + "loss": 0.0007, + "reward": -0.08250000327825546, + "reward_std": 0.37661266326904297, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08250000327825546, + "step": 206 + }, + { + "completion_length": 200.0, + "epoch": 0.09212283044058744, + "grad_norm": 0.013401404023170471, + "kl": 0.0045086476020514965, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 207 + }, + { + "completion_length": 200.0, + "epoch": 0.09256786826880285, + "grad_norm": 0.015258646570146084, + "kl": 0.005985723342746496, + "learning_rate": 4.622222222222222e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 208 + }, + { + "completion_length": 200.0, + "epoch": 0.09301290609701825, + "grad_norm": 0.017956389114260674, + "kl": 0.0076329647563397884, + "learning_rate": 4.644444444444445e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 209 + }, + { + "completion_length": 200.0, + "epoch": 0.09345794392523364, + "grad_norm": 0.6878367066383362, + "kl": 0.00822862982749939, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0003, + "reward": 0.025333335623145103, + "reward_std": 0.24413248896598816, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.025333335623145103, + "step": 210 + }, + { + "completion_length": 185.6666717529297, + "epoch": 0.09390298175344905, + "grad_norm": 0.7083204388618469, + "kl": 0.008970928378403187, + "learning_rate": 4.6888888888888895e-06, + "loss": 0.0004, + "reward": -0.10883334279060364, + "reward_std": 0.31478020548820496, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10883334279060364, + "step": 211 + }, + { + "completion_length": 200.0, + "epoch": 0.09434801958166444, + "grad_norm": 0.010259282775223255, + "kl": 0.005621683783829212, + "learning_rate": 4.711111111111111e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 212 + }, + { + "completion_length": 200.0, + "epoch": 0.09479305740987984, + "grad_norm": 0.63719642162323, + "kl": 0.017996463924646378, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.0007, + "reward": -0.2148333489894867, + "reward_std": 0.37843701243400574, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.2148333489894867, + "step": 213 + }, + { + "completion_length": 200.0, + "epoch": 0.09523809523809523, + "grad_norm": 0.7535067796707153, + "kl": 0.009188219904899597, + "learning_rate": 4.755555555555556e-06, + "loss": 0.0004, + "reward": 0.025833334773778915, + "reward_std": 0.24290774762630463, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.025833334773778915, + "step": 214 + }, + { + "completion_length": 200.0, + "epoch": 0.09568313306631064, + "grad_norm": 0.03754749149084091, + "kl": 0.023878587409853935, + "learning_rate": 4.777777777777778e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 215 + }, + { + "completion_length": 195.1666717529297, + "epoch": 0.09612817089452604, + "grad_norm": 0.7495411038398743, + "kl": 0.020265337079763412, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0008, + "reward": -0.1301666796207428, + "reward_std": 0.2998015582561493, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1301666796207428, + "step": 216 + }, + { + "completion_length": 200.0, + "epoch": 0.09657320872274143, + "grad_norm": 0.8328439593315125, + "kl": 0.012733696028590202, + "learning_rate": 4.822222222222222e-06, + "loss": 0.0005, + "reward": -0.10983332991600037, + "reward_std": 0.36434730887413025, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10983332991600037, + "step": 217 + }, + { + "completion_length": 200.0, + "epoch": 0.09701824655095684, + "grad_norm": 0.014833502471446991, + "kl": 0.005706641357392073, + "learning_rate": 4.8444444444444446e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 218 + }, + { + "completion_length": 200.0, + "epoch": 0.09746328437917223, + "grad_norm": 0.01487264595925808, + "kl": 0.007111959159374237, + "learning_rate": 4.866666666666667e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 219 + }, + { + "completion_length": 191.5, + "epoch": 0.09790832220738763, + "grad_norm": 0.6546903252601624, + "kl": 0.03114478290081024, + "learning_rate": 4.888888888888889e-06, + "loss": 0.0012, + "reward": -0.02316666767001152, + "reward_std": 0.3049527406692505, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.02316666767001152, + "step": 220 + }, + { + "completion_length": 200.0, + "epoch": 0.09835336003560302, + "grad_norm": 0.012690722942352295, + "kl": 0.004840874578803778, + "learning_rate": 4.911111111111112e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 221 + }, + { + "completion_length": 200.0, + "epoch": 0.09879839786381843, + "grad_norm": 0.8420057892799377, + "kl": 0.01995580643415451, + "learning_rate": 4.933333333333334e-06, + "loss": 0.0008, + "reward": -0.11483334004878998, + "reward_std": 0.37201637029647827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.11483334004878998, + "step": 222 + }, + { + "completion_length": 162.6666717529297, + "epoch": 0.09924343569203382, + "grad_norm": 0.7063876390457153, + "kl": 0.018741153180599213, + "learning_rate": 4.9555555555555565e-06, + "loss": 0.0007, + "reward": -0.15850001573562622, + "reward_std": 0.2223670333623886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.15850001573562622, + "step": 223 + }, + { + "completion_length": 142.1666717529297, + "epoch": 0.09968847352024922, + "grad_norm": 1.3013415336608887, + "kl": 0.0186506025493145, + "learning_rate": 4.977777777777778e-06, + "loss": 0.0007, + "reward": 0.08866667002439499, + "reward_std": 0.11559354513883591, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08866667002439499, + "step": 224 + }, + { + "completion_length": 200.0, + "epoch": 0.10013351134846461, + "grad_norm": 0.8148112297058105, + "kl": 0.014559872448444366, + "learning_rate": 5e-06, + "loss": 0.0006, + "reward": 0.011333337053656578, + "reward_std": 0.27842533588409424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.011333337053656578, + "step": 225 + }, + { + "completion_length": 200.0, + "epoch": 0.10057854917668002, + "grad_norm": 0.7223935127258301, + "kl": 0.01846488006412983, + "learning_rate": 4.999996982499377e-06, + "loss": 0.0007, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 226 + }, + { + "completion_length": 176.83334350585938, + "epoch": 0.10102358700489542, + "grad_norm": 0.855940580368042, + "kl": 0.023898562416434288, + "learning_rate": 4.9999879300047904e-06, + "loss": 0.001, + "reward": -0.06350000202655792, + "reward_std": 0.24804334342479706, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.06350000202655792, + "step": 227 + }, + { + "completion_length": 178.1666717529297, + "epoch": 0.10146862483311081, + "grad_norm": 1.3559764623641968, + "kl": 0.012699100188910961, + "learning_rate": 4.999972842538094e-06, + "loss": 0.0005, + "reward": 0.0833333358168602, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 228 + }, + { + "completion_length": 200.0, + "epoch": 0.10191366266132622, + "grad_norm": 0.11357201635837555, + "kl": 0.02485606074333191, + "learning_rate": 4.999951720135707e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 229 + }, + { + "completion_length": 193.5, + "epoch": 0.10235870048954161, + "grad_norm": 1.0450315475463867, + "kl": 0.022413700819015503, + "learning_rate": 4.999924562848623e-06, + "loss": 0.0009, + "reward": -0.27666670083999634, + "reward_std": 0.32778817415237427, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.27666670083999634, + "step": 230 + }, + { + "completion_length": 200.0, + "epoch": 0.102803738317757, + "grad_norm": 0.022118445485830307, + "kl": 0.011586411856114864, + "learning_rate": 4.999891370742395e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 231 + }, + { + "completion_length": 200.0, + "epoch": 0.1032487761459724, + "grad_norm": 0.032451968640089035, + "kl": 0.01942615956068039, + "learning_rate": 4.999852143897152e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 232 + }, + { + "completion_length": 187.33334350585938, + "epoch": 0.10369381397418781, + "grad_norm": 1.0973916053771973, + "kl": 0.028928395360708237, + "learning_rate": 4.999806882407586e-06, + "loss": 0.0012, + "reward": 0.09416666626930237, + "reward_std": 0.07552593946456909, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.09416666626930237, + "step": 233 + }, + { + "completion_length": 200.0, + "epoch": 0.1041388518024032, + "grad_norm": 0.8492021560668945, + "kl": 0.020361032336950302, + "learning_rate": 4.9997555863829584e-06, + "loss": 0.0008, + "reward": -0.0728333443403244, + "reward_std": 0.3074478507041931, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0728333443403244, + "step": 234 + }, + { + "completion_length": 200.0, + "epoch": 0.1045838896306186, + "grad_norm": 0.7689980268478394, + "kl": 0.010467594489455223, + "learning_rate": 4.999698255947099e-06, + "loss": 0.0004, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 235 + }, + { + "completion_length": 200.0, + "epoch": 0.105028927458834, + "grad_norm": 0.022514864802360535, + "kl": 0.016714762896299362, + "learning_rate": 4.9996348912384025e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 236 + }, + { + "completion_length": 200.0, + "epoch": 0.1054739652870494, + "grad_norm": 0.7497798800468445, + "kl": 0.035260215401649475, + "learning_rate": 4.999565492409831e-06, + "loss": 0.0014, + "reward": -0.07899999618530273, + "reward_std": 0.3160455822944641, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.07899999618530273, + "step": 237 + }, + { + "completion_length": 180.33334350585938, + "epoch": 0.1059190031152648, + "grad_norm": 1.1552839279174805, + "kl": 0.020859047770500183, + "learning_rate": 4.999490059628914e-06, + "loss": 0.0008, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 238 + }, + { + "completion_length": 200.0, + "epoch": 0.10636404094348019, + "grad_norm": 0.014450768008828163, + "kl": 0.00946769304573536, + "learning_rate": 4.999408593077747e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 239 + }, + { + "completion_length": 200.0, + "epoch": 0.1068090787716956, + "grad_norm": 0.6490687131881714, + "kl": 0.013119819574058056, + "learning_rate": 4.999321092952989e-06, + "loss": 0.0005, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 240 + }, + { + "completion_length": 198.1666717529297, + "epoch": 0.10725411659991099, + "grad_norm": 0.8787567615509033, + "kl": 0.027725404128432274, + "learning_rate": 4.999227559465865e-06, + "loss": 0.0011, + "reward": -0.060833342373371124, + "reward_std": 0.2964027523994446, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.060833342373371124, + "step": 241 + }, + { + "completion_length": 200.0, + "epoch": 0.10769915442812639, + "grad_norm": 0.6240781545639038, + "kl": 0.011513415724039078, + "learning_rate": 4.999127992842167e-06, + "loss": 0.0005, + "reward": 0.019833337515592575, + "reward_std": 0.2576046586036682, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019833337515592575, + "step": 242 + }, + { + "completion_length": 200.0, + "epoch": 0.1081441922563418, + "grad_norm": 0.021726641803979874, + "kl": 0.011281192302703857, + "learning_rate": 4.999022393322246e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 243 + }, + { + "completion_length": 200.0, + "epoch": 0.10858923008455719, + "grad_norm": 0.01203103642910719, + "kl": 0.007502372842282057, + "learning_rate": 4.998910761161022e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 244 + }, + { + "completion_length": 200.0, + "epoch": 0.10903426791277258, + "grad_norm": 0.03352230042219162, + "kl": 0.015283550135791302, + "learning_rate": 4.998793096627973e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 245 + }, + { + "completion_length": 180.0, + "epoch": 0.10947930574098798, + "grad_norm": 0.6931204199790955, + "kl": 0.025455031543970108, + "learning_rate": 4.998669400007142e-06, + "loss": 0.001, + "reward": -0.0663333386182785, + "reward_std": 0.25824224948883057, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0663333386182785, + "step": 246 + }, + { + "completion_length": 179.33334350585938, + "epoch": 0.10992434356920339, + "grad_norm": 0.7052227258682251, + "kl": 0.01645379140973091, + "learning_rate": 4.998539671597134e-06, + "loss": 0.0007, + "reward": -0.1290000081062317, + "reward_std": 0.243107408285141, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1290000081062317, + "step": 247 + }, + { + "completion_length": 200.0, + "epoch": 0.11036938139741878, + "grad_norm": 0.6620118021965027, + "kl": 0.01992572657763958, + "learning_rate": 4.998403911711112e-06, + "loss": 0.0008, + "reward": 0.02500000223517418, + "reward_std": 0.24494896829128265, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02500000223517418, + "step": 248 + }, + { + "completion_length": 200.0, + "epoch": 0.11081441922563418, + "grad_norm": 0.012199520133435726, + "kl": 0.01109787356108427, + "learning_rate": 4.9982621206768e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 249 + }, + { + "completion_length": 200.0, + "epoch": 0.11125945705384958, + "grad_norm": 0.6646403074264526, + "kl": 0.017368197441101074, + "learning_rate": 4.998114298836483e-06, + "loss": 0.0007, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 250 + }, + { + "completion_length": 200.0, + "epoch": 0.11170449488206498, + "grad_norm": 0.64476478099823, + "kl": 0.018091298639774323, + "learning_rate": 4.997960446547002e-06, + "loss": 0.0007, + "reward": 0.007833331823348999, + "reward_std": 0.28699856996536255, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.007833331823348999, + "step": 251 + }, + { + "completion_length": 200.0, + "epoch": 0.11214953271028037, + "grad_norm": 0.01579858362674713, + "kl": 0.009715499356389046, + "learning_rate": 4.997800564179758e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 252 + }, + { + "completion_length": 200.0, + "epoch": 0.11259457053849577, + "grad_norm": 0.7200601696968079, + "kl": 0.02384258806705475, + "learning_rate": 4.997634652120704e-06, + "loss": 0.001, + "reward": 0.016166668385267258, + "reward_std": 0.2665861248970032, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.016166668385267258, + "step": 253 + }, + { + "completion_length": 193.33334350585938, + "epoch": 0.11303960836671118, + "grad_norm": 0.7668142318725586, + "kl": 0.029656527563929558, + "learning_rate": 4.997462710770356e-06, + "loss": 0.0012, + "reward": -0.13450001180171967, + "reward_std": 0.29981911182403564, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.13450001180171967, + "step": 254 + }, + { + "completion_length": 200.0, + "epoch": 0.11348464619492657, + "grad_norm": 0.02032412961125374, + "kl": 0.014308085665106773, + "learning_rate": 4.997284740543776e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 255 + }, + { + "completion_length": 198.1666717529297, + "epoch": 0.11392968402314196, + "grad_norm": 0.025216558948159218, + "kl": 0.022948317229747772, + "learning_rate": 4.997100741870587e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 256 + }, + { + "completion_length": 200.0, + "epoch": 0.11437472185135737, + "grad_norm": 0.0239882729947567, + "kl": 0.01751275360584259, + "learning_rate": 4.996910715194963e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 257 + }, + { + "completion_length": 200.0, + "epoch": 0.11481975967957277, + "grad_norm": 0.028994852676987648, + "kl": 0.01953146606683731, + "learning_rate": 4.9967146609756254e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 258 + }, + { + "completion_length": 200.0, + "epoch": 0.11526479750778816, + "grad_norm": 0.7920787334442139, + "kl": 0.024792861193418503, + "learning_rate": 4.996512579685851e-06, + "loss": 0.001, + "reward": 0.034333329647779465, + "reward_std": 0.28770241141319275, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.034333329647779465, + "step": 259 + }, + { + "completion_length": 200.0, + "epoch": 0.11570983533600356, + "grad_norm": 0.027261994779109955, + "kl": 0.01596745476126671, + "learning_rate": 4.996304471813464e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 260 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.11615487316421896, + "grad_norm": 0.7512991428375244, + "kl": 0.020810922607779503, + "learning_rate": 4.996090337860836e-06, + "loss": 0.0008, + "reward": 0.03583333641290665, + "reward_std": 0.218412846326828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03583333641290665, + "step": 261 + }, + { + "completion_length": 200.0, + "epoch": 0.11659991099243436, + "grad_norm": 0.01981634460389614, + "kl": 0.02145499363541603, + "learning_rate": 4.995870178344888e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 262 + }, + { + "completion_length": 200.0, + "epoch": 0.11704494882064975, + "grad_norm": 0.015385876409709454, + "kl": 0.010222600772976875, + "learning_rate": 4.995643993797084e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 263 + }, + { + "completion_length": 200.0, + "epoch": 0.11748998664886515, + "grad_norm": 0.010453257709741592, + "kl": 0.004751277156174183, + "learning_rate": 4.995411784763434e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 264 + }, + { + "completion_length": 200.0, + "epoch": 0.11793502447708056, + "grad_norm": 0.02353510819375515, + "kl": 0.01597435772418976, + "learning_rate": 4.995173551804491e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 265 + }, + { + "completion_length": 200.0, + "epoch": 0.11838006230529595, + "grad_norm": 0.6973554491996765, + "kl": 0.013702675700187683, + "learning_rate": 4.9949292954953486e-06, + "loss": 0.0005, + "reward": 0.020166665315628052, + "reward_std": 0.2567881941795349, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.020166665315628052, + "step": 266 + }, + { + "completion_length": 200.0, + "epoch": 0.11882510013351134, + "grad_norm": 0.7307512760162354, + "kl": 0.02334902063012123, + "learning_rate": 4.994679016425642e-06, + "loss": 0.0009, + "reward": 0.007999996654689312, + "reward_std": 0.2865903079509735, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.007999996654689312, + "step": 267 + }, + { + "completion_length": 196.5, + "epoch": 0.11927013796172675, + "grad_norm": 0.7821613550186157, + "kl": 0.023126548156142235, + "learning_rate": 4.994422715199546e-06, + "loss": 0.0009, + "reward": -0.35100001096725464, + "reward_std": 0.24427853524684906, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.35100001096725464, + "step": 268 + }, + { + "completion_length": 200.0, + "epoch": 0.11971517578994215, + "grad_norm": 0.014924556948244572, + "kl": 0.010245325975120068, + "learning_rate": 4.99416039243577e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 269 + }, + { + "completion_length": 200.0, + "epoch": 0.12016021361815754, + "grad_norm": 0.011302834376692772, + "kl": 0.0074605681002140045, + "learning_rate": 4.993892048767563e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 270 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.12060525144637294, + "grad_norm": 0.7507510185241699, + "kl": 0.01992712914943695, + "learning_rate": 4.993617684842707e-06, + "loss": 0.0008, + "reward": 0.0560000017285347, + "reward_std": 0.1690147966146469, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0560000017285347, + "step": 271 + }, + { + "completion_length": 186.33334350585938, + "epoch": 0.12105028927458834, + "grad_norm": 0.642690896987915, + "kl": 0.03827090188860893, + "learning_rate": 4.9933373013235156e-06, + "loss": 0.0015, + "reward": -0.01666666567325592, + "reward_std": 0.2657710909843445, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.01666666567325592, + "step": 272 + }, + { + "completion_length": 200.0, + "epoch": 0.12149532710280374, + "grad_norm": 0.016800519078969955, + "kl": 0.007222745567560196, + "learning_rate": 4.993050898886833e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 273 + }, + { + "completion_length": 200.0, + "epoch": 0.12194036493101913, + "grad_norm": 0.758976936340332, + "kl": 0.018932368606328964, + "learning_rate": 4.992758478224039e-06, + "loss": 0.0008, + "reward": -0.24283334612846375, + "reward_std": 0.41019290685653687, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.24283334612846375, + "step": 274 + }, + { + "completion_length": 200.0, + "epoch": 0.12238540275923454, + "grad_norm": 0.020172713324427605, + "kl": 0.012656650505959988, + "learning_rate": 4.992460040041034e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 275 + }, + { + "completion_length": 200.0, + "epoch": 0.12283044058744993, + "grad_norm": 0.01721290498971939, + "kl": 0.008657376281917095, + "learning_rate": 4.992155585058248e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 276 + }, + { + "completion_length": 186.83334350585938, + "epoch": 0.12327547841566533, + "grad_norm": 0.6902337074279785, + "kl": 0.06522956490516663, + "learning_rate": 4.991845114010638e-06, + "loss": 0.0026, + "reward": -0.09683333337306976, + "reward_std": 0.308468759059906, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09683333337306976, + "step": 277 + }, + { + "completion_length": 200.0, + "epoch": 0.12372051624388072, + "grad_norm": 0.013619703240692616, + "kl": 0.009151134639978409, + "learning_rate": 4.99152862764768e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 278 + }, + { + "completion_length": 200.0, + "epoch": 0.12416555407209613, + "grad_norm": 0.01365516148507595, + "kl": 0.005762772168964148, + "learning_rate": 4.99120612673337e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 279 + }, + { + "completion_length": 200.0, + "epoch": 0.12461059190031153, + "grad_norm": 0.03844517469406128, + "kl": 0.009146707132458687, + "learning_rate": 4.990877612046228e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 280 + }, + { + "completion_length": 200.0, + "epoch": 0.12505562972852693, + "grad_norm": 0.010963845066726208, + "kl": 0.004802503623068333, + "learning_rate": 4.9905430843792886e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 281 + }, + { + "completion_length": 200.0, + "epoch": 0.12550066755674233, + "grad_norm": 0.018975911661982536, + "kl": 0.00843195803463459, + "learning_rate": 4.9902025445401e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 282 + }, + { + "completion_length": 191.33334350585938, + "epoch": 0.12594570538495772, + "grad_norm": 0.9152283072471619, + "kl": 0.01359584927558899, + "learning_rate": 4.989855993350728e-06, + "loss": 0.0005, + "reward": 0.08433333039283752, + "reward_std": 0.09961257874965668, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08433333039283752, + "step": 283 + }, + { + "completion_length": 199.33334350585938, + "epoch": 0.12639074321317312, + "grad_norm": 0.6327641606330872, + "kl": 0.011363311670720577, + "learning_rate": 4.989503431647744e-06, + "loss": 0.0005, + "reward": 0.05283333361148834, + "reward_std": 0.17677150666713715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.05283333361148834, + "step": 284 + }, + { + "completion_length": 200.0, + "epoch": 0.1268357810413885, + "grad_norm": 0.015086417086422443, + "kl": 0.006470312364399433, + "learning_rate": 4.9891448602822355e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 285 + }, + { + "completion_length": 200.0, + "epoch": 0.1272808188696039, + "grad_norm": 0.6536492705345154, + "kl": 0.015819404274225235, + "learning_rate": 4.988780280119792e-06, + "loss": 0.0006, + "reward": -0.09083332866430283, + "reward_std": 0.33507877588272095, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09083332866430283, + "step": 286 + }, + { + "completion_length": 200.0, + "epoch": 0.1277258566978193, + "grad_norm": 0.008064402267336845, + "kl": 0.002744730096310377, + "learning_rate": 4.988409692040511e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 287 + }, + { + "completion_length": 180.83334350585938, + "epoch": 0.12817089452603472, + "grad_norm": 0.819420337677002, + "kl": 0.06618692725896835, + "learning_rate": 4.988033096938991e-06, + "loss": 0.0026, + "reward": 0.021166665479540825, + "reward_std": 0.24596862494945526, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021166665479540825, + "step": 288 + }, + { + "completion_length": 200.0, + "epoch": 0.12861593235425012, + "grad_norm": 0.017804304137825966, + "kl": 0.0107121542096138, + "learning_rate": 4.9876504957243345e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 289 + }, + { + "completion_length": 200.0, + "epoch": 0.1290609701824655, + "grad_norm": 0.009461048990488052, + "kl": 0.005491574760526419, + "learning_rate": 4.987261889320141e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 290 + }, + { + "completion_length": 200.0, + "epoch": 0.1295060080106809, + "grad_norm": 0.020793907344341278, + "kl": 0.006585664115846157, + "learning_rate": 4.986867278664505e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 291 + }, + { + "completion_length": 200.0, + "epoch": 0.1299510458388963, + "grad_norm": 0.01623818278312683, + "kl": 0.014776560477912426, + "learning_rate": 4.9864666647100176e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 292 + }, + { + "completion_length": 200.0, + "epoch": 0.1303960836671117, + "grad_norm": 0.015048404224216938, + "kl": 0.006943022832274437, + "learning_rate": 4.986060048423761e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 293 + }, + { + "completion_length": 200.0, + "epoch": 0.1308411214953271, + "grad_norm": 0.10332320630550385, + "kl": 0.018736328929662704, + "learning_rate": 4.985647430787308e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 294 + }, + { + "completion_length": 200.0, + "epoch": 0.1312861593235425, + "grad_norm": 0.7041745185852051, + "kl": 0.013492944650352001, + "learning_rate": 4.985228812796717e-06, + "loss": 0.0005, + "reward": 0.0403333380818367, + "reward_std": 0.20739012956619263, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0403333380818367, + "step": 295 + }, + { + "completion_length": 180.33334350585938, + "epoch": 0.1317311971517579, + "grad_norm": 1.2074172496795654, + "kl": 0.017207711935043335, + "learning_rate": 4.984804195462532e-06, + "loss": 0.0007, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 296 + }, + { + "completion_length": 200.0, + "epoch": 0.1321762349799733, + "grad_norm": 0.007344384212046862, + "kl": 0.0033924030140042305, + "learning_rate": 4.984373579809778e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 297 + }, + { + "completion_length": 200.0, + "epoch": 0.1326212728081887, + "grad_norm": 0.6581267714500427, + "kl": 0.004817100241780281, + "learning_rate": 4.983936966877964e-06, + "loss": 0.0002, + "reward": 0.02033333107829094, + "reward_std": 0.2563799321651459, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02033333107829094, + "step": 298 + }, + { + "completion_length": 200.0, + "epoch": 0.1330663106364041, + "grad_norm": 0.6918825507164001, + "kl": 0.008517519570887089, + "learning_rate": 4.983494357721074e-06, + "loss": 0.0003, + "reward": -0.029333334416151047, + "reward_std": 0.3780378997325897, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.029333334416151047, + "step": 299 + }, + { + "completion_length": 200.0, + "epoch": 0.13351134846461948, + "grad_norm": 0.6355785727500916, + "kl": 0.007732154801487923, + "learning_rate": 4.983045753407564e-06, + "loss": 0.0003, + "reward": -0.12200000137090683, + "reward_std": 0.3381112515926361, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.12200000137090683, + "step": 300 + }, + { + "completion_length": 200.0, + "epoch": 0.13395638629283488, + "grad_norm": 0.00788823515176773, + "kl": 0.004913420882076025, + "learning_rate": 4.982591155020367e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 301 + }, + { + "completion_length": 200.0, + "epoch": 0.1344014241210503, + "grad_norm": 0.019300375133752823, + "kl": 0.0115005848929286, + "learning_rate": 4.9821305636568835e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 302 + }, + { + "completion_length": 200.0, + "epoch": 0.1348464619492657, + "grad_norm": 0.5956396460533142, + "kl": 0.014617225155234337, + "learning_rate": 4.981663980428981e-06, + "loss": 0.0006, + "reward": 0.019499998539686203, + "reward_std": 0.2584211826324463, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019499998539686203, + "step": 303 + }, + { + "completion_length": 200.0, + "epoch": 0.1352914997774811, + "grad_norm": 0.013532106764614582, + "kl": 0.007755351718515158, + "learning_rate": 4.981191406462991e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 304 + }, + { + "completion_length": 200.0, + "epoch": 0.13573653760569648, + "grad_norm": 0.6766030788421631, + "kl": 0.019166380167007446, + "learning_rate": 4.9807128428997085e-06, + "loss": 0.0008, + "reward": 0.020166665315628052, + "reward_std": 0.2567881941795349, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.020166665315628052, + "step": 305 + }, + { + "completion_length": 200.0, + "epoch": 0.13618157543391188, + "grad_norm": 0.0559423454105854, + "kl": 0.023686787113547325, + "learning_rate": 4.980228290894386e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 306 + }, + { + "completion_length": 200.0, + "epoch": 0.13662661326212727, + "grad_norm": 0.03602616861462593, + "kl": 0.021832283586263657, + "learning_rate": 4.979737751616732e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 307 + }, + { + "completion_length": 200.0, + "epoch": 0.13707165109034267, + "grad_norm": 1.0004466772079468, + "kl": 0.01577741652727127, + "learning_rate": 4.979241226250908e-06, + "loss": 0.0006, + "reward": 0.010333329439163208, + "reward_std": 0.2808748185634613, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.010333329439163208, + "step": 308 + }, + { + "completion_length": 200.0, + "epoch": 0.1375166889185581, + "grad_norm": 0.01729634776711464, + "kl": 0.007570373825728893, + "learning_rate": 4.9787387159955265e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 309 + }, + { + "completion_length": 200.0, + "epoch": 0.13796172674677348, + "grad_norm": 0.0166346225887537, + "kl": 0.007312558591365814, + "learning_rate": 4.978230222063649e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 310 + }, + { + "completion_length": 200.0, + "epoch": 0.13840676457498888, + "grad_norm": 0.018261654302477837, + "kl": 0.009910644963383675, + "learning_rate": 4.9777157456827785e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 311 + }, + { + "completion_length": 200.0, + "epoch": 0.13885180240320427, + "grad_norm": 0.02585846185684204, + "kl": 0.011723216623067856, + "learning_rate": 4.977195288094863e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 312 + }, + { + "completion_length": 200.0, + "epoch": 0.13929684023141967, + "grad_norm": 0.02551482431590557, + "kl": 0.011289785616099834, + "learning_rate": 4.976668850556284e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 313 + }, + { + "completion_length": 200.0, + "epoch": 0.13974187805963506, + "grad_norm": 0.022060496732592583, + "kl": 0.01366241555660963, + "learning_rate": 4.976136434337866e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 314 + }, + { + "completion_length": 200.0, + "epoch": 0.14018691588785046, + "grad_norm": 0.5639723539352417, + "kl": 0.006856884807348251, + "learning_rate": 4.97559804072486e-06, + "loss": 0.0003, + "reward": 0.022499999031424522, + "reward_std": 0.2510727047920227, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.022499999031424522, + "step": 315 + }, + { + "completion_length": 196.0, + "epoch": 0.14063195371606588, + "grad_norm": 0.8882031440734863, + "kl": 0.012949245050549507, + "learning_rate": 4.9750536710169485e-06, + "loss": 0.0005, + "reward": 0.061000000685453415, + "reward_std": 0.1567673534154892, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.061000000685453415, + "step": 316 + }, + { + "completion_length": 200.0, + "epoch": 0.14107699154428127, + "grad_norm": 0.738691508769989, + "kl": 0.018215883523225784, + "learning_rate": 4.97450332652824e-06, + "loss": 0.0007, + "reward": -0.19816666841506958, + "reward_std": 0.35409173369407654, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.19816666841506958, + "step": 317 + }, + { + "completion_length": 200.0, + "epoch": 0.14152202937249667, + "grad_norm": 0.008259186521172523, + "kl": 0.004001545254141092, + "learning_rate": 4.973947008587268e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 318 + }, + { + "completion_length": 183.0, + "epoch": 0.14196706720071206, + "grad_norm": 0.8659139275550842, + "kl": 0.02181245945394039, + "learning_rate": 4.973384718536982e-06, + "loss": 0.0009, + "reward": 0.03533333167433739, + "reward_std": 0.18040475249290466, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03533333167433739, + "step": 319 + }, + { + "completion_length": 200.0, + "epoch": 0.14241210502892745, + "grad_norm": 0.013459905050694942, + "kl": 0.008506972342729568, + "learning_rate": 4.972816457734752e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 320 + }, + { + "completion_length": 200.0, + "epoch": 0.14285714285714285, + "grad_norm": 0.6455613970756531, + "kl": 0.04512510821223259, + "learning_rate": 4.972242227552358e-06, + "loss": 0.0018, + "reward": 0.008666664361953735, + "reward_std": 0.2849573493003845, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.008666664361953735, + "step": 321 + }, + { + "completion_length": 182.33334350585938, + "epoch": 0.14330218068535824, + "grad_norm": 0.7501981854438782, + "kl": 0.01761593297123909, + "learning_rate": 4.971662029375995e-06, + "loss": 0.0007, + "reward": -0.17916667461395264, + "reward_std": 0.331107497215271, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.17916667461395264, + "step": 322 + }, + { + "completion_length": 200.0, + "epoch": 0.14374721851357367, + "grad_norm": 0.008645527996122837, + "kl": 0.003186706220731139, + "learning_rate": 4.97107586460626e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 323 + }, + { + "completion_length": 175.83334350585938, + "epoch": 0.14419225634178906, + "grad_norm": 0.7112619876861572, + "kl": 0.030575290322303772, + "learning_rate": 4.970483734658154e-06, + "loss": 0.0012, + "reward": 0.08583333343267441, + "reward_std": 0.12314936518669128, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08583333343267441, + "step": 324 + }, + { + "completion_length": 200.0, + "epoch": 0.14463729417000445, + "grad_norm": 0.035192981362342834, + "kl": 0.008340008556842804, + "learning_rate": 4.969885640961081e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 325 + }, + { + "completion_length": 190.0, + "epoch": 0.14508233199821985, + "grad_norm": 0.7466467618942261, + "kl": 0.0285557322204113, + "learning_rate": 4.969281584958838e-06, + "loss": 0.0011, + "reward": 0.015166670083999634, + "reward_std": 0.2690356373786926, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.015166670083999634, + "step": 326 + }, + { + "completion_length": 191.1666717529297, + "epoch": 0.14552736982643524, + "grad_norm": 0.802043616771698, + "kl": 0.025279924273490906, + "learning_rate": 4.968671568109617e-06, + "loss": 0.001, + "reward": 0.0898333415389061, + "reward_std": 0.21741704642772675, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0898333415389061, + "step": 327 + }, + { + "completion_length": 167.5, + "epoch": 0.14597240765465064, + "grad_norm": 0.812122642993927, + "kl": 0.01720350608229637, + "learning_rate": 4.968055591885999e-06, + "loss": 0.0007, + "reward": -0.06016666814684868, + "reward_std": 0.24665558338165283, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.06016666814684868, + "step": 328 + }, + { + "completion_length": 200.0, + "epoch": 0.14641744548286603, + "grad_norm": 0.7395609021186829, + "kl": 0.05103464424610138, + "learning_rate": 4.967433657774952e-06, + "loss": 0.002, + "reward": -0.07100000232458115, + "reward_std": 0.3038104772567749, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.07100000232458115, + "step": 329 + }, + { + "completion_length": 179.83334350585938, + "epoch": 0.14686248331108145, + "grad_norm": 0.7787045836448669, + "kl": 0.04071980342268944, + "learning_rate": 4.9668057672778225e-06, + "loss": 0.0016, + "reward": 0.047833334654569626, + "reward_std": 0.21287593245506287, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.047833334654569626, + "step": 330 + }, + { + "completion_length": 200.0, + "epoch": 0.14730752113929685, + "grad_norm": 0.6824626326560974, + "kl": 0.008245850913226604, + "learning_rate": 4.966171921910341e-06, + "loss": 0.0003, + "reward": -0.09216667711734772, + "reward_std": 0.34250280261039734, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09216667711734772, + "step": 331 + }, + { + "completion_length": 200.0, + "epoch": 0.14775255896751224, + "grad_norm": 0.01006829272955656, + "kl": 0.004723408259451389, + "learning_rate": 4.96553212320261e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 332 + }, + { + "completion_length": 200.0, + "epoch": 0.14819759679572764, + "grad_norm": 0.7323021292686462, + "kl": 0.010274862870573997, + "learning_rate": 4.9648863726991035e-06, + "loss": 0.0004, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 333 + }, + { + "completion_length": 200.0, + "epoch": 0.14864263462394303, + "grad_norm": 0.024399923160672188, + "kl": 0.013128578662872314, + "learning_rate": 4.964234671958663e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 334 + }, + { + "completion_length": 200.0, + "epoch": 0.14908767245215843, + "grad_norm": 0.010077173821628094, + "kl": 0.004746645223349333, + "learning_rate": 4.963577022554496e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 335 + }, + { + "completion_length": 200.0, + "epoch": 0.14953271028037382, + "grad_norm": 0.011270418763160706, + "kl": 0.005329379811882973, + "learning_rate": 4.962913426074166e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 336 + }, + { + "completion_length": 200.0, + "epoch": 0.14997774810858924, + "grad_norm": 0.012212223373353481, + "kl": 0.006592839956283569, + "learning_rate": 4.9622438841195986e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 337 + }, + { + "completion_length": 200.0, + "epoch": 0.15042278593680464, + "grad_norm": 0.007356896065175533, + "kl": 0.003410342149436474, + "learning_rate": 4.961568398307065e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 338 + }, + { + "completion_length": 199.5, + "epoch": 0.15086782376502003, + "grad_norm": 0.6488639116287231, + "kl": 0.01651693880558014, + "learning_rate": 4.960886970267191e-06, + "loss": 0.0007, + "reward": 0.05366666615009308, + "reward_std": 0.17473027110099792, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.05366666615009308, + "step": 339 + }, + { + "completion_length": 200.0, + "epoch": 0.15131286159323543, + "grad_norm": 0.024112718179821968, + "kl": 0.018834218382835388, + "learning_rate": 4.960199601644943e-06, + "loss": 0.0008, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 340 + }, + { + "completion_length": 200.0, + "epoch": 0.15175789942145082, + "grad_norm": 0.013589066453278065, + "kl": 0.01101887971162796, + "learning_rate": 4.959506294099629e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 341 + }, + { + "completion_length": 200.0, + "epoch": 0.15220293724966621, + "grad_norm": 0.010311473160982132, + "kl": 0.007442638278007507, + "learning_rate": 4.958807049304893e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 342 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.1526479750778816, + "grad_norm": 5.103739261627197, + "kl": 0.3760009706020355, + "learning_rate": 4.958101868948715e-06, + "loss": 0.015, + "reward": 0.1758333444595337, + "reward_std": 0.12451573461294174, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1758333444595337, + "step": 343 + }, + { + "completion_length": 200.0, + "epoch": 0.15309301290609703, + "grad_norm": 0.013698318973183632, + "kl": 0.011166570708155632, + "learning_rate": 4.957390754733398e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 344 + }, + { + "completion_length": 200.0, + "epoch": 0.15353805073431243, + "grad_norm": 0.014360117726027966, + "kl": 0.010611571371555328, + "learning_rate": 4.956673708375574e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 345 + }, + { + "completion_length": 189.5, + "epoch": 0.15398308856252782, + "grad_norm": 0.7445936799049377, + "kl": 0.028817251324653625, + "learning_rate": 4.955950731606192e-06, + "loss": 0.0012, + "reward": 0.07383333891630173, + "reward_std": 0.12533222138881683, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.07383333891630173, + "step": 346 + }, + { + "completion_length": 184.0, + "epoch": 0.15442812639074321, + "grad_norm": 0.7008059024810791, + "kl": 0.05760132521390915, + "learning_rate": 4.9552218261705185e-06, + "loss": 0.0023, + "reward": -0.22599999606609344, + "reward_std": 0.35765790939331055, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.22599999606609344, + "step": 347 + }, + { + "completion_length": 200.0, + "epoch": 0.1548731642189586, + "grad_norm": 0.6954376697540283, + "kl": 0.007197665050625801, + "learning_rate": 4.954486993828132e-06, + "loss": 0.0003, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 348 + }, + { + "completion_length": 200.0, + "epoch": 0.155318202047174, + "grad_norm": 0.009056415408849716, + "kl": 0.007077607326209545, + "learning_rate": 4.953746236352917e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 349 + }, + { + "completion_length": 200.0, + "epoch": 0.1557632398753894, + "grad_norm": 0.01317316759377718, + "kl": 0.01229158602654934, + "learning_rate": 4.952999555533065e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 350 + }, + { + "completion_length": 195.5, + "epoch": 0.15620827770360482, + "grad_norm": 0.6882670521736145, + "kl": 0.024929411709308624, + "learning_rate": 4.952246953171062e-06, + "loss": 0.001, + "reward": -0.04200000315904617, + "reward_std": 0.33165282011032104, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.04200000315904617, + "step": 351 + }, + { + "completion_length": 195.6666717529297, + "epoch": 0.15665331553182021, + "grad_norm": 0.676509439945221, + "kl": 0.04691646993160248, + "learning_rate": 4.951488431083689e-06, + "loss": 0.0019, + "reward": -0.13633333146572113, + "reward_std": 0.29477566480636597, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.13633333146572113, + "step": 352 + }, + { + "completion_length": 200.0, + "epoch": 0.1570983533600356, + "grad_norm": 0.6416396498680115, + "kl": 0.01036627497524023, + "learning_rate": 4.950723991102022e-06, + "loss": 0.0004, + "reward": -0.014666667208075523, + "reward_std": 0.34211206436157227, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.014666667208075523, + "step": 353 + }, + { + "completion_length": 200.0, + "epoch": 0.157543391188251, + "grad_norm": 0.6665915846824646, + "kl": 0.03278065845370293, + "learning_rate": 4.949953635071417e-06, + "loss": 0.0013, + "reward": -0.08783333748579025, + "reward_std": 0.3303600549697876, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08783333748579025, + "step": 354 + }, + { + "completion_length": 191.5, + "epoch": 0.1579884290164664, + "grad_norm": 0.9799753427505493, + "kl": 0.01656440459191799, + "learning_rate": 4.949177364851515e-06, + "loss": 0.0007, + "reward": 0.0429999977350235, + "reward_std": 0.20085816085338593, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0429999977350235, + "step": 355 + }, + { + "completion_length": 200.0, + "epoch": 0.1584334668446818, + "grad_norm": 0.016740066930651665, + "kl": 0.01629803143441677, + "learning_rate": 4.9483951823162326e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 356 + }, + { + "completion_length": 200.0, + "epoch": 0.1588785046728972, + "grad_norm": 0.014137927442789078, + "kl": 0.007187969516962767, + "learning_rate": 4.947607089353758e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 357 + }, + { + "completion_length": 200.0, + "epoch": 0.1593235425011126, + "grad_norm": 0.6014176607131958, + "kl": 0.018046831712126732, + "learning_rate": 4.946813087866549e-06, + "loss": 0.0007, + "reward": 0.007333338260650635, + "reward_std": 0.2882232666015625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.007333338260650635, + "step": 358 + }, + { + "completion_length": 200.0, + "epoch": 0.159768580329328, + "grad_norm": 0.013204401358962059, + "kl": 0.013151012361049652, + "learning_rate": 4.946013179771325e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 359 + }, + { + "completion_length": 199.5, + "epoch": 0.1602136181575434, + "grad_norm": 0.6359446048736572, + "kl": 0.02717038244009018, + "learning_rate": 4.9452073669990656e-06, + "loss": 0.0011, + "reward": 0.03733333572745323, + "reward_std": 0.2147386074066162, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03733333572745323, + "step": 360 + }, + { + "completion_length": 200.0, + "epoch": 0.1606586559857588, + "grad_norm": 0.013449462130665779, + "kl": 0.006884987931698561, + "learning_rate": 4.944395651495002e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 361 + }, + { + "completion_length": 187.6666717529297, + "epoch": 0.16110369381397419, + "grad_norm": 0.818776547908783, + "kl": 0.0218803733587265, + "learning_rate": 4.9435780352186154e-06, + "loss": 0.0009, + "reward": -0.00916666816920042, + "reward_std": 0.21735171973705292, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.009166665375232697, + "step": 362 + }, + { + "completion_length": 200.0, + "epoch": 0.16154873164218958, + "grad_norm": 0.013731640763580799, + "kl": 0.0065148696303367615, + "learning_rate": 4.942754520143634e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 363 + }, + { + "completion_length": 200.0, + "epoch": 0.16199376947040497, + "grad_norm": 0.009971830062568188, + "kl": 0.012283856980502605, + "learning_rate": 4.9419251082580216e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 364 + }, + { + "completion_length": 200.0, + "epoch": 0.16243880729862037, + "grad_norm": 0.6981242895126343, + "kl": 0.022827019914984703, + "learning_rate": 4.94108980156398e-06, + "loss": 0.0009, + "reward": -0.02850000187754631, + "reward_std": 0.3187059760093689, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.02850000187754631, + "step": 365 + }, + { + "completion_length": 186.1666717529297, + "epoch": 0.1628838451268358, + "grad_norm": 0.7663524746894836, + "kl": 0.02265150099992752, + "learning_rate": 4.940248602077939e-06, + "loss": 0.0009, + "reward": 0.006000000052154064, + "reward_std": 0.2016005963087082, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.006000000052154064, + "step": 366 + }, + { + "completion_length": 200.0, + "epoch": 0.16332888295505119, + "grad_norm": 0.01284782588481903, + "kl": 0.009670613333582878, + "learning_rate": 4.939401511830556e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 367 + }, + { + "completion_length": 200.0, + "epoch": 0.16377392078326658, + "grad_norm": 0.01885703206062317, + "kl": 0.009269410744309425, + "learning_rate": 4.938548532866706e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 368 + }, + { + "completion_length": 200.0, + "epoch": 0.16421895861148197, + "grad_norm": 0.6328880190849304, + "kl": 0.01867607608437538, + "learning_rate": 4.937689667245481e-06, + "loss": 0.0007, + "reward": 0.021500002592802048, + "reward_std": 0.2535221576690674, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021500002592802048, + "step": 369 + }, + { + "completion_length": 200.0, + "epoch": 0.16466399643969737, + "grad_norm": 0.010342692025005817, + "kl": 0.0059143840335309505, + "learning_rate": 4.936824917040184e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 370 + }, + { + "completion_length": 200.0, + "epoch": 0.16510903426791276, + "grad_norm": 0.6799391508102417, + "kl": 0.01873181015253067, + "learning_rate": 4.935954284338321e-06, + "loss": 0.0007, + "reward": 0.005833338014781475, + "reward_std": 0.2918975353240967, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.005833338014781475, + "step": 371 + }, + { + "completion_length": 200.0, + "epoch": 0.16555407209612816, + "grad_norm": 0.017115725204348564, + "kl": 0.010264448821544647, + "learning_rate": 4.9350777712415995e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 372 + }, + { + "completion_length": 200.0, + "epoch": 0.16599910992434358, + "grad_norm": 0.010852769017219543, + "kl": 0.007528107613325119, + "learning_rate": 4.934195379865925e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 373 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.16644414775255897, + "grad_norm": 1.29231595993042, + "kl": 0.18531188368797302, + "learning_rate": 4.933307112341388e-06, + "loss": 0.0074, + "reward": 0.1628333330154419, + "reward_std": 0.13582256436347961, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1628333330154419, + "step": 374 + }, + { + "completion_length": 200.0, + "epoch": 0.16688918558077437, + "grad_norm": 0.009757625870406628, + "kl": 0.01133672520518303, + "learning_rate": 4.932412970812269e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 375 + }, + { + "completion_length": 199.5, + "epoch": 0.16733422340898976, + "grad_norm": 0.01278277114033699, + "kl": 0.009305099956691265, + "learning_rate": 4.931512957437024e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 376 + }, + { + "completion_length": 200.0, + "epoch": 0.16777926123720516, + "grad_norm": 0.013093134388327599, + "kl": 0.007520940154790878, + "learning_rate": 4.930607074388287e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 377 + }, + { + "completion_length": 200.0, + "epoch": 0.16822429906542055, + "grad_norm": 0.011228191666305065, + "kl": 0.008690441027283669, + "learning_rate": 4.92969532385286e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 378 + }, + { + "completion_length": 185.5, + "epoch": 0.16866933689363595, + "grad_norm": 0.7297086119651794, + "kl": 0.026256537064909935, + "learning_rate": 4.928777708031709e-06, + "loss": 0.0011, + "reward": 0.016166668385267258, + "reward_std": 0.16865399479866028, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.016166668385267258, + "step": 379 + }, + { + "completion_length": 200.0, + "epoch": 0.16911437472185137, + "grad_norm": 0.015303281135857105, + "kl": 0.01288022380322218, + "learning_rate": 4.927854229139959e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 380 + }, + { + "completion_length": 200.0, + "epoch": 0.16955941255006676, + "grad_norm": 0.011454589664936066, + "kl": 0.006882285233587027, + "learning_rate": 4.9269248894068886e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 381 + }, + { + "completion_length": 200.0, + "epoch": 0.17000445037828216, + "grad_norm": 0.014687180519104004, + "kl": 0.014141609892249107, + "learning_rate": 4.9259896910759246e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 382 + }, + { + "completion_length": 200.0, + "epoch": 0.17044948820649755, + "grad_norm": 0.016531087458133698, + "kl": 0.00648513063788414, + "learning_rate": 4.925048636404635e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 383 + }, + { + "completion_length": 200.0, + "epoch": 0.17089452603471295, + "grad_norm": 0.00957447849214077, + "kl": 0.0061697340570390224, + "learning_rate": 4.9241017276647295e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 384 + }, + { + "completion_length": 200.0, + "epoch": 0.17133956386292834, + "grad_norm": 0.01072862558066845, + "kl": 0.005429576151072979, + "learning_rate": 4.923148967142043e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 385 + }, + { + "completion_length": 200.0, + "epoch": 0.17178460169114373, + "grad_norm": 0.024660624563694, + "kl": 0.009238356724381447, + "learning_rate": 4.9221903571365406e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 386 + }, + { + "completion_length": 200.0, + "epoch": 0.17222963951935916, + "grad_norm": 0.7495405673980713, + "kl": 0.0185337346047163, + "learning_rate": 4.921225899962308e-06, + "loss": 0.0007, + "reward": -0.13200001418590546, + "reward_std": 0.3997148871421814, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.13200001418590546, + "step": 387 + }, + { + "completion_length": 200.0, + "epoch": 0.17267467734757455, + "grad_norm": 0.010481936857104301, + "kl": 0.00567130371928215, + "learning_rate": 4.920255597947545e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 388 + }, + { + "completion_length": 200.0, + "epoch": 0.17311971517578995, + "grad_norm": 0.007845093496143818, + "kl": 0.003956751897931099, + "learning_rate": 4.919279453434561e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 389 + }, + { + "completion_length": 181.0, + "epoch": 0.17356475300400534, + "grad_norm": 0.6615269184112549, + "kl": 0.030248617753386497, + "learning_rate": 4.918297468779771e-06, + "loss": 0.0012, + "reward": -0.13499999046325684, + "reward_std": 0.35328683257102966, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.13499999046325684, + "step": 390 + }, + { + "completion_length": 200.0, + "epoch": 0.17400979083222073, + "grad_norm": 0.691724419593811, + "kl": 0.02659853920340538, + "learning_rate": 4.917309646353682e-06, + "loss": 0.0011, + "reward": -0.09699999541044235, + "reward_std": 0.34400463104248047, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09699999541044235, + "step": 391 + }, + { + "completion_length": 184.5, + "epoch": 0.17445482866043613, + "grad_norm": 0.6306662559509277, + "kl": 0.0601261667907238, + "learning_rate": 4.916315988540903e-06, + "loss": 0.0024, + "reward": -0.09049999713897705, + "reward_std": 0.25501197576522827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09049999713897705, + "step": 392 + }, + { + "completion_length": 200.0, + "epoch": 0.17489986648865152, + "grad_norm": 0.012944119051098824, + "kl": 0.01151751633733511, + "learning_rate": 4.9153164977401215e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 393 + }, + { + "completion_length": 200.0, + "epoch": 0.17534490431686695, + "grad_norm": 0.7263959646224976, + "kl": 0.011329833418130875, + "learning_rate": 4.914311176364109e-06, + "loss": 0.0005, + "reward": 0.021500002592802048, + "reward_std": 0.2535221576690674, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021500002592802048, + "step": 394 + }, + { + "completion_length": 200.0, + "epoch": 0.17578994214508234, + "grad_norm": 0.00732870027422905, + "kl": 0.004891596734523773, + "learning_rate": 4.913300026839714e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 395 + }, + { + "completion_length": 200.0, + "epoch": 0.17623497997329773, + "grad_norm": 0.7324749827384949, + "kl": 0.01928497850894928, + "learning_rate": 4.912283051607849e-06, + "loss": 0.0008, + "reward": 0.02199999988079071, + "reward_std": 0.25229746103286743, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02199999988079071, + "step": 396 + }, + { + "completion_length": 200.0, + "epoch": 0.17668001780151313, + "grad_norm": 0.8718001842498779, + "kl": 0.13316845893859863, + "learning_rate": 4.911260253123494e-06, + "loss": 0.0053, + "reward": -0.1536666750907898, + "reward_std": 0.30690693855285645, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1536666750907898, + "step": 397 + }, + { + "completion_length": 200.0, + "epoch": 0.17712505562972852, + "grad_norm": 0.7277849912643433, + "kl": 0.01658281497657299, + "learning_rate": 4.9102316338556844e-06, + "loss": 0.0007, + "reward": -0.0003333290515001863, + "reward_std": 0.3070027232170105, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0003333290515001863, + "step": 398 + }, + { + "completion_length": 200.0, + "epoch": 0.17757009345794392, + "grad_norm": 0.6445873975753784, + "kl": 0.02959112823009491, + "learning_rate": 4.909197196287509e-06, + "loss": 0.0012, + "reward": -0.1274999976158142, + "reward_std": 0.44477805495262146, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1274999976158142, + "step": 399 + }, + { + "completion_length": 172.33334350585938, + "epoch": 0.1780151312861593, + "grad_norm": 0.7523168921470642, + "kl": 0.0322515144944191, + "learning_rate": 4.908156942916101e-06, + "loss": 0.0013, + "reward": -0.03199999779462814, + "reward_std": 0.2558077275753021, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.03199999779462814, + "step": 400 + }, + { + "completion_length": 167.0, + "epoch": 0.17846016911437473, + "grad_norm": 0.8686386942863464, + "kl": 0.056089848279953, + "learning_rate": 4.90711087625263e-06, + "loss": 0.0022, + "reward": 0.07116666436195374, + "reward_std": 0.14513224363327026, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.07116666436195374, + "step": 401 + }, + { + "completion_length": 200.0, + "epoch": 0.17890520694259013, + "grad_norm": 0.00890452042222023, + "kl": 0.005404898431152105, + "learning_rate": 4.906058998822303e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 402 + }, + { + "completion_length": 200.0, + "epoch": 0.17935024477080552, + "grad_norm": 0.9637120962142944, + "kl": 0.02167494222521782, + "learning_rate": 4.905001313164353e-06, + "loss": 0.0009, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 403 + }, + { + "completion_length": 200.0, + "epoch": 0.17979528259902092, + "grad_norm": 0.007997624576091766, + "kl": 0.004169671796262264, + "learning_rate": 4.9039378218320325e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 404 + }, + { + "completion_length": 200.0, + "epoch": 0.1802403204272363, + "grad_norm": 0.016796903684735298, + "kl": 0.011683585122227669, + "learning_rate": 4.902868527392612e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 405 + }, + { + "completion_length": 200.0, + "epoch": 0.1806853582554517, + "grad_norm": 0.015744149684906006, + "kl": 0.012497194111347198, + "learning_rate": 4.9017934324273655e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 406 + }, + { + "completion_length": 199.1666717529297, + "epoch": 0.1811303960836671, + "grad_norm": 0.5119321346282959, + "kl": 0.028500860556960106, + "learning_rate": 4.900712539531577e-06, + "loss": 0.0011, + "reward": 0.03933333605527878, + "reward_std": 0.2756495475769043, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03933333605527878, + "step": 407 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.18157543391188252, + "grad_norm": 0.009107636287808418, + "kl": 0.009039022959768772, + "learning_rate": 4.89962585131452e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 408 + }, + { + "completion_length": 200.0, + "epoch": 0.18202047174009792, + "grad_norm": 0.013876304030418396, + "kl": 0.009923950769007206, + "learning_rate": 4.898533370399459e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 409 + }, + { + "completion_length": 200.0, + "epoch": 0.1824655095683133, + "grad_norm": 0.03001904860138893, + "kl": 0.01629229262471199, + "learning_rate": 4.897435099423647e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 410 + }, + { + "completion_length": 200.0, + "epoch": 0.1829105473965287, + "grad_norm": 0.020497044548392296, + "kl": 0.0096663823351264, + "learning_rate": 4.896331041038309e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 411 + }, + { + "completion_length": 200.0, + "epoch": 0.1833555852247441, + "grad_norm": 0.9498317241668701, + "kl": 0.01545047014951706, + "learning_rate": 4.895221197908643e-06, + "loss": 0.0006, + "reward": 0.001833329675719142, + "reward_std": 0.3016955256462097, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.001833329675719142, + "step": 412 + }, + { + "completion_length": 200.0, + "epoch": 0.1838006230529595, + "grad_norm": 0.006678743753582239, + "kl": 0.003321468597277999, + "learning_rate": 4.89410557271381e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 413 + }, + { + "completion_length": 200.0, + "epoch": 0.1842456608811749, + "grad_norm": 0.009199023246765137, + "kl": 0.005622576922178268, + "learning_rate": 4.8929841681469295e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 414 + }, + { + "completion_length": 200.0, + "epoch": 0.1846906987093903, + "grad_norm": 0.7114378213882446, + "kl": 0.02522313967347145, + "learning_rate": 4.891856986915073e-06, + "loss": 0.001, + "reward": -0.10500000417232513, + "reward_std": 0.31319066882133484, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10500000417232513, + "step": 415 + }, + { + "completion_length": 200.0, + "epoch": 0.1851357365376057, + "grad_norm": 0.011489290744066238, + "kl": 0.008540185168385506, + "learning_rate": 4.8907240317392565e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 416 + }, + { + "completion_length": 200.0, + "epoch": 0.1855807743658211, + "grad_norm": 0.6549782156944275, + "kl": 0.023001134395599365, + "learning_rate": 4.889585305354436e-06, + "loss": 0.0009, + "reward": 0.05899999663233757, + "reward_std": 0.30181917548179626, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.05899999663233757, + "step": 417 + }, + { + "completion_length": 200.0, + "epoch": 0.1860258121940365, + "grad_norm": 0.6743258833885193, + "kl": 0.014783745631575584, + "learning_rate": 4.888440810509496e-06, + "loss": 0.0006, + "reward": -0.007166664116084576, + "reward_std": 0.267223060131073, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.007166664116084576, + "step": 418 + }, + { + "completion_length": 200.0, + "epoch": 0.1864708500222519, + "grad_norm": 0.011519741266965866, + "kl": 0.006074823439121246, + "learning_rate": 4.887290549967247e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 419 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.18691588785046728, + "grad_norm": 0.8046799302101135, + "kl": 0.018539931625127792, + "learning_rate": 4.886134526504421e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.07905694097280502, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 420 + }, + { + "completion_length": 200.0, + "epoch": 0.18736092567868268, + "grad_norm": 0.02228482812643051, + "kl": 0.022595927119255066, + "learning_rate": 4.884972742911656e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 421 + }, + { + "completion_length": 170.0, + "epoch": 0.1878059635068981, + "grad_norm": 1.0265932083129883, + "kl": 0.03965630382299423, + "learning_rate": 4.8838052019935005e-06, + "loss": 0.0016, + "reward": -0.19983333349227905, + "reward_std": 0.36561259627342224, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.19983333349227905, + "step": 422 + }, + { + "completion_length": 200.0, + "epoch": 0.1882510013351135, + "grad_norm": 0.008268559351563454, + "kl": 0.006413072347640991, + "learning_rate": 4.882631906568398e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 423 + }, + { + "completion_length": 200.0, + "epoch": 0.1886960391633289, + "grad_norm": 0.011764715425670147, + "kl": 0.00969620794057846, + "learning_rate": 4.881452859468685e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 424 + }, + { + "completion_length": 200.0, + "epoch": 0.18914107699154428, + "grad_norm": 0.013174543157219887, + "kl": 0.012654460966587067, + "learning_rate": 4.880268063540581e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 425 + }, + { + "completion_length": 200.0, + "epoch": 0.18958611481975968, + "grad_norm": 0.009335462003946304, + "kl": 0.011132560670375824, + "learning_rate": 4.8790775216441835e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 426 + }, + { + "completion_length": 141.1666717529297, + "epoch": 0.19003115264797507, + "grad_norm": 0.8435110449790955, + "kl": 0.023837899789214134, + "learning_rate": 4.877881236653463e-06, + "loss": 0.001, + "reward": -0.03983333706855774, + "reward_std": 0.24120646715164185, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.03983333706855774, + "step": 427 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.19047619047619047, + "grad_norm": 0.6499157547950745, + "kl": 0.01160873007029295, + "learning_rate": 4.8766792114562495e-06, + "loss": 0.0005, + "reward": -0.09033333510160446, + "reward_std": 0.351275771856308, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09033333510160446, + "step": 428 + }, + { + "completion_length": 197.83334350585938, + "epoch": 0.1909212283044059, + "grad_norm": 0.015310406684875488, + "kl": 0.010684727691113949, + "learning_rate": 4.875471448954234e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 429 + }, + { + "completion_length": 200.0, + "epoch": 0.19136626613262128, + "grad_norm": 0.6541863679885864, + "kl": 0.01754167675971985, + "learning_rate": 4.874257952062957e-06, + "loss": 0.0007, + "reward": 0.00533333420753479, + "reward_std": 0.2931222915649414, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.00533333420753479, + "step": 430 + }, + { + "completion_length": 200.0, + "epoch": 0.19181130396083668, + "grad_norm": 0.009647169150412083, + "kl": 0.005298088304698467, + "learning_rate": 4.873038723711798e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 431 + }, + { + "completion_length": 200.0, + "epoch": 0.19225634178905207, + "grad_norm": 0.010216983035206795, + "kl": 0.007101266644895077, + "learning_rate": 4.871813766843977e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 432 + }, + { + "completion_length": 200.0, + "epoch": 0.19270137961726747, + "grad_norm": 0.007988468743860722, + "kl": 0.003836569143459201, + "learning_rate": 4.870583084416539e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 433 + }, + { + "completion_length": 200.0, + "epoch": 0.19314641744548286, + "grad_norm": 0.7281708717346191, + "kl": 0.016352718695998192, + "learning_rate": 4.869346679400353e-06, + "loss": 0.0007, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 434 + }, + { + "completion_length": 200.0, + "epoch": 0.19359145527369825, + "grad_norm": 0.009215595200657845, + "kl": 0.006540496833622456, + "learning_rate": 4.868104554780101e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 435 + }, + { + "completion_length": 178.0, + "epoch": 0.19403649310191368, + "grad_norm": 0.6501026749610901, + "kl": 0.030550524592399597, + "learning_rate": 4.866856713554271e-06, + "loss": 0.0012, + "reward": 0.04383333772420883, + "reward_std": 0.15619271993637085, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04383333772420883, + "step": 436 + }, + { + "completion_length": 200.0, + "epoch": 0.19448153093012907, + "grad_norm": 0.010590564459562302, + "kl": 0.006376064382493496, + "learning_rate": 4.865603158735155e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 437 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.19492656875834447, + "grad_norm": 0.5724078416824341, + "kl": 0.015208952128887177, + "learning_rate": 4.864343893348834e-06, + "loss": 0.0006, + "reward": 0.0690000057220459, + "reward_std": 0.20461182296276093, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0690000057220459, + "step": 438 + }, + { + "completion_length": 200.0, + "epoch": 0.19537160658655986, + "grad_norm": 0.5766968727111816, + "kl": 0.011872484348714352, + "learning_rate": 4.863078920435173e-06, + "loss": 0.0005, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 439 + }, + { + "completion_length": 200.0, + "epoch": 0.19581664441477525, + "grad_norm": 0.0435682088136673, + "kl": 0.017930179834365845, + "learning_rate": 4.861808243047822e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 440 + }, + { + "completion_length": 200.0, + "epoch": 0.19626168224299065, + "grad_norm": 0.012740753591060638, + "kl": 0.005490332376211882, + "learning_rate": 4.860531864254192e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 441 + }, + { + "completion_length": 200.0, + "epoch": 0.19670672007120604, + "grad_norm": 0.007875868119299412, + "kl": 0.003752867691218853, + "learning_rate": 4.8592497871354646e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 442 + }, + { + "completion_length": 200.0, + "epoch": 0.19715175789942144, + "grad_norm": 0.00968019850552082, + "kl": 0.0069991848431527615, + "learning_rate": 4.857962014786575e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 443 + }, + { + "completion_length": 200.0, + "epoch": 0.19759679572763686, + "grad_norm": 0.6327012777328491, + "kl": 0.013870742172002792, + "learning_rate": 4.856668550316203e-06, + "loss": 0.0006, + "reward": 0.008500000461935997, + "reward_std": 0.2853655517101288, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.008500000461935997, + "step": 444 + }, + { + "completion_length": 200.0, + "epoch": 0.19804183355585225, + "grad_norm": 0.6365529298782349, + "kl": 0.01527687069028616, + "learning_rate": 4.855369396846778e-06, + "loss": 0.0006, + "reward": 0.0004999985685572028, + "reward_std": 0.248800128698349, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0004999985685572028, + "step": 445 + }, + { + "completion_length": 200.0, + "epoch": 0.19848687138406765, + "grad_norm": 0.013388743624091148, + "kl": 0.0054934462532401085, + "learning_rate": 4.854064557514452e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 446 + }, + { + "completion_length": 147.33334350585938, + "epoch": 0.19893190921228304, + "grad_norm": 0.80791836977005, + "kl": 0.06670716404914856, + "learning_rate": 4.8527540354691095e-06, + "loss": 0.0027, + "reward": 0.14483334124088287, + "reward_std": 0.055607251822948456, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14483334124088287, + "step": 447 + }, + { + "completion_length": 200.0, + "epoch": 0.19937694704049844, + "grad_norm": 0.008610348217189312, + "kl": 0.0053973570466041565, + "learning_rate": 4.8514378338743525e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 448 + }, + { + "completion_length": 200.0, + "epoch": 0.19982198486871383, + "grad_norm": 0.6462050676345825, + "kl": 0.022491535171866417, + "learning_rate": 4.850115955907491e-06, + "loss": 0.0009, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 449 + }, + { + "completion_length": 200.0, + "epoch": 0.20026702269692923, + "grad_norm": 0.013753926381468773, + "kl": 0.003668756689876318, + "learning_rate": 4.8487884047595395e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 450 + }, + { + "completion_length": 200.0, + "epoch": 0.20071206052514465, + "grad_norm": 0.01476313453167677, + "kl": 0.009049910120666027, + "learning_rate": 4.847455183635207e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 451 + }, + { + "completion_length": 194.5, + "epoch": 0.20115709835336004, + "grad_norm": 0.7963857650756836, + "kl": 0.024605944752693176, + "learning_rate": 4.846116295752891e-06, + "loss": 0.001, + "reward": 0.08583333343267441, + "reward_std": 0.1649368703365326, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08583333343267441, + "step": 452 + }, + { + "completion_length": 197.33334350585938, + "epoch": 0.20160213618157544, + "grad_norm": 0.008257864974439144, + "kl": 0.00925783533602953, + "learning_rate": 4.844771744344666e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 453 + }, + { + "completion_length": 169.5, + "epoch": 0.20204717400979083, + "grad_norm": 0.7012379169464111, + "kl": 0.032544177025556564, + "learning_rate": 4.843421532656281e-06, + "loss": 0.0013, + "reward": 0.028833335265517235, + "reward_std": 0.24847166240215302, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.028833335265517235, + "step": 454 + }, + { + "completion_length": 133.5, + "epoch": 0.20249221183800623, + "grad_norm": 1.1525821685791016, + "kl": 0.030363596975803375, + "learning_rate": 4.8420656639471466e-06, + "loss": 0.0012, + "reward": -0.10766666382551193, + "reward_std": 0.2986413836479187, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10766667127609253, + "step": 455 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.20293724966622162, + "grad_norm": 0.01882368139922619, + "kl": 0.018681103363633156, + "learning_rate": 4.84070414149033e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 456 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.20338228749443701, + "grad_norm": 0.907418429851532, + "kl": 0.016113460063934326, + "learning_rate": 4.83933696857255e-06, + "loss": 0.0006, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 457 + }, + { + "completion_length": 200.0, + "epoch": 0.20382732532265244, + "grad_norm": 0.02320541813969612, + "kl": 0.012556570582091808, + "learning_rate": 4.83796414849416e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 458 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.20427236315086783, + "grad_norm": 0.8501640558242798, + "kl": 0.023711485788226128, + "learning_rate": 4.836585684569148e-06, + "loss": 0.0009, + "reward": 0.05000000447034836, + "reward_std": 0.18371173739433289, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.05000000447034836, + "step": 459 + }, + { + "completion_length": 200.0, + "epoch": 0.20471740097908322, + "grad_norm": 0.0207956675440073, + "kl": 0.008212253451347351, + "learning_rate": 4.83520158012513e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 460 + }, + { + "completion_length": 199.33334350585938, + "epoch": 0.20516243880729862, + "grad_norm": 0.01620439812541008, + "kl": 0.011511600576341152, + "learning_rate": 4.833811838503331e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 461 + }, + { + "completion_length": 200.0, + "epoch": 0.205607476635514, + "grad_norm": 0.8750971555709839, + "kl": 0.007969997823238373, + "learning_rate": 4.83241646305859e-06, + "loss": 0.0003, + "reward": -0.0003333290515001863, + "reward_std": 0.3070027232170105, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0003333290515001863, + "step": 462 + }, + { + "completion_length": 200.0, + "epoch": 0.2060525144637294, + "grad_norm": 0.012503387406468391, + "kl": 0.005554807838052511, + "learning_rate": 4.8310154571593435e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 463 + }, + { + "completion_length": 200.0, + "epoch": 0.2064975522919448, + "grad_norm": 0.01813841424882412, + "kl": 0.00543005159124732, + "learning_rate": 4.829608824187621e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 464 + }, + { + "completion_length": 200.0, + "epoch": 0.20694259012016022, + "grad_norm": 0.007825582288205624, + "kl": 0.007653496228158474, + "learning_rate": 4.828196567539034e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 465 + }, + { + "completion_length": 200.0, + "epoch": 0.20738762794837562, + "grad_norm": 0.03445260971784592, + "kl": 0.011151362210512161, + "learning_rate": 4.826778690622772e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 466 + }, + { + "completion_length": 192.6666717529297, + "epoch": 0.207832665776591, + "grad_norm": 0.6254943609237671, + "kl": 0.0196918286383152, + "learning_rate": 4.82535519686159e-06, + "loss": 0.0008, + "reward": -0.05883334204554558, + "reward_std": 0.3185080885887146, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.05883333832025528, + "step": 467 + }, + { + "completion_length": 183.33334350585938, + "epoch": 0.2082777036048064, + "grad_norm": 0.7328706383705139, + "kl": 0.045631419867277145, + "learning_rate": 4.823926089691803e-06, + "loss": 0.0018, + "reward": -0.11100000143051147, + "reward_std": 0.37771734595298767, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.11100000143051147, + "step": 468 + }, + { + "completion_length": 200.0, + "epoch": 0.2087227414330218, + "grad_norm": 0.013255412690341473, + "kl": 0.011069796979427338, + "learning_rate": 4.822491372563276e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 469 + }, + { + "completion_length": 200.0, + "epoch": 0.2091677792612372, + "grad_norm": 0.022944483906030655, + "kl": 0.013812100514769554, + "learning_rate": 4.821051048939416e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 470 + }, + { + "completion_length": 200.0, + "epoch": 0.2096128170894526, + "grad_norm": 0.009076782502233982, + "kl": 0.0035323970951139927, + "learning_rate": 4.819605122297167e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 471 + }, + { + "completion_length": 200.0, + "epoch": 0.210057854917668, + "grad_norm": 0.047525037080049515, + "kl": 0.008770107291638851, + "learning_rate": 4.818153596126995e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 472 + }, + { + "completion_length": 200.0, + "epoch": 0.2105028927458834, + "grad_norm": 0.7111804485321045, + "kl": 0.02760786935687065, + "learning_rate": 4.816696473932886e-06, + "loss": 0.0011, + "reward": -0.012666663154959679, + "reward_std": 0.33721309900283813, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.012666663154959679, + "step": 473 + }, + { + "completion_length": 70.66667175292969, + "epoch": 0.2109479305740988, + "grad_norm": 1.4313840866088867, + "kl": 0.02774934470653534, + "learning_rate": 4.815233759232333e-06, + "loss": 0.0011, + "reward": 0.01666666753590107, + "reward_std": 0.054006174206733704, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.01666666753590107, + "step": 474 + }, + { + "completion_length": 200.0, + "epoch": 0.2113929684023142, + "grad_norm": 0.008336659520864487, + "kl": 0.003154546720907092, + "learning_rate": 4.8137654555563305e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 475 + }, + { + "completion_length": 200.0, + "epoch": 0.2118380062305296, + "grad_norm": 0.6730583310127258, + "kl": 0.02612406387925148, + "learning_rate": 4.812291566449363e-06, + "loss": 0.001, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 476 + }, + { + "completion_length": 200.0, + "epoch": 0.21228304405874499, + "grad_norm": 0.6641086935997009, + "kl": 0.013326774351298809, + "learning_rate": 4.810812095469401e-06, + "loss": 0.0005, + "reward": 0.011333337053656578, + "reward_std": 0.27842533588409424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.011333337053656578, + "step": 477 + }, + { + "completion_length": 196.0, + "epoch": 0.21272808188696038, + "grad_norm": 0.02167276106774807, + "kl": 0.016343414783477783, + "learning_rate": 4.809327046187888e-06, + "loss": 0.0007, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 478 + }, + { + "completion_length": 200.0, + "epoch": 0.2131731197151758, + "grad_norm": 0.6960393786430359, + "kl": 0.00471863616257906, + "learning_rate": 4.807836422189733e-06, + "loss": 0.0002, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 479 + }, + { + "completion_length": 200.0, + "epoch": 0.2136181575433912, + "grad_norm": 0.03306792676448822, + "kl": 0.022989537566900253, + "learning_rate": 4.806340227073304e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 480 + }, + { + "completion_length": 186.1666717529297, + "epoch": 0.2140631953716066, + "grad_norm": 0.7957750558853149, + "kl": 0.0064355782233178616, + "learning_rate": 4.8048384644504165e-06, + "loss": 0.0003, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 481 + }, + { + "completion_length": 190.0, + "epoch": 0.21450823319982198, + "grad_norm": 0.7082236409187317, + "kl": 0.05344567820429802, + "learning_rate": 4.8033311379463255e-06, + "loss": 0.0021, + "reward": -0.0898333415389061, + "reward_std": 0.3882114887237549, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0898333415389061, + "step": 482 + }, + { + "completion_length": 200.0, + "epoch": 0.21495327102803738, + "grad_norm": 0.019929101690649986, + "kl": 0.008283906616270542, + "learning_rate": 4.801818251199718e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 483 + }, + { + "completion_length": 185.5, + "epoch": 0.21539830885625277, + "grad_norm": 0.6174662113189697, + "kl": 0.02444113790988922, + "learning_rate": 4.800299807862705e-06, + "loss": 0.001, + "reward": 0.00916666816920042, + "reward_std": 0.22120074927806854, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.00916666816920042, + "step": 484 + }, + { + "completion_length": 200.0, + "epoch": 0.21584334668446817, + "grad_norm": 0.6985958814620972, + "kl": 0.047310732305049896, + "learning_rate": 4.798775811600807e-06, + "loss": 0.0019, + "reward": -0.09299999475479126, + "reward_std": 0.3469115197658539, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.09299999475479126, + "step": 485 + }, + { + "completion_length": 200.0, + "epoch": 0.2162883845126836, + "grad_norm": 0.008483149111270905, + "kl": 0.004622921347618103, + "learning_rate": 4.7972462660929546e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 486 + }, + { + "completion_length": 190.6666717529297, + "epoch": 0.21673342234089898, + "grad_norm": 0.8138934969902039, + "kl": 0.022382408380508423, + "learning_rate": 4.795711175031467e-06, + "loss": 0.0009, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 487 + }, + { + "completion_length": 200.0, + "epoch": 0.21717846016911438, + "grad_norm": 0.010969250462949276, + "kl": 0.005018382798880339, + "learning_rate": 4.79417054212206e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 488 + }, + { + "completion_length": 200.0, + "epoch": 0.21762349799732977, + "grad_norm": 0.007726567331701517, + "kl": 0.003069926518946886, + "learning_rate": 4.792624371083819e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 489 + }, + { + "completion_length": 200.0, + "epoch": 0.21806853582554517, + "grad_norm": 0.00889444351196289, + "kl": 0.004114137962460518, + "learning_rate": 4.791072665649203e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 490 + }, + { + "completion_length": 187.0, + "epoch": 0.21851357365376056, + "grad_norm": 0.9759646654129028, + "kl": 0.02656198851764202, + "learning_rate": 4.789515429564029e-06, + "loss": 0.0011, + "reward": 0.13449999690055847, + "reward_std": 0.023270150646567345, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13449999690055847, + "step": 491 + }, + { + "completion_length": 164.6666717529297, + "epoch": 0.21895861148197596, + "grad_norm": 0.698348343372345, + "kl": 0.03639654442667961, + "learning_rate": 4.787952666587465e-06, + "loss": 0.0015, + "reward": 0.06016666814684868, + "reward_std": 0.2878718078136444, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.06016666814684868, + "step": 492 + }, + { + "completion_length": 181.83334350585938, + "epoch": 0.21940364931019138, + "grad_norm": 0.6999357342720032, + "kl": 0.0428909957408905, + "learning_rate": 4.786384380492024e-06, + "loss": 0.0017, + "reward": 0.16099999845027924, + "reward_std": 0.13888844847679138, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.16099999845027924, + "step": 493 + }, + { + "completion_length": 184.83334350585938, + "epoch": 0.21984868713840677, + "grad_norm": 0.7373248338699341, + "kl": 0.051096897572278976, + "learning_rate": 4.784810575063546e-06, + "loss": 0.002, + "reward": 0.019499998539686203, + "reward_std": 0.3235452175140381, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019499998539686203, + "step": 494 + }, + { + "completion_length": 198.5, + "epoch": 0.22029372496662217, + "grad_norm": 0.8084774017333984, + "kl": 0.012246180325746536, + "learning_rate": 4.783231254101201e-06, + "loss": 0.0005, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 495 + }, + { + "completion_length": 200.0, + "epoch": 0.22073876279483756, + "grad_norm": 0.008205811493098736, + "kl": 0.007350212894380093, + "learning_rate": 4.781646421417469e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 496 + }, + { + "completion_length": 200.0, + "epoch": 0.22118380062305296, + "grad_norm": 0.018955878913402557, + "kl": 0.005687872879207134, + "learning_rate": 4.780056080838138e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 497 + }, + { + "completion_length": 200.0, + "epoch": 0.22162883845126835, + "grad_norm": 0.17352764308452606, + "kl": 0.04136586934328079, + "learning_rate": 4.77846023620229e-06, + "loss": 0.0017, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 498 + }, + { + "completion_length": 199.0, + "epoch": 0.22207387627948375, + "grad_norm": 0.014824754558503628, + "kl": 0.008487005718052387, + "learning_rate": 4.776858891362296e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 499 + }, + { + "completion_length": 200.0, + "epoch": 0.22251891410769917, + "grad_norm": 0.7017592191696167, + "kl": 0.012654486112296581, + "learning_rate": 4.775252050183802e-06, + "loss": 0.0005, + "reward": 0.025499999523162842, + "reward_std": 0.24372422695159912, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.025499999523162842, + "step": 500 + }, + { + "completion_length": 200.0, + "epoch": 0.22296395193591456, + "grad_norm": 0.8674989342689514, + "kl": 0.024426866322755814, + "learning_rate": 4.773639716545723e-06, + "loss": 0.001, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 501 + }, + { + "completion_length": 200.0, + "epoch": 0.22340898976412996, + "grad_norm": 0.008451178669929504, + "kl": 0.0035606666933745146, + "learning_rate": 4.772021894340235e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 502 + }, + { + "completion_length": 200.0, + "epoch": 0.22385402759234535, + "grad_norm": 0.014713208191096783, + "kl": 0.00692252442240715, + "learning_rate": 4.77039858747276e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 503 + }, + { + "completion_length": 200.0, + "epoch": 0.22429906542056074, + "grad_norm": 0.7125733494758606, + "kl": 0.031231265515089035, + "learning_rate": 4.768769799861962e-06, + "loss": 0.0012, + "reward": 0.019166668877005577, + "reward_std": 0.2592376470565796, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019166668877005577, + "step": 504 + }, + { + "completion_length": 199.33334350585938, + "epoch": 0.22474410324877614, + "grad_norm": 0.6759501099586487, + "kl": 0.01907402276992798, + "learning_rate": 4.767135535439736e-06, + "loss": 0.0008, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 505 + }, + { + "completion_length": 200.0, + "epoch": 0.22518914107699153, + "grad_norm": 0.008459432050585747, + "kl": 0.009604415856301785, + "learning_rate": 4.765495798151196e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 506 + }, + { + "completion_length": 191.33334350585938, + "epoch": 0.22563417890520696, + "grad_norm": 0.9782493710517883, + "kl": 0.030173055827617645, + "learning_rate": 4.763850591954668e-06, + "loss": 0.0012, + "reward": 0.1875, + "reward_std": 0.1530931293964386, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 507 + }, + { + "completion_length": 196.33334350585938, + "epoch": 0.22607921673342235, + "grad_norm": 0.7100333571434021, + "kl": 0.020358001813292503, + "learning_rate": 4.762199920821683e-06, + "loss": 0.0008, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 508 + }, + { + "completion_length": 199.5, + "epoch": 0.22652425456163774, + "grad_norm": 0.011824924498796463, + "kl": 0.0119178993627429, + "learning_rate": 4.760543788736961e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 509 + }, + { + "completion_length": 200.0, + "epoch": 0.22696929238985314, + "grad_norm": 0.007140854839235544, + "kl": 0.005115116946399212, + "learning_rate": 4.758882199698405e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 510 + }, + { + "completion_length": 190.1666717529297, + "epoch": 0.22741433021806853, + "grad_norm": 0.7087447643280029, + "kl": 0.018751170486211777, + "learning_rate": 4.757215157717091e-06, + "loss": 0.0008, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 511 + }, + { + "completion_length": 194.0, + "epoch": 0.22785936804628393, + "grad_norm": 0.05364219471812248, + "kl": 0.0219450481235981, + "learning_rate": 4.7555426668172614e-06, + "loss": 0.0009, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 512 + }, + { + "completion_length": 179.33334350585938, + "epoch": 0.22830440587449932, + "grad_norm": 0.7024439573287964, + "kl": 0.027089372277259827, + "learning_rate": 4.753864731036308e-06, + "loss": 0.0011, + "reward": 0.187666654586792, + "reward_std": 0.10494124889373779, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.187666654586792, + "step": 513 + }, + { + "completion_length": 200.0, + "epoch": 0.22874944370271474, + "grad_norm": 0.010735830292105675, + "kl": 0.008807472884654999, + "learning_rate": 4.752181354424769e-06, + "loss": 0.0004, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 514 + }, + { + "completion_length": 200.0, + "epoch": 0.22919448153093014, + "grad_norm": 0.008272531442344189, + "kl": 0.014129738323390484, + "learning_rate": 4.750492541046318e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 515 + }, + { + "completion_length": 200.0, + "epoch": 0.22963951935914553, + "grad_norm": 0.006092743948101997, + "kl": 0.0037111197598278522, + "learning_rate": 4.74879829497775e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 516 + }, + { + "completion_length": 200.0, + "epoch": 0.23008455718736093, + "grad_norm": 0.7442413568496704, + "kl": 0.005196265410631895, + "learning_rate": 4.747098620308975e-06, + "loss": 0.0002, + "reward": 0.1041666716337204, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 517 + }, + { + "completion_length": 186.0, + "epoch": 0.23052959501557632, + "grad_norm": 0.9965373277664185, + "kl": 0.03707145154476166, + "learning_rate": 4.7453935211430105e-06, + "loss": 0.0015, + "reward": 0.2083333432674408, + "reward_std": 0.15138253569602966, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 518 + }, + { + "completion_length": 200.0, + "epoch": 0.23097463284379172, + "grad_norm": 0.6923635005950928, + "kl": 0.025277286767959595, + "learning_rate": 4.743683001595965e-06, + "loss": 0.001, + "reward": 0.04116666316986084, + "reward_std": 0.3334938883781433, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04116666316986084, + "step": 519 + }, + { + "completion_length": 200.0, + "epoch": 0.2314196706720071, + "grad_norm": 0.009316305629909039, + "kl": 0.003812011331319809, + "learning_rate": 4.741967065797036e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 520 + }, + { + "completion_length": 163.83334350585938, + "epoch": 0.23186470850022253, + "grad_norm": 1.0490024089813232, + "kl": 0.03154284879565239, + "learning_rate": 4.740245717888491e-06, + "loss": 0.0013, + "reward": 0.13366666436195374, + "reward_std": 0.11832442134618759, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13366666436195374, + "step": 521 + }, + { + "completion_length": 200.0, + "epoch": 0.23230974632843793, + "grad_norm": 0.5622179508209229, + "kl": 0.00910902488976717, + "learning_rate": 4.738518962025665e-06, + "loss": 0.0004, + "reward": 0.1041666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 522 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.23275478415665332, + "grad_norm": 0.6993998885154724, + "kl": 0.028999265283346176, + "learning_rate": 4.736786802376948e-06, + "loss": 0.0012, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 523 + }, + { + "completion_length": 200.0, + "epoch": 0.23319982198486872, + "grad_norm": 0.011826745234429836, + "kl": 0.005504906177520752, + "learning_rate": 4.735049243123774e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 524 + }, + { + "completion_length": 200.0, + "epoch": 0.2336448598130841, + "grad_norm": 0.8087031245231628, + "kl": 0.06020812317728996, + "learning_rate": 4.7333062884606114e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 525 + }, + { + "completion_length": 167.33334350585938, + "epoch": 0.2340898976412995, + "grad_norm": 0.7873777151107788, + "kl": 0.03628785163164139, + "learning_rate": 4.731557942594956e-06, + "loss": 0.0015, + "reward": 0.0885000079870224, + "reward_std": 0.27989909052848816, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0885000079870224, + "step": 526 + }, + { + "completion_length": 187.1666717529297, + "epoch": 0.2345349354695149, + "grad_norm": 0.7097110152244568, + "kl": 0.034623414278030396, + "learning_rate": 4.729804209747313e-06, + "loss": 0.0014, + "reward": -0.0033333352766931057, + "reward_std": 0.3099939823150635, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.0033333352766931057, + "step": 527 + }, + { + "completion_length": 200.0, + "epoch": 0.2349799732977303, + "grad_norm": 0.0095818554982543, + "kl": 0.003446843009442091, + "learning_rate": 4.728045094151194e-06, + "loss": 0.0001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 528 + }, + { + "completion_length": 158.6666717529297, + "epoch": 0.23542501112594572, + "grad_norm": 1.1593365669250488, + "kl": 0.038195449858903885, + "learning_rate": 4.726280600053109e-06, + "loss": 0.0015, + "reward": 0.1041666716337204, + "reward_std": 0.0940965861082077, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 529 + }, + { + "completion_length": 195.1666717529297, + "epoch": 0.2358700489541611, + "grad_norm": 0.6258925795555115, + "kl": 0.021456394344568253, + "learning_rate": 4.724510731712543e-06, + "loss": 0.0009, + "reward": 0.12516666948795319, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12516666948795319, + "step": 530 + }, + { + "completion_length": 185.33334350585938, + "epoch": 0.2363150867823765, + "grad_norm": 0.8283340930938721, + "kl": 0.04360618814826012, + "learning_rate": 4.722735493401961e-06, + "loss": 0.0017, + "reward": 0.2291666716337204, + "reward_std": 0.14613065123558044, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 531 + }, + { + "completion_length": 198.33334350585938, + "epoch": 0.2367601246105919, + "grad_norm": 0.6890245676040649, + "kl": 0.012334956787526608, + "learning_rate": 4.720954889406789e-06, + "loss": 0.0005, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 532 + }, + { + "completion_length": 200.0, + "epoch": 0.2372051624388073, + "grad_norm": 0.006551599130034447, + "kl": 0.00498668709769845, + "learning_rate": 4.719168924025407e-06, + "loss": 0.0002, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 533 + }, + { + "completion_length": 190.83334350585938, + "epoch": 0.2376502002670227, + "grad_norm": 0.7857916355133057, + "kl": 0.037860430777072906, + "learning_rate": 4.7173776015691345e-06, + "loss": 0.0015, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 534 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.23809523809523808, + "grad_norm": 0.7188397645950317, + "kl": 0.02583630383014679, + "learning_rate": 4.715580926362225e-06, + "loss": 0.001, + "reward": 0.0755000039935112, + "reward_std": 0.3398551344871521, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.007833331823348999, + "step": 535 + }, + { + "completion_length": 200.0, + "epoch": 0.2385402759234535, + "grad_norm": 0.7328579425811768, + "kl": 0.014590962789952755, + "learning_rate": 4.713778902741855e-06, + "loss": 0.0006, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 536 + }, + { + "completion_length": 151.6666717529297, + "epoch": 0.2389853137516689, + "grad_norm": 0.8885166049003601, + "kl": 0.0610353946685791, + "learning_rate": 4.7119715350581096e-06, + "loss": 0.0024, + "reward": 0.250166654586792, + "reward_std": 0.15827244520187378, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1668333262205124, + "step": 537 + }, + { + "completion_length": 200.0, + "epoch": 0.2394303515798843, + "grad_norm": 0.6243503093719482, + "kl": 0.011630935594439507, + "learning_rate": 4.710158827673974e-06, + "loss": 0.0005, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 538 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.2398753894080997, + "grad_norm": 0.6943646669387817, + "kl": 0.02668793499469757, + "learning_rate": 4.708340784965326e-06, + "loss": 0.0011, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 539 + }, + { + "completion_length": 185.83334350585938, + "epoch": 0.24032042723631508, + "grad_norm": 0.7564919590950012, + "kl": 0.036508020013570786, + "learning_rate": 4.7065174113209225e-06, + "loss": 0.0015, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 540 + }, + { + "completion_length": 193.1666717529297, + "epoch": 0.24076546506453048, + "grad_norm": 1.0034822225570679, + "kl": 0.038777224719524384, + "learning_rate": 4.7046887111423865e-06, + "loss": 0.0016, + "reward": 0.1041666716337204, + "reward_std": 0.0940965861082077, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 541 + }, + { + "completion_length": 184.83334350585938, + "epoch": 0.24121050289274587, + "grad_norm": 0.8872579336166382, + "kl": 0.009863987565040588, + "learning_rate": 4.702854688844202e-06, + "loss": 0.0004, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 542 + }, + { + "completion_length": 112.66667175292969, + "epoch": 0.2416555407209613, + "grad_norm": 0.8163735866546631, + "kl": 0.059337299317121506, + "learning_rate": 4.701015348853699e-06, + "loss": 0.0024, + "reward": 0.2866666913032532, + "reward_std": 0.26798635721206665, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03666666895151138, + "step": 543 + }, + { + "completion_length": 111.5, + "epoch": 0.2421005785491767, + "grad_norm": 1.2242677211761475, + "kl": 0.08483864367008209, + "learning_rate": 4.699170695611047e-06, + "loss": 0.0034, + "reward": 0.250166654586792, + "reward_std": 0.13693124055862427, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1668333262205124, + "step": 544 + }, + { + "completion_length": 196.6666717529297, + "epoch": 0.24254561637739208, + "grad_norm": 0.6293683648109436, + "kl": 0.014732254669070244, + "learning_rate": 4.697320733569238e-06, + "loss": 0.0006, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 545 + }, + { + "completion_length": 200.0, + "epoch": 0.24299065420560748, + "grad_norm": 0.015561908483505249, + "kl": 0.006728894077241421, + "learning_rate": 4.695465467194082e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 546 + }, + { + "completion_length": 199.0, + "epoch": 0.24343569203382287, + "grad_norm": 0.6494888663291931, + "kl": 0.04665082320570946, + "learning_rate": 4.693604900964193e-06, + "loss": 0.0019, + "reward": 0.1875, + "reward_std": 0.10458251088857651, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 547 + }, + { + "completion_length": 164.1666717529297, + "epoch": 0.24388072986203826, + "grad_norm": 0.9423882365226746, + "kl": 0.06301107257604599, + "learning_rate": 4.691739039370979e-06, + "loss": 0.0025, + "reward": 0.2291666716337204, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 548 + }, + { + "completion_length": 191.5, + "epoch": 0.24432576769025366, + "grad_norm": 0.7398462891578674, + "kl": 0.028107035905122757, + "learning_rate": 4.68986788691863e-06, + "loss": 0.0011, + "reward": 0.1458333432674408, + "reward_std": 0.0940965861082077, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 549 + }, + { + "completion_length": 200.0, + "epoch": 0.24477080551846908, + "grad_norm": 0.015271569602191448, + "kl": 0.006307273171842098, + "learning_rate": 4.68799144812411e-06, + "loss": 0.0003, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 550 + }, + { + "completion_length": 161.1666717529297, + "epoch": 0.24521584334668448, + "grad_norm": 0.8253611922264099, + "kl": 0.061717789620161057, + "learning_rate": 4.686109727517142e-06, + "loss": 0.0025, + "reward": 0.2916666865348816, + "reward_std": 0.23273734748363495, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 551 + }, + { + "completion_length": 149.6666717529297, + "epoch": 0.24566088117489987, + "grad_norm": 0.9274349808692932, + "kl": 0.060279231518507004, + "learning_rate": 4.6842227296402025e-06, + "loss": 0.0024, + "reward": 0.2293333262205124, + "reward_std": 0.18415391445159912, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1459999978542328, + "step": 552 + }, + { + "completion_length": 180.5, + "epoch": 0.24610591900311526, + "grad_norm": 0.7610526084899902, + "kl": 0.040498070418834686, + "learning_rate": 4.6823304590485025e-06, + "loss": 0.0016, + "reward": 0.1459999978542328, + "reward_std": 0.16642116010189056, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1459999978542328, + "step": 553 + }, + { + "completion_length": 174.83334350585938, + "epoch": 0.24655095683133066, + "grad_norm": 0.820443868637085, + "kl": 0.07258454710245132, + "learning_rate": 4.680432920309986e-06, + "loss": 0.0029, + "reward": 0.2708333432674408, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 554 + }, + { + "completion_length": 177.33334350585938, + "epoch": 0.24699599465954605, + "grad_norm": 0.7389416694641113, + "kl": 0.04997410625219345, + "learning_rate": 4.678530118005313e-06, + "loss": 0.002, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 555 + }, + { + "completion_length": 176.83334350585938, + "epoch": 0.24744103248776145, + "grad_norm": 0.7619123458862305, + "kl": 0.0550689622759819, + "learning_rate": 4.676622056727848e-06, + "loss": 0.0022, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 556 + }, + { + "completion_length": 180.6666717529297, + "epoch": 0.24788607031597687, + "grad_norm": 0.6843514442443848, + "kl": 0.031852152198553085, + "learning_rate": 4.674708741083651e-06, + "loss": 0.0013, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 557 + }, + { + "completion_length": 95.5, + "epoch": 0.24833110814419226, + "grad_norm": 1.1454741954803467, + "kl": 0.08709007501602173, + "learning_rate": 4.6727901756914694e-06, + "loss": 0.0035, + "reward": 0.375166654586792, + "reward_std": 0.15811441838741302, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12516666948795319, + "step": 558 + }, + { + "completion_length": 184.5, + "epoch": 0.24877614597240766, + "grad_norm": 0.6881827712059021, + "kl": 0.05518307909369469, + "learning_rate": 4.670866365182719e-06, + "loss": 0.0022, + "reward": 0.1458333432674408, + "reward_std": 0.0940965861082077, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 559 + }, + { + "completion_length": 188.1666717529297, + "epoch": 0.24922118380062305, + "grad_norm": 0.8532994389533997, + "kl": 0.03818666934967041, + "learning_rate": 4.66893731420148e-06, + "loss": 0.0015, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 560 + }, + { + "completion_length": 200.0, + "epoch": 0.24966622162883845, + "grad_norm": 0.8707519769668579, + "kl": 0.02029397338628769, + "learning_rate": 4.667003027404483e-06, + "loss": 0.0008, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 561 + }, + { + "completion_length": 167.5, + "epoch": 0.25011125945705387, + "grad_norm": 0.7573645710945129, + "kl": 0.08380292356014252, + "learning_rate": 4.665063509461098e-06, + "loss": 0.0034, + "reward": 0.15600000321865082, + "reward_std": 0.3755556046962738, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.07266666740179062, + "step": 562 + }, + { + "completion_length": 200.0, + "epoch": 0.25055629728526924, + "grad_norm": 0.7199256420135498, + "kl": 0.027807530015707016, + "learning_rate": 4.663118765053319e-06, + "loss": 0.0011, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 563 + }, + { + "completion_length": 129.33334350585938, + "epoch": 0.25100133511348466, + "grad_norm": 0.9969347715377808, + "kl": 0.09646206349134445, + "learning_rate": 4.661168798875763e-06, + "loss": 0.0039, + "reward": 0.3958333432674408, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 564 + }, + { + "completion_length": 145.5, + "epoch": 0.2514463729417, + "grad_norm": 0.6874487996101379, + "kl": 0.07618147879838943, + "learning_rate": 4.6592136156356476e-06, + "loss": 0.003, + "reward": 0.25, + "reward_std": 0.15811388194561005, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 565 + }, + { + "completion_length": 200.0, + "epoch": 0.25189141076991545, + "grad_norm": 0.8305972814559937, + "kl": 0.022951535880565643, + "learning_rate": 4.6572532200527875e-06, + "loss": 0.0009, + "reward": -0.05400000140070915, + "reward_std": 0.37837284803390503, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.05400000140070915, + "step": 566 + }, + { + "completion_length": 153.1666717529297, + "epoch": 0.2523364485981308, + "grad_norm": 0.9877628684043884, + "kl": 0.0713796615600586, + "learning_rate": 4.655287616859578e-06, + "loss": 0.0029, + "reward": 0.187666654586792, + "reward_std": 0.06828372180461884, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18766668438911438, + "step": 567 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.25278148642634624, + "grad_norm": 0.7665106058120728, + "kl": 0.04054148495197296, + "learning_rate": 4.6533168108009855e-06, + "loss": 0.0016, + "reward": 0.2291666716337204, + "reward_std": 0.14613065123558044, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 568 + }, + { + "completion_length": 170.33334350585938, + "epoch": 0.25322652425456166, + "grad_norm": 0.8183974623680115, + "kl": 0.047635436058044434, + "learning_rate": 4.651340806634538e-06, + "loss": 0.0019, + "reward": 0.3543333411216736, + "reward_std": 0.12293358892202377, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.187666654586792, + "step": 569 + }, + { + "completion_length": 168.6666717529297, + "epoch": 0.253671562082777, + "grad_norm": 0.7534694075584412, + "kl": 0.04617027938365936, + "learning_rate": 4.64935960913031e-06, + "loss": 0.0018, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 570 + }, + { + "completion_length": 113.16667175292969, + "epoch": 0.25411659991099245, + "grad_norm": 1.2481725215911865, + "kl": 0.07855173945426941, + "learning_rate": 4.647373223070913e-06, + "loss": 0.0031, + "reward": 0.2916666865348816, + "reward_std": 0.10206207633018494, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 571 + }, + { + "completion_length": 103.66667175292969, + "epoch": 0.2545616377392078, + "grad_norm": 1.0237010717391968, + "kl": 0.11217594146728516, + "learning_rate": 4.645381653251485e-06, + "loss": 0.0045, + "reward": 0.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 572 + }, + { + "completion_length": 195.5, + "epoch": 0.25500667556742324, + "grad_norm": 0.886687695980072, + "kl": 0.016482416540384293, + "learning_rate": 4.643384904479675e-06, + "loss": 0.0007, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 573 + }, + { + "completion_length": 114.33333587646484, + "epoch": 0.2554517133956386, + "grad_norm": 0.04589557647705078, + "kl": 0.08041426539421082, + "learning_rate": 4.641382981575637e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 574 + }, + { + "completion_length": 200.0, + "epoch": 0.255896751223854, + "grad_norm": 0.8335212469100952, + "kl": 0.060845568776130676, + "learning_rate": 4.639375889372013e-06, + "loss": 0.0024, + "reward": 0.125, + "reward_std": 0.1369306445121765, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 575 + }, + { + "completion_length": 192.6666717529297, + "epoch": 0.25634178905206945, + "grad_norm": 0.8120817542076111, + "kl": 0.03659413754940033, + "learning_rate": 4.637363632713924e-06, + "loss": 0.0015, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 576 + }, + { + "completion_length": 181.6666717529297, + "epoch": 0.2567868268802848, + "grad_norm": 0.723743736743927, + "kl": 0.028544750064611435, + "learning_rate": 4.63534621645896e-06, + "loss": 0.0011, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 577 + }, + { + "completion_length": 123.0, + "epoch": 0.25723186470850024, + "grad_norm": 0.8822214007377625, + "kl": 0.15481743216514587, + "learning_rate": 4.6333236454771644e-06, + "loss": 0.0062, + "reward": 0.3333333432674408, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 578 + }, + { + "completion_length": 196.0, + "epoch": 0.2576769025367156, + "grad_norm": 0.8370240926742554, + "kl": 0.045977018773555756, + "learning_rate": 4.6312959246510245e-06, + "loss": 0.0018, + "reward": 0.1875, + "reward_std": 0.10458251088857651, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 579 + }, + { + "completion_length": 150.6666717529297, + "epoch": 0.258121940364931, + "grad_norm": 0.7318511009216309, + "kl": 0.05776175111532211, + "learning_rate": 4.629263058875458e-06, + "loss": 0.0023, + "reward": 0.2291666716337204, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 580 + }, + { + "completion_length": 155.0, + "epoch": 0.2585669781931464, + "grad_norm": 0.9142718315124512, + "kl": 0.069237619638443, + "learning_rate": 4.627225053057806e-06, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 581 + }, + { + "completion_length": 165.0, + "epoch": 0.2590120160213618, + "grad_norm": 0.7616869807243347, + "kl": 0.0599093958735466, + "learning_rate": 4.6251819121178145e-06, + "loss": 0.0024, + "reward": 0.3125, + "reward_std": 0.1530931293964386, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 582 + }, + { + "completion_length": 154.6666717529297, + "epoch": 0.25945705384957723, + "grad_norm": 0.806352972984314, + "kl": 0.08792692422866821, + "learning_rate": 4.623133640987628e-06, + "loss": 0.0035, + "reward": 0.2918333411216736, + "reward_std": 0.1881493180990219, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12516666948795319, + "step": 583 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.2599020916777926, + "grad_norm": 0.7354152798652649, + "kl": 0.03302110731601715, + "learning_rate": 4.621080244611772e-06, + "loss": 0.0013, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 584 + }, + { + "completion_length": 197.6666717529297, + "epoch": 0.260347129506008, + "grad_norm": 0.6742915511131287, + "kl": 0.032543033361434937, + "learning_rate": 4.619021727947147e-06, + "loss": 0.0013, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 585 + }, + { + "completion_length": 151.83334350585938, + "epoch": 0.2607921673342234, + "grad_norm": 0.9699811935424805, + "kl": 0.08444561809301376, + "learning_rate": 4.616958095963014e-06, + "loss": 0.0034, + "reward": 0.1276666671037674, + "reward_std": 0.48804986476898193, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1223333403468132, + "step": 586 + }, + { + "completion_length": 169.1666717529297, + "epoch": 0.2612372051624388, + "grad_norm": 1.1708648204803467, + "kl": 0.04619307070970535, + "learning_rate": 4.6148893536409815e-06, + "loss": 0.0018, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 587 + }, + { + "completion_length": 109.33333587646484, + "epoch": 0.2616822429906542, + "grad_norm": 0.978725016117096, + "kl": 0.08659862726926804, + "learning_rate": 4.612815505974993e-06, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 588 + }, + { + "completion_length": 185.5, + "epoch": 0.2621272808188696, + "grad_norm": 0.8235928416252136, + "kl": 0.052005793899297714, + "learning_rate": 4.610736557971321e-06, + "loss": 0.0021, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 589 + }, + { + "completion_length": 174.1666717529297, + "epoch": 0.262572318647085, + "grad_norm": 0.976612389087677, + "kl": 0.045445047318935394, + "learning_rate": 4.608652514648544e-06, + "loss": 0.0018, + "reward": 0.2916666865348816, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 590 + }, + { + "completion_length": 200.0, + "epoch": 0.2630173564753004, + "grad_norm": 0.014436294324696064, + "kl": 0.03448406979441643, + "learning_rate": 4.606563381037544e-06, + "loss": 0.0014, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 591 + }, + { + "completion_length": 200.0, + "epoch": 0.2634623943035158, + "grad_norm": 0.8270597457885742, + "kl": 0.034238051623106, + "learning_rate": 4.604469162181492e-06, + "loss": 0.0014, + "reward": 0.01300000213086605, + "reward_std": 0.2962499260902405, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.01300000213086605, + "step": 592 + }, + { + "completion_length": 148.0, + "epoch": 0.2639074321317312, + "grad_norm": 0.8803796768188477, + "kl": 0.06960055232048035, + "learning_rate": 4.6023698631358326e-06, + "loss": 0.0028, + "reward": 0.437666654586792, + "reward_std": 0.10446371138095856, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10433333367109299, + "step": 593 + }, + { + "completion_length": 200.0, + "epoch": 0.2643524699599466, + "grad_norm": 0.7200313210487366, + "kl": 0.05139869078993797, + "learning_rate": 4.6002654889682755e-06, + "loss": 0.0021, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 594 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.26479750778816197, + "grad_norm": 0.7966943979263306, + "kl": 0.040248602628707886, + "learning_rate": 4.598156044758779e-06, + "loss": 0.0016, + "reward": 0.1458333432674408, + "reward_std": 0.12289901822805405, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 595 + }, + { + "completion_length": 200.0, + "epoch": 0.2652425456163774, + "grad_norm": 0.755216121673584, + "kl": 0.019312169402837753, + "learning_rate": 4.5960415355995444e-06, + "loss": 0.0008, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 596 + }, + { + "completion_length": 186.1666717529297, + "epoch": 0.2656875834445928, + "grad_norm": 0.853950023651123, + "kl": 0.039613865315914154, + "learning_rate": 4.593921966594997e-06, + "loss": 0.0016, + "reward": 0.026833336800336838, + "reward_std": 0.4285675883293152, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.026833336800336838, + "step": 597 + }, + { + "completion_length": 191.1666717529297, + "epoch": 0.2661326212728082, + "grad_norm": 0.7605770826339722, + "kl": 0.05703141912817955, + "learning_rate": 4.591797342861778e-06, + "loss": 0.0023, + "reward": 0.1875, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 598 + }, + { + "completion_length": 200.0, + "epoch": 0.2665776591010236, + "grad_norm": 0.8873202800750732, + "kl": 0.03285577893257141, + "learning_rate": 4.589667669528729e-06, + "loss": 0.0013, + "reward": 0.0833333358168602, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 599 + }, + { + "completion_length": 152.5, + "epoch": 0.26702269692923897, + "grad_norm": 0.7901185154914856, + "kl": 0.07483705878257751, + "learning_rate": 4.587532951736884e-06, + "loss": 0.003, + "reward": 0.2916666865348816, + "reward_std": 0.1881931722164154, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 600 + }, + { + "completion_length": 147.1666717529297, + "epoch": 0.2674677347574544, + "grad_norm": 0.9961167573928833, + "kl": 0.0769721046090126, + "learning_rate": 4.585393194639452e-06, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 601 + }, + { + "completion_length": 200.0, + "epoch": 0.26791277258566976, + "grad_norm": 0.7630622386932373, + "kl": 0.035730935633182526, + "learning_rate": 4.583248403401808e-06, + "loss": 0.0014, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 602 + }, + { + "completion_length": 160.33334350585938, + "epoch": 0.2683578104138852, + "grad_norm": 0.896909236907959, + "kl": 0.07461149990558624, + "learning_rate": 4.581098583201478e-06, + "loss": 0.003, + "reward": 0.375, + "reward_std": 0.1369306445121765, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 603 + }, + { + "completion_length": 116.83333587646484, + "epoch": 0.2688028482421006, + "grad_norm": 0.9277684688568115, + "kl": 0.08133234083652496, + "learning_rate": 4.578943739228131e-06, + "loss": 0.0033, + "reward": 0.24300000071525574, + "reward_std": 0.22485819458961487, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.15966667234897614, + "step": 604 + }, + { + "completion_length": 184.83334350585938, + "epoch": 0.26924788607031597, + "grad_norm": 0.7694399952888489, + "kl": 0.058748748153448105, + "learning_rate": 4.576783876683559e-06, + "loss": 0.0023, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 605 + }, + { + "completion_length": 136.33334350585938, + "epoch": 0.2696929238985314, + "grad_norm": 0.9464454054832458, + "kl": 0.054710254073143005, + "learning_rate": 4.574619000781674e-06, + "loss": 0.0022, + "reward": 0.250333309173584, + "reward_std": 0.1122509092092514, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250333309173584, + "step": 606 + }, + { + "completion_length": 179.6666717529297, + "epoch": 0.27013796172674676, + "grad_norm": 0.8664156198501587, + "kl": 0.07143112272024155, + "learning_rate": 4.572449116748485e-06, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 607 + }, + { + "completion_length": 200.0, + "epoch": 0.2705829995549622, + "grad_norm": 0.9034672975540161, + "kl": 0.026494070887565613, + "learning_rate": 4.570274229822095e-06, + "loss": 0.0011, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 608 + }, + { + "completion_length": 200.0, + "epoch": 0.27102803738317754, + "grad_norm": 0.6816303133964539, + "kl": 0.04936773702502251, + "learning_rate": 4.5680943452526814e-06, + "loss": 0.002, + "reward": -0.1028333306312561, + "reward_std": 0.36173439025878906, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.1028333306312561, + "step": 609 + }, + { + "completion_length": 89.5, + "epoch": 0.27147307521139297, + "grad_norm": 1.2434215545654297, + "kl": 0.10808803886175156, + "learning_rate": 4.565909468302486e-06, + "loss": 0.0043, + "reward": 0.382999986410141, + "reward_std": 0.14375397562980652, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.13300000131130219, + "step": 610 + }, + { + "completion_length": 200.0, + "epoch": 0.2719181130396084, + "grad_norm": 0.03729372099041939, + "kl": 0.05879303440451622, + "learning_rate": 4.563719604245804e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 611 + }, + { + "completion_length": 171.83334350585938, + "epoch": 0.27236315086782376, + "grad_norm": 0.7619086503982544, + "kl": 0.14301355183124542, + "learning_rate": 4.561524758368968e-06, + "loss": 0.0057, + "reward": 0.22450000047683716, + "reward_std": 0.4011252820491791, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.025499999523162842, + "step": 612 + }, + { + "completion_length": 161.83334350585938, + "epoch": 0.2728081886960392, + "grad_norm": 0.8366971015930176, + "kl": 0.07396448403596878, + "learning_rate": 4.559324935970337e-06, + "loss": 0.003, + "reward": 0.3543333411216736, + "reward_std": 0.22915641963481903, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021000001579523087, + "step": 613 + }, + { + "completion_length": 178.1666717529297, + "epoch": 0.27325322652425454, + "grad_norm": 0.9588587880134583, + "kl": 0.06857272237539291, + "learning_rate": 4.5571201423602825e-06, + "loss": 0.0027, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 614 + }, + { + "completion_length": 185.83334350585938, + "epoch": 0.27369826435246997, + "grad_norm": 0.7504873871803284, + "kl": 0.07397520542144775, + "learning_rate": 4.554910382861178e-06, + "loss": 0.003, + "reward": 0.2916666865348816, + "reward_std": 0.2457980364561081, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 615 + }, + { + "completion_length": 199.33334350585938, + "epoch": 0.27414330218068533, + "grad_norm": 0.8504765629768372, + "kl": 0.06764481961727142, + "learning_rate": 4.552695662807385e-06, + "loss": 0.0027, + "reward": 0.1459999978542328, + "reward_std": 0.12306909263134003, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1459999978542328, + "step": 616 + }, + { + "completion_length": 91.66667175292969, + "epoch": 0.27458834000890076, + "grad_norm": 0.24826188385486603, + "kl": 0.1672677993774414, + "learning_rate": 4.550475987545238e-06, + "loss": 0.0067, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 617 + }, + { + "completion_length": 187.0, + "epoch": 0.2750333778371162, + "grad_norm": 0.9045337438583374, + "kl": 0.07695237547159195, + "learning_rate": 4.548251362433033e-06, + "loss": 0.0031, + "reward": 0.1458333432674408, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 618 + }, + { + "completion_length": 106.66667175292969, + "epoch": 0.27547841566533154, + "grad_norm": 0.9984831213951111, + "kl": 0.17150583863258362, + "learning_rate": 4.546021792841019e-06, + "loss": 0.0069, + "reward": 0.3543333411216736, + "reward_std": 0.22915641963481903, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021000001579523087, + "step": 619 + }, + { + "completion_length": 86.0, + "epoch": 0.27592345349354697, + "grad_norm": 0.9944882392883301, + "kl": 0.17101812362670898, + "learning_rate": 4.543787284151374e-06, + "loss": 0.0068, + "reward": 0.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 620 + }, + { + "completion_length": 141.5, + "epoch": 0.27636849132176233, + "grad_norm": 1.0072808265686035, + "kl": 0.11485801637172699, + "learning_rate": 4.541547841758207e-06, + "loss": 0.0046, + "reward": 0.3958333432674408, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 621 + }, + { + "completion_length": 156.0, + "epoch": 0.27681352914997776, + "grad_norm": 1.1295337677001953, + "kl": 0.121914803981781, + "learning_rate": 4.539303471067531e-06, + "loss": 0.0049, + "reward": 0.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 622 + }, + { + "completion_length": 168.5, + "epoch": 0.2772585669781931, + "grad_norm": 0.791140615940094, + "kl": 0.06966628134250641, + "learning_rate": 4.537054177497259e-06, + "loss": 0.0028, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 623 + }, + { + "completion_length": 120.0, + "epoch": 0.27770360480640854, + "grad_norm": 0.9863029718399048, + "kl": 0.10272692143917084, + "learning_rate": 4.534799966477186e-06, + "loss": 0.0041, + "reward": 0.3959999978542328, + "reward_std": 0.1658191978931427, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.06266666948795319, + "step": 624 + }, + { + "completion_length": 200.0, + "epoch": 0.27814864263462397, + "grad_norm": 0.8481348752975464, + "kl": 0.06225643306970596, + "learning_rate": 4.532540843448979e-06, + "loss": 0.0025, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 625 + }, + { + "completion_length": 144.0, + "epoch": 0.27859368046283933, + "grad_norm": 0.9757879376411438, + "kl": 0.09962724894285202, + "learning_rate": 4.530276813866162e-06, + "loss": 0.004, + "reward": 0.31316667795181274, + "reward_std": 0.0688314288854599, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 626 + }, + { + "completion_length": 194.6666717529297, + "epoch": 0.27903871829105475, + "grad_norm": 0.931898295879364, + "kl": 0.07902925461530685, + "learning_rate": 4.528007883194102e-06, + "loss": 0.0032, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 627 + }, + { + "completion_length": 190.83334350585938, + "epoch": 0.2794837561192701, + "grad_norm": 0.831519365310669, + "kl": 0.06486264616250992, + "learning_rate": 4.525734056910002e-06, + "loss": 0.0026, + "reward": 0.2918333411216736, + "reward_std": 0.2042061984539032, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12516666948795319, + "step": 628 + }, + { + "completion_length": 111.33333587646484, + "epoch": 0.27992879394748554, + "grad_norm": 0.9687957167625427, + "kl": 0.1136898398399353, + "learning_rate": 4.523455340502878e-06, + "loss": 0.0045, + "reward": 0.375166654586792, + "reward_std": 0.15811441838741302, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12516666948795319, + "step": 629 + }, + { + "completion_length": 120.66667175292969, + "epoch": 0.2803738317757009, + "grad_norm": 0.0987262949347496, + "kl": 0.11421214044094086, + "learning_rate": 4.521171739473552e-06, + "loss": 0.0046, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 630 + }, + { + "completion_length": 120.83333587646484, + "epoch": 0.28081886960391633, + "grad_norm": 0.8952674269676208, + "kl": 0.09326162189245224, + "learning_rate": 4.5188832593346386e-06, + "loss": 0.0037, + "reward": 0.4583333432674408, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 631 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.28126390743213175, + "grad_norm": 0.06504479050636292, + "kl": 0.12895925343036652, + "learning_rate": 4.51658990561053e-06, + "loss": 0.0052, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 632 + }, + { + "completion_length": 118.16667175292969, + "epoch": 0.2817089452603471, + "grad_norm": 1.197392463684082, + "kl": 0.39798209071159363, + "learning_rate": 4.514291683837383e-06, + "loss": 0.0159, + "reward": 0.2523333430290222, + "reward_std": 0.41331908106803894, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.08100000023841858, + "step": 633 + }, + { + "completion_length": 141.33334350585938, + "epoch": 0.28215398308856254, + "grad_norm": 1.023812174797058, + "kl": 0.0909874364733696, + "learning_rate": 4.511988599563107e-06, + "loss": 0.0036, + "reward": 0.3958333432674408, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 634 + }, + { + "completion_length": 107.5, + "epoch": 0.2825990209167779, + "grad_norm": 1.506011962890625, + "kl": 0.10861188173294067, + "learning_rate": 4.509680658347347e-06, + "loss": 0.0043, + "reward": 0.3959999978542328, + "reward_std": 0.2002398669719696, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.06266666948795319, + "step": 635 + }, + { + "completion_length": 150.1666717529297, + "epoch": 0.28304405874499333, + "grad_norm": 1.0660347938537598, + "kl": 0.1432390809059143, + "learning_rate": 4.507367865761476e-06, + "loss": 0.0057, + "reward": 0.3543333411216736, + "reward_std": 0.20028147101402283, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10433333367109299, + "step": 636 + }, + { + "completion_length": 190.33334350585938, + "epoch": 0.2834890965732087, + "grad_norm": 0.9527061581611633, + "kl": 0.07011144608259201, + "learning_rate": 4.505050227388575e-06, + "loss": 0.0028, + "reward": 0.1875, + "reward_std": 0.246855229139328, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 637 + }, + { + "completion_length": 117.16667175292969, + "epoch": 0.2839341344014241, + "grad_norm": 1.230230689048767, + "kl": 0.09981067478656769, + "learning_rate": 4.502727748823425e-06, + "loss": 0.004, + "reward": 0.2084999978542328, + "reward_std": 0.23266606032848358, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04183333367109299, + "step": 638 + }, + { + "completion_length": 129.0, + "epoch": 0.28437917222963954, + "grad_norm": 0.8897629976272583, + "kl": 0.09878586232662201, + "learning_rate": 4.50040043567249e-06, + "loss": 0.004, + "reward": 0.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 639 + }, + { + "completion_length": 92.83333587646484, + "epoch": 0.2848242100578549, + "grad_norm": 0.9260250926017761, + "kl": 0.08705293387174606, + "learning_rate": 4.498068293553906e-06, + "loss": 0.0035, + "reward": 0.437666654586792, + "reward_std": 0.15268486738204956, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021000001579523087, + "step": 640 + }, + { + "completion_length": 195.5, + "epoch": 0.28526924788607033, + "grad_norm": 0.7902094125747681, + "kl": 0.07190477102994919, + "learning_rate": 4.495731328097464e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.1881931722164154, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 641 + }, + { + "completion_length": 200.0, + "epoch": 0.2857142857142857, + "grad_norm": 0.8260470032691956, + "kl": 0.0487191379070282, + "learning_rate": 4.4933895449446e-06, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.12909945845603943, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 642 + }, + { + "completion_length": 126.33333587646484, + "epoch": 0.2861593235425011, + "grad_norm": 0.7838308811187744, + "kl": 0.12842848896980286, + "learning_rate": 4.491042949748381e-06, + "loss": 0.0051, + "reward": 0.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 643 + }, + { + "completion_length": 154.6666717529297, + "epoch": 0.2866043613707165, + "grad_norm": 0.820894181728363, + "kl": 0.060759272426366806, + "learning_rate": 4.488691548173487e-06, + "loss": 0.0024, + "reward": 0.28716668486595154, + "reward_std": 0.21236613392829895, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12049999833106995, + "step": 644 + }, + { + "completion_length": 147.33334350585938, + "epoch": 0.2870493991989319, + "grad_norm": 0.7532824873924255, + "kl": 0.0968950018286705, + "learning_rate": 4.486335345896204e-06, + "loss": 0.0039, + "reward": 0.4166666865348816, + "reward_std": 0.20412415266036987, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 645 + }, + { + "completion_length": 179.5, + "epoch": 0.28749443702714733, + "grad_norm": 0.7337256073951721, + "kl": 0.0560641810297966, + "learning_rate": 4.483974348604407e-06, + "loss": 0.0022, + "reward": 0.20866666734218597, + "reward_std": 0.12942281365394592, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20866666734218597, + "step": 646 + }, + { + "completion_length": 191.33334350585938, + "epoch": 0.2879394748553627, + "grad_norm": 0.665947675704956, + "kl": 0.04197518154978752, + "learning_rate": 4.48160856199754e-06, + "loss": 0.0017, + "reward": 0.187666654586792, + "reward_std": 0.20557884871959686, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10433333367109299, + "step": 647 + }, + { + "completion_length": 168.33334350585938, + "epoch": 0.2883845126835781, + "grad_norm": 0.7467770576477051, + "kl": 0.05759914964437485, + "learning_rate": 4.479237991786617e-06, + "loss": 0.0023, + "reward": 0.10266666859388351, + "reward_std": 0.39400848746299744, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.019333332777023315, + "step": 648 + }, + { + "completion_length": 200.0, + "epoch": 0.2888295505117935, + "grad_norm": 0.7417876720428467, + "kl": 0.05739980190992355, + "learning_rate": 4.476862643694194e-06, + "loss": 0.0023, + "reward": 0.0416666679084301, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 649 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.2892745883400089, + "grad_norm": 0.6486422419548035, + "kl": 0.05907044932246208, + "learning_rate": 4.474482523454363e-06, + "loss": 0.0024, + "reward": 0.1041666716337204, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 650 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.2897196261682243, + "grad_norm": 0.8684423565864563, + "kl": 0.05625095218420029, + "learning_rate": 4.472097636812736e-06, + "loss": 0.0022, + "reward": 0.437666654586792, + "reward_std": 0.15268486738204956, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.021000001579523087, + "step": 651 + }, + { + "completion_length": 143.0, + "epoch": 0.2901646639964397, + "grad_norm": 0.8793902397155762, + "kl": 0.07028966397047043, + "learning_rate": 4.469707989526429e-06, + "loss": 0.0028, + "reward": 0.3958333432674408, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 652 + }, + { + "completion_length": 177.5, + "epoch": 0.2906097018246551, + "grad_norm": 0.9311726093292236, + "kl": 0.05447518453001976, + "learning_rate": 4.467313587364053e-06, + "loss": 0.0022, + "reward": 0.2708333432674408, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 653 + }, + { + "completion_length": 165.33334350585938, + "epoch": 0.2910547396528705, + "grad_norm": 0.6879425644874573, + "kl": 0.06616301834583282, + "learning_rate": 4.464914436105695e-06, + "loss": 0.0026, + "reward": 0.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 654 + }, + { + "completion_length": 200.0, + "epoch": 0.2914997774810859, + "grad_norm": 0.6475021243095398, + "kl": 0.039877697825431824, + "learning_rate": 4.462510541542909e-06, + "loss": 0.0016, + "reward": 0.04933333396911621, + "reward_std": 0.38236457109451294, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04933333396911621, + "step": 655 + }, + { + "completion_length": 143.83334350585938, + "epoch": 0.2919448153093013, + "grad_norm": 0.9102987051010132, + "kl": 0.09255407750606537, + "learning_rate": 4.460101909478696e-06, + "loss": 0.0037, + "reward": 0.14616666734218597, + "reward_std": 0.1465870589017868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14616666734218597, + "step": 656 + }, + { + "completion_length": 126.16667175292969, + "epoch": 0.2923898531375167, + "grad_norm": 0.045655757188797, + "kl": 0.07757769525051117, + "learning_rate": 4.457688545727496e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 657 + }, + { + "completion_length": 191.6666717529297, + "epoch": 0.29283489096573206, + "grad_norm": 0.6900054216384888, + "kl": 0.1448201835155487, + "learning_rate": 4.45527045611517e-06, + "loss": 0.0058, + "reward": 0.010500004515051842, + "reward_std": 0.3959311842918396, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.07283332943916321, + "step": 658 + }, + { + "completion_length": 119.33333587646484, + "epoch": 0.2932799287939475, + "grad_norm": 0.9801993370056152, + "kl": 0.10018780082464218, + "learning_rate": 4.452847646478987e-06, + "loss": 0.004, + "reward": 0.35483333468437195, + "reward_std": 0.0940093994140625, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27149999141693115, + "step": 659 + }, + { + "completion_length": 193.5, + "epoch": 0.2937249666221629, + "grad_norm": 0.7721496224403381, + "kl": 0.04845193028450012, + "learning_rate": 4.4504201226676124e-06, + "loss": 0.0019, + "reward": 0.2708333432674408, + "reward_std": 0.25515520572662354, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 660 + }, + { + "completion_length": 164.0, + "epoch": 0.2941700044503783, + "grad_norm": 0.8990196585655212, + "kl": 0.038245782256126404, + "learning_rate": 4.4479878905410875e-06, + "loss": 0.0015, + "reward": 0.3959999978542328, + "reward_std": 0.12286578863859177, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1459999978542328, + "step": 661 + }, + { + "completion_length": 197.0, + "epoch": 0.2946150422785937, + "grad_norm": 0.7893727421760559, + "kl": 0.04667172580957413, + "learning_rate": 4.445550955970823e-06, + "loss": 0.0019, + "reward": 0.1458333432674408, + "reward_std": 0.12289901822805405, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 662 + }, + { + "completion_length": 110.33333587646484, + "epoch": 0.29506008010680906, + "grad_norm": 0.8910045623779297, + "kl": 0.08671041578054428, + "learning_rate": 4.443109324839581e-06, + "loss": 0.0035, + "reward": 0.4375, + "reward_std": 0.1530931293964386, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 663 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.2955051179350245, + "grad_norm": 0.7910698056221008, + "kl": 0.043277036398649216, + "learning_rate": 4.440663003041459e-06, + "loss": 0.0017, + "reward": 0.2291666716337204, + "reward_std": 0.22935599088668823, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 664 + }, + { + "completion_length": 186.33334350585938, + "epoch": 0.29595015576323985, + "grad_norm": 0.8543537855148315, + "kl": 0.06220155954360962, + "learning_rate": 4.43821199648188e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 665 + }, + { + "completion_length": 172.1666717529297, + "epoch": 0.2963951935914553, + "grad_norm": 0.8467397093772888, + "kl": 0.05557259917259216, + "learning_rate": 4.435756311077573e-06, + "loss": 0.0022, + "reward": 0.250166654586792, + "reward_std": 0.2372765839099884, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08349999785423279, + "step": 666 + }, + { + "completion_length": 111.66667175292969, + "epoch": 0.2968402314196707, + "grad_norm": 0.0691637396812439, + "kl": 0.09109967201948166, + "learning_rate": 4.4332959527565666e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 667 + }, + { + "completion_length": 171.5, + "epoch": 0.29728526924788606, + "grad_norm": 0.6657293438911438, + "kl": 0.06025252863764763, + "learning_rate": 4.430830927458166e-06, + "loss": 0.0024, + "reward": 0.3961666524410248, + "reward_std": 0.14607451856136322, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14616666734218597, + "step": 668 + }, + { + "completion_length": 119.66667175292969, + "epoch": 0.2977303070761015, + "grad_norm": 0.9089164137840271, + "kl": 0.08549092710018158, + "learning_rate": 4.428361241132943e-06, + "loss": 0.0034, + "reward": 0.375166654586792, + "reward_std": 0.19339123368263245, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.04183333367109299, + "step": 669 + }, + { + "completion_length": 200.0, + "epoch": 0.29817534490431685, + "grad_norm": 0.6968740820884705, + "kl": 0.04054586589336395, + "learning_rate": 4.425886899742722e-06, + "loss": 0.0016, + "reward": 0.1458333432674408, + "reward_std": 0.12289901822805405, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 670 + }, + { + "completion_length": 121.33333587646484, + "epoch": 0.2986203827325323, + "grad_norm": 1.0022284984588623, + "kl": 0.08514213562011719, + "learning_rate": 4.423407909260564e-06, + "loss": 0.0034, + "reward": 0.437666654586792, + "reward_std": 0.10446371138095856, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10433333367109299, + "step": 671 + }, + { + "completion_length": 184.1666717529297, + "epoch": 0.29906542056074764, + "grad_norm": 0.7887581586837769, + "kl": 0.038804031908512115, + "learning_rate": 4.420924275670753e-06, + "loss": 0.0016, + "reward": 0.2083333432674408, + "reward_std": 0.17078250646591187, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 672 + }, + { + "completion_length": 198.0, + "epoch": 0.29951045838896306, + "grad_norm": 0.8230785131454468, + "kl": 0.04383291304111481, + "learning_rate": 4.4184360049687826e-06, + "loss": 0.0018, + "reward": 0.2293333262205124, + "reward_std": 0.12313678115606308, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 673 + }, + { + "completion_length": 200.0, + "epoch": 0.2999554962171785, + "grad_norm": 0.7423299551010132, + "kl": 0.03673369064927101, + "learning_rate": 4.41594310316134e-06, + "loss": 0.0015, + "reward": 0.0625, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 674 + }, + { + "completion_length": 137.0, + "epoch": 0.30040053404539385, + "grad_norm": 0.8925609588623047, + "kl": 0.08815717697143555, + "learning_rate": 4.4134455762662895e-06, + "loss": 0.0035, + "reward": 0.4166666865348816, + "reward_std": 0.20412413775920868, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.4166666865348816, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 675 + }, + { + "completion_length": 97.5, + "epoch": 0.3008455718736093, + "grad_norm": 0.03086089715361595, + "kl": 0.06561337411403656, + "learning_rate": 4.410943430312663e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 676 + }, + { + "completion_length": 200.0, + "epoch": 0.30129060970182464, + "grad_norm": 0.03762960061430931, + "kl": 0.025554362684488297, + "learning_rate": 4.408436671340643e-06, + "loss": 0.001, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 677 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.30173564753004006, + "grad_norm": 0.7618845701217651, + "kl": 0.03975854814052582, + "learning_rate": 4.405925305401547e-06, + "loss": 0.0016, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 678 + }, + { + "completion_length": 143.0, + "epoch": 0.30218068535825543, + "grad_norm": 1.127150535583496, + "kl": 0.08482104539871216, + "learning_rate": 4.4034093385578125e-06, + "loss": 0.0034, + "reward": 0.3336666524410248, + "reward_std": 0.21874704957008362, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.08366666734218597, + "step": 679 + }, + { + "completion_length": 144.5, + "epoch": 0.30262572318647085, + "grad_norm": 0.03559992089867592, + "kl": 0.05165047571063042, + "learning_rate": 4.400888776882985e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 680 + }, + { + "completion_length": 150.0, + "epoch": 0.3030707610146863, + "grad_norm": 0.8135696649551392, + "kl": 0.07061418145895004, + "learning_rate": 4.398363626461702e-06, + "loss": 0.0028, + "reward": 0.3543333411216736, + "reward_std": 0.20028147101402283, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10433333367109299, + "step": 681 + }, + { + "completion_length": 200.0, + "epoch": 0.30351579884290164, + "grad_norm": 0.7969770431518555, + "kl": 0.03894883021712303, + "learning_rate": 4.395833893389676e-06, + "loss": 0.0016, + "reward": 0.12433333694934845, + "reward_std": 0.3078192174434662, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12433333694934845, + "step": 682 + }, + { + "completion_length": 200.0, + "epoch": 0.30396083667111706, + "grad_norm": 0.7363507747650146, + "kl": 0.03019530698657036, + "learning_rate": 4.393299583773688e-06, + "loss": 0.0012, + "reward": 0.125, + "reward_std": 0.11180339753627777, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 683 + }, + { + "completion_length": 195.5, + "epoch": 0.30440587449933243, + "grad_norm": 0.7232860326766968, + "kl": 0.03277706354856491, + "learning_rate": 4.390760703731559e-06, + "loss": 0.0013, + "reward": 0.1875, + "reward_std": 0.06846532225608826, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 684 + }, + { + "completion_length": 198.5, + "epoch": 0.30485091232754785, + "grad_norm": 0.7945019602775574, + "kl": 0.04717344790697098, + "learning_rate": 4.388217259392148e-06, + "loss": 0.0019, + "reward": 0.14266666769981384, + "reward_std": 0.3285280168056488, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14266666769981384, + "step": 685 + }, + { + "completion_length": 186.0, + "epoch": 0.3052959501557632, + "grad_norm": 0.7385047078132629, + "kl": 0.04403085261583328, + "learning_rate": 4.38566925689533e-06, + "loss": 0.0018, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 686 + }, + { + "completion_length": 163.6666717529297, + "epoch": 0.30574098798397864, + "grad_norm": 0.856914758682251, + "kl": 0.05494067072868347, + "learning_rate": 4.383116702391988e-06, + "loss": 0.0022, + "reward": 0.35466668009757996, + "reward_std": 0.09423092007637024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 687 + }, + { + "completion_length": 196.6666717529297, + "epoch": 0.30618602581219406, + "grad_norm": 0.8300358057022095, + "kl": 0.043124958872795105, + "learning_rate": 4.3805596020439845e-06, + "loss": 0.0017, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 688 + }, + { + "completion_length": 169.6666717529297, + "epoch": 0.30663106364040943, + "grad_norm": 0.749025285243988, + "kl": 0.05098932236433029, + "learning_rate": 4.3779979620241644e-06, + "loss": 0.002, + "reward": 0.20083332061767578, + "reward_std": 0.40986746549606323, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.03416666388511658, + "step": 689 + }, + { + "completion_length": 163.1666717529297, + "epoch": 0.30707610146862485, + "grad_norm": 0.9226408004760742, + "kl": 0.055235203355550766, + "learning_rate": 4.375431788516326e-06, + "loss": 0.0022, + "reward": 0.39633333683013916, + "reward_std": 0.09396524727344513, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22966668009757996, + "step": 690 + }, + { + "completion_length": 145.1666717529297, + "epoch": 0.3075211392968402, + "grad_norm": 0.8971224427223206, + "kl": 0.05605591833591461, + "learning_rate": 4.372861087715215e-06, + "loss": 0.0022, + "reward": 0.4586666524410248, + "reward_std": 0.06403334438800812, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12533333897590637, + "step": 691 + }, + { + "completion_length": 149.5, + "epoch": 0.30796617712505564, + "grad_norm": 0.8178679943084717, + "kl": 0.05255182832479477, + "learning_rate": 4.3702858658265044e-06, + "loss": 0.0021, + "reward": 0.3958333432674408, + "reward_std": 0.16614501178264618, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 692 + }, + { + "completion_length": 200.0, + "epoch": 0.308411214953271, + "grad_norm": 0.8197976350784302, + "kl": 0.03835117816925049, + "learning_rate": 4.367706129066781e-06, + "loss": 0.0015, + "reward": 0.1875, + "reward_std": 0.10458251088857651, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1875, + "step": 693 + }, + { + "completion_length": 172.83334350585938, + "epoch": 0.30885625278148643, + "grad_norm": 0.8643909692764282, + "kl": 0.04992937296628952, + "learning_rate": 4.36512188366353e-06, + "loss": 0.002, + "reward": 0.27116668224334717, + "reward_std": 0.09453975409269333, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 694 + }, + { + "completion_length": 200.0, + "epoch": 0.30930129060970185, + "grad_norm": 0.7522907853126526, + "kl": 0.04939102381467819, + "learning_rate": 4.36253313585512e-06, + "loss": 0.002, + "reward": 0.02083333395421505, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.02083333395421505, + "step": 695 + }, + { + "completion_length": 197.6666717529297, + "epoch": 0.3097463284379172, + "grad_norm": 0.7550695538520813, + "kl": 0.02722327411174774, + "learning_rate": 4.359939891890793e-06, + "loss": 0.0011, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 696 + }, + { + "completion_length": 200.0, + "epoch": 0.31019136626613264, + "grad_norm": 0.023237159475684166, + "kl": 0.013823822140693665, + "learning_rate": 4.357342158030638e-06, + "loss": 0.0006, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 697 + }, + { + "completion_length": 200.0, + "epoch": 0.310636404094348, + "grad_norm": 0.6470344066619873, + "kl": 0.028774775564670563, + "learning_rate": 4.354739940545587e-06, + "loss": 0.0012, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 698 + }, + { + "completion_length": 178.0, + "epoch": 0.31108144192256343, + "grad_norm": 0.8396903872489929, + "kl": 0.036307577043771744, + "learning_rate": 4.352133245717393e-06, + "loss": 0.0015, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 699 + }, + { + "completion_length": 194.0, + "epoch": 0.3115264797507788, + "grad_norm": 0.8275584578514099, + "kl": 0.03899259492754936, + "learning_rate": 4.349522079838622e-06, + "loss": 0.0016, + "reward": 0.125, + "reward_std": 0.07905694097280502, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 700 + }, + { + "completion_length": 185.6666717529297, + "epoch": 0.3119715175789942, + "grad_norm": 0.043758708983659744, + "kl": 0.049132104963064194, + "learning_rate": 4.346906449212627e-06, + "loss": 0.002, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 701 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.31241655540720964, + "grad_norm": 0.8897082209587097, + "kl": 0.03444843739271164, + "learning_rate": 4.344286360153541e-06, + "loss": 0.0014, + "reward": 0.1875, + "reward_std": 0.18957190215587616, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1041666716337204, + "step": 702 + }, + { + "completion_length": 177.0, + "epoch": 0.312861593235425, + "grad_norm": 0.8655028343200684, + "kl": 0.05428258329629898, + "learning_rate": 4.341661818986263e-06, + "loss": 0.0022, + "reward": 0.3966667056083679, + "reward_std": 0.05062280222773552, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 703 + }, + { + "completion_length": 200.0, + "epoch": 0.31330663106364043, + "grad_norm": 0.7451438903808594, + "kl": 0.03979836031794548, + "learning_rate": 4.339032832046434e-06, + "loss": 0.0016, + "reward": 0.187666654586792, + "reward_std": 0.13138745725154877, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.187666654586792, + "step": 704 + }, + { + "completion_length": 197.1666717529297, + "epoch": 0.3137516688918558, + "grad_norm": 0.7860096096992493, + "kl": 0.04280473291873932, + "learning_rate": 4.336399405680432e-06, + "loss": 0.0017, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 705 + }, + { + "completion_length": 200.0, + "epoch": 0.3141967067200712, + "grad_norm": 0.9517030715942383, + "kl": 0.07379502058029175, + "learning_rate": 4.333761546245348e-06, + "loss": 0.003, + "reward": 0.0416666679084301, + "reward_std": 0.10206207633018494, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 706 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.3146417445482866, + "grad_norm": 0.8798065185546875, + "kl": 0.050519704818725586, + "learning_rate": 4.331119260108977e-06, + "loss": 0.002, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 707 + }, + { + "completion_length": 190.1666717529297, + "epoch": 0.315086782376502, + "grad_norm": 0.7868778705596924, + "kl": 0.07131218910217285, + "learning_rate": 4.328472553649799e-06, + "loss": 0.0029, + "reward": 0.125, + "reward_std": 0.20916502177715302, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0416666679084301, + "step": 708 + }, + { + "completion_length": 200.0, + "epoch": 0.31553182020471743, + "grad_norm": 0.029699545353651047, + "kl": 0.04753357172012329, + "learning_rate": 4.325821433256963e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 709 + }, + { + "completion_length": 188.83334350585938, + "epoch": 0.3159768580329328, + "grad_norm": 0.7864294648170471, + "kl": 0.04412949085235596, + "learning_rate": 4.323165905330277e-06, + "loss": 0.0018, + "reward": -0.10866667330265045, + "reward_std": 0.38256901502609253, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.10866667330265045, + "step": 710 + }, + { + "completion_length": 183.6666717529297, + "epoch": 0.3164218958611482, + "grad_norm": 0.8975881338119507, + "kl": 0.04495091363787651, + "learning_rate": 4.320505976280186e-06, + "loss": 0.0018, + "reward": 0.35466668009757996, + "reward_std": 0.09423092007637024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 711 + }, + { + "completion_length": 184.0, + "epoch": 0.3168669336893636, + "grad_norm": 0.7394328117370605, + "kl": 0.08109882473945618, + "learning_rate": 4.3178416525277586e-06, + "loss": 0.0032, + "reward": 0.25, + "reward_std": 0.22360679507255554, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333358168602, + "step": 712 + }, + { + "completion_length": 197.5, + "epoch": 0.317311971517579, + "grad_norm": 0.7964750528335571, + "kl": 0.0377558134496212, + "learning_rate": 4.315172940504677e-06, + "loss": 0.0015, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 713 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.3177570093457944, + "grad_norm": 1.8717976808547974, + "kl": 0.176216721534729, + "learning_rate": 4.312499846653211e-06, + "loss": 0.007, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 714 + }, + { + "completion_length": 141.0, + "epoch": 0.3182020471740098, + "grad_norm": 1.7549928426742554, + "kl": 0.2996341586112976, + "learning_rate": 4.309822377426211e-06, + "loss": 0.012, + "reward": 0.10500000417232513, + "reward_std": 0.5367960333824158, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.25, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": -0.14500001072883606, + "step": 715 + }, + { + "completion_length": 195.1666717529297, + "epoch": 0.3186470850022252, + "grad_norm": 0.813319206237793, + "kl": 0.05743285268545151, + "learning_rate": 4.307140539287089e-06, + "loss": 0.0023, + "reward": 0.1458333432674408, + "reward_std": 0.2002602517604828, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0625, + "step": 716 + }, + { + "completion_length": 186.5, + "epoch": 0.3190921228304406, + "grad_norm": 0.8374386429786682, + "kl": 0.03491507098078728, + "learning_rate": 4.304454338709803e-06, + "loss": 0.0014, + "reward": 0.22966668009757996, + "reward_std": 0.1666717231273651, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22966668009757996, + "step": 717 + }, + { + "completion_length": 192.0, + "epoch": 0.319537160658656, + "grad_norm": 0.7661421298980713, + "kl": 0.04614044725894928, + "learning_rate": 4.3017637821788436e-06, + "loss": 0.0018, + "reward": 0.2916666865348816, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 718 + }, + { + "completion_length": 198.0, + "epoch": 0.31998219848687137, + "grad_norm": 0.8050372004508972, + "kl": 0.018003419041633606, + "learning_rate": 4.2990688761892155e-06, + "loss": 0.0007, + "reward": 0.1666666716337204, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 719 + }, + { + "completion_length": 162.0, + "epoch": 0.3204272363150868, + "grad_norm": 0.758633017539978, + "kl": 0.06661681085824966, + "learning_rate": 4.296369627246422e-06, + "loss": 0.0027, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 720 + }, + { + "completion_length": 171.33334350585938, + "epoch": 0.32087227414330216, + "grad_norm": 0.8165731430053711, + "kl": 0.06688942015171051, + "learning_rate": 4.293666041866453e-06, + "loss": 0.0027, + "reward": 0.31283333897590637, + "reward_std": 0.20551829040050507, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14616666734218597, + "step": 721 + }, + { + "completion_length": 194.33334350585938, + "epoch": 0.3213173119715176, + "grad_norm": 0.8424144387245178, + "kl": 0.04467558115720749, + "learning_rate": 4.290958126575764e-06, + "loss": 0.0018, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 722 + }, + { + "completion_length": 171.83334350585938, + "epoch": 0.32176234979973295, + "grad_norm": 0.8838493824005127, + "kl": 0.05616327375173569, + "learning_rate": 4.2882458879112634e-06, + "loss": 0.0022, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 723 + }, + { + "completion_length": 199.0, + "epoch": 0.32220738762794837, + "grad_norm": 0.7267130017280579, + "kl": 0.04860411211848259, + "learning_rate": 4.285529332420298e-06, + "loss": 0.0019, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 724 + }, + { + "completion_length": 200.0, + "epoch": 0.3226524254561638, + "grad_norm": 0.8249167203903198, + "kl": 0.04377565532922745, + "learning_rate": 4.282808466660632e-06, + "loss": 0.0018, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 725 + }, + { + "completion_length": 190.1666717529297, + "epoch": 0.32309746328437916, + "grad_norm": 0.7198425531387329, + "kl": 0.04959031194448471, + "learning_rate": 4.280083297200439e-06, + "loss": 0.002, + "reward": 0.2711666524410248, + "reward_std": 0.09453975409269333, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 726 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.3235425011125946, + "grad_norm": 0.7675707340240479, + "kl": 0.04044210538268089, + "learning_rate": 4.277353830618279e-06, + "loss": 0.0016, + "reward": 0.187666654586792, + "reward_std": 0.10494124889373779, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.187666654586792, + "step": 727 + }, + { + "completion_length": 200.0, + "epoch": 0.32398753894080995, + "grad_norm": 0.7638272643089294, + "kl": 0.03953151777386665, + "learning_rate": 4.274620073503084e-06, + "loss": 0.0016, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 728 + }, + { + "completion_length": 152.6666717529297, + "epoch": 0.32443257676902537, + "grad_norm": 0.05756811052560806, + "kl": 0.06373923271894455, + "learning_rate": 4.2718820324541475e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.5, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 729 + }, + { + "completion_length": 162.0, + "epoch": 0.32487761459724074, + "grad_norm": 0.7738885283470154, + "kl": 0.06379255652427673, + "learning_rate": 4.2691397140811e-06, + "loss": 0.0026, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 730 + }, + { + "completion_length": 117.66667175292969, + "epoch": 0.32532265242545616, + "grad_norm": 0.9680776596069336, + "kl": 0.08181880414485931, + "learning_rate": 4.2663931250039005e-06, + "loss": 0.0033, + "reward": 0.3333333432674408, + "reward_std": 0.25819891691207886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.3333333432674408, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0, + "step": 731 + }, + { + "completion_length": 170.83334350585938, + "epoch": 0.3257676902536716, + "grad_norm": 0.8730087280273438, + "kl": 0.07460620254278183, + "learning_rate": 4.2636422718528155e-06, + "loss": 0.003, + "reward": 0.18150000274181366, + "reward_std": 0.3583281934261322, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18150000274181366, + "step": 732 + }, + { + "completion_length": 184.33334350585938, + "epoch": 0.32621272808188695, + "grad_norm": 0.8382135629653931, + "kl": 0.05216635391116142, + "learning_rate": 4.2608871612684074e-06, + "loss": 0.0021, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 733 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.32665776591010237, + "grad_norm": 0.9829415082931519, + "kl": 0.050829045474529266, + "learning_rate": 4.258127799901512e-06, + "loss": 0.002, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 734 + }, + { + "completion_length": 190.5, + "epoch": 0.32710280373831774, + "grad_norm": 0.7752060890197754, + "kl": 0.04926741123199463, + "learning_rate": 4.255364194413232e-06, + "loss": 0.002, + "reward": 0.2084999978542328, + "reward_std": 0.1021445021033287, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 735 + }, + { + "completion_length": 200.0, + "epoch": 0.32754784156653316, + "grad_norm": 0.7153200507164001, + "kl": 0.040820520371198654, + "learning_rate": 4.25259635147491e-06, + "loss": 0.0016, + "reward": 0.2083333432674408, + "reward_std": 0.10206207633018494, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 736 + }, + { + "completion_length": 200.0, + "epoch": 0.3279928793947485, + "grad_norm": 0.023354843258857727, + "kl": 0.03863299638032913, + "learning_rate": 4.249824277768122e-06, + "loss": 0.0015, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 737 + }, + { + "completion_length": 172.1666717529297, + "epoch": 0.32843791722296395, + "grad_norm": 0.796233594417572, + "kl": 0.047746941447257996, + "learning_rate": 4.2470479799846545e-06, + "loss": 0.0019, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 738 + }, + { + "completion_length": 200.0, + "epoch": 0.32888295505117937, + "grad_norm": 0.024252623319625854, + "kl": 0.04170938581228256, + "learning_rate": 4.2442674648264914e-06, + "loss": 0.0017, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 739 + }, + { + "completion_length": 200.0, + "epoch": 0.32932799287939474, + "grad_norm": 0.8098192811012268, + "kl": 0.055160801857709885, + "learning_rate": 4.241482739005798e-06, + "loss": 0.0022, + "reward": 0.11166667193174362, + "reward_std": 0.3388460874557495, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.11166667193174362, + "step": 740 + }, + { + "completion_length": 177.33334350585938, + "epoch": 0.32977303070761016, + "grad_norm": 0.7267571091651917, + "kl": 0.037675365805625916, + "learning_rate": 4.238693809244904e-06, + "loss": 0.0015, + "reward": 0.1899999976158142, + "reward_std": 0.26663535833358765, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1066666692495346, + "step": 741 + }, + { + "completion_length": 200.0, + "epoch": 0.3302180685358255, + "grad_norm": 0.03363126143813133, + "kl": 0.04696328565478325, + "learning_rate": 4.235900682276287e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 742 + }, + { + "completion_length": 194.33334350585938, + "epoch": 0.33066310636404095, + "grad_norm": 0.6722932457923889, + "kl": 0.040012698620557785, + "learning_rate": 4.2331033648425565e-06, + "loss": 0.0016, + "reward": 0.12316666543483734, + "reward_std": 0.37579911947250366, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12316666543483734, + "step": 743 + }, + { + "completion_length": 200.0, + "epoch": 0.3311081441922563, + "grad_norm": 0.9077341556549072, + "kl": 0.04133099317550659, + "learning_rate": 4.230301863696439e-06, + "loss": 0.0017, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 744 + }, + { + "completion_length": 156.83334350585938, + "epoch": 0.33155318202047174, + "grad_norm": 1.1381067037582397, + "kl": 0.0650453120470047, + "learning_rate": 4.22749618560076e-06, + "loss": 0.0026, + "reward": 0.27133333683013916, + "reward_std": 0.09423092007637024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 745 + }, + { + "completion_length": 200.0, + "epoch": 0.33199821984868716, + "grad_norm": 0.78874272108078, + "kl": 0.02345721237361431, + "learning_rate": 4.224686337328426e-06, + "loss": 0.0009, + "reward": 0.1458333432674408, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 746 + }, + { + "completion_length": 179.0, + "epoch": 0.3324432576769025, + "grad_norm": 0.836664617061615, + "kl": 0.04164649173617363, + "learning_rate": 4.221872325662414e-06, + "loss": 0.0017, + "reward": 0.25, + "reward_std": 0.15811388194561005, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1666666716337204, + "step": 747 + }, + { + "completion_length": 135.83334350585938, + "epoch": 0.33288829550511795, + "grad_norm": 0.9000113010406494, + "kl": 0.06278669834136963, + "learning_rate": 4.219054157395749e-06, + "loss": 0.0025, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 748 + }, + { + "completion_length": 191.83334350585938, + "epoch": 0.3333333333333333, + "grad_norm": 0.8174548745155334, + "kl": 0.0394861064851284, + "learning_rate": 4.21623183933149e-06, + "loss": 0.0016, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 749 + }, + { + "completion_length": 166.0, + "epoch": 0.33377837116154874, + "grad_norm": 0.7031174302101135, + "kl": 0.03650489076972008, + "learning_rate": 4.213405378282714e-06, + "loss": 0.0015, + "reward": 0.25033333897590637, + "reward_std": 0.13729628920555115, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25033333897590637, + "step": 750 + }, + { + "completion_length": 192.83334350585938, + "epoch": 0.3342234089897641, + "grad_norm": 0.7927865386009216, + "kl": 0.0499715581536293, + "learning_rate": 4.210574781072501e-06, + "loss": 0.002, + "reward": 0.2084999978542328, + "reward_std": 0.12935802340507507, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 751 + }, + { + "completion_length": 162.33334350585938, + "epoch": 0.3346684468179795, + "grad_norm": 0.7981622219085693, + "kl": 0.04853060469031334, + "learning_rate": 4.207740054533913e-06, + "loss": 0.0019, + "reward": 0.22949998080730438, + "reward_std": 0.14653019607067108, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22949998080730438, + "step": 752 + }, + { + "completion_length": 200.0, + "epoch": 0.33511348464619495, + "grad_norm": 0.022546028718352318, + "kl": 0.01208839938044548, + "learning_rate": 4.204901205509981e-06, + "loss": 0.0005, + "reward": 0.125, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.125, + "step": 753 + }, + { + "completion_length": 200.0, + "epoch": 0.3355585224744103, + "grad_norm": 0.7191113829612732, + "kl": 0.031925659626722336, + "learning_rate": 4.202058240853689e-06, + "loss": 0.0013, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 754 + }, + { + "completion_length": 172.33334350585938, + "epoch": 0.33600356030262574, + "grad_norm": 0.8460274338722229, + "kl": 0.047413043677806854, + "learning_rate": 4.199211167427955e-06, + "loss": 0.0019, + "reward": 0.2121666520833969, + "reward_std": 0.18116556107997894, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2121666669845581, + "step": 755 + }, + { + "completion_length": 114.66667175292969, + "epoch": 0.3364485981308411, + "grad_norm": 1.0755983591079712, + "kl": 0.054288625717163086, + "learning_rate": 4.196359992105614e-06, + "loss": 0.0022, + "reward": 0.1456666737794876, + "reward_std": 0.26882386207580566, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1456666737794876, + "step": 756 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.3368936359590565, + "grad_norm": 0.7851623296737671, + "kl": 0.03890561684966087, + "learning_rate": 4.193504721769406e-06, + "loss": 0.0016, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 757 + }, + { + "completion_length": 173.83334350585938, + "epoch": 0.3373386737872719, + "grad_norm": 1.2191767692565918, + "kl": 0.03187034651637077, + "learning_rate": 4.190645363311955e-06, + "loss": 0.0013, + "reward": 0.20533333718776703, + "reward_std": 0.06943102180957794, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20533333718776703, + "step": 758 + }, + { + "completion_length": 164.0, + "epoch": 0.3377837116154873, + "grad_norm": 0.6636083126068115, + "kl": 0.13397148251533508, + "learning_rate": 4.187781923635753e-06, + "loss": 0.0054, + "reward": 0.2254999876022339, + "reward_std": 0.2528444528579712, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22550001740455627, + "step": 759 + }, + { + "completion_length": 127.66667175292969, + "epoch": 0.33822874944370274, + "grad_norm": 0.030917134135961533, + "kl": 0.06076059490442276, + "learning_rate": 4.184914409653147e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 760 + }, + { + "completion_length": 194.0, + "epoch": 0.3386737872719181, + "grad_norm": 0.723721444606781, + "kl": 0.043149642646312714, + "learning_rate": 4.182042828286313e-06, + "loss": 0.0017, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 761 + }, + { + "completion_length": 193.33334350585938, + "epoch": 0.3391188251001335, + "grad_norm": 0.7052424550056458, + "kl": 0.039253827184438705, + "learning_rate": 4.179167186467255e-06, + "loss": 0.0016, + "reward": 0.250333309173584, + "reward_std": 0.1122509092092514, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250333309173584, + "step": 762 + }, + { + "completion_length": 116.83333587646484, + "epoch": 0.3395638629283489, + "grad_norm": 1.4630470275878906, + "kl": 0.026879621669650078, + "learning_rate": 4.17628749113777e-06, + "loss": 0.0011, + "reward": 0.24266664683818817, + "reward_std": 0.15977442264556885, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.24266664683818817, + "step": 763 + }, + { + "completion_length": 124.0, + "epoch": 0.3400089007565643, + "grad_norm": 0.04300786554813385, + "kl": 0.05142722651362419, + "learning_rate": 4.173403749249444e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 764 + }, + { + "completion_length": 171.6666717529297, + "epoch": 0.3404539385847797, + "grad_norm": 1.0912110805511475, + "kl": 0.04297208786010742, + "learning_rate": 4.170515967763633e-06, + "loss": 0.0017, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 765 + }, + { + "completion_length": 163.33334350585938, + "epoch": 0.3408989764129951, + "grad_norm": 0.9113920331001282, + "kl": 0.1428011953830719, + "learning_rate": 4.167624153651444e-06, + "loss": 0.0057, + "reward": 0.18649999797344208, + "reward_std": 0.34627026319503784, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18649999797344208, + "step": 766 + }, + { + "completion_length": 197.5, + "epoch": 0.3413440142412105, + "grad_norm": 0.7323278188705444, + "kl": 0.046782054007053375, + "learning_rate": 4.1647283138937144e-06, + "loss": 0.0019, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 767 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.3417890520694259, + "grad_norm": 0.7725353837013245, + "kl": 0.027050405740737915, + "learning_rate": 4.1618284554810056e-06, + "loss": 0.0011, + "reward": 0.22949999570846558, + "reward_std": 0.12337382137775421, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22949999570846558, + "step": 768 + }, + { + "completion_length": 137.0, + "epoch": 0.3422340898976413, + "grad_norm": 0.03009336069226265, + "kl": 0.057053741067647934, + "learning_rate": 4.158924585413576e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 769 + }, + { + "completion_length": 197.5, + "epoch": 0.3426791277258567, + "grad_norm": 0.601902425289154, + "kl": 0.027302034199237823, + "learning_rate": 4.156016710701369e-06, + "loss": 0.0011, + "reward": 0.2293333262205124, + "reward_std": 0.09440692514181137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 770 + }, + { + "completion_length": 200.0, + "epoch": 0.3431241655540721, + "grad_norm": 0.013710107654333115, + "kl": 0.03290403261780739, + "learning_rate": 4.153104838363997e-06, + "loss": 0.0013, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 771 + }, + { + "completion_length": 197.83334350585938, + "epoch": 0.34356920338228747, + "grad_norm": 0.6330693960189819, + "kl": 0.03465873375535011, + "learning_rate": 4.15018897543072e-06, + "loss": 0.0014, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 772 + }, + { + "completion_length": 167.6666717529297, + "epoch": 0.3440142412105029, + "grad_norm": 0.7272565364837646, + "kl": 0.04871327802538872, + "learning_rate": 4.1472691289404335e-06, + "loss": 0.0019, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 773 + }, + { + "completion_length": 200.0, + "epoch": 0.3444592790387183, + "grad_norm": 0.025043027475476265, + "kl": 0.03786566108465195, + "learning_rate": 4.144345305941648e-06, + "loss": 0.0015, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 774 + }, + { + "completion_length": 200.0, + "epoch": 0.3449043168669337, + "grad_norm": 0.03370220214128494, + "kl": 0.0471016988158226, + "learning_rate": 4.141417513492473e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 775 + }, + { + "completion_length": 200.0, + "epoch": 0.3453493546951491, + "grad_norm": 0.7220287919044495, + "kl": 0.035600695759058, + "learning_rate": 4.138485758660602e-06, + "loss": 0.0014, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 776 + }, + { + "completion_length": 120.66667175292969, + "epoch": 0.34579439252336447, + "grad_norm": 1.2347756624221802, + "kl": 0.08316612988710403, + "learning_rate": 4.135550048523293e-06, + "loss": 0.0033, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 777 + }, + { + "completion_length": 170.6666717529297, + "epoch": 0.3462394303515799, + "grad_norm": 0.7800046801567078, + "kl": 0.053269438445568085, + "learning_rate": 4.132610390167349e-06, + "loss": 0.0021, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 778 + }, + { + "completion_length": 157.1666717529297, + "epoch": 0.34668446817979526, + "grad_norm": 0.7412069439888, + "kl": 0.06059323251247406, + "learning_rate": 4.12966679068911e-06, + "loss": 0.0024, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 779 + }, + { + "completion_length": 200.0, + "epoch": 0.3471295060080107, + "grad_norm": 0.7685080766677856, + "kl": 0.056760650128126144, + "learning_rate": 4.126719257194425e-06, + "loss": 0.0023, + "reward": 0.12250000238418579, + "reward_std": 0.31230995059013367, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12250000238418579, + "step": 780 + }, + { + "completion_length": 200.0, + "epoch": 0.3475745438362261, + "grad_norm": 0.018277641385793686, + "kl": 0.03469054400920868, + "learning_rate": 4.123767796798641e-06, + "loss": 0.0014, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 781 + }, + { + "completion_length": 133.5, + "epoch": 0.34801958166444147, + "grad_norm": 0.8028903007507324, + "kl": 0.06514596194028854, + "learning_rate": 4.120812416626586e-06, + "loss": 0.0026, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 782 + }, + { + "completion_length": 146.83334350585938, + "epoch": 0.3484646194926569, + "grad_norm": 1.0154013633728027, + "kl": 0.0660717785358429, + "learning_rate": 4.117853123812549e-06, + "loss": 0.0026, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 783 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.34890965732087226, + "grad_norm": 0.76609206199646, + "kl": 0.05136913061141968, + "learning_rate": 4.1148899255002636e-06, + "loss": 0.0021, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 784 + }, + { + "completion_length": 142.5, + "epoch": 0.3493546951490877, + "grad_norm": 0.03385661169886589, + "kl": 0.05575406178832054, + "learning_rate": 4.111922828842892e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 785 + }, + { + "completion_length": 187.0, + "epoch": 0.34979973297730305, + "grad_norm": 0.7177822589874268, + "kl": 0.030316343531012535, + "learning_rate": 4.108951841003009e-06, + "loss": 0.0012, + "reward": 0.25050002336502075, + "reward_std": 0.13747835159301758, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25050002336502075, + "step": 786 + }, + { + "completion_length": 178.5, + "epoch": 0.35024477080551847, + "grad_norm": 0.7809700965881348, + "kl": 0.039071641862392426, + "learning_rate": 4.105976969152578e-06, + "loss": 0.0016, + "reward": 0.31333333253860474, + "reward_std": 0.1049412414431572, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 787 + }, + { + "completion_length": 177.1666717529297, + "epoch": 0.3506898086337339, + "grad_norm": 0.7270780205726624, + "kl": 0.042834021151065826, + "learning_rate": 4.102998220472943e-06, + "loss": 0.0017, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 788 + }, + { + "completion_length": 200.0, + "epoch": 0.35113484646194926, + "grad_norm": 0.6192082762718201, + "kl": 0.04256781190633774, + "learning_rate": 4.100015602154802e-06, + "loss": 0.0017, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 789 + }, + { + "completion_length": 200.0, + "epoch": 0.3515798842901647, + "grad_norm": 0.014344203285872936, + "kl": 0.03416243940591812, + "learning_rate": 4.0970291213982e-06, + "loss": 0.0014, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 790 + }, + { + "completion_length": 200.0, + "epoch": 0.35202492211838005, + "grad_norm": 0.8113481998443604, + "kl": 0.05264754593372345, + "learning_rate": 4.094038785412504e-06, + "loss": 0.0021, + "reward": 0.11516666412353516, + "reward_std": 0.3302728831768036, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.11516666412353516, + "step": 791 + }, + { + "completion_length": 138.0, + "epoch": 0.35246995994659547, + "grad_norm": 1.720563292503357, + "kl": 0.03622860834002495, + "learning_rate": 4.091044601416383e-06, + "loss": 0.0014, + "reward": 0.27133333683013916, + "reward_std": 0.1465587466955185, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 792 + }, + { + "completion_length": 177.5, + "epoch": 0.35291499777481083, + "grad_norm": 0.7472648024559021, + "kl": 0.04746413230895996, + "learning_rate": 4.0880465766378015e-06, + "loss": 0.0019, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 793 + }, + { + "completion_length": 129.83334350585938, + "epoch": 0.35336003560302626, + "grad_norm": 1.1209585666656494, + "kl": 0.0549950897693634, + "learning_rate": 4.085044718313991e-06, + "loss": 0.0022, + "reward": 0.31349998712539673, + "reward_std": 0.10458250343799591, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31349998712539673, + "step": 794 + }, + { + "completion_length": 193.1666717529297, + "epoch": 0.3538050734312417, + "grad_norm": 0.6405737400054932, + "kl": 0.04125973582267761, + "learning_rate": 4.08203903369144e-06, + "loss": 0.0017, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 795 + }, + { + "completion_length": 185.5, + "epoch": 0.35425011125945705, + "grad_norm": 0.8062145709991455, + "kl": 0.04092112183570862, + "learning_rate": 4.079029530025873e-06, + "loss": 0.0016, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 796 + }, + { + "completion_length": 142.83334350585938, + "epoch": 0.35469514908767247, + "grad_norm": 0.0398949459195137, + "kl": 0.052285999059677124, + "learning_rate": 4.076016214582232e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 797 + }, + { + "completion_length": 154.0, + "epoch": 0.35514018691588783, + "grad_norm": 1.113144874572754, + "kl": 0.07075363397598267, + "learning_rate": 4.072999094634663e-06, + "loss": 0.0028, + "reward": 0.29233333468437195, + "reward_std": 0.10222654044628143, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 798 + }, + { + "completion_length": 200.0, + "epoch": 0.35558522474410326, + "grad_norm": 0.016199104487895966, + "kl": 0.03834648057818413, + "learning_rate": 4.069978177466495e-06, + "loss": 0.0015, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 799 + }, + { + "completion_length": 200.0, + "epoch": 0.3560302625723186, + "grad_norm": 0.9117387533187866, + "kl": 0.18886041641235352, + "learning_rate": 4.066953470370223e-06, + "loss": 0.0076, + "reward": 0.016166668385267258, + "reward_std": 0.2665861248970032, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.016166668385267258, + "step": 800 + }, + { + "completion_length": 178.33334350585938, + "epoch": 0.35647530040053405, + "grad_norm": 0.6704380512237549, + "kl": 0.057386137545108795, + "learning_rate": 4.063924980647492e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 801 + }, + { + "completion_length": 168.33334350585938, + "epoch": 0.35692033822874947, + "grad_norm": 0.767604649066925, + "kl": 0.0486370325088501, + "learning_rate": 4.060892715609078e-06, + "loss": 0.0019, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 802 + }, + { + "completion_length": 141.0, + "epoch": 0.35736537605696483, + "grad_norm": 1.3145936727523804, + "kl": 0.04570293426513672, + "learning_rate": 4.0578566825748685e-06, + "loss": 0.0018, + "reward": 0.29233333468437195, + "reward_std": 0.15182314813137054, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 803 + }, + { + "completion_length": 126.66667175292969, + "epoch": 0.35781041388518026, + "grad_norm": 0.6498439311981201, + "kl": 0.06144469231367111, + "learning_rate": 4.054816888873852e-06, + "loss": 0.0025, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 804 + }, + { + "completion_length": 185.6666717529297, + "epoch": 0.3582554517133956, + "grad_norm": 0.9341973662376404, + "kl": 0.05542676895856857, + "learning_rate": 4.051773341844088e-06, + "loss": 0.0022, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 805 + }, + { + "completion_length": 153.33334350585938, + "epoch": 0.35870048954161105, + "grad_norm": 0.6858371496200562, + "kl": 0.06017071008682251, + "learning_rate": 4.048726048832704e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 806 + }, + { + "completion_length": 182.6666717529297, + "epoch": 0.3591455273698264, + "grad_norm": 0.5970679521560669, + "kl": 0.03451145440340042, + "learning_rate": 4.045675017195866e-06, + "loss": 0.0014, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 807 + }, + { + "completion_length": 162.5, + "epoch": 0.35959056519804183, + "grad_norm": 0.6962899565696716, + "kl": 0.05475949868559837, + "learning_rate": 4.042620254298765e-06, + "loss": 0.0022, + "reward": 0.29233333468437195, + "reward_std": 0.15182313323020935, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 808 + }, + { + "completion_length": 183.83334350585938, + "epoch": 0.36003560302625726, + "grad_norm": 0.7941375970840454, + "kl": 0.045727722346782684, + "learning_rate": 4.039561767515599e-06, + "loss": 0.0018, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 809 + }, + { + "completion_length": 174.1666717529297, + "epoch": 0.3604806408544726, + "grad_norm": 0.6645864248275757, + "kl": 0.052175264805555344, + "learning_rate": 4.036499564229559e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 810 + }, + { + "completion_length": 192.0, + "epoch": 0.36092567868268804, + "grad_norm": 0.7362174391746521, + "kl": 0.05001889169216156, + "learning_rate": 4.033433651832806e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 811 + }, + { + "completion_length": 183.33334350585938, + "epoch": 0.3613707165109034, + "grad_norm": 0.700889527797699, + "kl": 0.04670108109712601, + "learning_rate": 4.0303640377264505e-06, + "loss": 0.0019, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 812 + }, + { + "completion_length": 121.66667175292969, + "epoch": 0.36181575433911883, + "grad_norm": 0.11063014715909958, + "kl": 0.08557015657424927, + "learning_rate": 4.027290729320545e-06, + "loss": 0.0037, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 813 + }, + { + "completion_length": 75.83333587646484, + "epoch": 0.3622607921673342, + "grad_norm": 1.2718409299850464, + "kl": 0.08148349821567535, + "learning_rate": 4.024213734034057e-06, + "loss": 0.0033, + "reward": 0.39666664600372314, + "reward_std": 0.05062279850244522, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 814 + }, + { + "completion_length": 186.5, + "epoch": 0.3627058299955496, + "grad_norm": 0.7700260281562805, + "kl": 0.04646776616573334, + "learning_rate": 4.021133059294855e-06, + "loss": 0.0019, + "reward": 0.22949999570846558, + "reward_std": 0.12337382137775421, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22949998080730438, + "step": 815 + }, + { + "completion_length": 147.5, + "epoch": 0.36315086782376504, + "grad_norm": 1.176891803741455, + "kl": 0.05650541931390762, + "learning_rate": 4.018048712539689e-06, + "loss": 0.0023, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 816 + }, + { + "completion_length": 180.0, + "epoch": 0.3635959056519804, + "grad_norm": 0.6683288216590881, + "kl": 0.06407992541790009, + "learning_rate": 4.014960701214173e-06, + "loss": 0.0026, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 817 + }, + { + "completion_length": 105.0, + "epoch": 0.36404094348019583, + "grad_norm": 0.9382845163345337, + "kl": 0.06550440192222595, + "learning_rate": 4.011869032772769e-06, + "loss": 0.0026, + "reward": 0.31333333253860474, + "reward_std": 0.15350136160850525, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 818 + }, + { + "completion_length": 165.6666717529297, + "epoch": 0.3644859813084112, + "grad_norm": 0.7827046513557434, + "kl": 0.05532263219356537, + "learning_rate": 4.008773714678766e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 819 + }, + { + "completion_length": 145.6666717529297, + "epoch": 0.3649310191366266, + "grad_norm": 0.026940084993839264, + "kl": 0.0543852262198925, + "learning_rate": 4.005674754404263e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 820 + }, + { + "completion_length": 138.6666717529297, + "epoch": 0.365376056964842, + "grad_norm": 0.035869237035512924, + "kl": 0.06597190350294113, + "learning_rate": 4.002572159430151e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 821 + }, + { + "completion_length": 163.83334350585938, + "epoch": 0.3658210947930574, + "grad_norm": 0.910876989364624, + "kl": 0.06616390496492386, + "learning_rate": 3.999465937246096e-06, + "loss": 0.0026, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 822 + }, + { + "completion_length": 133.1666717529297, + "epoch": 0.36626613262127283, + "grad_norm": 0.8520764708518982, + "kl": 0.08879172801971436, + "learning_rate": 3.996356095350522e-06, + "loss": 0.0036, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 823 + }, + { + "completion_length": 170.0, + "epoch": 0.3667111704494882, + "grad_norm": 0.8743280172348022, + "kl": 0.061830103397369385, + "learning_rate": 3.993242641250586e-06, + "loss": 0.0025, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 824 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.3671562082777036, + "grad_norm": 0.7217028737068176, + "kl": 0.056446269154548645, + "learning_rate": 3.990125582462171e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 825 + }, + { + "completion_length": 179.1666717529297, + "epoch": 0.367601246105919, + "grad_norm": 0.6563734412193298, + "kl": 0.060024701058864594, + "learning_rate": 3.987004926509854e-06, + "loss": 0.0024, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 826 + }, + { + "completion_length": 188.0, + "epoch": 0.3680462839341344, + "grad_norm": 0.6999355554580688, + "kl": 0.045960139483213425, + "learning_rate": 3.983880680926904e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 827 + }, + { + "completion_length": 98.16667175292969, + "epoch": 0.3684913217623498, + "grad_norm": 0.04137030988931656, + "kl": 0.08726416528224945, + "learning_rate": 3.98075285325525e-06, + "loss": 0.0038, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 828 + }, + { + "completion_length": 160.1666717529297, + "epoch": 0.3689363595905652, + "grad_norm": 0.7065590620040894, + "kl": 0.061148129403591156, + "learning_rate": 3.977621451045469e-06, + "loss": 0.0024, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 829 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.3693813974187806, + "grad_norm": 0.7527098059654236, + "kl": 0.055032290518283844, + "learning_rate": 3.974486481856769e-06, + "loss": 0.0022, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 830 + }, + { + "completion_length": 157.6666717529297, + "epoch": 0.369826435246996, + "grad_norm": 0.8093323111534119, + "kl": 0.06528506428003311, + "learning_rate": 3.971347953256965e-06, + "loss": 0.0026, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 831 + }, + { + "completion_length": 136.5, + "epoch": 0.3702714730752114, + "grad_norm": 0.04360884055495262, + "kl": 0.0801323652267456, + "learning_rate": 3.968205872822468e-06, + "loss": 0.0035, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 832 + }, + { + "completion_length": 173.83334350585938, + "epoch": 0.3707165109034268, + "grad_norm": 0.7571378350257874, + "kl": 0.05690945312380791, + "learning_rate": 3.965060248138263e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 833 + }, + { + "completion_length": 87.66667175292969, + "epoch": 0.3711615487316422, + "grad_norm": 0.06486710906028748, + "kl": 0.10215651988983154, + "learning_rate": 3.961911086797886e-06, + "loss": 0.0044, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 834 + }, + { + "completion_length": 169.83334350585938, + "epoch": 0.37160658655985757, + "grad_norm": 0.7843642830848694, + "kl": 0.05757517367601395, + "learning_rate": 3.958758396403418e-06, + "loss": 0.0023, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 835 + }, + { + "completion_length": 149.6666717529297, + "epoch": 0.372051624388073, + "grad_norm": 0.6219406127929688, + "kl": 0.11813263595104218, + "learning_rate": 3.955602184565452e-06, + "loss": 0.0047, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 836 + }, + { + "completion_length": 154.6666717529297, + "epoch": 0.3724966622162884, + "grad_norm": 0.699906051158905, + "kl": 0.06392105668783188, + "learning_rate": 3.952442458903087e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 837 + }, + { + "completion_length": 200.0, + "epoch": 0.3729417000445038, + "grad_norm": 0.036947038024663925, + "kl": 0.05290337651968002, + "learning_rate": 3.9492792270439015e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 838 + }, + { + "completion_length": 99.33333587646484, + "epoch": 0.3733867378727192, + "grad_norm": 1.119400978088379, + "kl": 0.06555521488189697, + "learning_rate": 3.946112496623939e-06, + "loss": 0.0026, + "reward": 0.3343333601951599, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33433330059051514, + "step": 839 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.37383177570093457, + "grad_norm": 0.030132388696074486, + "kl": 0.057201653718948364, + "learning_rate": 3.942942275287688e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 840 + }, + { + "completion_length": 123.16667175292969, + "epoch": 0.37427681352915, + "grad_norm": 0.027864990755915642, + "kl": 0.06204737722873688, + "learning_rate": 3.939768570688064e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 841 + }, + { + "completion_length": 161.0, + "epoch": 0.37472185135736535, + "grad_norm": 0.6778984069824219, + "kl": 0.06228325888514519, + "learning_rate": 3.936591390486393e-06, + "loss": 0.0025, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 842 + }, + { + "completion_length": 165.83334350585938, + "epoch": 0.3751668891855808, + "grad_norm": 0.03816313296556473, + "kl": 0.06406964361667633, + "learning_rate": 3.933410742352388e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 843 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.3756119270137962, + "grad_norm": 0.8484237790107727, + "kl": 0.09677344560623169, + "learning_rate": 3.930226633964137e-06, + "loss": 0.0039, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 844 + }, + { + "completion_length": 135.5, + "epoch": 0.37605696484201157, + "grad_norm": 0.03836163505911827, + "kl": 0.06688190996646881, + "learning_rate": 3.927039073008077e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 845 + }, + { + "completion_length": 200.0, + "epoch": 0.376502002670227, + "grad_norm": 0.03299454599618912, + "kl": 0.052207015454769135, + "learning_rate": 3.9238480671789836e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 846 + }, + { + "completion_length": 189.33334350585938, + "epoch": 0.37694704049844235, + "grad_norm": 0.8058741092681885, + "kl": 0.052587032318115234, + "learning_rate": 3.920653624179945e-06, + "loss": 0.0021, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 847 + }, + { + "completion_length": 156.0, + "epoch": 0.3773920783266578, + "grad_norm": 0.9629629254341125, + "kl": 0.18556594848632812, + "learning_rate": 3.917455751722349e-06, + "loss": 0.0074, + "reward": 0.28033334016799927, + "reward_std": 0.23433451354503632, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.28033334016799927, + "step": 848 + }, + { + "completion_length": 132.33334350585938, + "epoch": 0.37783711615487314, + "grad_norm": 0.05315527319908142, + "kl": 0.07583107054233551, + "learning_rate": 3.914254457525862e-06, + "loss": 0.0033, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 849 + }, + { + "completion_length": 187.1666717529297, + "epoch": 0.37828215398308856, + "grad_norm": 0.7499144673347473, + "kl": 0.051162999123334885, + "learning_rate": 3.9110497493184084e-06, + "loss": 0.002, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 850 + }, + { + "completion_length": 153.33334350585938, + "epoch": 0.378727191811304, + "grad_norm": 0.6836094856262207, + "kl": 0.06891427934169769, + "learning_rate": 3.9078416348361555e-06, + "loss": 0.0028, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 851 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.37917222963951935, + "grad_norm": 0.8249133229255676, + "kl": 0.04966040700674057, + "learning_rate": 3.904630121823495e-06, + "loss": 0.002, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 852 + }, + { + "completion_length": 91.0, + "epoch": 0.3796172674677348, + "grad_norm": 1.4175060987472534, + "kl": 0.15148359537124634, + "learning_rate": 3.901415218033019e-06, + "loss": 0.0061, + "reward": 0.36516666412353516, + "reward_std": 0.026536142453551292, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.36516663432121277, + "step": 853 + }, + { + "completion_length": 146.5, + "epoch": 0.38006230529595014, + "grad_norm": 0.850635290145874, + "kl": 0.06043955311179161, + "learning_rate": 3.8981969312255075e-06, + "loss": 0.0024, + "reward": 0.31333333253860474, + "reward_std": 0.10494124889373779, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 854 + }, + { + "completion_length": 134.6666717529297, + "epoch": 0.38050734312416556, + "grad_norm": 0.028085196390748024, + "kl": 0.061860792338848114, + "learning_rate": 3.894975269169906e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 855 + }, + { + "completion_length": 188.0, + "epoch": 0.38095238095238093, + "grad_norm": 1.085335612297058, + "kl": 0.16889886558055878, + "learning_rate": 3.891750239643309e-06, + "loss": 0.0068, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 856 + }, + { + "completion_length": 137.5, + "epoch": 0.38139741878059635, + "grad_norm": 0.8269402384757996, + "kl": 0.06682190299034119, + "learning_rate": 3.888521850430939e-06, + "loss": 0.0027, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 857 + }, + { + "completion_length": 200.0, + "epoch": 0.3818424566088118, + "grad_norm": 0.026867447420954704, + "kl": 0.04740360379219055, + "learning_rate": 3.885290109326131e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 858 + }, + { + "completion_length": 200.0, + "epoch": 0.38228749443702714, + "grad_norm": 0.023793501779437065, + "kl": 0.046479731798172, + "learning_rate": 3.882055024130307e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 859 + }, + { + "completion_length": 189.83334350585938, + "epoch": 0.38273253226524256, + "grad_norm": 0.6452245712280273, + "kl": 0.050767965614795685, + "learning_rate": 3.878816602652965e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 860 + }, + { + "completion_length": 126.5, + "epoch": 0.38317757009345793, + "grad_norm": 0.02156522497534752, + "kl": 0.062334608286619186, + "learning_rate": 3.875574852711656e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 861 + }, + { + "completion_length": 184.33334350585938, + "epoch": 0.38362260792167335, + "grad_norm": 1.0488053560256958, + "kl": 0.050179775804281235, + "learning_rate": 3.872329782131967e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 862 + }, + { + "completion_length": 177.0, + "epoch": 0.3840676457498887, + "grad_norm": 0.7218441367149353, + "kl": 0.05524627864360809, + "learning_rate": 3.869081398747499e-06, + "loss": 0.0022, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 863 + }, + { + "completion_length": 169.5, + "epoch": 0.38451268357810414, + "grad_norm": 0.6805753707885742, + "kl": 0.057348743081092834, + "learning_rate": 3.865829710399852e-06, + "loss": 0.0023, + "reward": 0.3341667056083679, + "reward_std": 0.0648086816072464, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 864 + }, + { + "completion_length": 190.1666717529297, + "epoch": 0.38495772140631956, + "grad_norm": 0.7564582824707031, + "kl": 0.06877424567937851, + "learning_rate": 3.862574724938602e-06, + "loss": 0.0028, + "reward": 0.0833333358168602, + "reward_std": 0.3685337007045746, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.0833333283662796, + "step": 865 + }, + { + "completion_length": 130.6666717529297, + "epoch": 0.38540275923453493, + "grad_norm": 0.028254088014364243, + "kl": 0.05616528540849686, + "learning_rate": 3.859316450221286e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 866 + }, + { + "completion_length": 155.33334350585938, + "epoch": 0.38584779706275035, + "grad_norm": 0.6590713858604431, + "kl": 0.05754072964191437, + "learning_rate": 3.856054894113382e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 867 + }, + { + "completion_length": 155.5, + "epoch": 0.3862928348909657, + "grad_norm": 0.11690230667591095, + "kl": 0.07130900025367737, + "learning_rate": 3.852790064488286e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 868 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.38673787271918114, + "grad_norm": 0.7855741381645203, + "kl": 0.0497148297727108, + "learning_rate": 3.8495219692273e-06, + "loss": 0.002, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 869 + }, + { + "completion_length": 149.1666717529297, + "epoch": 0.3871829105473965, + "grad_norm": 0.749443769454956, + "kl": 0.044368281960487366, + "learning_rate": 3.846250616219607e-06, + "loss": 0.0018, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 870 + }, + { + "completion_length": 147.5, + "epoch": 0.38762794837561193, + "grad_norm": 0.779934823513031, + "kl": 0.09931713342666626, + "learning_rate": 3.842976013362255e-06, + "loss": 0.004, + "reward": 0.2381666600704193, + "reward_std": 0.3376213312149048, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2381666600704193, + "step": 871 + }, + { + "completion_length": 162.83334350585938, + "epoch": 0.38807298620382735, + "grad_norm": 0.0258543211966753, + "kl": 0.05789042264223099, + "learning_rate": 3.839698168560137e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 872 + }, + { + "completion_length": 178.6666717529297, + "epoch": 0.3885180240320427, + "grad_norm": 0.7851041555404663, + "kl": 0.0560050830245018, + "learning_rate": 3.8364170897259715e-06, + "loss": 0.0022, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 873 + }, + { + "completion_length": 160.6666717529297, + "epoch": 0.38896306186025814, + "grad_norm": 0.7662412524223328, + "kl": 0.07136371731758118, + "learning_rate": 3.833132784780284e-06, + "loss": 0.0029, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 874 + }, + { + "completion_length": 160.33334350585938, + "epoch": 0.3894080996884735, + "grad_norm": 0.6216509938240051, + "kl": 0.05862082540988922, + "learning_rate": 3.82984526165139e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 875 + }, + { + "completion_length": 162.33334350585938, + "epoch": 0.38985313751668893, + "grad_norm": 0.6714538335800171, + "kl": 0.06793516874313354, + "learning_rate": 3.8265545282753706e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 876 + }, + { + "completion_length": 176.0, + "epoch": 0.3902981753449043, + "grad_norm": 0.6328492760658264, + "kl": 0.07553236186504364, + "learning_rate": 3.823260592596058e-06, + "loss": 0.003, + "reward": 0.20350000262260437, + "reward_std": 0.36431294679641724, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20350000262260437, + "step": 877 + }, + { + "completion_length": 200.0, + "epoch": 0.3907432131731197, + "grad_norm": 0.6953088045120239, + "kl": 0.03606265038251877, + "learning_rate": 3.819963462565015e-06, + "loss": 0.0014, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 878 + }, + { + "completion_length": 168.6666717529297, + "epoch": 0.39118825100133514, + "grad_norm": 0.6372416615486145, + "kl": 0.06722667813301086, + "learning_rate": 3.816663146141514e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 879 + }, + { + "completion_length": 159.5, + "epoch": 0.3916332888295505, + "grad_norm": 0.8337939381599426, + "kl": 0.06765174865722656, + "learning_rate": 3.813359651292522e-06, + "loss": 0.0027, + "reward": 0.31333333253860474, + "reward_std": 0.06864885985851288, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 880 + }, + { + "completion_length": 164.5, + "epoch": 0.39207832665776593, + "grad_norm": 0.6367321610450745, + "kl": 0.049886442720890045, + "learning_rate": 3.810052985992677e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 881 + }, + { + "completion_length": 151.1666717529297, + "epoch": 0.3925233644859813, + "grad_norm": 0.03452099859714508, + "kl": 0.0649208277463913, + "learning_rate": 3.8067431582242697e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 882 + }, + { + "completion_length": 150.0, + "epoch": 0.3929684023141967, + "grad_norm": 0.8333684802055359, + "kl": 0.061620742082595825, + "learning_rate": 3.8034301759772263e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 883 + }, + { + "completion_length": 132.33334350585938, + "epoch": 0.3934134401424121, + "grad_norm": 0.03911470249295235, + "kl": 0.07865100353956223, + "learning_rate": 3.8001140472490887e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 884 + }, + { + "completion_length": 130.1666717529297, + "epoch": 0.3938584779706275, + "grad_norm": 1.2934985160827637, + "kl": 0.05334341526031494, + "learning_rate": 3.796794780044992e-06, + "loss": 0.0021, + "reward": 0.17083333432674408, + "reward_std": 0.2419474571943283, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.17083333432674408, + "step": 885 + }, + { + "completion_length": 139.1666717529297, + "epoch": 0.3943035157988429, + "grad_norm": 0.8030723929405212, + "kl": 0.06765462458133698, + "learning_rate": 3.7934723823776494e-06, + "loss": 0.0027, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 886 + }, + { + "completion_length": 105.5, + "epoch": 0.3947485536270583, + "grad_norm": 0.02923821657896042, + "kl": 0.06622253358364105, + "learning_rate": 3.7901468622673303e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 887 + }, + { + "completion_length": 140.6666717529297, + "epoch": 0.3951935914552737, + "grad_norm": 0.7183970808982849, + "kl": 0.0615064799785614, + "learning_rate": 3.786818227741842e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 888 + }, + { + "completion_length": 165.33334350585938, + "epoch": 0.3956386292834891, + "grad_norm": 0.733579158782959, + "kl": 0.04961232468485832, + "learning_rate": 3.78348648683651e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 889 + }, + { + "completion_length": 171.33334350585938, + "epoch": 0.3960836671117045, + "grad_norm": 0.7921627163887024, + "kl": 0.05507838726043701, + "learning_rate": 3.780151647594159e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 890 + }, + { + "completion_length": 171.0, + "epoch": 0.3965287049399199, + "grad_norm": 0.7230894565582275, + "kl": 0.057939350605010986, + "learning_rate": 3.7768137180650915e-06, + "loss": 0.0023, + "reward": 0.29233333468437195, + "reward_std": 0.12961584329605103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 891 + }, + { + "completion_length": 155.0, + "epoch": 0.3969737427681353, + "grad_norm": 0.7460453510284424, + "kl": 0.08669416606426239, + "learning_rate": 3.773472706307072e-06, + "loss": 0.0035, + "reward": 0.29216665029525757, + "reward_std": 0.2053488940000534, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216665029525757, + "step": 892 + }, + { + "completion_length": 127.0, + "epoch": 0.39741878059635066, + "grad_norm": 0.9885587692260742, + "kl": 0.23846940696239471, + "learning_rate": 3.7701286203853036e-06, + "loss": 0.0095, + "reward": 0.3103333115577698, + "reward_std": 0.16084983944892883, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3103333115577698, + "step": 893 + }, + { + "completion_length": 133.1666717529297, + "epoch": 0.3978638184245661, + "grad_norm": 1.0468348264694214, + "kl": 0.241329163312912, + "learning_rate": 3.7667814683724126e-06, + "loss": 0.0097, + "reward": 0.34300002455711365, + "reward_std": 0.08083315193653107, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.34299999475479126, + "step": 894 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.3983088562527815, + "grad_norm": 0.965579628944397, + "kl": 0.06833823025226593, + "learning_rate": 3.7634312583484244e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 895 + }, + { + "completion_length": 148.5, + "epoch": 0.3987538940809969, + "grad_norm": 0.7001117467880249, + "kl": 0.05449951812624931, + "learning_rate": 3.7600779984007485e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 896 + }, + { + "completion_length": 171.5, + "epoch": 0.3991989319092123, + "grad_norm": 0.678555965423584, + "kl": 0.04612820968031883, + "learning_rate": 3.756721696624156e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 897 + }, + { + "completion_length": 177.6666717529297, + "epoch": 0.39964396973742766, + "grad_norm": 0.915947675704956, + "kl": 0.05997948348522186, + "learning_rate": 3.7533623611207607e-06, + "loss": 0.0024, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 898 + }, + { + "completion_length": 200.0, + "epoch": 0.4000890075656431, + "grad_norm": 0.05163982883095741, + "kl": 0.049683578312397, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.002, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 899 + }, + { + "completion_length": 168.6666717529297, + "epoch": 0.40053404539385845, + "grad_norm": 0.7753753066062927, + "kl": 0.05140147730708122, + "learning_rate": 3.7466346213786165e-06, + "loss": 0.0021, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 900 + }, + { + "completion_length": 176.1666717529297, + "epoch": 0.4009790832220739, + "grad_norm": 0.027781542390584946, + "kl": 0.054599761962890625, + "learning_rate": 3.743266233380635e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 901 + }, + { + "completion_length": 83.66667175292969, + "epoch": 0.4014241210502893, + "grad_norm": 0.02804761379957199, + "kl": 0.07853664457798004, + "learning_rate": 3.7398948441373454e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 902 + }, + { + "completion_length": 141.6666717529297, + "epoch": 0.40186915887850466, + "grad_norm": 0.028074469417333603, + "kl": 0.056927796453237534, + "learning_rate": 3.7365204617872834e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 903 + }, + { + "completion_length": 177.83334350585938, + "epoch": 0.4023141967067201, + "grad_norm": 0.8167720437049866, + "kl": 0.06986045837402344, + "learning_rate": 3.7331430944762105e-06, + "loss": 0.0028, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 904 + }, + { + "completion_length": 187.1666717529297, + "epoch": 0.40275923453493545, + "grad_norm": 0.7425169944763184, + "kl": 0.04808403179049492, + "learning_rate": 3.729762750357092e-06, + "loss": 0.0019, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 905 + }, + { + "completion_length": 139.5, + "epoch": 0.4032042723631509, + "grad_norm": 0.03980370983481407, + "kl": 0.062199126929044724, + "learning_rate": 3.7263794375900803e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 906 + }, + { + "completion_length": 145.5, + "epoch": 0.40364931019136624, + "grad_norm": 0.022532425820827484, + "kl": 0.056330885738134384, + "learning_rate": 3.7229931643424943e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 907 + }, + { + "completion_length": 198.83334350585938, + "epoch": 0.40409434801958166, + "grad_norm": 0.625041127204895, + "kl": 0.05984826013445854, + "learning_rate": 3.7196039387887995e-06, + "loss": 0.0024, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 908 + }, + { + "completion_length": 192.5, + "epoch": 0.4045393858477971, + "grad_norm": 0.7206208109855652, + "kl": 0.050445057451725006, + "learning_rate": 3.7162117691105894e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 909 + }, + { + "completion_length": 200.0, + "epoch": 0.40498442367601245, + "grad_norm": 0.028280407190322876, + "kl": 0.040611665695905685, + "learning_rate": 3.71281666349656e-06, + "loss": 0.0016, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 910 + }, + { + "completion_length": 184.83334350585938, + "epoch": 0.4054294615042279, + "grad_norm": 0.7396180629730225, + "kl": 0.04777958244085312, + "learning_rate": 3.7094186301425006e-06, + "loss": 0.0019, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 911 + }, + { + "completion_length": 191.33334350585938, + "epoch": 0.40587449933244324, + "grad_norm": 1.011993408203125, + "kl": 0.04695138335227966, + "learning_rate": 3.706017677251266e-06, + "loss": 0.0019, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 912 + }, + { + "completion_length": 114.0, + "epoch": 0.40631953716065866, + "grad_norm": 0.029970509931445122, + "kl": 0.06655572354793549, + "learning_rate": 3.7026138130327547e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 913 + }, + { + "completion_length": 180.33334350585938, + "epoch": 0.40676457498887403, + "grad_norm": 0.660056471824646, + "kl": 0.041105978190898895, + "learning_rate": 3.6992070457038998e-06, + "loss": 0.0016, + "reward": 0.29233333468437195, + "reward_std": 0.10247080028057098, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 914 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.40720961281708945, + "grad_norm": 0.7320570945739746, + "kl": 0.045362964272499084, + "learning_rate": 3.6957973834886387e-06, + "loss": 0.0018, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 915 + }, + { + "completion_length": 159.0, + "epoch": 0.4076546506453049, + "grad_norm": 0.7662878036499023, + "kl": 0.05844952166080475, + "learning_rate": 3.692384834617897e-06, + "loss": 0.0023, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 916 + }, + { + "completion_length": 184.1666717529297, + "epoch": 0.40809968847352024, + "grad_norm": 0.7281984090805054, + "kl": 0.05307254195213318, + "learning_rate": 3.688969407329569e-06, + "loss": 0.0021, + "reward": 0.27133333683013916, + "reward_std": 0.12340772151947021, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 917 + }, + { + "completion_length": 177.6666717529297, + "epoch": 0.40854472630173566, + "grad_norm": 0.7347136735916138, + "kl": 0.05313348397612572, + "learning_rate": 3.6855511098684996e-06, + "loss": 0.0021, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 918 + }, + { + "completion_length": 200.0, + "epoch": 0.40898976412995103, + "grad_norm": 0.7880335450172424, + "kl": 0.03796866908669472, + "learning_rate": 3.682129950486459e-06, + "loss": 0.0015, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 919 + }, + { + "completion_length": 171.83334350585938, + "epoch": 0.40943480195816645, + "grad_norm": 0.03392661735415459, + "kl": 0.05389983206987381, + "learning_rate": 3.678705937442128e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 920 + }, + { + "completion_length": 159.33334350585938, + "epoch": 0.4098798397863818, + "grad_norm": 0.8597182035446167, + "kl": 0.0735437273979187, + "learning_rate": 3.675279079001077e-06, + "loss": 0.0029, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 921 + }, + { + "completion_length": 196.5, + "epoch": 0.41032487761459724, + "grad_norm": 0.7532240152359009, + "kl": 0.053770922124385834, + "learning_rate": 3.6718493834357415e-06, + "loss": 0.0022, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 922 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.41076991544281266, + "grad_norm": 0.7527331709861755, + "kl": 0.05144277960062027, + "learning_rate": 3.6684168590254103e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 923 + }, + { + "completion_length": 182.5, + "epoch": 0.411214953271028, + "grad_norm": 0.5914585590362549, + "kl": 0.05143951624631882, + "learning_rate": 3.6649815140561995e-06, + "loss": 0.0021, + "reward": 0.21649998426437378, + "reward_std": 0.3328048884868622, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.21649998426437378, + "step": 924 + }, + { + "completion_length": 185.0, + "epoch": 0.41165999109924345, + "grad_norm": 0.6755291819572449, + "kl": 0.04394887015223503, + "learning_rate": 3.6615433568210313e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 925 + }, + { + "completion_length": 194.83334350585938, + "epoch": 0.4121050289274588, + "grad_norm": 0.8636181354522705, + "kl": 0.047952380031347275, + "learning_rate": 3.658102395619621e-06, + "loss": 0.0019, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 926 + }, + { + "completion_length": 200.0, + "epoch": 0.41255006675567424, + "grad_norm": 0.7906734347343445, + "kl": 0.02828504703938961, + "learning_rate": 3.65465863875845e-06, + "loss": 0.0011, + "reward": 0.1458333432674408, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1458333432674408, + "step": 927 + }, + { + "completion_length": 149.1666717529297, + "epoch": 0.4129951045838896, + "grad_norm": 0.8365933895111084, + "kl": 0.08980211615562439, + "learning_rate": 3.651212094550748e-06, + "loss": 0.0036, + "reward": 0.26616665720939636, + "reward_std": 0.2690356373786926, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.26616665720939636, + "step": 928 + }, + { + "completion_length": 199.6666717529297, + "epoch": 0.413440142412105, + "grad_norm": 0.6435677409172058, + "kl": 0.03840624541044235, + "learning_rate": 3.6477627713164767e-06, + "loss": 0.0015, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 929 + }, + { + "completion_length": 140.1666717529297, + "epoch": 0.41388518024032045, + "grad_norm": 0.02791665680706501, + "kl": 0.0602748803794384, + "learning_rate": 3.6443106773823025e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 930 + }, + { + "completion_length": 190.0, + "epoch": 0.4143302180685358, + "grad_norm": 0.8331409096717834, + "kl": 0.05726364254951477, + "learning_rate": 3.6408558210815814e-06, + "loss": 0.0023, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 931 + }, + { + "completion_length": 193.1666717529297, + "epoch": 0.41477525589675124, + "grad_norm": 0.7008532285690308, + "kl": 0.05072779580950737, + "learning_rate": 3.6373982107543398e-06, + "loss": 0.002, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 932 + }, + { + "completion_length": 136.33334350585938, + "epoch": 0.4152202937249666, + "grad_norm": 0.02707936242222786, + "kl": 0.05186247080564499, + "learning_rate": 3.63393785474725e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 933 + }, + { + "completion_length": 145.5, + "epoch": 0.415665331553182, + "grad_norm": 0.7188604474067688, + "kl": 0.06224457547068596, + "learning_rate": 3.6304747614136126e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 934 + }, + { + "completion_length": 117.66667175292969, + "epoch": 0.4161103693813974, + "grad_norm": 1.1184520721435547, + "kl": 0.0940161943435669, + "learning_rate": 3.6270089391133378e-06, + "loss": 0.0038, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 935 + }, + { + "completion_length": 158.5, + "epoch": 0.4165554072096128, + "grad_norm": 0.683738648891449, + "kl": 0.05458883196115494, + "learning_rate": 3.6235403962129218e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 936 + }, + { + "completion_length": 176.5, + "epoch": 0.41700044503782824, + "grad_norm": 0.7745259404182434, + "kl": 0.053183965384960175, + "learning_rate": 3.6200691410854284e-06, + "loss": 0.0021, + "reward": 0.31316667795181274, + "reward_std": 0.0688314288854599, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 937 + }, + { + "completion_length": 134.5, + "epoch": 0.4174454828660436, + "grad_norm": 0.028597315773367882, + "kl": 0.06111491844058037, + "learning_rate": 3.61659518211047e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 938 + }, + { + "completion_length": 157.0, + "epoch": 0.417890520694259, + "grad_norm": 0.05513963848352432, + "kl": 0.07339125871658325, + "learning_rate": 3.6131185276741846e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 939 + }, + { + "completion_length": 173.0, + "epoch": 0.4183355585224744, + "grad_norm": 0.7153098583221436, + "kl": 0.053828366100788116, + "learning_rate": 3.6096391861692183e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 940 + }, + { + "completion_length": 193.0, + "epoch": 0.4187805963506898, + "grad_norm": 0.738511323928833, + "kl": 0.05540754646062851, + "learning_rate": 3.6061571659947032e-06, + "loss": 0.0022, + "reward": 0.29233333468437195, + "reward_std": 0.12961584329605103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 941 + }, + { + "completion_length": 144.33334350585938, + "epoch": 0.4192256341789052, + "grad_norm": 0.01766749657690525, + "kl": 0.05199790000915527, + "learning_rate": 3.602672475556237e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 942 + }, + { + "completion_length": 131.1666717529297, + "epoch": 0.4196706720071206, + "grad_norm": 0.02660481631755829, + "kl": 0.05784265324473381, + "learning_rate": 3.5991851232658647e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 943 + }, + { + "completion_length": 191.6666717529297, + "epoch": 0.420115709835336, + "grad_norm": 0.6994498372077942, + "kl": 0.039949461817741394, + "learning_rate": 3.595695117542057e-06, + "loss": 0.0016, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 944 + }, + { + "completion_length": 166.83334350585938, + "epoch": 0.4205607476635514, + "grad_norm": 0.8509209752082825, + "kl": 0.10748060047626495, + "learning_rate": 3.5922024668096885e-06, + "loss": 0.0043, + "reward": 0.29216668009757996, + "reward_std": 0.06493817269802094, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 945 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.4210057854917668, + "grad_norm": 0.014815707691013813, + "kl": 0.04326681047677994, + "learning_rate": 3.5887071795000204e-06, + "loss": 0.002, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 946 + }, + { + "completion_length": 182.83334350585938, + "epoch": 0.4214508233199822, + "grad_norm": 0.8161847591400146, + "kl": 0.0375329852104187, + "learning_rate": 3.585209264050678e-06, + "loss": 0.0015, + "reward": 0.22949999570846558, + "reward_std": 0.12337382137775421, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22949998080730438, + "step": 947 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.4218958611481976, + "grad_norm": 0.7375685572624207, + "kl": 0.057841382920742035, + "learning_rate": 3.5817087289056305e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 948 + }, + { + "completion_length": 155.0, + "epoch": 0.42234089897641297, + "grad_norm": 0.02155291475355625, + "kl": 0.051909781992435455, + "learning_rate": 3.5782055825151722e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 949 + }, + { + "completion_length": 171.33334350585938, + "epoch": 0.4227859368046284, + "grad_norm": 0.635463297367096, + "kl": 0.049717407673597336, + "learning_rate": 3.5746998333358994e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 950 + }, + { + "completion_length": 135.83334350585938, + "epoch": 0.4232309746328438, + "grad_norm": 0.04738845303654671, + "kl": 0.07275395095348358, + "learning_rate": 3.571191489830693e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 951 + }, + { + "completion_length": 148.5, + "epoch": 0.4236760124610592, + "grad_norm": 0.734367311000824, + "kl": 0.0618259459733963, + "learning_rate": 3.567680560468696e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 952 + }, + { + "completion_length": 185.1666717529297, + "epoch": 0.4241210502892746, + "grad_norm": 0.6942402720451355, + "kl": 0.04942780360579491, + "learning_rate": 3.564167053725293e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 953 + }, + { + "completion_length": 149.83334350585938, + "epoch": 0.42456608811748997, + "grad_norm": 0.6157485246658325, + "kl": 0.048379119485616684, + "learning_rate": 3.560650978082092e-06, + "loss": 0.0019, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 954 + }, + { + "completion_length": 181.33334350585938, + "epoch": 0.4250111259457054, + "grad_norm": 0.6741864085197449, + "kl": 0.05080154538154602, + "learning_rate": 3.5571323420269e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 955 + }, + { + "completion_length": 164.33334350585938, + "epoch": 0.42545616377392076, + "grad_norm": 0.6833618879318237, + "kl": 0.04868559539318085, + "learning_rate": 3.5536111540537076e-06, + "loss": 0.0019, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 956 + }, + { + "completion_length": 153.83334350585938, + "epoch": 0.4259012016021362, + "grad_norm": 0.8367120027542114, + "kl": 0.08897934854030609, + "learning_rate": 3.5500874226626635e-06, + "loss": 0.0036, + "reward": 0.2783333361148834, + "reward_std": 0.23923347890377045, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27833330631256104, + "step": 957 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.4263462394303516, + "grad_norm": 0.8038119673728943, + "kl": 0.06598465144634247, + "learning_rate": 3.546561156360057e-06, + "loss": 0.0026, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 958 + }, + { + "completion_length": 167.6666717529297, + "epoch": 0.42679127725856697, + "grad_norm": 0.8257107734680176, + "kl": 0.06047782301902771, + "learning_rate": 3.543032363658297e-06, + "loss": 0.0024, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 959 + }, + { + "completion_length": 151.1666717529297, + "epoch": 0.4272363150867824, + "grad_norm": 0.029956277459859848, + "kl": 0.05927738547325134, + "learning_rate": 3.5395010530758913e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 960 + }, + { + "completion_length": 185.0, + "epoch": 0.42768135291499776, + "grad_norm": 0.7257969379425049, + "kl": 0.043350182473659515, + "learning_rate": 3.535967233137424e-06, + "loss": 0.0017, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 961 + }, + { + "completion_length": 164.33334350585938, + "epoch": 0.4281263907432132, + "grad_norm": 0.019633714109659195, + "kl": 0.04942721128463745, + "learning_rate": 3.5324309123735396e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 962 + }, + { + "completion_length": 200.0, + "epoch": 0.42857142857142855, + "grad_norm": 0.6552579402923584, + "kl": 0.055333852767944336, + "learning_rate": 3.5288920993209175e-06, + "loss": 0.0022, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 963 + }, + { + "completion_length": 172.0, + "epoch": 0.42901646639964397, + "grad_norm": 0.8508467078208923, + "kl": 0.047447673976421356, + "learning_rate": 3.5253508025222545e-06, + "loss": 0.0019, + "reward": 0.2615000009536743, + "reward_std": 0.13879159092903137, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2615000009536743, + "step": 964 + }, + { + "completion_length": 199.5, + "epoch": 0.4294615042278594, + "grad_norm": 0.6771219968795776, + "kl": 0.0449552908539772, + "learning_rate": 3.5218070305262427e-06, + "loss": 0.0018, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 965 + }, + { + "completion_length": 149.5, + "epoch": 0.42990654205607476, + "grad_norm": 0.046515047550201416, + "kl": 0.06530429422855377, + "learning_rate": 3.5182607918875495e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 966 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.4303515798842902, + "grad_norm": 0.645074188709259, + "kl": 0.051890529692173004, + "learning_rate": 3.514712095166797e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 967 + }, + { + "completion_length": 177.5, + "epoch": 0.43079661771250555, + "grad_norm": 0.7000871896743774, + "kl": 0.05255164951086044, + "learning_rate": 3.511160948930539e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 968 + }, + { + "completion_length": 149.1666717529297, + "epoch": 0.43124165554072097, + "grad_norm": 0.02208912931382656, + "kl": 0.05640026181936264, + "learning_rate": 3.507607361751248e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 969 + }, + { + "completion_length": 168.6666717529297, + "epoch": 0.43168669336893634, + "grad_norm": 0.032338086515665054, + "kl": 0.059976160526275635, + "learning_rate": 3.504051342207282e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 970 + }, + { + "completion_length": 151.0, + "epoch": 0.43213173119715176, + "grad_norm": 0.8209261894226074, + "kl": 0.064157634973526, + "learning_rate": 3.5004928988828748e-06, + "loss": 0.0026, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 971 + }, + { + "completion_length": 197.5, + "epoch": 0.4325767690253672, + "grad_norm": 0.6907802820205688, + "kl": 0.05787642300128937, + "learning_rate": 3.4969320403681105e-06, + "loss": 0.0023, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 972 + }, + { + "completion_length": 199.83334350585938, + "epoch": 0.43302180685358255, + "grad_norm": 0.802642285823822, + "kl": 0.044948406517505646, + "learning_rate": 3.493368775258904e-06, + "loss": 0.0018, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 973 + }, + { + "completion_length": 168.83334350585938, + "epoch": 0.43346684468179797, + "grad_norm": 0.6813015341758728, + "kl": 0.04564934968948364, + "learning_rate": 3.489803112156978e-06, + "loss": 0.0018, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 974 + }, + { + "completion_length": 161.33334350585938, + "epoch": 0.43391188251001334, + "grad_norm": 0.028628792613744736, + "kl": 0.05609843134880066, + "learning_rate": 3.486235059669846e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 975 + }, + { + "completion_length": 172.83334350585938, + "epoch": 0.43435692033822876, + "grad_norm": 0.7536810636520386, + "kl": 0.07381969690322876, + "learning_rate": 3.482664626410787e-06, + "loss": 0.003, + "reward": 0.3341667056083679, + "reward_std": 0.0648086816072464, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 976 + }, + { + "completion_length": 185.5, + "epoch": 0.4348019581664441, + "grad_norm": 0.7587660551071167, + "kl": 0.04436537250876427, + "learning_rate": 3.47909182099883e-06, + "loss": 0.0018, + "reward": 0.250333309173584, + "reward_std": 0.1122509092092514, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250333309173584, + "step": 977 + }, + { + "completion_length": 174.6666717529297, + "epoch": 0.43524699599465955, + "grad_norm": 0.7341634035110474, + "kl": 0.04806718975305557, + "learning_rate": 3.4755166520587297e-06, + "loss": 0.0019, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 978 + }, + { + "completion_length": 200.0, + "epoch": 0.43569203382287497, + "grad_norm": 0.01764507032930851, + "kl": 0.046592336148023605, + "learning_rate": 3.4719391282209437e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 979 + }, + { + "completion_length": 132.1666717529297, + "epoch": 0.43613707165109034, + "grad_norm": 0.921298086643219, + "kl": 0.15562453866004944, + "learning_rate": 3.4683592581216173e-06, + "loss": 0.0062, + "reward": 0.3048333525657654, + "reward_std": 0.1743220090866089, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.304833322763443, + "step": 980 + }, + { + "completion_length": 119.16667175292969, + "epoch": 0.43658210947930576, + "grad_norm": 0.024678191170096397, + "kl": 0.06529170274734497, + "learning_rate": 3.464777050402559e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 981 + }, + { + "completion_length": 144.1666717529297, + "epoch": 0.4370271473075211, + "grad_norm": 0.016610290855169296, + "kl": 0.04939974471926689, + "learning_rate": 3.461192513711219e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 982 + }, + { + "completion_length": 149.83334350585938, + "epoch": 0.43747218513573655, + "grad_norm": 0.029446417465806007, + "kl": 0.05275744944810867, + "learning_rate": 3.4576056567006728e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 983 + }, + { + "completion_length": 178.6666717529297, + "epoch": 0.4379172229639519, + "grad_norm": 0.7193699479103088, + "kl": 0.057997625321149826, + "learning_rate": 3.454016488029592e-06, + "loss": 0.0023, + "reward": 0.29216668009757996, + "reward_std": 0.10255226492881775, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 984 + }, + { + "completion_length": 132.0, + "epoch": 0.43836226079216734, + "grad_norm": 0.016964778304100037, + "kl": 0.052684418857097626, + "learning_rate": 3.4504250163622334e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 985 + }, + { + "completion_length": 191.6666717529297, + "epoch": 0.43880729862038276, + "grad_norm": 0.7094153761863708, + "kl": 0.03705962002277374, + "learning_rate": 3.446831250368412e-06, + "loss": 0.0015, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 986 + }, + { + "completion_length": 182.5, + "epoch": 0.4392523364485981, + "grad_norm": 0.7157971858978271, + "kl": 0.05741278454661369, + "learning_rate": 3.443235198723479e-06, + "loss": 0.0023, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 987 + }, + { + "completion_length": 195.0, + "epoch": 0.43969737427681355, + "grad_norm": 0.7865637540817261, + "kl": 0.03509274870157242, + "learning_rate": 3.4396368701083073e-06, + "loss": 0.0014, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 988 + }, + { + "completion_length": 162.1666717529297, + "epoch": 0.4401424121050289, + "grad_norm": 1.0707346200942993, + "kl": 0.06345522403717041, + "learning_rate": 3.436036273209261e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 989 + }, + { + "completion_length": 157.6666717529297, + "epoch": 0.44058744993324434, + "grad_norm": 0.7505767345428467, + "kl": 0.06585465371608734, + "learning_rate": 3.432433416718184e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 990 + }, + { + "completion_length": 197.6666717529297, + "epoch": 0.4410324877614597, + "grad_norm": 0.7079508900642395, + "kl": 0.05217638239264488, + "learning_rate": 3.428828309332375e-06, + "loss": 0.0021, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 991 + }, + { + "completion_length": 125.33333587646484, + "epoch": 0.4414775255896751, + "grad_norm": 0.022779956459999084, + "kl": 0.06337767839431763, + "learning_rate": 3.4252209597545634e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 992 + }, + { + "completion_length": 200.0, + "epoch": 0.44192256341789055, + "grad_norm": 0.02871246449649334, + "kl": 0.045304443687200546, + "learning_rate": 3.4216113766928926e-06, + "loss": 0.0018, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 993 + }, + { + "completion_length": 196.33334350585938, + "epoch": 0.4423676012461059, + "grad_norm": 0.7327296137809753, + "kl": 0.04095136374235153, + "learning_rate": 3.4179995688608996e-06, + "loss": 0.0016, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 994 + }, + { + "completion_length": 179.0, + "epoch": 0.44281263907432133, + "grad_norm": 0.6812580823898315, + "kl": 0.054273735731840134, + "learning_rate": 3.414385544977489e-06, + "loss": 0.0022, + "reward": 0.24016666412353516, + "reward_std": 0.23370617628097534, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.24016666412353516, + "step": 995 + }, + { + "completion_length": 147.0, + "epoch": 0.4432576769025367, + "grad_norm": 0.029301166534423828, + "kl": 0.05664734169840813, + "learning_rate": 3.4107693137669167e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 996 + }, + { + "completion_length": 187.83334350585938, + "epoch": 0.4437027147307521, + "grad_norm": 0.7347012758255005, + "kl": 0.0524878203868866, + "learning_rate": 3.4071508839587676e-06, + "loss": 0.0021, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 997 + }, + { + "completion_length": 175.83334350585938, + "epoch": 0.4441477525589675, + "grad_norm": 0.7827162742614746, + "kl": 0.05347862094640732, + "learning_rate": 3.403530264287931e-06, + "loss": 0.0021, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 998 + }, + { + "completion_length": 179.0, + "epoch": 0.4445927903871829, + "grad_norm": 0.0856582522392273, + "kl": 0.05322853848338127, + "learning_rate": 3.3999074634945854e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 999 + }, + { + "completion_length": 193.0, + "epoch": 0.44503782821539833, + "grad_norm": 0.7221679091453552, + "kl": 0.03924994543194771, + "learning_rate": 3.396282490324175e-06, + "loss": 0.0016, + "reward": 0.2711666524410248, + "reward_std": 0.09453975409269333, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 1000 + }, + { + "completion_length": 118.0, + "epoch": 0.4454828660436137, + "grad_norm": 0.025427184998989105, + "kl": 0.05986163020133972, + "learning_rate": 3.392655353527385e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1001 + }, + { + "completion_length": 160.83334350585938, + "epoch": 0.4459279038718291, + "grad_norm": 0.8170287013053894, + "kl": 0.05823221802711487, + "learning_rate": 3.389026061860126e-06, + "loss": 0.0023, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1002 + }, + { + "completion_length": 200.0, + "epoch": 0.4463729417000445, + "grad_norm": 0.030482221394777298, + "kl": 0.04448040574789047, + "learning_rate": 3.3853946240835113e-06, + "loss": 0.0018, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1003 + }, + { + "completion_length": 160.5, + "epoch": 0.4468179795282599, + "grad_norm": 0.6799116134643555, + "kl": 0.0437270812690258, + "learning_rate": 3.3817610489638313e-06, + "loss": 0.0017, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1004 + }, + { + "completion_length": 183.83334350585938, + "epoch": 0.4472630173564753, + "grad_norm": 0.7161902785301208, + "kl": 0.04126816242933273, + "learning_rate": 3.3781253452725395e-06, + "loss": 0.0017, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1005 + }, + { + "completion_length": 198.83334350585938, + "epoch": 0.4477080551846907, + "grad_norm": 0.7520974278450012, + "kl": 0.04543256014585495, + "learning_rate": 3.3744875217862266e-06, + "loss": 0.0018, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1006 + }, + { + "completion_length": 154.33334350585938, + "epoch": 0.4481530930129061, + "grad_norm": 0.8436493277549744, + "kl": 0.061019204556941986, + "learning_rate": 3.3708475872866e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1007 + }, + { + "completion_length": 200.0, + "epoch": 0.4485981308411215, + "grad_norm": 0.6720322966575623, + "kl": 0.033872686326503754, + "learning_rate": 3.3672055505604624e-06, + "loss": 0.0014, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 1008 + }, + { + "completion_length": 197.83334350585938, + "epoch": 0.4490431686693369, + "grad_norm": 0.7746611833572388, + "kl": 0.054266974329948425, + "learning_rate": 3.3635614203996938e-06, + "loss": 0.0022, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 1009 + }, + { + "completion_length": 193.0, + "epoch": 0.4494882064975523, + "grad_norm": 0.725085973739624, + "kl": 0.09663119912147522, + "learning_rate": 3.3599152056012246e-06, + "loss": 0.0039, + "reward": 0.032499998807907104, + "reward_std": 0.30418860912323, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.032499998807907104, + "step": 1010 + }, + { + "completion_length": 162.83334350585938, + "epoch": 0.4499332443257677, + "grad_norm": 0.6782302856445312, + "kl": 0.051349788904190063, + "learning_rate": 3.356266914967021e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1011 + }, + { + "completion_length": 159.1666717529297, + "epoch": 0.45037828215398307, + "grad_norm": 0.7623798847198486, + "kl": 0.09882908314466476, + "learning_rate": 3.352616557304057e-06, + "loss": 0.004, + "reward": 0.21299999952316284, + "reward_std": 0.25778907537460327, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.21299999952316284, + "step": 1012 + }, + { + "completion_length": 166.5, + "epoch": 0.4508233199821985, + "grad_norm": 0.7910959720611572, + "kl": 0.05198938772082329, + "learning_rate": 3.3489641414242986e-06, + "loss": 0.0021, + "reward": 0.35466668009757996, + "reward_std": 0.09423092007637024, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 1013 + }, + { + "completion_length": 183.5, + "epoch": 0.4512683578104139, + "grad_norm": 0.8101857304573059, + "kl": 0.05365820974111557, + "learning_rate": 3.3453096761446795e-06, + "loss": 0.0021, + "reward": 0.20000001788139343, + "reward_std": 0.25355708599090576, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20000001788139343, + "step": 1014 + }, + { + "completion_length": 196.0, + "epoch": 0.4517133956386293, + "grad_norm": 0.8162935376167297, + "kl": 0.04988335818052292, + "learning_rate": 3.34165317028708e-06, + "loss": 0.002, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1015 + }, + { + "completion_length": 192.0, + "epoch": 0.4521584334668447, + "grad_norm": 0.7259402871131897, + "kl": 0.05293646082282066, + "learning_rate": 3.3379946326783074e-06, + "loss": 0.0021, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 1016 + }, + { + "completion_length": 166.1666717529297, + "epoch": 0.45260347129506007, + "grad_norm": 0.6532735228538513, + "kl": 0.04412105679512024, + "learning_rate": 3.3343340721500743e-06, + "loss": 0.0018, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1017 + }, + { + "completion_length": 148.33334350585938, + "epoch": 0.4530485091232755, + "grad_norm": 0.7425003051757812, + "kl": 0.10510390996932983, + "learning_rate": 3.3306714975389742e-06, + "loss": 0.0042, + "reward": 0.2293333262205124, + "reward_std": 0.3592585027217865, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2293333262205124, + "step": 1018 + }, + { + "completion_length": 170.33334350585938, + "epoch": 0.45349354695149086, + "grad_norm": 0.804882287979126, + "kl": 0.04764273762702942, + "learning_rate": 3.3270069176864644e-06, + "loss": 0.0019, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1019 + }, + { + "completion_length": 128.5, + "epoch": 0.4539385847797063, + "grad_norm": 0.9536888599395752, + "kl": 0.06011280044913292, + "learning_rate": 3.3233403414388432e-06, + "loss": 0.0024, + "reward": 0.3341667056083679, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1020 + }, + { + "completion_length": 129.1666717529297, + "epoch": 0.4543836226079217, + "grad_norm": 0.025884155184030533, + "kl": 0.05302686616778374, + "learning_rate": 3.319671777647227e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1021 + }, + { + "completion_length": 182.33334350585938, + "epoch": 0.45482866043613707, + "grad_norm": 0.7451061606407166, + "kl": 0.05932309851050377, + "learning_rate": 3.3160012351675304e-06, + "loss": 0.0024, + "reward": 0.057500001043081284, + "reward_std": 0.39164361357688904, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.057500001043081284, + "step": 1022 + }, + { + "completion_length": 179.33334350585938, + "epoch": 0.4552736982643525, + "grad_norm": 0.7858273386955261, + "kl": 0.05125393718481064, + "learning_rate": 3.312328722860445e-06, + "loss": 0.0021, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 1023 + }, + { + "completion_length": 133.6666717529297, + "epoch": 0.45571873609256786, + "grad_norm": 0.837760329246521, + "kl": 0.15334957838058472, + "learning_rate": 3.3086542495914176e-06, + "loss": 0.0061, + "reward": 0.2828333377838135, + "reward_std": 0.22821079194545746, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2828333377838135, + "step": 1024 + }, + { + "completion_length": 169.0, + "epoch": 0.4561637739207833, + "grad_norm": 0.9212179183959961, + "kl": 0.11279284954071045, + "learning_rate": 3.304977824230628e-06, + "loss": 0.0045, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1025 + }, + { + "completion_length": 152.5, + "epoch": 0.45660881174899864, + "grad_norm": 0.6558517217636108, + "kl": 0.05286305397748947, + "learning_rate": 3.301299455652969e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1026 + }, + { + "completion_length": 197.5, + "epoch": 0.45705384957721407, + "grad_norm": 0.7066366076469421, + "kl": 0.04090268909931183, + "learning_rate": 3.297619152738025e-06, + "loss": 0.0016, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1027 + }, + { + "completion_length": 185.1666717529297, + "epoch": 0.4574988874054295, + "grad_norm": 1.0826621055603027, + "kl": 0.04789039492607117, + "learning_rate": 3.293936924370048e-06, + "loss": 0.0019, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1028 + }, + { + "completion_length": 165.1666717529297, + "epoch": 0.45794392523364486, + "grad_norm": 0.7362165451049805, + "kl": 0.05640339106321335, + "learning_rate": 3.290252779437939e-06, + "loss": 0.0023, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1029 + }, + { + "completion_length": 190.6666717529297, + "epoch": 0.4583889630618603, + "grad_norm": 0.993109941482544, + "kl": 0.06488848477602005, + "learning_rate": 3.286566726835227e-06, + "loss": 0.0026, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1030 + }, + { + "completion_length": 163.5, + "epoch": 0.45883400089007564, + "grad_norm": 0.01776033826172352, + "kl": 0.051738932728767395, + "learning_rate": 3.282878775460044e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1031 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.45927903871829107, + "grad_norm": 0.8004707098007202, + "kl": 0.043662603944540024, + "learning_rate": 3.2791889342151055e-06, + "loss": 0.0017, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1032 + }, + { + "completion_length": 158.6666717529297, + "epoch": 0.45972407654650643, + "grad_norm": 0.837901771068573, + "kl": 0.09061974287033081, + "learning_rate": 3.2754972120076918e-06, + "loss": 0.0036, + "reward": 0.2618333399295807, + "reward_std": 0.2796500623226166, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2618333101272583, + "step": 1033 + }, + { + "completion_length": 200.0, + "epoch": 0.46016911437472185, + "grad_norm": 0.01697075366973877, + "kl": 0.042512111365795135, + "learning_rate": 3.2718036177496217e-06, + "loss": 0.0017, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1034 + }, + { + "completion_length": 154.83334350585938, + "epoch": 0.4606141522029373, + "grad_norm": 0.8117488622665405, + "kl": 0.07002654671669006, + "learning_rate": 3.268108160357233e-06, + "loss": 0.0028, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1035 + }, + { + "completion_length": 140.1666717529297, + "epoch": 0.46105919003115264, + "grad_norm": 0.0247640497982502, + "kl": 0.05443256348371506, + "learning_rate": 3.2644108487513614e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1036 + }, + { + "completion_length": 197.83334350585938, + "epoch": 0.46150422785936807, + "grad_norm": 0.6665837168693542, + "kl": 0.07370376586914062, + "learning_rate": 3.2607116918573207e-06, + "loss": 0.0029, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1037 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.46194926568758343, + "grad_norm": 0.7120652794837952, + "kl": 0.043880194425582886, + "learning_rate": 3.2570106986048755e-06, + "loss": 0.0018, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1038 + }, + { + "completion_length": 172.33334350585938, + "epoch": 0.46239430351579885, + "grad_norm": 0.7578598260879517, + "kl": 0.0529780313372612, + "learning_rate": 3.253307877928227e-06, + "loss": 0.0021, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1039 + }, + { + "completion_length": 193.6666717529297, + "epoch": 0.4628393413440142, + "grad_norm": 0.7513720393180847, + "kl": 0.07827489823102951, + "learning_rate": 3.249603238765985e-06, + "loss": 0.0031, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 1040 + }, + { + "completion_length": 151.5, + "epoch": 0.46328437917222964, + "grad_norm": 0.02841741219162941, + "kl": 0.05697069317102432, + "learning_rate": 3.2458967900611504e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1041 + }, + { + "completion_length": 177.1666717529297, + "epoch": 0.46372941700044507, + "grad_norm": 0.7044739127159119, + "kl": 0.06733378767967224, + "learning_rate": 3.242188540761092e-06, + "loss": 0.0027, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1042 + }, + { + "completion_length": 152.1666717529297, + "epoch": 0.46417445482866043, + "grad_norm": 0.9891431927680969, + "kl": 0.06679210066795349, + "learning_rate": 3.2384784998175274e-06, + "loss": 0.0027, + "reward": 0.30433332920074463, + "reward_std": 0.1755467653274536, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.30433332920074463, + "step": 1043 + }, + { + "completion_length": 180.0, + "epoch": 0.46461949265687585, + "grad_norm": 0.7353585362434387, + "kl": 0.05097658559679985, + "learning_rate": 3.234766676186495e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1044 + }, + { + "completion_length": 145.0, + "epoch": 0.4650645304850912, + "grad_norm": 0.023157954216003418, + "kl": 0.05559340864419937, + "learning_rate": 3.23105307882834e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1045 + }, + { + "completion_length": 200.0, + "epoch": 0.46550956831330664, + "grad_norm": 0.6369867324829102, + "kl": 0.06708483397960663, + "learning_rate": 3.2273377167076876e-06, + "loss": 0.0027, + "reward": 0.1081666648387909, + "reward_std": 0.3474193215370178, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1081666648387909, + "step": 1046 + }, + { + "completion_length": 186.0, + "epoch": 0.465954606141522, + "grad_norm": 0.8478872179985046, + "kl": 0.04902426898479462, + "learning_rate": 3.2236205987934237e-06, + "loss": 0.002, + "reward": 0.22949999570846558, + "reward_std": 0.12337382137775421, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22949999570846558, + "step": 1047 + }, + { + "completion_length": 169.83334350585938, + "epoch": 0.46639964396973743, + "grad_norm": 0.6993066072463989, + "kl": 0.05535086244344711, + "learning_rate": 3.219901734058675e-06, + "loss": 0.0022, + "reward": 0.31316667795181274, + "reward_std": 0.0688314288854599, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1048 + }, + { + "completion_length": 194.33334350585938, + "epoch": 0.4668446817979528, + "grad_norm": 0.8170168995857239, + "kl": 0.03591171279549599, + "learning_rate": 3.2161811314807794e-06, + "loss": 0.0014, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 1049 + }, + { + "completion_length": 127.16667175292969, + "epoch": 0.4672897196261682, + "grad_norm": 0.022290347144007683, + "kl": 0.05225914716720581, + "learning_rate": 3.212458800041276e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1050 + }, + { + "completion_length": 195.83334350585938, + "epoch": 0.46773475745438364, + "grad_norm": 0.820202112197876, + "kl": 0.04448423907160759, + "learning_rate": 3.2087347487258735e-06, + "loss": 0.0018, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 1051 + }, + { + "completion_length": 141.33334350585938, + "epoch": 0.468179795282599, + "grad_norm": 0.767572283744812, + "kl": 0.0574708953499794, + "learning_rate": 3.2050089865244345e-06, + "loss": 0.0023, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1052 + }, + { + "completion_length": 141.6666717529297, + "epoch": 0.46862483311081443, + "grad_norm": 0.772443413734436, + "kl": 0.05985133349895477, + "learning_rate": 3.2012815224309495e-06, + "loss": 0.0024, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1053 + }, + { + "completion_length": 183.0, + "epoch": 0.4690698709390298, + "grad_norm": 0.7392573356628418, + "kl": 0.043959807604551315, + "learning_rate": 3.1975523654435204e-06, + "loss": 0.0018, + "reward": 0.27133333683013916, + "reward_std": 0.12340773642063141, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 1054 + }, + { + "completion_length": 130.83334350585938, + "epoch": 0.4695149087672452, + "grad_norm": 0.0171357449144125, + "kl": 0.053742073476314545, + "learning_rate": 3.1938215245643327e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1055 + }, + { + "completion_length": 197.83334350585938, + "epoch": 0.4699599465954606, + "grad_norm": 0.751876175403595, + "kl": 0.05212129279971123, + "learning_rate": 3.190089008799638e-06, + "loss": 0.0021, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1056 + }, + { + "completion_length": 186.6666717529297, + "epoch": 0.470404984423676, + "grad_norm": 0.8305873870849609, + "kl": 0.04781080409884453, + "learning_rate": 3.1863548271597333e-06, + "loss": 0.0019, + "reward": 0.29216668009757996, + "reward_std": 0.10255226492881775, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1057 + }, + { + "completion_length": 174.5, + "epoch": 0.47085002225189143, + "grad_norm": 0.661457896232605, + "kl": 0.056177690625190735, + "learning_rate": 3.182618988658932e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1058 + }, + { + "completion_length": 185.6666717529297, + "epoch": 0.4712950600801068, + "grad_norm": 0.7157963514328003, + "kl": 0.05063026398420334, + "learning_rate": 3.178881502315552e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1059 + }, + { + "completion_length": 96.83333587646484, + "epoch": 0.4717400979083222, + "grad_norm": 0.028320979326963425, + "kl": 0.07067437469959259, + "learning_rate": 3.1751423771518876e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1060 + }, + { + "completion_length": 108.0, + "epoch": 0.4721851357365376, + "grad_norm": 0.023467140272259712, + "kl": 0.06319691985845566, + "learning_rate": 3.171401622194187e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1061 + }, + { + "completion_length": 159.5, + "epoch": 0.472630173564753, + "grad_norm": 0.6847676634788513, + "kl": 0.06087712198495865, + "learning_rate": 3.1676592464726354e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1062 + }, + { + "completion_length": 115.16667175292969, + "epoch": 0.4730752113929684, + "grad_norm": 0.7515028715133667, + "kl": 0.08603694289922714, + "learning_rate": 3.1639152590213294e-06, + "loss": 0.0034, + "reward": 0.2993333339691162, + "reward_std": 0.18779420852661133, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2993333041667938, + "step": 1063 + }, + { + "completion_length": 141.83334350585938, + "epoch": 0.4735202492211838, + "grad_norm": 0.02573162131011486, + "kl": 0.05833118408918381, + "learning_rate": 3.1601696688782575e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1064 + }, + { + "completion_length": 153.1666717529297, + "epoch": 0.4739652870493992, + "grad_norm": 0.027337657287716866, + "kl": 0.06034916639328003, + "learning_rate": 3.1564224850852756e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1065 + }, + { + "completion_length": 195.1666717529297, + "epoch": 0.4744103248776146, + "grad_norm": 0.6953572630882263, + "kl": 0.04970608279109001, + "learning_rate": 3.152673716688087e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1066 + }, + { + "completion_length": 165.5, + "epoch": 0.47485536270583, + "grad_norm": 0.6301455497741699, + "kl": 0.06556437909603119, + "learning_rate": 3.148923372736221e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1067 + }, + { + "completion_length": 151.1666717529297, + "epoch": 0.4753004005340454, + "grad_norm": 0.6602085828781128, + "kl": 0.0660819336771965, + "learning_rate": 3.14517146228301e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1068 + }, + { + "completion_length": 157.1666717529297, + "epoch": 0.4757454383622608, + "grad_norm": 0.06263867020606995, + "kl": 0.06300581991672516, + "learning_rate": 3.141417994385566e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1069 + }, + { + "completion_length": 167.33334350585938, + "epoch": 0.47619047619047616, + "grad_norm": 0.027167366817593575, + "kl": 0.060957591980695724, + "learning_rate": 3.1376629781047642e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1070 + }, + { + "completion_length": 118.16667175292969, + "epoch": 0.4766355140186916, + "grad_norm": 0.03084111399948597, + "kl": 0.07735933363437653, + "learning_rate": 3.1339064225052153e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1071 + }, + { + "completion_length": 145.6666717529297, + "epoch": 0.477080551846907, + "grad_norm": 0.04745664820075035, + "kl": 0.08085335791110992, + "learning_rate": 3.1301483366552455e-06, + "loss": 0.0035, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1072 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.4775255896751224, + "grad_norm": 0.7254480123519897, + "kl": 0.055918075144290924, + "learning_rate": 3.1263887296268767e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1073 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.4779706275033378, + "grad_norm": 0.6751211285591125, + "kl": 0.052202414721250534, + "learning_rate": 3.122627610495803e-06, + "loss": 0.0021, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1074 + }, + { + "completion_length": 136.83334350585938, + "epoch": 0.47841566533155316, + "grad_norm": 0.8842310309410095, + "kl": 0.05041445419192314, + "learning_rate": 3.1188649883413665e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1075 + }, + { + "completion_length": 148.6666717529297, + "epoch": 0.4788607031597686, + "grad_norm": 0.02766844630241394, + "kl": 0.07211090624332428, + "learning_rate": 3.1151008722465398e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1076 + }, + { + "completion_length": 200.0, + "epoch": 0.47930574098798395, + "grad_norm": 0.019513975828886032, + "kl": 0.04784202575683594, + "learning_rate": 3.1113352712979e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1077 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.4797507788161994, + "grad_norm": 0.9397275447845459, + "kl": 0.06918388605117798, + "learning_rate": 3.10756819458561e-06, + "loss": 0.0028, + "reward": 0.2759999930858612, + "reward_std": 0.24494898319244385, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2759999930858612, + "step": 1078 + }, + { + "completion_length": 153.1666717529297, + "epoch": 0.4801958166444148, + "grad_norm": 0.023615337908267975, + "kl": 0.05814233422279358, + "learning_rate": 3.1037996512033963e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1079 + }, + { + "completion_length": 200.0, + "epoch": 0.48064085447263016, + "grad_norm": 0.7462202310562134, + "kl": 0.04940864071249962, + "learning_rate": 3.1000296502485226e-06, + "loss": 0.002, + "reward": 0.18850001692771912, + "reward_std": 0.15064363181591034, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18850001692771912, + "step": 1080 + }, + { + "completion_length": 153.83334350585938, + "epoch": 0.4810858923008456, + "grad_norm": 0.7381025552749634, + "kl": 0.05779881775379181, + "learning_rate": 3.096258200821774e-06, + "loss": 0.0023, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1081 + }, + { + "completion_length": 176.1666717529297, + "epoch": 0.48153093012906095, + "grad_norm": 0.7282528877258301, + "kl": 0.05618796870112419, + "learning_rate": 3.0924853120274313e-06, + "loss": 0.0022, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1082 + }, + { + "completion_length": 159.5, + "epoch": 0.4819759679572764, + "grad_norm": 0.6911483407020569, + "kl": 0.06526792049407959, + "learning_rate": 3.0887109929732496e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1083 + }, + { + "completion_length": 165.33334350585938, + "epoch": 0.48242100578549174, + "grad_norm": 0.8593658208847046, + "kl": 0.05354367941617966, + "learning_rate": 3.084935252770437e-06, + "loss": 0.0021, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1084 + }, + { + "completion_length": 133.83334350585938, + "epoch": 0.48286604361370716, + "grad_norm": 0.7801674604415894, + "kl": 0.0553756058216095, + "learning_rate": 3.081158100533633e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1085 + }, + { + "completion_length": 137.6666717529297, + "epoch": 0.4833110814419226, + "grad_norm": 0.020206518471240997, + "kl": 0.061163198202848434, + "learning_rate": 3.0773795453808842e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1086 + }, + { + "completion_length": 177.5, + "epoch": 0.48375611927013795, + "grad_norm": 0.655252993106842, + "kl": 0.05324871465563774, + "learning_rate": 3.073599596433624e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1087 + }, + { + "completion_length": 145.33334350585938, + "epoch": 0.4842011570983534, + "grad_norm": 0.02423451840877533, + "kl": 0.0650758221745491, + "learning_rate": 3.069818262816653e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1088 + }, + { + "completion_length": 192.6666717529297, + "epoch": 0.48464619492656874, + "grad_norm": 0.6816695928573608, + "kl": 0.05046912655234337, + "learning_rate": 3.0660355536581104e-06, + "loss": 0.002, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1089 + }, + { + "completion_length": 162.1666717529297, + "epoch": 0.48509123275478416, + "grad_norm": 0.033472057431936264, + "kl": 0.05764755234122276, + "learning_rate": 3.0622514780894592e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1090 + }, + { + "completion_length": 121.33333587646484, + "epoch": 0.48553627058299953, + "grad_norm": 0.028588904067873955, + "kl": 0.0573783814907074, + "learning_rate": 3.0584660452454596e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1091 + }, + { + "completion_length": 116.33333587646484, + "epoch": 0.48598130841121495, + "grad_norm": 0.9297807812690735, + "kl": 0.061338119208812714, + "learning_rate": 3.054679264264148e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1092 + }, + { + "completion_length": 157.5, + "epoch": 0.4864263462394304, + "grad_norm": 0.01856524497270584, + "kl": 0.05327138304710388, + "learning_rate": 3.0508911442868155e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1093 + }, + { + "completion_length": 181.1666717529297, + "epoch": 0.48687138406764574, + "grad_norm": 0.8838992714881897, + "kl": 0.06359528750181198, + "learning_rate": 3.047101694457987e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1094 + }, + { + "completion_length": 192.33334350585938, + "epoch": 0.48731642189586116, + "grad_norm": 0.7542251348495483, + "kl": 0.0404694564640522, + "learning_rate": 3.0433109239253937e-06, + "loss": 0.0016, + "reward": 0.20866666734218597, + "reward_std": 0.12961582839488983, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20866666734218597, + "step": 1095 + }, + { + "completion_length": 168.33334350585938, + "epoch": 0.48776145972407653, + "grad_norm": 0.8963215947151184, + "kl": 0.05817902833223343, + "learning_rate": 3.0395188418399597e-06, + "loss": 0.0023, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1096 + }, + { + "completion_length": 139.5, + "epoch": 0.48820649755229195, + "grad_norm": 0.02465493232011795, + "kl": 0.06517083942890167, + "learning_rate": 3.035725457355773e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1097 + }, + { + "completion_length": 164.1666717529297, + "epoch": 0.4886515353805073, + "grad_norm": 0.7453930974006653, + "kl": 0.09313317388296127, + "learning_rate": 3.0319307796300634e-06, + "loss": 0.0037, + "reward": 0.26016664505004883, + "reward_std": 0.2837325930595398, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.26016664505004883, + "step": 1098 + }, + { + "completion_length": 194.1666717529297, + "epoch": 0.48909657320872274, + "grad_norm": 0.822679340839386, + "kl": 0.04898854345083237, + "learning_rate": 3.028134817823187e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1099 + }, + { + "completion_length": 116.5, + "epoch": 0.48954161103693816, + "grad_norm": 0.04699001461267471, + "kl": 0.08001024276018143, + "learning_rate": 3.024337581098597e-06, + "loss": 0.0035, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1100 + }, + { + "completion_length": 108.5, + "epoch": 0.48998664886515353, + "grad_norm": 0.9832037687301636, + "kl": 0.05540832504630089, + "learning_rate": 3.0205390786228244e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1101 + }, + { + "completion_length": 157.1666717529297, + "epoch": 0.49043168669336895, + "grad_norm": 0.7491467595100403, + "kl": 0.04887532815337181, + "learning_rate": 3.016739319565457e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1102 + }, + { + "completion_length": 194.33334350585938, + "epoch": 0.4908767245215843, + "grad_norm": 0.6997105479240417, + "kl": 0.04166887700557709, + "learning_rate": 3.0129383130991142e-06, + "loss": 0.0017, + "reward": 0.25033333897590637, + "reward_std": 0.1122509092092514, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25033333897590637, + "step": 1103 + }, + { + "completion_length": 165.33334350585938, + "epoch": 0.49132176234979974, + "grad_norm": 0.8059443235397339, + "kl": 0.12227432429790497, + "learning_rate": 3.009136068399427e-06, + "loss": 0.0049, + "reward": 0.25866666436195374, + "reward_std": 0.2874068021774292, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25866663455963135, + "step": 1104 + }, + { + "completion_length": 194.33334350585938, + "epoch": 0.4917668001780151, + "grad_norm": 0.9285522103309631, + "kl": 0.05058509111404419, + "learning_rate": 3.005332594645018e-06, + "loss": 0.002, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1105 + }, + { + "completion_length": 189.1666717529297, + "epoch": 0.49221183800623053, + "grad_norm": 0.7666235566139221, + "kl": 0.07048574090003967, + "learning_rate": 3.0015279010174725e-06, + "loss": 0.0028, + "reward": 0.28333333134651184, + "reward_std": 0.17277345061302185, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.28333330154418945, + "step": 1106 + }, + { + "completion_length": 145.1666717529297, + "epoch": 0.49265687583444595, + "grad_norm": 0.020632104948163033, + "kl": 0.04825424402952194, + "learning_rate": 2.9977219967013237e-06, + "loss": 0.0022, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1107 + }, + { + "completion_length": 133.5, + "epoch": 0.4931019136626613, + "grad_norm": 0.04588095471262932, + "kl": 0.07726660370826721, + "learning_rate": 2.993914890884027e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1108 + }, + { + "completion_length": 160.33334350585938, + "epoch": 0.49354695149087674, + "grad_norm": 0.7883959412574768, + "kl": 0.0675104409456253, + "learning_rate": 2.990106592755937e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1109 + }, + { + "completion_length": 168.33334350585938, + "epoch": 0.4939919893190921, + "grad_norm": 0.7340433597564697, + "kl": 0.04423590004444122, + "learning_rate": 2.9862971115102877e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1110 + }, + { + "completion_length": 146.1666717529297, + "epoch": 0.49443702714730753, + "grad_norm": 0.04487235099077225, + "kl": 0.07285156100988388, + "learning_rate": 2.9824864563431684e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1111 + }, + { + "completion_length": 173.0, + "epoch": 0.4948820649755229, + "grad_norm": 0.7536811232566833, + "kl": 0.05195162817835808, + "learning_rate": 2.978674636453503e-06, + "loss": 0.0021, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1112 + }, + { + "completion_length": 178.33334350585938, + "epoch": 0.4953271028037383, + "grad_norm": 0.7448456287384033, + "kl": 0.054630979895591736, + "learning_rate": 2.9748616610430266e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1113 + }, + { + "completion_length": 174.0, + "epoch": 0.49577214063195374, + "grad_norm": 0.7480930089950562, + "kl": 0.07871096581220627, + "learning_rate": 2.971047539316263e-06, + "loss": 0.0031, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1114 + }, + { + "completion_length": 168.6666717529297, + "epoch": 0.4962171784601691, + "grad_norm": 0.030191617086529732, + "kl": 0.061308782547712326, + "learning_rate": 2.9672322804805048e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1115 + }, + { + "completion_length": 200.0, + "epoch": 0.49666221628838453, + "grad_norm": 0.031045343726873398, + "kl": 0.05266506224870682, + "learning_rate": 2.9634158937457886e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1116 + }, + { + "completion_length": 141.5, + "epoch": 0.4971072541165999, + "grad_norm": 0.029410747811198235, + "kl": 0.05735943466424942, + "learning_rate": 2.9595983883248736e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1117 + }, + { + "completion_length": 139.1666717529297, + "epoch": 0.4975522919448153, + "grad_norm": 0.7413390874862671, + "kl": 0.06353849172592163, + "learning_rate": 2.9557797734332196e-06, + "loss": 0.0025, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1118 + }, + { + "completion_length": 115.5, + "epoch": 0.4979973297730307, + "grad_norm": 0.023313088342547417, + "kl": 0.0640382319688797, + "learning_rate": 2.9519600582889657e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1119 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.4984423676012461, + "grad_norm": 0.023131540045142174, + "kl": 0.05825777351856232, + "learning_rate": 2.9481392521129047e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1120 + }, + { + "completion_length": 183.0, + "epoch": 0.49888740542946153, + "grad_norm": 0.733018159866333, + "kl": 0.0460851714015007, + "learning_rate": 2.9443173641284663e-06, + "loss": 0.0018, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1121 + }, + { + "completion_length": 186.0, + "epoch": 0.4993324432576769, + "grad_norm": 0.7658995985984802, + "kl": 0.039262425154447556, + "learning_rate": 2.9404944035616893e-06, + "loss": 0.0016, + "reward": 0.27133333683013916, + "reward_std": 0.12340772151947021, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 1122 + }, + { + "completion_length": 95.33333587646484, + "epoch": 0.4997774810858923, + "grad_norm": 0.9063270092010498, + "kl": 0.05967504531145096, + "learning_rate": 2.9366703796412022e-06, + "loss": 0.0024, + "reward": 0.39666664600372314, + "reward_std": 0.05062279850244522, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1123 + }, + { + "completion_length": 190.6666717529297, + "epoch": 0.5002225189141077, + "grad_norm": 0.8011205792427063, + "kl": 0.07317472994327545, + "learning_rate": 2.9328453015982005e-06, + "loss": 0.0029, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1124 + }, + { + "completion_length": 193.1666717529297, + "epoch": 0.5006675567423231, + "grad_norm": 0.7241079211235046, + "kl": 0.2021249532699585, + "learning_rate": 2.9290191786664253e-06, + "loss": 0.0081, + "reward": 0.10083332657814026, + "reward_std": 0.3284243941307068, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10083332657814026, + "step": 1125 + }, + { + "completion_length": 190.33334350585938, + "epoch": 0.5011125945705385, + "grad_norm": 0.8474998474121094, + "kl": 0.04898199066519737, + "learning_rate": 2.9251920200821383e-06, + "loss": 0.002, + "reward": 0.2711666524410248, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 1126 + }, + { + "completion_length": 183.6666717529297, + "epoch": 0.5015576323987538, + "grad_norm": 0.686153769493103, + "kl": 0.04468465596437454, + "learning_rate": 2.9213638350841026e-06, + "loss": 0.0018, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1127 + }, + { + "completion_length": 199.0, + "epoch": 0.5020026702269693, + "grad_norm": 0.7068986296653748, + "kl": 0.04341750591993332, + "learning_rate": 2.9175346329135596e-06, + "loss": 0.0017, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1128 + }, + { + "completion_length": 155.0, + "epoch": 0.5024477080551847, + "grad_norm": 0.03027581423521042, + "kl": 0.05036742985248566, + "learning_rate": 2.9137044228142035e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1129 + }, + { + "completion_length": 144.1666717529297, + "epoch": 0.5028927458834, + "grad_norm": 0.019624266773462296, + "kl": 0.05049632117152214, + "learning_rate": 2.909873214032165e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1130 + }, + { + "completion_length": 181.6666717529297, + "epoch": 0.5033377837116155, + "grad_norm": 0.7808805108070374, + "kl": 0.11239727586507797, + "learning_rate": 2.906041015815983e-06, + "loss": 0.0045, + "reward": 0.10783332586288452, + "reward_std": 0.32281166315078735, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.10783332586288452, + "step": 1131 + }, + { + "completion_length": 164.6666717529297, + "epoch": 0.5037828215398309, + "grad_norm": 0.7809410691261292, + "kl": 0.05724121630191803, + "learning_rate": 2.9022078374165863e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1132 + }, + { + "completion_length": 171.6666717529297, + "epoch": 0.5042278593680463, + "grad_norm": 0.7676877379417419, + "kl": 0.07378482818603516, + "learning_rate": 2.8983736880872697e-06, + "loss": 0.003, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1133 + }, + { + "completion_length": 189.33334350585938, + "epoch": 0.5046728971962616, + "grad_norm": 0.7471271753311157, + "kl": 0.04732378572225571, + "learning_rate": 2.894538577083671e-06, + "loss": 0.0019, + "reward": 0.29216665029525757, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216665029525757, + "step": 1134 + }, + { + "completion_length": 138.1666717529297, + "epoch": 0.5051179350244771, + "grad_norm": 0.7307435274124146, + "kl": 0.057880111038684845, + "learning_rate": 2.8907025136637505e-06, + "loss": 0.0023, + "reward": 0.3084999918937683, + "reward_std": 0.16534054279327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3084999918937683, + "step": 1135 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.5055629728526925, + "grad_norm": 0.6487751603126526, + "kl": 0.04655180498957634, + "learning_rate": 2.8868655070877676e-06, + "loss": 0.0019, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1136 + }, + { + "completion_length": 183.0, + "epoch": 0.5060080106809078, + "grad_norm": 0.9664366841316223, + "kl": 0.05927140638232231, + "learning_rate": 2.8830275666182566e-06, + "loss": 0.0024, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1137 + }, + { + "completion_length": 200.0, + "epoch": 0.5064530485091233, + "grad_norm": 0.03512910008430481, + "kl": 0.05313728377223015, + "learning_rate": 2.879188701520009e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1138 + }, + { + "completion_length": 154.1666717529297, + "epoch": 0.5068980863373387, + "grad_norm": 0.7382206320762634, + "kl": 0.051763903349637985, + "learning_rate": 2.875348921060047e-06, + "loss": 0.0021, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1139 + }, + { + "completion_length": 135.1666717529297, + "epoch": 0.507343124165554, + "grad_norm": 0.7762537598609924, + "kl": 0.07914315164089203, + "learning_rate": 2.8715082345076022e-06, + "loss": 0.0032, + "reward": 0.2789999842643738, + "reward_std": 0.23760050535202026, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2789999842643738, + "step": 1140 + }, + { + "completion_length": 119.5, + "epoch": 0.5077881619937694, + "grad_norm": 0.028253188356757164, + "kl": 0.0685911476612091, + "learning_rate": 2.8676666511340946e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1141 + }, + { + "completion_length": 180.0, + "epoch": 0.5082331998219849, + "grad_norm": 0.915734052658081, + "kl": 0.05604727193713188, + "learning_rate": 2.8638241802131068e-06, + "loss": 0.0022, + "reward": 0.16566666960716248, + "reward_std": 0.2729935050010681, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.16566666960716248, + "step": 1142 + }, + { + "completion_length": 181.83334350585938, + "epoch": 0.5086782376502003, + "grad_norm": 0.8028380870819092, + "kl": 0.05204429477453232, + "learning_rate": 2.8599808310203662e-06, + "loss": 0.0021, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1143 + }, + { + "completion_length": 106.66667175292969, + "epoch": 0.5091232754784156, + "grad_norm": 0.026731373742222786, + "kl": 0.0580553337931633, + "learning_rate": 2.8561366128337213e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1144 + }, + { + "completion_length": 166.83334350585938, + "epoch": 0.5095683133066311, + "grad_norm": 0.6618978381156921, + "kl": 0.04359781742095947, + "learning_rate": 2.852291534933114e-06, + "loss": 0.0017, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1145 + }, + { + "completion_length": 171.83334350585938, + "epoch": 0.5100133511348465, + "grad_norm": 0.7761538028717041, + "kl": 0.05730967968702316, + "learning_rate": 2.848445606600567e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1146 + }, + { + "completion_length": 171.33334350585938, + "epoch": 0.5104583889630618, + "grad_norm": 8.721156120300293, + "kl": 2.40242338180542, + "learning_rate": 2.844598837120151e-06, + "loss": 0.0961, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1147 + }, + { + "completion_length": 200.0, + "epoch": 0.5109034267912772, + "grad_norm": 0.7378061413764954, + "kl": 0.046708278357982635, + "learning_rate": 2.8407512357779703e-06, + "loss": 0.0019, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 1148 + }, + { + "completion_length": 162.5, + "epoch": 0.5113484646194927, + "grad_norm": 0.7555989623069763, + "kl": 0.049754880368709564, + "learning_rate": 2.836902811862136e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1149 + }, + { + "completion_length": 133.33334350585938, + "epoch": 0.511793502447708, + "grad_norm": 0.022492585703730583, + "kl": 0.06045402213931084, + "learning_rate": 2.833053574662747e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1150 + }, + { + "completion_length": 193.0, + "epoch": 0.5122385402759234, + "grad_norm": 0.8129910230636597, + "kl": 0.03598358854651451, + "learning_rate": 2.8292035334718617e-06, + "loss": 0.0014, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 1151 + }, + { + "completion_length": 128.1666717529297, + "epoch": 0.5126835781041389, + "grad_norm": 0.8406050801277161, + "kl": 0.07259870320558548, + "learning_rate": 2.8253526975834824e-06, + "loss": 0.0029, + "reward": 0.33416664600372314, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1152 + }, + { + "completion_length": 182.1666717529297, + "epoch": 0.5131286159323543, + "grad_norm": 0.7992534041404724, + "kl": 0.04329013079404831, + "learning_rate": 2.821501076293529e-06, + "loss": 0.0017, + "reward": 0.27133333683013916, + "reward_std": 0.12340772151947021, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 1153 + }, + { + "completion_length": 157.5, + "epoch": 0.5135736537605696, + "grad_norm": 0.8890674710273743, + "kl": 0.07445360720157623, + "learning_rate": 2.8176486788998168e-06, + "loss": 0.003, + "reward": 0.31316667795181274, + "reward_std": 0.0688314288854599, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1154 + }, + { + "completion_length": 185.0, + "epoch": 0.514018691588785, + "grad_norm": 0.7826995849609375, + "kl": 0.05238564312458038, + "learning_rate": 2.813795514702036e-06, + "loss": 0.0021, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1155 + }, + { + "completion_length": 194.1666717529297, + "epoch": 0.5144637294170005, + "grad_norm": 0.6543252468109131, + "kl": 0.04138343781232834, + "learning_rate": 2.8099415930017254e-06, + "loss": 0.0017, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1156 + }, + { + "completion_length": 150.0, + "epoch": 0.5149087672452158, + "grad_norm": 0.022047659382224083, + "kl": 0.054192956537008286, + "learning_rate": 2.806086923102255e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1157 + }, + { + "completion_length": 195.6666717529297, + "epoch": 0.5153538050734312, + "grad_norm": 0.7176312804222107, + "kl": 0.036622267216444016, + "learning_rate": 2.802231514308799e-06, + "loss": 0.0015, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1158 + }, + { + "completion_length": 133.0, + "epoch": 0.5157988429016467, + "grad_norm": 0.024259619414806366, + "kl": 0.0638909861445427, + "learning_rate": 2.798375375928318e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1159 + }, + { + "completion_length": 162.5, + "epoch": 0.516243880729862, + "grad_norm": 0.6893110275268555, + "kl": 0.0569378100335598, + "learning_rate": 2.7945185172695295e-06, + "loss": 0.0023, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1160 + }, + { + "completion_length": 158.6666717529297, + "epoch": 0.5166889185580774, + "grad_norm": 0.028980251401662827, + "kl": 0.06851427257061005, + "learning_rate": 2.7906609476428935e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1161 + }, + { + "completion_length": 131.33334350585938, + "epoch": 0.5171339563862928, + "grad_norm": 1.0463398694992065, + "kl": 0.11661326885223389, + "learning_rate": 2.7868026763605854e-06, + "loss": 0.0047, + "reward": 0.32233333587646484, + "reward_std": 0.1314559429883957, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.32233333587646484, + "step": 1162 + }, + { + "completion_length": 166.1666717529297, + "epoch": 0.5175789942145083, + "grad_norm": 0.6688762307167053, + "kl": 0.05393125116825104, + "learning_rate": 2.782943712736473e-06, + "loss": 0.0022, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1163 + }, + { + "completion_length": 125.66667175292969, + "epoch": 0.5180240320427236, + "grad_norm": 0.7415357232093811, + "kl": 0.06435568630695343, + "learning_rate": 2.7790840660860973e-06, + "loss": 0.0026, + "reward": 0.29216665029525757, + "reward_std": 0.2053488940000534, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216665029525757, + "step": 1164 + }, + { + "completion_length": 153.6666717529297, + "epoch": 0.518469069870939, + "grad_norm": 0.7977253198623657, + "kl": 0.05409222096204758, + "learning_rate": 2.775223745726646e-06, + "loss": 0.0022, + "reward": 0.33416664600372314, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1165 + }, + { + "completion_length": 137.33334350585938, + "epoch": 0.5189141076991545, + "grad_norm": 0.8917381167411804, + "kl": 0.05699780583381653, + "learning_rate": 2.7713627609769363e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1166 + }, + { + "completion_length": 127.83333587646484, + "epoch": 0.5193591455273698, + "grad_norm": 0.026533007621765137, + "kl": 0.06946979463100433, + "learning_rate": 2.767501121157386e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1167 + }, + { + "completion_length": 151.6666717529297, + "epoch": 0.5198041833555852, + "grad_norm": 0.02679111249744892, + "kl": 0.058871589601039886, + "learning_rate": 2.763638835589995e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1168 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.5202492211838006, + "grad_norm": 0.6838176846504211, + "kl": 0.0421440526843071, + "learning_rate": 2.7597759135983237e-06, + "loss": 0.0017, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1169 + }, + { + "completion_length": 152.6666717529297, + "epoch": 0.520694259012016, + "grad_norm": 0.6762892007827759, + "kl": 0.0611237995326519, + "learning_rate": 2.755912364507468e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1170 + }, + { + "completion_length": 164.5, + "epoch": 0.5211392968402314, + "grad_norm": 0.025890666991472244, + "kl": 0.056531667709350586, + "learning_rate": 2.752048197644036e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1171 + }, + { + "completion_length": 154.83334350585938, + "epoch": 0.5215843346684468, + "grad_norm": 0.6817211508750916, + "kl": 0.0709918960928917, + "learning_rate": 2.7481834223361294e-06, + "loss": 0.0028, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1172 + }, + { + "completion_length": 153.33334350585938, + "epoch": 0.5220293724966623, + "grad_norm": 0.8292616605758667, + "kl": 0.0703635960817337, + "learning_rate": 2.744318047913318e-06, + "loss": 0.0028, + "reward": 0.21949999034404755, + "reward_std": 0.22260615229606628, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.21949999034404755, + "step": 1173 + }, + { + "completion_length": 143.0, + "epoch": 0.5224744103248776, + "grad_norm": 0.01640567183494568, + "kl": 0.054961517453193665, + "learning_rate": 2.7404520837066163e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1174 + }, + { + "completion_length": 191.1666717529297, + "epoch": 0.522919448153093, + "grad_norm": 0.7560374140739441, + "kl": 0.04697653651237488, + "learning_rate": 2.7365855390484646e-06, + "loss": 0.0019, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 1175 + }, + { + "completion_length": 170.83334350585938, + "epoch": 0.5233644859813084, + "grad_norm": 0.651421070098877, + "kl": 0.1001739650964737, + "learning_rate": 2.7327184232727037e-06, + "loss": 0.004, + "reward": 0.2094999998807907, + "reward_std": 0.34976324439048767, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2094999998807907, + "step": 1176 + }, + { + "completion_length": 177.5, + "epoch": 0.5238095238095238, + "grad_norm": 0.6764176487922668, + "kl": 0.059600621461868286, + "learning_rate": 2.728850745714553e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1177 + }, + { + "completion_length": 147.33334350585938, + "epoch": 0.5242545616377392, + "grad_norm": 0.7848303318023682, + "kl": 0.05990871787071228, + "learning_rate": 2.724982515710588e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1178 + }, + { + "completion_length": 164.6666717529297, + "epoch": 0.5246995994659546, + "grad_norm": 0.6674816012382507, + "kl": 0.054695215076208115, + "learning_rate": 2.7211137425987178e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1179 + }, + { + "completion_length": 179.83334350585938, + "epoch": 0.52514463729417, + "grad_norm": 0.6498001217842102, + "kl": 0.05996771901845932, + "learning_rate": 2.7172444357181628e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1180 + }, + { + "completion_length": 186.5, + "epoch": 0.5255896751223854, + "grad_norm": 0.7126318216323853, + "kl": 0.06885448098182678, + "learning_rate": 2.7133746044094315e-06, + "loss": 0.0028, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1181 + }, + { + "completion_length": 165.6666717529297, + "epoch": 0.5260347129506008, + "grad_norm": 0.8802997469902039, + "kl": 0.066841721534729, + "learning_rate": 2.7095042580142984e-06, + "loss": 0.0027, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1182 + }, + { + "completion_length": 179.1666717529297, + "epoch": 0.5264797507788161, + "grad_norm": 0.7791231274604797, + "kl": 0.04690280556678772, + "learning_rate": 2.705633405875782e-06, + "loss": 0.0019, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1183 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.5269247886070316, + "grad_norm": 0.9081090092658997, + "kl": 0.07322704046964645, + "learning_rate": 2.701762057338122e-06, + "loss": 0.0029, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1184 + }, + { + "completion_length": 116.33333587646484, + "epoch": 0.527369826435247, + "grad_norm": 0.03892948850989342, + "kl": 0.07545986026525497, + "learning_rate": 2.697890221746754e-06, + "loss": 0.0033, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1185 + }, + { + "completion_length": 150.33334350585938, + "epoch": 0.5278148642634624, + "grad_norm": 0.03163018450140953, + "kl": 0.08749519288539886, + "learning_rate": 2.6940179084482927e-06, + "loss": 0.0038, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1186 + }, + { + "completion_length": 143.33334350585938, + "epoch": 0.5282599020916778, + "grad_norm": 0.02014567144215107, + "kl": 0.05517182871699333, + "learning_rate": 2.690145126790503e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1187 + }, + { + "completion_length": 116.16667175292969, + "epoch": 0.5287049399198932, + "grad_norm": 0.026103656738996506, + "kl": 0.06503650546073914, + "learning_rate": 2.686271886122283e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1188 + }, + { + "completion_length": 85.5, + "epoch": 0.5291499777481086, + "grad_norm": 1.503110647201538, + "kl": 0.07808747887611389, + "learning_rate": 2.6823981957936363e-06, + "loss": 0.0031, + "reward": 0.39666664600372314, + "reward_std": 0.05062279850244522, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1189 + }, + { + "completion_length": 183.1666717529297, + "epoch": 0.5295950155763239, + "grad_norm": 0.7100231647491455, + "kl": 0.059634458273649216, + "learning_rate": 2.678524065155655e-06, + "loss": 0.0024, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1190 + }, + { + "completion_length": 120.0, + "epoch": 0.5300400534045394, + "grad_norm": 0.03600943461060524, + "kl": 0.0854596421122551, + "learning_rate": 2.67464950356049e-06, + "loss": 0.0037, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1191 + }, + { + "completion_length": 192.1666717529297, + "epoch": 0.5304850912327548, + "grad_norm": 0.761343777179718, + "kl": 0.15199977159500122, + "learning_rate": 2.670774520361337e-06, + "loss": 0.0061, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1192 + }, + { + "completion_length": 150.0, + "epoch": 0.5309301290609701, + "grad_norm": 0.7439923286437988, + "kl": 0.08151707053184509, + "learning_rate": 2.666899124912407e-06, + "loss": 0.0033, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1193 + }, + { + "completion_length": 184.6666717529297, + "epoch": 0.5313751668891856, + "grad_norm": 0.7151275277137756, + "kl": 0.08384295552968979, + "learning_rate": 2.6630233265689053e-06, + "loss": 0.0034, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1194 + }, + { + "completion_length": 157.0, + "epoch": 0.531820204717401, + "grad_norm": 0.8445621132850647, + "kl": 0.07981076091527939, + "learning_rate": 2.659147134687013e-06, + "loss": 0.0032, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1195 + }, + { + "completion_length": 171.1666717529297, + "epoch": 0.5322652425456164, + "grad_norm": 0.9164091348648071, + "kl": 0.05100735276937485, + "learning_rate": 2.6552705586238575e-06, + "loss": 0.002, + "reward": 0.33416664600372314, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1196 + }, + { + "completion_length": 200.0, + "epoch": 0.5327102803738317, + "grad_norm": 0.7608870267868042, + "kl": 0.06349802017211914, + "learning_rate": 2.651393607737496e-06, + "loss": 0.0025, + "reward": 0.12850001454353333, + "reward_std": 0.2976129949092865, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.12850001454353333, + "step": 1197 + }, + { + "completion_length": 182.6666717529297, + "epoch": 0.5331553182020472, + "grad_norm": 0.8513096570968628, + "kl": 0.06429284065961838, + "learning_rate": 2.6475162913868903e-06, + "loss": 0.0026, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1198 + }, + { + "completion_length": 169.6666717529297, + "epoch": 0.5336003560302626, + "grad_norm": 0.7032941579818726, + "kl": 0.05037330836057663, + "learning_rate": 2.643638618931883e-06, + "loss": 0.002, + "reward": 0.2561666667461395, + "reward_std": 0.29353052377700806, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2561666667461395, + "step": 1199 + }, + { + "completion_length": 184.33334350585938, + "epoch": 0.5340453938584779, + "grad_norm": 1.026814341545105, + "kl": 0.046553052961826324, + "learning_rate": 2.639760599733178e-06, + "loss": 0.0019, + "reward": 0.15850000083446503, + "reward_std": 0.10841356217861176, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.15850000083446503, + "step": 1200 + }, + { + "completion_length": 150.83334350585938, + "epoch": 0.5344904316866934, + "grad_norm": 0.8465564846992493, + "kl": 0.08807514607906342, + "learning_rate": 2.635882243152316e-06, + "loss": 0.0035, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1201 + }, + { + "completion_length": 200.0, + "epoch": 0.5349354695149088, + "grad_norm": 0.026606876403093338, + "kl": 0.047713205218315125, + "learning_rate": 2.6320035585516513e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1202 + }, + { + "completion_length": 164.0, + "epoch": 0.5353805073431241, + "grad_norm": 0.8893852233886719, + "kl": 0.04501502960920334, + "learning_rate": 2.6281245552943297e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1203 + }, + { + "completion_length": 179.1666717529297, + "epoch": 0.5358255451713395, + "grad_norm": 0.9132094979286194, + "kl": 0.0830874890089035, + "learning_rate": 2.624245242744268e-06, + "loss": 0.0033, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1204 + }, + { + "completion_length": 162.1666717529297, + "epoch": 0.536270582999555, + "grad_norm": 0.7258564829826355, + "kl": 0.050873756408691406, + "learning_rate": 2.6203656302661284e-06, + "loss": 0.002, + "reward": 0.3341667056083679, + "reward_std": 0.0648086816072464, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1205 + }, + { + "completion_length": 123.5, + "epoch": 0.5367156208277704, + "grad_norm": 0.9385852217674255, + "kl": 0.06338381767272949, + "learning_rate": 2.6164857272252975e-06, + "loss": 0.0025, + "reward": 0.39666664600372314, + "reward_std": 0.05062279850244522, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1206 + }, + { + "completion_length": 176.1666717529297, + "epoch": 0.5371606586559857, + "grad_norm": 0.7586855888366699, + "kl": 0.05588201805949211, + "learning_rate": 2.6126055429878634e-06, + "loss": 0.0022, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1207 + }, + { + "completion_length": 139.1666717529297, + "epoch": 0.5376056964842012, + "grad_norm": 0.026297971606254578, + "kl": 0.06350646913051605, + "learning_rate": 2.608725086920591e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1208 + }, + { + "completion_length": 127.66667175292969, + "epoch": 0.5380507343124166, + "grad_norm": 1.102054238319397, + "kl": 0.16556766629219055, + "learning_rate": 2.6048443683909053e-06, + "loss": 0.0066, + "reward": 0.3264999985694885, + "reward_std": 0.1212497353553772, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3264999985694885, + "step": 1209 + }, + { + "completion_length": 152.6666717529297, + "epoch": 0.5384957721406319, + "grad_norm": 0.7634212374687195, + "kl": 0.09940703958272934, + "learning_rate": 2.6009633967668625e-06, + "loss": 0.004, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1210 + }, + { + "completion_length": 121.66667175292969, + "epoch": 0.5389408099688473, + "grad_norm": 0.07904371619224548, + "kl": 0.09018834680318832, + "learning_rate": 2.5970821814171287e-06, + "loss": 0.0039, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1211 + }, + { + "completion_length": 150.5, + "epoch": 0.5393858477970628, + "grad_norm": 0.06901979446411133, + "kl": 0.09860502183437347, + "learning_rate": 2.5932007317109607e-06, + "loss": 0.0042, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1212 + }, + { + "completion_length": 162.6666717529297, + "epoch": 0.5398308856252781, + "grad_norm": 0.030686013400554657, + "kl": 0.07230792939662933, + "learning_rate": 2.58931905701818e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1213 + }, + { + "completion_length": 194.5, + "epoch": 0.5402759234534935, + "grad_norm": 0.6610811352729797, + "kl": 0.04911746829748154, + "learning_rate": 2.585437166709151e-06, + "loss": 0.002, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1214 + }, + { + "completion_length": 137.5, + "epoch": 0.540720961281709, + "grad_norm": 0.04222337529063225, + "kl": 0.08580730855464935, + "learning_rate": 2.581555070154759e-06, + "loss": 0.0037, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1215 + }, + { + "completion_length": 161.1666717529297, + "epoch": 0.5411659991099244, + "grad_norm": 0.8334679007530212, + "kl": 0.05739726126194, + "learning_rate": 2.5776727767263878e-06, + "loss": 0.0023, + "reward": 0.29233333468437195, + "reward_std": 0.12961584329605103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 1216 + }, + { + "completion_length": 134.33334350585938, + "epoch": 0.5416110369381397, + "grad_norm": 0.7179288864135742, + "kl": 0.1268427073955536, + "learning_rate": 2.5737902957958928e-06, + "loss": 0.0051, + "reward": 0.29783332347869873, + "reward_std": 0.19146844744682312, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29783332347869873, + "step": 1217 + }, + { + "completion_length": 148.33334350585938, + "epoch": 0.5420560747663551, + "grad_norm": 0.031933125108480453, + "kl": 0.06602717936038971, + "learning_rate": 2.5699076367355883e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1218 + }, + { + "completion_length": 154.1666717529297, + "epoch": 0.5425011125945706, + "grad_norm": 0.8263406157493591, + "kl": 0.0948314294219017, + "learning_rate": 2.5660248089182136e-06, + "loss": 0.0038, + "reward": 0.3316666781902313, + "reward_std": 0.0688670203089714, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3316666781902313, + "step": 1219 + }, + { + "completion_length": 161.1666717529297, + "epoch": 0.5429461504227859, + "grad_norm": 0.6751954555511475, + "kl": 0.07369253039360046, + "learning_rate": 2.5621418217169177e-06, + "loss": 0.0029, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1220 + }, + { + "completion_length": 138.0, + "epoch": 0.5433911882510013, + "grad_norm": 0.022621888667345047, + "kl": 0.06267684698104858, + "learning_rate": 2.5582586845052333e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1221 + }, + { + "completion_length": 168.0, + "epoch": 0.5438362260792168, + "grad_norm": 0.8740278482437134, + "kl": 0.05454988032579422, + "learning_rate": 2.554375406657054e-06, + "loss": 0.0022, + "reward": 0.29250001907348633, + "reward_std": 0.12935802340507507, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29250001907348633, + "step": 1222 + }, + { + "completion_length": 197.33334350585938, + "epoch": 0.5442812639074321, + "grad_norm": 0.6384437084197998, + "kl": 0.06101636588573456, + "learning_rate": 2.550491997546617e-06, + "loss": 0.0024, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1223 + }, + { + "completion_length": 126.0, + "epoch": 0.5447263017356475, + "grad_norm": 0.020438499748706818, + "kl": 0.06194034963846207, + "learning_rate": 2.5466084665484732e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1224 + }, + { + "completion_length": 179.0, + "epoch": 0.5451713395638629, + "grad_norm": 0.6573781967163086, + "kl": 0.0651036947965622, + "learning_rate": 2.542724823037467e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1225 + }, + { + "completion_length": 144.83334350585938, + "epoch": 0.5456163773920784, + "grad_norm": 0.04750434681773186, + "kl": 0.08231821656227112, + "learning_rate": 2.538841076388717e-06, + "loss": 0.0036, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1226 + }, + { + "completion_length": 133.0, + "epoch": 0.5460614152202937, + "grad_norm": 0.803379237651825, + "kl": 0.056590914726257324, + "learning_rate": 2.5349572359775894e-06, + "loss": 0.0023, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1227 + }, + { + "completion_length": 195.5, + "epoch": 0.5465064530485091, + "grad_norm": 1.080286979675293, + "kl": 0.0482148602604866, + "learning_rate": 2.5310733111796765e-06, + "loss": 0.0019, + "reward": 0.19833332300186157, + "reward_std": 0.21086929738521576, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.19833330810070038, + "step": 1228 + }, + { + "completion_length": 162.0, + "epoch": 0.5469514908767246, + "grad_norm": 0.02214636653661728, + "kl": 0.05421888083219528, + "learning_rate": 2.527189311370775e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1229 + }, + { + "completion_length": 182.33334350585938, + "epoch": 0.5473965287049399, + "grad_norm": 0.7057332992553711, + "kl": 0.07187958061695099, + "learning_rate": 2.523305245926862e-06, + "loss": 0.0029, + "reward": 0.27133333683013916, + "reward_std": 0.12340772151947021, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27133333683013916, + "step": 1230 + }, + { + "completion_length": 150.0, + "epoch": 0.5478415665331553, + "grad_norm": 0.019993474707007408, + "kl": 0.05282042548060417, + "learning_rate": 2.519421124224074e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1231 + }, + { + "completion_length": 131.33334350585938, + "epoch": 0.5482866043613707, + "grad_norm": 0.9813181757926941, + "kl": 0.06754153221845627, + "learning_rate": 2.5155369556386825e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1232 + }, + { + "completion_length": 174.0, + "epoch": 0.5487316421895861, + "grad_norm": 0.7997798919677734, + "kl": 0.08393299579620361, + "learning_rate": 2.5116527495470724e-06, + "loss": 0.0034, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1233 + }, + { + "completion_length": 188.5, + "epoch": 0.5491766800178015, + "grad_norm": 0.7237727642059326, + "kl": 0.049749068915843964, + "learning_rate": 2.5077685153257182e-06, + "loss": 0.002, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1234 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.5496217178460169, + "grad_norm": 0.9056436419487, + "kl": 0.04640985652804375, + "learning_rate": 2.503884262351165e-06, + "loss": 0.0019, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1235 + }, + { + "completion_length": 143.33334350585938, + "epoch": 0.5500667556742324, + "grad_norm": 0.8036392331123352, + "kl": 0.04953508824110031, + "learning_rate": 2.5e-06, + "loss": 0.002, + "reward": 0.37566667795181274, + "reward_std": 0.07905863225460052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 1236 + }, + { + "completion_length": 151.83334350585938, + "epoch": 0.5505117935024477, + "grad_norm": 0.7205859422683716, + "kl": 0.07808684557676315, + "learning_rate": 2.4961157376488352e-06, + "loss": 0.0031, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1237 + }, + { + "completion_length": 154.6666717529297, + "epoch": 0.5509568313306631, + "grad_norm": 0.017512843012809753, + "kl": 0.04894305393099785, + "learning_rate": 2.492231484674282e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1238 + }, + { + "completion_length": 175.0, + "epoch": 0.5514018691588785, + "grad_norm": 0.6931830644607544, + "kl": 0.05162414163351059, + "learning_rate": 2.488347250452929e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1239 + }, + { + "completion_length": 181.6666717529297, + "epoch": 0.5518469069870939, + "grad_norm": 0.6881082653999329, + "kl": 0.07192274183034897, + "learning_rate": 2.4844630443613183e-06, + "loss": 0.0029, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1240 + }, + { + "completion_length": 113.5, + "epoch": 0.5522919448153093, + "grad_norm": 0.023202555254101753, + "kl": 0.06096421927213669, + "learning_rate": 2.480578875775927e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1241 + }, + { + "completion_length": 179.0, + "epoch": 0.5527369826435247, + "grad_norm": 0.7318057417869568, + "kl": 0.09055332839488983, + "learning_rate": 2.4766947540731385e-06, + "loss": 0.0036, + "reward": 0.023666661232709885, + "reward_std": 0.36290258169174194, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.023666661232709885, + "step": 1242 + }, + { + "completion_length": 200.0, + "epoch": 0.5531820204717401, + "grad_norm": 0.6898524165153503, + "kl": 0.05797708034515381, + "learning_rate": 2.4728106886292257e-06, + "loss": 0.0023, + "reward": 0.14133334159851074, + "reward_std": 0.2661778926849365, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14133334159851074, + "step": 1243 + }, + { + "completion_length": 200.0, + "epoch": 0.5536270582999555, + "grad_norm": 0.7336671352386475, + "kl": 0.05232497677206993, + "learning_rate": 2.4689266888203243e-06, + "loss": 0.0021, + "reward": 0.2083333432674408, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2083333432674408, + "step": 1244 + }, + { + "completion_length": 200.0, + "epoch": 0.5540720961281709, + "grad_norm": 0.024343255907297134, + "kl": 0.052714623510837555, + "learning_rate": 2.4650427640224115e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1245 + }, + { + "completion_length": 171.0, + "epoch": 0.5545171339563862, + "grad_norm": 0.6648960709571838, + "kl": 0.05184723436832428, + "learning_rate": 2.4611589236112834e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1246 + }, + { + "completion_length": 122.33333587646484, + "epoch": 0.5549621717846017, + "grad_norm": 0.7880151271820068, + "kl": 0.06251058727502823, + "learning_rate": 2.4572751769625334e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1247 + }, + { + "completion_length": 132.0, + "epoch": 0.5554072096128171, + "grad_norm": 0.025808745995163918, + "kl": 0.06488477438688278, + "learning_rate": 2.4533915334515276e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1248 + }, + { + "completion_length": 186.6666717529297, + "epoch": 0.5558522474410325, + "grad_norm": 0.8226630091667175, + "kl": 0.05653652548789978, + "learning_rate": 2.4495080024533833e-06, + "loss": 0.0023, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1249 + }, + { + "completion_length": 139.33334350585938, + "epoch": 0.5562972852692479, + "grad_norm": 0.6765699982643127, + "kl": 0.0589500293135643, + "learning_rate": 2.4456245933429464e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1250 + }, + { + "completion_length": 182.5, + "epoch": 0.5567423230974633, + "grad_norm": 0.7177083492279053, + "kl": 0.054428085684776306, + "learning_rate": 2.441741315494768e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1251 + }, + { + "completion_length": 160.83334350585938, + "epoch": 0.5571873609256787, + "grad_norm": 0.907528817653656, + "kl": 0.054479341953992844, + "learning_rate": 2.437858178283083e-06, + "loss": 0.0022, + "reward": 0.3551666736602783, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1252 + }, + { + "completion_length": 165.83334350585938, + "epoch": 0.557632398753894, + "grad_norm": 0.7852663397789001, + "kl": 0.06126983463764191, + "learning_rate": 2.4339751910817868e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1253 + }, + { + "completion_length": 154.5, + "epoch": 0.5580774365821095, + "grad_norm": 0.6789471507072449, + "kl": 0.06228331848978996, + "learning_rate": 2.430092363264412e-06, + "loss": 0.0025, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1254 + }, + { + "completion_length": 195.33334350585938, + "epoch": 0.5585224744103249, + "grad_norm": 0.697045087814331, + "kl": 0.05558575317263603, + "learning_rate": 2.4262097042041076e-06, + "loss": 0.0022, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1255 + }, + { + "completion_length": 140.33334350585938, + "epoch": 0.5589675122385402, + "grad_norm": 0.03193692862987518, + "kl": 0.06003742665052414, + "learning_rate": 2.422327223273614e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1256 + }, + { + "completion_length": 200.0, + "epoch": 0.5594125500667557, + "grad_norm": 0.6868168115615845, + "kl": 0.052136264741420746, + "learning_rate": 2.4184449298452417e-06, + "loss": 0.0021, + "reward": 0.18816666305065155, + "reward_std": 0.17143210768699646, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.18816666305065155, + "step": 1257 + }, + { + "completion_length": 160.5, + "epoch": 0.5598575878949711, + "grad_norm": 0.7360550165176392, + "kl": 0.06848403811454773, + "learning_rate": 2.41456283329085e-06, + "loss": 0.0027, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1258 + }, + { + "completion_length": 170.5, + "epoch": 0.5603026257231865, + "grad_norm": 0.7767543196678162, + "kl": 0.06690746545791626, + "learning_rate": 2.410680942981821e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1259 + }, + { + "completion_length": 200.0, + "epoch": 0.5607476635514018, + "grad_norm": 0.7363775372505188, + "kl": 0.05513622611761093, + "learning_rate": 2.40679926828904e-06, + "loss": 0.0022, + "reward": 0.2291666716337204, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 1260 + }, + { + "completion_length": 195.5, + "epoch": 0.5611927013796173, + "grad_norm": 0.7677130699157715, + "kl": 0.0692647248506546, + "learning_rate": 2.4029178185828725e-06, + "loss": 0.0028, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1261 + }, + { + "completion_length": 160.0, + "epoch": 0.5616377392078327, + "grad_norm": 0.7026461958885193, + "kl": 0.0852893814444542, + "learning_rate": 2.3990366032331388e-06, + "loss": 0.0034, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1262 + }, + { + "completion_length": 197.1666717529297, + "epoch": 0.562082777036048, + "grad_norm": 0.9166952967643738, + "kl": 0.0652032420039177, + "learning_rate": 2.3951556316090955e-06, + "loss": 0.0026, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 1263 + }, + { + "completion_length": 138.83334350585938, + "epoch": 0.5625278148642635, + "grad_norm": 0.02755793184041977, + "kl": 0.058620356023311615, + "learning_rate": 2.39127491307941e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1264 + }, + { + "completion_length": 196.6666717529297, + "epoch": 0.5629728526924789, + "grad_norm": 0.7676947712898254, + "kl": 0.055359989404678345, + "learning_rate": 2.3873944570121383e-06, + "loss": 0.0022, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1265 + }, + { + "completion_length": 136.1666717529297, + "epoch": 0.5634178905206942, + "grad_norm": 0.8367419242858887, + "kl": 0.1252003312110901, + "learning_rate": 2.3835142727747033e-06, + "loss": 0.005, + "reward": 0.3186666667461395, + "reward_std": 0.14043740928173065, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31866663694381714, + "step": 1266 + }, + { + "completion_length": 147.33334350585938, + "epoch": 0.5638629283489096, + "grad_norm": 0.6919192671775818, + "kl": 0.06637275218963623, + "learning_rate": 2.3796343697338724e-06, + "loss": 0.0027, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1267 + }, + { + "completion_length": 115.0, + "epoch": 0.5643079661771251, + "grad_norm": 0.8813140988349915, + "kl": 0.06479702889919281, + "learning_rate": 2.375754757255733e-06, + "loss": 0.0026, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1268 + }, + { + "completion_length": 176.1666717529297, + "epoch": 0.5647530040053405, + "grad_norm": 0.7938532829284668, + "kl": 0.08625823259353638, + "learning_rate": 2.371875444705671e-06, + "loss": 0.0035, + "reward": 0.20500001311302185, + "reward_std": 0.2416965216398239, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.20500001311302185, + "step": 1269 + }, + { + "completion_length": 145.5, + "epoch": 0.5651980418335558, + "grad_norm": 0.030022282153367996, + "kl": 0.06400606036186218, + "learning_rate": 2.3679964414483504e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1270 + }, + { + "completion_length": 159.33334350585938, + "epoch": 0.5656430796617713, + "grad_norm": 0.6740466952323914, + "kl": 0.05308264493942261, + "learning_rate": 2.364117756847685e-06, + "loss": 0.0021, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1271 + }, + { + "completion_length": 182.6666717529297, + "epoch": 0.5660881174899867, + "grad_norm": 0.7299838066101074, + "kl": 0.06719101965427399, + "learning_rate": 2.3602394002668235e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1272 + }, + { + "completion_length": 164.1666717529297, + "epoch": 0.566533155318202, + "grad_norm": 0.029490042477846146, + "kl": 0.06731471419334412, + "learning_rate": 2.3563613810681184e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1273 + }, + { + "completion_length": 139.6666717529297, + "epoch": 0.5669781931464174, + "grad_norm": 0.03775608167052269, + "kl": 0.07644303143024445, + "learning_rate": 2.352483708613111e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1274 + }, + { + "completion_length": 149.6666717529297, + "epoch": 0.5674232309746329, + "grad_norm": 0.03325009346008301, + "kl": 0.06889767944812775, + "learning_rate": 2.3486063922625046e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1275 + }, + { + "completion_length": 166.6666717529297, + "epoch": 0.5678682688028482, + "grad_norm": 0.6968271136283875, + "kl": 0.05943942815065384, + "learning_rate": 2.344729441376143e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1276 + }, + { + "completion_length": 153.33334350585938, + "epoch": 0.5683133066310636, + "grad_norm": 0.6684544682502747, + "kl": 0.06784616410732269, + "learning_rate": 2.340852865312988e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1277 + }, + { + "completion_length": 155.83334350585938, + "epoch": 0.5687583444592791, + "grad_norm": 0.08216793835163116, + "kl": 0.0659741461277008, + "learning_rate": 2.3369766734310947e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1278 + }, + { + "completion_length": 146.33334350585938, + "epoch": 0.5692033822874945, + "grad_norm": 0.7616736888885498, + "kl": 0.07293137907981873, + "learning_rate": 2.3331008750875934e-06, + "loss": 0.0029, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1279 + }, + { + "completion_length": 175.33334350585938, + "epoch": 0.5696484201157098, + "grad_norm": 0.716716468334198, + "kl": 0.05208544060587883, + "learning_rate": 2.329225479638663e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1280 + }, + { + "completion_length": 181.6666717529297, + "epoch": 0.5700934579439252, + "grad_norm": 0.804816484451294, + "kl": 0.06358016282320023, + "learning_rate": 2.32535049643951e-06, + "loss": 0.0025, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1281 + }, + { + "completion_length": 120.16667175292969, + "epoch": 0.5705384957721407, + "grad_norm": 0.018244080245494843, + "kl": 0.05629762262105942, + "learning_rate": 2.3214759348443456e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1282 + }, + { + "completion_length": 183.6666717529297, + "epoch": 0.570983533600356, + "grad_norm": 0.7986438870429993, + "kl": 0.07024630904197693, + "learning_rate": 2.3176018042063637e-06, + "loss": 0.0028, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1283 + }, + { + "completion_length": 188.83334350585938, + "epoch": 0.5714285714285714, + "grad_norm": 0.7718678116798401, + "kl": 0.04382052272558212, + "learning_rate": 2.3137281138777173e-06, + "loss": 0.0018, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1284 + }, + { + "completion_length": 147.5, + "epoch": 0.5718736092567869, + "grad_norm": 0.05693955719470978, + "kl": 0.09006796777248383, + "learning_rate": 2.3098548732094964e-06, + "loss": 0.0039, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1285 + }, + { + "completion_length": 180.83334350585938, + "epoch": 0.5723186470850022, + "grad_norm": 0.7249354720115662, + "kl": 0.09484501928091049, + "learning_rate": 2.3059820915517077e-06, + "loss": 0.0038, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1286 + }, + { + "completion_length": 150.33334350585938, + "epoch": 0.5727636849132176, + "grad_norm": 0.039812684059143066, + "kl": 0.07181607186794281, + "learning_rate": 2.302109778253246e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1287 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.573208722741433, + "grad_norm": 0.7592692375183105, + "kl": 0.06504487991333008, + "learning_rate": 2.298237942661879e-06, + "loss": 0.0026, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1288 + }, + { + "completion_length": 142.5, + "epoch": 0.5736537605696485, + "grad_norm": 0.021916421130299568, + "kl": 0.05927393585443497, + "learning_rate": 2.294366594124218e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1289 + }, + { + "completion_length": 109.66667175292969, + "epoch": 0.5740987983978638, + "grad_norm": 0.030946621671319008, + "kl": 0.06689797341823578, + "learning_rate": 2.2904957419857016e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1290 + }, + { + "completion_length": 200.0, + "epoch": 0.5745438362260792, + "grad_norm": 0.015107125975191593, + "kl": 0.03902910649776459, + "learning_rate": 2.2866253955905693e-06, + "loss": 0.0016, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1291 + }, + { + "completion_length": 161.83334350585938, + "epoch": 0.5749888740542947, + "grad_norm": 0.9576203227043152, + "kl": 0.06322360038757324, + "learning_rate": 2.2827555642818377e-06, + "loss": 0.0025, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1292 + }, + { + "completion_length": 176.6666717529297, + "epoch": 0.57543391188251, + "grad_norm": 0.7536025047302246, + "kl": 0.057025518268346786, + "learning_rate": 2.278886257401282e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1293 + }, + { + "completion_length": 121.16667175292969, + "epoch": 0.5758789497107254, + "grad_norm": 0.017764268442988396, + "kl": 0.058683812618255615, + "learning_rate": 2.2750174842894127e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1294 + }, + { + "completion_length": 113.66667175292969, + "epoch": 0.5763239875389408, + "grad_norm": 1.1381512880325317, + "kl": 0.09195034205913544, + "learning_rate": 2.2711492542854475e-06, + "loss": 0.0037, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1295 + }, + { + "completion_length": 145.0, + "epoch": 0.5767690253671562, + "grad_norm": 0.6938797831535339, + "kl": 0.05431237071752548, + "learning_rate": 2.2672815767272968e-06, + "loss": 0.0022, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1296 + }, + { + "completion_length": 158.5, + "epoch": 0.5772140631953716, + "grad_norm": 0.7317979335784912, + "kl": 0.08583417534828186, + "learning_rate": 2.2634144609515362e-06, + "loss": 0.0034, + "reward": 0.33416664600372314, + "reward_std": 0.10247032344341278, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1297 + }, + { + "completion_length": 200.0, + "epoch": 0.577659101023587, + "grad_norm": 0.030720453709363937, + "kl": 0.05192447826266289, + "learning_rate": 2.2595479162933846e-06, + "loss": 0.0021, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1298 + }, + { + "completion_length": 176.6666717529297, + "epoch": 0.5781041388518025, + "grad_norm": 0.6601949334144592, + "kl": 0.068632572889328, + "learning_rate": 2.255681952086683e-06, + "loss": 0.0027, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1299 + }, + { + "completion_length": 173.6666717529297, + "epoch": 0.5785491766800178, + "grad_norm": 0.688745379447937, + "kl": 0.04240253195166588, + "learning_rate": 2.2518165776638715e-06, + "loss": 0.0017, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1300 + }, + { + "completion_length": 145.0, + "epoch": 0.5789942145082332, + "grad_norm": 0.021718116477131844, + "kl": 0.0497608482837677, + "learning_rate": 2.2479518023559645e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1301 + }, + { + "completion_length": 159.1666717529297, + "epoch": 0.5794392523364486, + "grad_norm": 0.7606553435325623, + "kl": 0.05341101810336113, + "learning_rate": 2.2440876354925327e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1302 + }, + { + "completion_length": 137.1666717529297, + "epoch": 0.579884290164664, + "grad_norm": 0.031692203134298325, + "kl": 0.07368038594722748, + "learning_rate": 2.240224086401677e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1303 + }, + { + "completion_length": 152.33334350585938, + "epoch": 0.5803293279928794, + "grad_norm": 0.8328744769096375, + "kl": 0.085344098508358, + "learning_rate": 2.2363611644100055e-06, + "loss": 0.0034, + "reward": 0.2528333365917206, + "reward_std": 0.18862704932689667, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2528333365917206, + "step": 1304 + }, + { + "completion_length": 178.33334350585938, + "epoch": 0.5807743658210948, + "grad_norm": 0.7432076930999756, + "kl": 0.04911906644701958, + "learning_rate": 2.232498878842615e-06, + "loss": 0.002, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1305 + }, + { + "completion_length": 123.5, + "epoch": 0.5812194036493102, + "grad_norm": 0.8634981513023376, + "kl": 0.09652163088321686, + "learning_rate": 2.2286372390230645e-06, + "loss": 0.0039, + "reward": 0.2605000138282776, + "reward_std": 0.1890542209148407, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2605000138282776, + "step": 1306 + }, + { + "completion_length": 114.66667175292969, + "epoch": 0.5816644414775256, + "grad_norm": 0.03657658025622368, + "kl": 0.06502784043550491, + "learning_rate": 2.2247762542733544e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1307 + }, + { + "completion_length": 197.0, + "epoch": 0.582109479305741, + "grad_norm": 0.8336066603660583, + "kl": 0.06351786851882935, + "learning_rate": 2.220915933913903e-06, + "loss": 0.0025, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 1308 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.5825545171339563, + "grad_norm": 0.882077693939209, + "kl": 0.05575979873538017, + "learning_rate": 2.217056287263528e-06, + "loss": 0.0022, + "reward": 0.35483333468437195, + "reward_std": 0.12303563207387924, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27149999141693115, + "step": 1309 + }, + { + "completion_length": 194.6666717529297, + "epoch": 0.5829995549621718, + "grad_norm": 0.6482923030853271, + "kl": 0.04915091395378113, + "learning_rate": 2.2131973236394154e-06, + "loss": 0.002, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 1310 + }, + { + "completion_length": 126.83333587646484, + "epoch": 0.5834445927903872, + "grad_norm": 0.025585202500224113, + "kl": 0.0684896856546402, + "learning_rate": 2.209339052357107e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1311 + }, + { + "completion_length": 161.83334350585938, + "epoch": 0.5838896306186026, + "grad_norm": 0.017367461696267128, + "kl": 0.04990217089653015, + "learning_rate": 2.2054814827304713e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1312 + }, + { + "completion_length": 145.1666717529297, + "epoch": 0.584334668446818, + "grad_norm": 0.6588000655174255, + "kl": 0.06359747052192688, + "learning_rate": 2.201624624071683e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1313 + }, + { + "completion_length": 190.33334350585938, + "epoch": 0.5847797062750334, + "grad_norm": 0.5984198451042175, + "kl": 0.05125384032726288, + "learning_rate": 2.1977684856912016e-06, + "loss": 0.0021, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1314 + }, + { + "completion_length": 166.5, + "epoch": 0.5852247441032488, + "grad_norm": 0.018499819561839104, + "kl": 0.05266015976667404, + "learning_rate": 2.1939130768977455e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1315 + }, + { + "completion_length": 151.6666717529297, + "epoch": 0.5856697819314641, + "grad_norm": 0.6672574281692505, + "kl": 0.0633692592382431, + "learning_rate": 2.190058406998275e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1316 + }, + { + "completion_length": 200.0, + "epoch": 0.5861148197596796, + "grad_norm": 0.02820000983774662, + "kl": 0.051216401159763336, + "learning_rate": 2.1862044852979654e-06, + "loss": 0.002, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1317 + }, + { + "completion_length": 192.83334350585938, + "epoch": 0.586559857587895, + "grad_norm": 0.8144605755805969, + "kl": 0.05723793804645538, + "learning_rate": 2.1823513211001836e-06, + "loss": 0.0023, + "reward": 0.2711666524410248, + "reward_std": 0.09453975409269333, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2711666524410248, + "step": 1318 + }, + { + "completion_length": 155.5, + "epoch": 0.5870048954161103, + "grad_norm": 0.011627976782619953, + "kl": 0.03995111584663391, + "learning_rate": 2.1784989237064716e-06, + "loss": 0.0019, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1319 + }, + { + "completion_length": 185.33334350585938, + "epoch": 0.5874499332443258, + "grad_norm": 0.705923318862915, + "kl": 0.04204287752509117, + "learning_rate": 2.1746473024165185e-06, + "loss": 0.0017, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1320 + }, + { + "completion_length": 188.83334350585938, + "epoch": 0.5878949710725412, + "grad_norm": 0.743707537651062, + "kl": 0.05525606870651245, + "learning_rate": 2.170796466528139e-06, + "loss": 0.0022, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1321 + }, + { + "completion_length": 162.33334350585938, + "epoch": 0.5883400089007566, + "grad_norm": 0.6818633079528809, + "kl": 0.08869786560535431, + "learning_rate": 2.166946425337254e-06, + "loss": 0.0035, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1322 + }, + { + "completion_length": 178.1666717529297, + "epoch": 0.5887850467289719, + "grad_norm": 0.8434653878211975, + "kl": 0.05428377911448479, + "learning_rate": 2.1630971881378644e-06, + "loss": 0.0022, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1323 + }, + { + "completion_length": 143.6666717529297, + "epoch": 0.5892300845571874, + "grad_norm": 0.01933359168469906, + "kl": 0.05959298461675644, + "learning_rate": 2.1592487642220305e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1324 + }, + { + "completion_length": 172.6666717529297, + "epoch": 0.5896751223854028, + "grad_norm": 0.8821998238563538, + "kl": 0.06853225827217102, + "learning_rate": 2.1554011628798495e-06, + "loss": 0.0027, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1325 + }, + { + "completion_length": 136.83334350585938, + "epoch": 0.5901201602136181, + "grad_norm": 0.022718112915754318, + "kl": 0.05797460302710533, + "learning_rate": 2.1515543933994344e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1326 + }, + { + "completion_length": 187.6666717529297, + "epoch": 0.5905651980418336, + "grad_norm": 0.6120337247848511, + "kl": 0.061057232320308685, + "learning_rate": 2.1477084650668863e-06, + "loss": 0.0024, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1327 + }, + { + "completion_length": 131.33334350585938, + "epoch": 0.591010235870049, + "grad_norm": 0.9323371052742004, + "kl": 0.05380024015903473, + "learning_rate": 2.1438633871662795e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1328 + }, + { + "completion_length": 140.6666717529297, + "epoch": 0.5914552736982643, + "grad_norm": 0.026489408686757088, + "kl": 0.06107024475932121, + "learning_rate": 2.140019168979634e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1329 + }, + { + "completion_length": 121.5, + "epoch": 0.5919003115264797, + "grad_norm": 0.02367200143635273, + "kl": 0.056515760719776154, + "learning_rate": 2.136175819786894e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1330 + }, + { + "completion_length": 152.0, + "epoch": 0.5923453493546952, + "grad_norm": 0.7418760657310486, + "kl": 0.08832807838916779, + "learning_rate": 2.1323333488659063e-06, + "loss": 0.0035, + "reward": 0.29350000619888306, + "reward_std": 0.20208291709423065, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29350000619888306, + "step": 1331 + }, + { + "completion_length": 190.0, + "epoch": 0.5927903871829105, + "grad_norm": 0.765781819820404, + "kl": 0.05727764964103699, + "learning_rate": 2.1284917654923986e-06, + "loss": 0.0023, + "reward": 0.27116668224334717, + "reward_std": 0.09453975409269333, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 1332 + }, + { + "completion_length": 132.33334350585938, + "epoch": 0.5932354250111259, + "grad_norm": 0.03234122321009636, + "kl": 0.07051411271095276, + "learning_rate": 2.1246510789399537e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1333 + }, + { + "completion_length": 156.1666717529297, + "epoch": 0.5936804628393414, + "grad_norm": 0.7347729802131653, + "kl": 0.05940014123916626, + "learning_rate": 2.1208112984799913e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1334 + }, + { + "completion_length": 158.1666717529297, + "epoch": 0.5941255006675568, + "grad_norm": 0.7142928242683411, + "kl": 0.07999814301729202, + "learning_rate": 2.1169724333817443e-06, + "loss": 0.0032, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1335 + }, + { + "completion_length": 151.83334350585938, + "epoch": 0.5945705384957721, + "grad_norm": 0.0350709892809391, + "kl": 0.058371301740407944, + "learning_rate": 2.1131344929122336e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1336 + }, + { + "completion_length": 127.33333587646484, + "epoch": 0.5950155763239875, + "grad_norm": 0.01668655313551426, + "kl": 0.054240863770246506, + "learning_rate": 2.1092974863362508e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1337 + }, + { + "completion_length": 174.83334350585938, + "epoch": 0.595460614152203, + "grad_norm": 0.6639335751533508, + "kl": 0.048784345388412476, + "learning_rate": 2.10546142291633e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1338 + }, + { + "completion_length": 162.83334350585938, + "epoch": 0.5959056519804183, + "grad_norm": 0.6551691889762878, + "kl": 0.11608364433050156, + "learning_rate": 2.1016263119127315e-06, + "loss": 0.0046, + "reward": 0.2409999966621399, + "reward_std": 0.2736355662345886, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2409999817609787, + "step": 1339 + }, + { + "completion_length": 144.33334350585938, + "epoch": 0.5963506898086337, + "grad_norm": 0.01642470248043537, + "kl": 0.0515790730714798, + "learning_rate": 2.097792162583415e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1340 + }, + { + "completion_length": 86.83333587646484, + "epoch": 0.5967957276368492, + "grad_norm": 0.02316073514521122, + "kl": 0.06323958933353424, + "learning_rate": 2.093958984184018e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1341 + }, + { + "completion_length": 158.1666717529297, + "epoch": 0.5972407654650645, + "grad_norm": 0.6790428757667542, + "kl": 0.06524261832237244, + "learning_rate": 2.090126785967836e-06, + "loss": 0.0026, + "reward": 0.3516666889190674, + "reward_std": 0.05044468492269516, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.351666659116745, + "step": 1342 + }, + { + "completion_length": 140.6666717529297, + "epoch": 0.5976858032932799, + "grad_norm": 0.7449166178703308, + "kl": 0.13734249770641327, + "learning_rate": 2.0862955771857977e-06, + "loss": 0.0055, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1343 + }, + { + "completion_length": 152.33334350585938, + "epoch": 0.5981308411214953, + "grad_norm": 0.8124682903289795, + "kl": 0.05861344933509827, + "learning_rate": 2.082465367086442e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1344 + }, + { + "completion_length": 171.33334350585938, + "epoch": 0.5985758789497108, + "grad_norm": 0.6527742743492126, + "kl": 0.050010886043310165, + "learning_rate": 2.078636164915898e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1345 + }, + { + "completion_length": 143.83334350585938, + "epoch": 0.5990209167779261, + "grad_norm": 0.7979593276977539, + "kl": 0.06243058666586876, + "learning_rate": 2.074807979917863e-06, + "loss": 0.0025, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1346 + }, + { + "completion_length": 103.83333587646484, + "epoch": 0.5994659546061415, + "grad_norm": 0.020012816414237022, + "kl": 0.0676102340221405, + "learning_rate": 2.070980821333576e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1347 + }, + { + "completion_length": 200.0, + "epoch": 0.599910992434357, + "grad_norm": 0.04530826210975647, + "kl": 0.054757438600063324, + "learning_rate": 2.0671546984018003e-06, + "loss": 0.0022, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1348 + }, + { + "completion_length": 175.6666717529297, + "epoch": 0.6003560302625723, + "grad_norm": 0.8745347857475281, + "kl": 0.08706136047840118, + "learning_rate": 2.0633296203587994e-06, + "loss": 0.0035, + "reward": 0.31333333253860474, + "reward_std": 0.10494124889373779, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1349 + }, + { + "completion_length": 179.6666717529297, + "epoch": 0.6008010680907877, + "grad_norm": 0.719700038433075, + "kl": 0.056766610592603683, + "learning_rate": 2.059505596438312e-06, + "loss": 0.0023, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1350 + }, + { + "completion_length": 177.6666717529297, + "epoch": 0.6012461059190031, + "grad_norm": 0.6509082317352295, + "kl": 0.055832430720329285, + "learning_rate": 2.0556826358715345e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1351 + }, + { + "completion_length": 186.6666717529297, + "epoch": 0.6016911437472185, + "grad_norm": 0.7251918315887451, + "kl": 0.08503405749797821, + "learning_rate": 2.0518607478870966e-06, + "loss": 0.0034, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1352 + }, + { + "completion_length": 170.1666717529297, + "epoch": 0.6021361815754339, + "grad_norm": 0.8101000189781189, + "kl": 0.07102101296186447, + "learning_rate": 2.048039941711035e-06, + "loss": 0.0028, + "reward": 0.2056666612625122, + "reward_std": 0.3590574860572815, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2056666612625122, + "step": 1353 + }, + { + "completion_length": 190.0, + "epoch": 0.6025812194036493, + "grad_norm": 0.7210485935211182, + "kl": 0.05284285545349121, + "learning_rate": 2.044220226566781e-06, + "loss": 0.0021, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1354 + }, + { + "completion_length": 179.0, + "epoch": 0.6030262572318648, + "grad_norm": 0.7402732372283936, + "kl": 0.05912027508020401, + "learning_rate": 2.0404016116751268e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1355 + }, + { + "completion_length": 166.33334350585938, + "epoch": 0.6034712950600801, + "grad_norm": 0.8056141138076782, + "kl": 0.06315895915031433, + "learning_rate": 2.0365841062542122e-06, + "loss": 0.0025, + "reward": 0.29216668009757996, + "reward_std": 0.10255226492881775, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1356 + }, + { + "completion_length": 177.1666717529297, + "epoch": 0.6039163328882955, + "grad_norm": 0.678676187992096, + "kl": 0.04996780678629875, + "learning_rate": 2.0327677195194956e-06, + "loss": 0.002, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1357 + }, + { + "completion_length": 174.1666717529297, + "epoch": 0.6043613707165109, + "grad_norm": 0.7291713953018188, + "kl": 0.04707088693976402, + "learning_rate": 2.028952460683737e-06, + "loss": 0.0019, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1358 + }, + { + "completion_length": 169.5, + "epoch": 0.6048064085447263, + "grad_norm": 0.8036746382713318, + "kl": 0.06635730713605881, + "learning_rate": 2.0251383389569743e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1359 + }, + { + "completion_length": 162.5, + "epoch": 0.6052514463729417, + "grad_norm": 0.6609824299812317, + "kl": 0.05208190530538559, + "learning_rate": 2.0213253635464974e-06, + "loss": 0.0021, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1360 + }, + { + "completion_length": 135.6666717529297, + "epoch": 0.6056964842011571, + "grad_norm": 0.7489309906959534, + "kl": 0.0696021094918251, + "learning_rate": 2.0175135436568315e-06, + "loss": 0.0028, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1361 + }, + { + "completion_length": 187.83334350585938, + "epoch": 0.6061415220293725, + "grad_norm": 0.8159146904945374, + "kl": 0.045086465775966644, + "learning_rate": 2.013702888489713e-06, + "loss": 0.0018, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1362 + }, + { + "completion_length": 175.33334350585938, + "epoch": 0.6065865598575879, + "grad_norm": 0.7238532304763794, + "kl": 0.08709849417209625, + "learning_rate": 2.0098934072440636e-06, + "loss": 0.0035, + "reward": 0.1850000023841858, + "reward_std": 0.24054770171642303, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.1850000023841858, + "step": 1363 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.6070315976858033, + "grad_norm": 0.9293068647384644, + "kl": 0.06978730857372284, + "learning_rate": 2.0060851091159735e-06, + "loss": 0.0028, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1364 + }, + { + "completion_length": 136.33334350585938, + "epoch": 0.6074766355140186, + "grad_norm": 0.020378444343805313, + "kl": 0.06543242931365967, + "learning_rate": 2.0022780032986767e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1365 + }, + { + "completion_length": 163.6666717529297, + "epoch": 0.6079216733422341, + "grad_norm": 0.7763041853904724, + "kl": 0.07946053147315979, + "learning_rate": 1.998472098982528e-06, + "loss": 0.0032, + "reward": 0.26850003004074097, + "reward_std": 0.15288132429122925, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.26850003004074097, + "step": 1366 + }, + { + "completion_length": 193.5, + "epoch": 0.6083667111704495, + "grad_norm": 0.826087474822998, + "kl": 0.053222887217998505, + "learning_rate": 1.9946674053549826e-06, + "loss": 0.0021, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1367 + }, + { + "completion_length": 132.0, + "epoch": 0.6088117489986649, + "grad_norm": 0.9280709624290466, + "kl": 0.062463387846946716, + "learning_rate": 1.990863931600573e-06, + "loss": 0.0025, + "reward": 0.3341667056083679, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1368 + }, + { + "completion_length": 185.0, + "epoch": 0.6092567868268803, + "grad_norm": 0.749100923538208, + "kl": 0.06078839302062988, + "learning_rate": 1.987061686900886e-06, + "loss": 0.0024, + "reward": 0.29233333468437195, + "reward_std": 0.12961584329605103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 1369 + }, + { + "completion_length": 147.83334350585938, + "epoch": 0.6097018246550957, + "grad_norm": 0.034440264105796814, + "kl": 0.06485289335250854, + "learning_rate": 1.983260680434543e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1370 + }, + { + "completion_length": 104.5, + "epoch": 0.6101468624833111, + "grad_norm": 0.01646091602742672, + "kl": 0.06164836511015892, + "learning_rate": 1.9794609213771756e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1371 + }, + { + "completion_length": 157.6666717529297, + "epoch": 0.6105919003115264, + "grad_norm": 0.749934196472168, + "kl": 0.0504591129720211, + "learning_rate": 1.975662418901403e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1372 + }, + { + "completion_length": 163.5, + "epoch": 0.6110369381397419, + "grad_norm": 0.7931237816810608, + "kl": 0.08825130760669708, + "learning_rate": 1.9718651821768133e-06, + "loss": 0.0035, + "reward": 0.25316667556762695, + "reward_std": 0.24440494179725647, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25316664576530457, + "step": 1373 + }, + { + "completion_length": 160.33334350585938, + "epoch": 0.6114819759679573, + "grad_norm": 0.034321513026952744, + "kl": 0.07206878066062927, + "learning_rate": 1.9680692203699374e-06, + "loss": 0.0032, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1374 + }, + { + "completion_length": 155.6666717529297, + "epoch": 0.6119270137961726, + "grad_norm": 0.7601866722106934, + "kl": 0.07652454823255539, + "learning_rate": 1.9642745426442284e-06, + "loss": 0.0031, + "reward": 0.31333333253860474, + "reward_std": 0.15350136160850525, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1375 + }, + { + "completion_length": 196.83334350585938, + "epoch": 0.6123720516243881, + "grad_norm": 0.78342205286026, + "kl": 0.08061586320400238, + "learning_rate": 1.9604811581600415e-06, + "loss": 0.0032, + "reward": 0.2709999978542328, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1376 + }, + { + "completion_length": 153.83334350585938, + "epoch": 0.6128170894526035, + "grad_norm": 0.6514473557472229, + "kl": 0.05775187537074089, + "learning_rate": 1.956689076074607e-06, + "loss": 0.0023, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1377 + }, + { + "completion_length": 157.33334350585938, + "epoch": 0.6132621272808189, + "grad_norm": 0.7332454919815063, + "kl": 0.05692437291145325, + "learning_rate": 1.9528983055420143e-06, + "loss": 0.0023, + "reward": 0.3343333601951599, + "reward_std": 0.10206206887960434, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33433330059051514, + "step": 1378 + }, + { + "completion_length": 189.6666717529297, + "epoch": 0.6137071651090342, + "grad_norm": 0.7741665840148926, + "kl": 0.07697466015815735, + "learning_rate": 1.949108855713185e-06, + "loss": 0.0031, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1379 + }, + { + "completion_length": 126.66667175292969, + "epoch": 0.6141522029372497, + "grad_norm": 0.021553704515099525, + "kl": 0.05840783566236496, + "learning_rate": 1.945320735735853e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1380 + }, + { + "completion_length": 156.33334350585938, + "epoch": 0.6145972407654651, + "grad_norm": 0.6324295997619629, + "kl": 0.04569033905863762, + "learning_rate": 1.941533954754541e-06, + "loss": 0.0018, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1381 + }, + { + "completion_length": 89.66667175292969, + "epoch": 0.6150422785936804, + "grad_norm": 1.134950876235962, + "kl": 0.11393502354621887, + "learning_rate": 1.9377485219105416e-06, + "loss": 0.0046, + "reward": 0.312333345413208, + "reward_std": 0.15595084428787231, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3123333156108856, + "step": 1382 + }, + { + "completion_length": 182.33334350585938, + "epoch": 0.6154873164218959, + "grad_norm": 0.7212135195732117, + "kl": 0.0604676827788353, + "learning_rate": 1.93396444634189e-06, + "loss": 0.0024, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1383 + }, + { + "completion_length": 156.0, + "epoch": 0.6159323542501113, + "grad_norm": 0.02910018339753151, + "kl": 0.07041066884994507, + "learning_rate": 1.930181737183348e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1384 + }, + { + "completion_length": 181.33334350585938, + "epoch": 0.6163773920783266, + "grad_norm": 0.7267289161682129, + "kl": 0.051712047308683395, + "learning_rate": 1.926400403566377e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1385 + }, + { + "completion_length": 95.66667175292969, + "epoch": 0.616822429906542, + "grad_norm": 0.020063413307070732, + "kl": 0.06864848732948303, + "learning_rate": 1.922620454619117e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1386 + }, + { + "completion_length": 149.33334350585938, + "epoch": 0.6172674677347575, + "grad_norm": 0.7748768329620361, + "kl": 0.05772966891527176, + "learning_rate": 1.9188418994663677e-06, + "loss": 0.0023, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1387 + }, + { + "completion_length": 190.83334350585938, + "epoch": 0.6177125055629729, + "grad_norm": 0.8829987645149231, + "kl": 0.06954911351203918, + "learning_rate": 1.9150647472295635e-06, + "loss": 0.0028, + "reward": 0.14366666972637177, + "reward_std": 0.2381979525089264, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.14366665482521057, + "step": 1388 + }, + { + "completion_length": 144.6666717529297, + "epoch": 0.6181575433911882, + "grad_norm": 0.7912724018096924, + "kl": 0.053366199135780334, + "learning_rate": 1.9112890070267513e-06, + "loss": 0.0021, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1389 + }, + { + "completion_length": 197.0, + "epoch": 0.6186025812194037, + "grad_norm": 0.7742429375648499, + "kl": 0.060363225638866425, + "learning_rate": 1.907514687972569e-06, + "loss": 0.0024, + "reward": 0.27116668224334717, + "reward_std": 0.09453976154327393, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27116668224334717, + "step": 1390 + }, + { + "completion_length": 173.6666717529297, + "epoch": 0.6190476190476191, + "grad_norm": 0.7186877131462097, + "kl": 0.07181936502456665, + "learning_rate": 1.903741799178227e-06, + "loss": 0.0029, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1391 + }, + { + "completion_length": 143.1666717529297, + "epoch": 0.6194926568758344, + "grad_norm": 0.025034688413143158, + "kl": 0.055738165974617004, + "learning_rate": 1.8999703497514782e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1392 + }, + { + "completion_length": 180.1666717529297, + "epoch": 0.6199376947040498, + "grad_norm": 0.7350578308105469, + "kl": 0.07966147363185883, + "learning_rate": 1.8962003487966044e-06, + "loss": 0.0032, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1393 + }, + { + "completion_length": 127.0, + "epoch": 0.6203827325322653, + "grad_norm": 0.017875896766781807, + "kl": 0.054014332592487335, + "learning_rate": 1.8924318054143903e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1394 + }, + { + "completion_length": 162.6666717529297, + "epoch": 0.6208277703604806, + "grad_norm": 0.06926420331001282, + "kl": 0.059310950338840485, + "learning_rate": 1.888664728702101e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1395 + }, + { + "completion_length": 200.0, + "epoch": 0.621272808188696, + "grad_norm": 0.647271454334259, + "kl": 0.058838214725255966, + "learning_rate": 1.8848991277534609e-06, + "loss": 0.0024, + "reward": 0.2291666716337204, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2291666716337204, + "step": 1396 + }, + { + "completion_length": 143.33334350585938, + "epoch": 0.6217178460169115, + "grad_norm": 0.7888451814651489, + "kl": 0.1036970466375351, + "learning_rate": 1.8811350116586341e-06, + "loss": 0.0041, + "reward": 0.2548333406448364, + "reward_std": 0.2967965006828308, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25483331084251404, + "step": 1397 + }, + { + "completion_length": 157.6666717529297, + "epoch": 0.6221628838451269, + "grad_norm": 0.02223152481019497, + "kl": 0.057576995342969894, + "learning_rate": 1.8773723895041975e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1398 + }, + { + "completion_length": 193.83334350585938, + "epoch": 0.6226079216733422, + "grad_norm": 0.6927502751350403, + "kl": 0.05820035561919212, + "learning_rate": 1.8736112703731235e-06, + "loss": 0.0023, + "reward": 0.2709999978542328, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2709999978542328, + "step": 1399 + }, + { + "completion_length": 130.0, + "epoch": 0.6230529595015576, + "grad_norm": 0.9275290966033936, + "kl": 0.06016760692000389, + "learning_rate": 1.869851663344755e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1400 + }, + { + "completion_length": 135.1666717529297, + "epoch": 0.6234979973297731, + "grad_norm": 0.028401155024766922, + "kl": 0.05090929567813873, + "learning_rate": 1.8660935774947857e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1401 + }, + { + "completion_length": 163.83334350585938, + "epoch": 0.6239430351579884, + "grad_norm": 1.0099029541015625, + "kl": 0.067303866147995, + "learning_rate": 1.8623370218952368e-06, + "loss": 0.0027, + "reward": 0.2775000035762787, + "reward_std": 0.24127474427223206, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2775000035762787, + "step": 1402 + }, + { + "completion_length": 133.83334350585938, + "epoch": 0.6243880729862038, + "grad_norm": 1.1334388256072998, + "kl": 0.06404712796211243, + "learning_rate": 1.8585820056144349e-06, + "loss": 0.0026, + "reward": 0.3551666736602783, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1403 + }, + { + "completion_length": 193.0, + "epoch": 0.6248331108144193, + "grad_norm": 0.7618772387504578, + "kl": 0.05527910590171814, + "learning_rate": 1.854828537716991e-06, + "loss": 0.0022, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1404 + }, + { + "completion_length": 172.1666717529297, + "epoch": 0.6252781486426346, + "grad_norm": 0.7541539669036865, + "kl": 0.05784829705953598, + "learning_rate": 1.8510766272637798e-06, + "loss": 0.0023, + "reward": 0.37566667795181274, + "reward_std": 0.07905863225460052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 1405 + }, + { + "completion_length": 182.6666717529297, + "epoch": 0.62572318647085, + "grad_norm": 0.7231999039649963, + "kl": 0.05078327655792236, + "learning_rate": 1.8473262833119138e-06, + "loss": 0.002, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1406 + }, + { + "completion_length": 179.1666717529297, + "epoch": 0.6261682242990654, + "grad_norm": 0.7264639139175415, + "kl": 0.06620623916387558, + "learning_rate": 1.843577514914725e-06, + "loss": 0.0026, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1407 + }, + { + "completion_length": 157.1666717529297, + "epoch": 0.6266132621272809, + "grad_norm": 0.6942959427833557, + "kl": 0.05550370737910271, + "learning_rate": 1.8398303311217436e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1408 + }, + { + "completion_length": 146.33334350585938, + "epoch": 0.6270582999554962, + "grad_norm": 0.8330131769180298, + "kl": 0.0588110089302063, + "learning_rate": 1.8360847409786714e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1409 + }, + { + "completion_length": 187.83334350585938, + "epoch": 0.6275033377837116, + "grad_norm": 0.6956566572189331, + "kl": 0.06108691915869713, + "learning_rate": 1.8323407535273658e-06, + "loss": 0.0024, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1410 + }, + { + "completion_length": 167.5, + "epoch": 0.6279483756119271, + "grad_norm": 0.6863263249397278, + "kl": 0.061490681022405624, + "learning_rate": 1.8285983778058147e-06, + "loss": 0.0025, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1411 + }, + { + "completion_length": 134.83334350585938, + "epoch": 0.6283934134401424, + "grad_norm": 0.7065067291259766, + "kl": 0.08277900516986847, + "learning_rate": 1.824857622848114e-06, + "loss": 0.0033, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1412 + }, + { + "completion_length": 127.83333587646484, + "epoch": 0.6288384512683578, + "grad_norm": 0.711338460445404, + "kl": 0.1219438910484314, + "learning_rate": 1.8211184976844487e-06, + "loss": 0.0049, + "reward": 0.25733333826065063, + "reward_std": 0.29067277908325195, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25733333826065063, + "step": 1413 + }, + { + "completion_length": 161.6666717529297, + "epoch": 0.6292834890965732, + "grad_norm": 0.8949511051177979, + "kl": 0.07294663786888123, + "learning_rate": 1.8173810113410688e-06, + "loss": 0.0029, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1414 + }, + { + "completion_length": 113.83333587646484, + "epoch": 0.6297285269247886, + "grad_norm": 0.03835580497980118, + "kl": 0.07534816116094589, + "learning_rate": 1.8136451728402682e-06, + "loss": 0.0033, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1415 + }, + { + "completion_length": 112.33333587646484, + "epoch": 0.630173564753004, + "grad_norm": 0.022022180259227753, + "kl": 0.057372257113456726, + "learning_rate": 1.8099109912003624e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1416 + }, + { + "completion_length": 144.0, + "epoch": 0.6306186025812194, + "grad_norm": 0.7620473504066467, + "kl": 0.06752672046422958, + "learning_rate": 1.8061784754356688e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1417 + }, + { + "completion_length": 171.0, + "epoch": 0.6310636404094349, + "grad_norm": 0.7567469477653503, + "kl": 0.06102651357650757, + "learning_rate": 1.8024476345564806e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1418 + }, + { + "completion_length": 137.83334350585938, + "epoch": 0.6315086782376502, + "grad_norm": 1.0179473161697388, + "kl": 0.050918303430080414, + "learning_rate": 1.7987184775690512e-06, + "loss": 0.002, + "reward": 0.39633333683013916, + "reward_std": 0.09396524727344513, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.1666666716337204, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.22966668009757996, + "step": 1419 + }, + { + "completion_length": 162.6666717529297, + "epoch": 0.6319537160658656, + "grad_norm": 0.8701192140579224, + "kl": 0.17964184284210205, + "learning_rate": 1.7949910134755672e-06, + "loss": 0.0072, + "reward": 0.2861666679382324, + "reward_std": 0.22004583477973938, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2861666679382324, + "step": 1420 + }, + { + "completion_length": 200.0, + "epoch": 0.632398753894081, + "grad_norm": 0.17582067847251892, + "kl": 0.08828055113554001, + "learning_rate": 1.7912652512741273e-06, + "loss": 0.0035, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1421 + }, + { + "completion_length": 197.1666717529297, + "epoch": 0.6328437917222964, + "grad_norm": 0.6580331325531006, + "kl": 0.038402341306209564, + "learning_rate": 1.7875411999587256e-06, + "loss": 0.0015, + "reward": 0.250166654586792, + "reward_std": 0.07937358319759369, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.250166654586792, + "step": 1422 + }, + { + "completion_length": 148.1666717529297, + "epoch": 0.6332888295505118, + "grad_norm": 0.8078848719596863, + "kl": 0.06011554226279259, + "learning_rate": 1.7838188685192217e-06, + "loss": 0.0024, + "reward": 0.3343333601951599, + "reward_std": 0.06454972922801971, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33433330059051514, + "step": 1423 + }, + { + "completion_length": 144.33334350585938, + "epoch": 0.6337338673787272, + "grad_norm": 0.06990113109350204, + "kl": 0.10113313794136047, + "learning_rate": 1.7800982659413268e-06, + "loss": 0.0043, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1424 + }, + { + "completion_length": 171.5, + "epoch": 0.6341789052069426, + "grad_norm": 0.7644363641738892, + "kl": 0.08610030263662338, + "learning_rate": 1.7763794012065771e-06, + "loss": 0.0034, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1425 + }, + { + "completion_length": 184.1666717529297, + "epoch": 0.634623943035158, + "grad_norm": 0.7638565301895142, + "kl": 0.052418213337659836, + "learning_rate": 1.772662283292314e-06, + "loss": 0.0021, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 1426 + }, + { + "completion_length": 144.83334350585938, + "epoch": 0.6350689808633734, + "grad_norm": 0.7163295745849609, + "kl": 0.06099975109100342, + "learning_rate": 1.7689469211716614e-06, + "loss": 0.0024, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1427 + }, + { + "completion_length": 129.33334350585938, + "epoch": 0.6355140186915887, + "grad_norm": 0.020992541685700417, + "kl": 0.06372680515050888, + "learning_rate": 1.7652333238135067e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1428 + }, + { + "completion_length": 147.6666717529297, + "epoch": 0.6359590565198042, + "grad_norm": 0.06478970497846603, + "kl": 0.0694139301776886, + "learning_rate": 1.761521500182474e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1429 + }, + { + "completion_length": 165.83334350585938, + "epoch": 0.6364040943480196, + "grad_norm": 0.8788062930107117, + "kl": 0.08362124860286713, + "learning_rate": 1.7578114592389085e-06, + "loss": 0.0033, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1430 + }, + { + "completion_length": 162.5, + "epoch": 0.636849132176235, + "grad_norm": 0.7626572251319885, + "kl": 0.07676111161708832, + "learning_rate": 1.75410320993885e-06, + "loss": 0.0031, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1431 + }, + { + "completion_length": 161.83334350585938, + "epoch": 0.6372941700044504, + "grad_norm": 0.6593140959739685, + "kl": 0.05127331614494324, + "learning_rate": 1.7503967612340153e-06, + "loss": 0.0021, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1432 + }, + { + "completion_length": 136.5, + "epoch": 0.6377392078326658, + "grad_norm": 0.8421521782875061, + "kl": 0.05608572065830231, + "learning_rate": 1.7466921220717737e-06, + "loss": 0.0022, + "reward": 0.31316667795181274, + "reward_std": 0.10506077855825424, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31316667795181274, + "step": 1433 + }, + { + "completion_length": 127.83333587646484, + "epoch": 0.6381842456608812, + "grad_norm": 0.032311953604221344, + "kl": 0.06359237432479858, + "learning_rate": 1.7429893013951243e-06, + "loss": 0.0028, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1434 + }, + { + "completion_length": 141.0, + "epoch": 0.6386292834890965, + "grad_norm": 0.02765144221484661, + "kl": 0.06800812482833862, + "learning_rate": 1.7392883081426793e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1435 + }, + { + "completion_length": 156.83334350585938, + "epoch": 0.639074321317312, + "grad_norm": 0.03926115483045578, + "kl": 0.07100366055965424, + "learning_rate": 1.7355891512486384e-06, + "loss": 0.0031, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1436 + }, + { + "completion_length": 136.6666717529297, + "epoch": 0.6395193591455274, + "grad_norm": 0.9405933022499084, + "kl": 0.07274708151817322, + "learning_rate": 1.7318918396427676e-06, + "loss": 0.0029, + "reward": 0.3551666736602783, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1437 + }, + { + "completion_length": 171.5, + "epoch": 0.6399643969737427, + "grad_norm": 0.6927393674850464, + "kl": 0.06484430283308029, + "learning_rate": 1.728196382250379e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1438 + }, + { + "completion_length": 185.5, + "epoch": 0.6404094348019582, + "grad_norm": 0.7905606031417847, + "kl": 0.05562688410282135, + "learning_rate": 1.7245027879923082e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1439 + }, + { + "completion_length": 121.5, + "epoch": 0.6408544726301736, + "grad_norm": 0.03622419387102127, + "kl": 0.07538799941539764, + "learning_rate": 1.7208110657848945e-06, + "loss": 0.0033, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1440 + }, + { + "completion_length": 179.6666717529297, + "epoch": 0.641299510458389, + "grad_norm": 0.7712323069572449, + "kl": 0.07858553528785706, + "learning_rate": 1.7171212245399572e-06, + "loss": 0.0031, + "reward": 0.31299999356269836, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.312999963760376, + "step": 1441 + }, + { + "completion_length": 200.0, + "epoch": 0.6417445482866043, + "grad_norm": 0.715948760509491, + "kl": 0.06623165309429169, + "learning_rate": 1.7134332731647734e-06, + "loss": 0.0026, + "reward": 0.11950000375509262, + "reward_std": 0.31965839862823486, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.11950000375509262, + "step": 1442 + }, + { + "completion_length": 143.5, + "epoch": 0.6421895861148198, + "grad_norm": 0.900661289691925, + "kl": 0.06195361539721489, + "learning_rate": 1.7097472205620607e-06, + "loss": 0.0025, + "reward": 0.3551666736602783, + "reward_std": 0.05103103443980217, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1443 + }, + { + "completion_length": 120.33333587646484, + "epoch": 0.6426346239430352, + "grad_norm": 0.01926054246723652, + "kl": 0.05608893558382988, + "learning_rate": 1.7060630756299529e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1444 + }, + { + "completion_length": 169.1666717529297, + "epoch": 0.6430796617712505, + "grad_norm": 0.014926317147910595, + "kl": 0.050173163414001465, + "learning_rate": 1.7023808472619755e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1445 + }, + { + "completion_length": 161.0, + "epoch": 0.6435246995994659, + "grad_norm": 0.02021980844438076, + "kl": 0.05474698543548584, + "learning_rate": 1.6987005443470309e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1446 + }, + { + "completion_length": 131.83334350585938, + "epoch": 0.6439697374276814, + "grad_norm": 0.7747736573219299, + "kl": 0.06740984320640564, + "learning_rate": 1.6950221757693725e-06, + "loss": 0.0027, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1447 + }, + { + "completion_length": 146.33334350585938, + "epoch": 0.6444147752558967, + "grad_norm": 0.8941154479980469, + "kl": 0.06897615641355515, + "learning_rate": 1.6913457504085828e-06, + "loss": 0.0028, + "reward": 0.3581666648387909, + "reward_std": 0.11561040580272675, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2748333215713501, + "step": 1448 + }, + { + "completion_length": 139.0, + "epoch": 0.6448598130841121, + "grad_norm": 0.8403373956680298, + "kl": 0.06838244199752808, + "learning_rate": 1.6876712771395553e-06, + "loss": 0.0027, + "reward": 0.29233333468437195, + "reward_std": 0.12961584329605103, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29233333468437195, + "step": 1449 + }, + { + "completion_length": 177.83334350585938, + "epoch": 0.6453048509123276, + "grad_norm": 0.6254804730415344, + "kl": 0.05985492095351219, + "learning_rate": 1.6839987648324702e-06, + "loss": 0.0024, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1450 + }, + { + "completion_length": 200.0, + "epoch": 0.645749888740543, + "grad_norm": 0.017869267612695694, + "kl": 0.038964178413152695, + "learning_rate": 1.6803282223527737e-06, + "loss": 0.0016, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1451 + }, + { + "completion_length": 115.83333587646484, + "epoch": 0.6461949265687583, + "grad_norm": 0.0266580693423748, + "kl": 0.06849268078804016, + "learning_rate": 1.6766596585611572e-06, + "loss": 0.003, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1452 + }, + { + "completion_length": 200.0, + "epoch": 0.6466399643969737, + "grad_norm": 0.035756222903728485, + "kl": 0.05660893768072128, + "learning_rate": 1.672993082313536e-06, + "loss": 0.0023, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1453 + }, + { + "completion_length": 126.5, + "epoch": 0.6470850022251892, + "grad_norm": 0.7655314207077026, + "kl": 0.05805585905909538, + "learning_rate": 1.6693285024610264e-06, + "loss": 0.0023, + "reward": 0.3966667056083679, + "reward_std": 0.05062280222773552, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1454 + }, + { + "completion_length": 126.66667175292969, + "epoch": 0.6475300400534045, + "grad_norm": 0.028626590967178345, + "kl": 0.05400149151682854, + "learning_rate": 1.6656659278499262e-06, + "loss": 0.0025, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1455 + }, + { + "completion_length": 179.6666717529297, + "epoch": 0.6479750778816199, + "grad_norm": 0.7383698225021362, + "kl": 0.0543503537774086, + "learning_rate": 1.662005367321693e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1456 + }, + { + "completion_length": 184.83334350585938, + "epoch": 0.6484201157098354, + "grad_norm": 0.7595195770263672, + "kl": 0.07299184054136276, + "learning_rate": 1.6583468297129207e-06, + "loss": 0.0029, + "reward": 0.17650000751018524, + "reward_std": 0.25984129309654236, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.17650000751018524, + "step": 1457 + }, + { + "completion_length": 178.83334350585938, + "epoch": 0.6488651535380507, + "grad_norm": 0.7613358497619629, + "kl": 0.05207335948944092, + "learning_rate": 1.6546903238553211e-06, + "loss": 0.0021, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1458 + }, + { + "completion_length": 137.33334350585938, + "epoch": 0.6493101913662661, + "grad_norm": 0.02273043617606163, + "kl": 0.05129929631948471, + "learning_rate": 1.6510358585757018e-06, + "loss": 0.0023, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1459 + }, + { + "completion_length": 200.0, + "epoch": 0.6497552291944815, + "grad_norm": 0.03293275833129883, + "kl": 0.04631292074918747, + "learning_rate": 1.6473834426959434e-06, + "loss": 0.0019, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1460 + }, + { + "completion_length": 191.1666717529297, + "epoch": 0.650200267022697, + "grad_norm": 0.8067001700401306, + "kl": 0.04392020031809807, + "learning_rate": 1.6437330850329793e-06, + "loss": 0.0018, + "reward": 0.2084999978542328, + "reward_std": 0.10238896310329437, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2084999978542328, + "step": 1461 + }, + { + "completion_length": 165.6666717529297, + "epoch": 0.6506453048509123, + "grad_norm": 0.8360378742218018, + "kl": 0.07717983424663544, + "learning_rate": 1.6400847943987758e-06, + "loss": 0.0031, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1462 + }, + { + "completion_length": 186.6666717529297, + "epoch": 0.6510903426791277, + "grad_norm": 0.7348068952560425, + "kl": 0.05224481225013733, + "learning_rate": 1.636438579600307e-06, + "loss": 0.0021, + "reward": 0.31300002336502075, + "reward_std": 0.06901304423809052, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31300002336502075, + "step": 1463 + }, + { + "completion_length": 155.1666717529297, + "epoch": 0.6515353805073432, + "grad_norm": 0.6557543277740479, + "kl": 0.0562002956867218, + "learning_rate": 1.6327944494395387e-06, + "loss": 0.0022, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1464 + }, + { + "completion_length": 144.0, + "epoch": 0.6519804183355585, + "grad_norm": 1.051000952720642, + "kl": 0.05340130627155304, + "learning_rate": 1.6291524127134012e-06, + "loss": 0.0021, + "reward": 0.3966667056083679, + "reward_std": 0.05062280222773552, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.31333333253860474, + "step": 1465 + }, + { + "completion_length": 177.6666717529297, + "epoch": 0.6524254561637739, + "grad_norm": 0.7785902619361877, + "kl": 0.09893165528774261, + "learning_rate": 1.6255124782137738e-06, + "loss": 0.004, + "reward": 0.3341667056083679, + "reward_std": 0.10247030854225159, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1466 + }, + { + "completion_length": 168.33334350585938, + "epoch": 0.6528704939919893, + "grad_norm": 0.7352219223976135, + "kl": 0.07749474048614502, + "learning_rate": 1.6218746547274612e-06, + "loss": 0.0031, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1467 + }, + { + "completion_length": 146.1666717529297, + "epoch": 0.6533155318202047, + "grad_norm": 0.03056454285979271, + "kl": 0.07416104525327682, + "learning_rate": 1.618238951036169e-06, + "loss": 0.0033, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1468 + }, + { + "completion_length": 162.1666717529297, + "epoch": 0.6537605696484201, + "grad_norm": 0.7109426259994507, + "kl": 0.04952998086810112, + "learning_rate": 1.6146053759164895e-06, + "loss": 0.002, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1469 + }, + { + "completion_length": 174.83334350585938, + "epoch": 0.6542056074766355, + "grad_norm": 0.7508170008659363, + "kl": 0.06910863518714905, + "learning_rate": 1.6109739381398746e-06, + "loss": 0.0028, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1470 + }, + { + "completion_length": 105.0, + "epoch": 0.654650645304851, + "grad_norm": 0.024463405832648277, + "kl": 0.06437089294195175, + "learning_rate": 1.6073446464726158e-06, + "loss": 0.0029, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1471 + }, + { + "completion_length": 132.6666717529297, + "epoch": 0.6550956831330663, + "grad_norm": 0.043828994035720825, + "kl": 0.06118755787611008, + "learning_rate": 1.6037175096758259e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1472 + }, + { + "completion_length": 150.0, + "epoch": 0.6555407209612817, + "grad_norm": 0.7977015376091003, + "kl": 0.10141167044639587, + "learning_rate": 1.6000925365054154e-06, + "loss": 0.0041, + "reward": 0.2861666679382324, + "reward_std": 0.22004583477973938, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2861666679382324, + "step": 1473 + }, + { + "completion_length": 156.33334350585938, + "epoch": 0.655985758789497, + "grad_norm": 0.9616900086402893, + "kl": 0.052790567278862, + "learning_rate": 1.59646973571207e-06, + "loss": 0.0021, + "reward": 0.3148333430290222, + "reward_std": 0.10149761289358139, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.3148333430290222, + "step": 1474 + }, + { + "completion_length": 153.0, + "epoch": 0.6564307966177125, + "grad_norm": 0.7333883047103882, + "kl": 0.07777030766010284, + "learning_rate": 1.5928491160412335e-06, + "loss": 0.0031, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1475 + }, + { + "completion_length": 138.83334350585938, + "epoch": 0.6568758344459279, + "grad_norm": 0.8540945053100586, + "kl": 0.08184421062469482, + "learning_rate": 1.5892306862330837e-06, + "loss": 0.0033, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1476 + }, + { + "completion_length": 159.1666717529297, + "epoch": 0.6573208722741433, + "grad_norm": 0.7016175389289856, + "kl": 0.05608559399843216, + "learning_rate": 1.5856144550225113e-06, + "loss": 0.0022, + "reward": 0.33399999141693115, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33399999141693115, + "step": 1477 + }, + { + "completion_length": 145.6666717529297, + "epoch": 0.6577659101023587, + "grad_norm": 0.01697458140552044, + "kl": 0.052540235221385956, + "learning_rate": 1.5820004311391006e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1478 + }, + { + "completion_length": 126.0, + "epoch": 0.6582109479305741, + "grad_norm": 1.1132007837295532, + "kl": 0.08661770820617676, + "learning_rate": 1.5783886233071078e-06, + "loss": 0.0035, + "reward": 0.3551666736602783, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1479 + }, + { + "completion_length": 153.83334350585938, + "epoch": 0.6586559857587895, + "grad_norm": 0.5954561829566956, + "kl": 0.07186748832464218, + "learning_rate": 1.5747790402454377e-06, + "loss": 0.0029, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1480 + }, + { + "completion_length": 133.5, + "epoch": 0.6591010235870048, + "grad_norm": 0.8292099833488464, + "kl": 0.04337434098124504, + "learning_rate": 1.5711716906676258e-06, + "loss": 0.0017, + "reward": 0.35499998927116394, + "reward_std": 0.12266214936971664, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0833333358168602, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.27166664600372314, + "step": 1481 + }, + { + "completion_length": 151.5, + "epoch": 0.6595460614152203, + "grad_norm": 0.8073120713233948, + "kl": 0.059761784970760345, + "learning_rate": 1.5675665832818166e-06, + "loss": 0.0024, + "reward": 0.3551666736602783, + "reward_std": 0.05103103816509247, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35516664385795593, + "step": 1482 + }, + { + "completion_length": 161.5, + "epoch": 0.6599910992434357, + "grad_norm": 0.7773087620735168, + "kl": 0.048801522701978683, + "learning_rate": 1.5639637267907399e-06, + "loss": 0.002, + "reward": 0.3341667056083679, + "reward_std": 0.0648086816072464, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1483 + }, + { + "completion_length": 182.0, + "epoch": 0.660436137071651, + "grad_norm": 0.5983509421348572, + "kl": 0.08513455092906952, + "learning_rate": 1.5603631298916937e-06, + "loss": 0.0034, + "reward": 0.17266666889190674, + "reward_std": 0.3189123272895813, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.17266666889190674, + "step": 1484 + }, + { + "completion_length": 156.33334350585938, + "epoch": 0.6608811748998665, + "grad_norm": 0.030569666996598244, + "kl": 0.0781131312251091, + "learning_rate": 1.5567648012765213e-06, + "loss": 0.0034, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1485 + }, + { + "completion_length": 193.33334350585938, + "epoch": 0.6613262127280819, + "grad_norm": 0.7831483483314514, + "kl": 0.04813634976744652, + "learning_rate": 1.5531687496315887e-06, + "loss": 0.0019, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1486 + }, + { + "completion_length": 177.6666717529297, + "epoch": 0.6617712505562973, + "grad_norm": 0.6230365633964539, + "kl": 0.04865188151597977, + "learning_rate": 1.549574983637767e-06, + "loss": 0.0019, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1487 + }, + { + "completion_length": 176.5, + "epoch": 0.6622162883845126, + "grad_norm": 0.6940677762031555, + "kl": 0.07798059284687042, + "learning_rate": 1.545983511970409e-06, + "loss": 0.0031, + "reward": 0.29216668009757996, + "reward_std": 0.10255225747823715, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.29216668009757996, + "step": 1488 + }, + { + "completion_length": 171.83334350585938, + "epoch": 0.6626613262127281, + "grad_norm": 0.6483290195465088, + "kl": 0.06438718736171722, + "learning_rate": 1.5423943432993287e-06, + "loss": 0.0026, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1489 + }, + { + "completion_length": 200.0, + "epoch": 0.6631063640409435, + "grad_norm": 0.019243627786636353, + "kl": 0.0421595573425293, + "learning_rate": 1.538807486288782e-06, + "loss": 0.0017, + "reward": 0.25, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.25, + "step": 1490 + }, + { + "completion_length": 143.6666717529297, + "epoch": 0.6635514018691588, + "grad_norm": 0.024592779576778412, + "kl": 0.052703164517879486, + "learning_rate": 1.5352229495974425e-06, + "loss": 0.0024, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1491 + }, + { + "completion_length": 137.33334350585938, + "epoch": 0.6639964396973743, + "grad_norm": 0.02066526561975479, + "kl": 0.06004131957888603, + "learning_rate": 1.5316407418783835e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1492 + }, + { + "completion_length": 144.6666717529297, + "epoch": 0.6644414775255897, + "grad_norm": 0.8184164762496948, + "kl": 0.05917416140437126, + "learning_rate": 1.528060871779058e-06, + "loss": 0.0024, + "reward": 0.33416664600372314, + "reward_std": 0.064808689057827, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.33416664600372314, + "step": 1493 + }, + { + "completion_length": 166.33334350585938, + "epoch": 0.664886515353805, + "grad_norm": 0.5798844695091248, + "kl": 0.06790559738874435, + "learning_rate": 1.5244833479412717e-06, + "loss": 0.0027, + "reward": 0.21583333611488342, + "reward_std": 0.39232656359672546, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.21583333611488342, + "step": 1494 + }, + { + "completion_length": 163.83334350585938, + "epoch": 0.6653315531820204, + "grad_norm": 0.0290197916328907, + "kl": 0.05775400996208191, + "learning_rate": 1.5209081790011704e-06, + "loss": 0.0026, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1495 + }, + { + "completion_length": 127.66667175292969, + "epoch": 0.6657765910102359, + "grad_norm": 0.0205070823431015, + "kl": 0.060368895530700684, + "learning_rate": 1.5173353735892139e-06, + "loss": 0.0027, + "reward": 0.3760000169277191, + "reward_std": 0.0, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.37599998712539673, + "step": 1496 + }, + { + "completion_length": 156.83334350585938, + "epoch": 0.6662216288384513, + "grad_norm": 0.6863259673118591, + "kl": 0.049120690673589706, + "learning_rate": 1.5137649403301551e-06, + "loss": 0.002, + "reward": 0.35500001907348633, + "reward_std": 0.051439277827739716, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1497 + }, + { + "completion_length": 188.0, + "epoch": 0.6666666666666666, + "grad_norm": 0.6774608492851257, + "kl": 0.04528352618217468, + "learning_rate": 1.5101968878430229e-06, + "loss": 0.0018, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1498 + }, + { + "completion_length": 187.6666717529297, + "epoch": 0.6671117044948821, + "grad_norm": 0.7849301695823669, + "kl": 0.09143184125423431, + "learning_rate": 1.5066312247410974e-06, + "loss": 0.0037, + "reward": 0.2919999957084656, + "reward_std": 0.06506611406803131, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.2919999957084656, + "step": 1499 + }, + { + "completion_length": 147.0, + "epoch": 0.6675567423230975, + "grad_norm": 0.8740885853767395, + "kl": 0.06229601427912712, + "learning_rate": 1.5030679596318904e-06, + "loss": 0.0025, + "reward": 0.35499998927116394, + "reward_std": 0.051439281553030014, + "rewards/correctness_reward_func": 0.0, + "rewards/int_reward_func": 0.0, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.35499998927116394, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 2247, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}