{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 605.8158782958984, "epoch": 0.08528784648187633, "grad_norm": 0.792238175868988, "kl": 0.00026590824127197265, "learning_rate": 2.5e-06, "loss": 0.0376, "reward": 0.6513393133878708, "reward_std": 0.3232782337814569, "rewards/accuracy_reward": 0.6511160984635354, "rewards/format_reward": 0.00022321429569274187, "step": 5 }, { "completion_length": 608.7732391357422, "epoch": 0.17057569296375266, "grad_norm": 4.090766906738281, "kl": 1868.8025625228881, "learning_rate": 2.956412726139078e-06, "loss": 92.3117, "reward": 0.7147321745753288, "reward_std": 0.27289656847715377, "rewards/accuracy_reward": 0.714285746216774, "rewards/format_reward": 0.00044642859138548373, "step": 10 }, { "completion_length": 596.5480163574218, "epoch": 0.255863539445629, "grad_norm": 0.19806408882141113, "kl": 0.0032785415649414064, "learning_rate": 2.7836719084521715e-06, "loss": 0.0645, "reward": 0.7618303894996643, "reward_std": 0.20630390886217356, "rewards/accuracy_reward": 0.7618303894996643, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 579.8969009399414, "epoch": 0.3411513859275053, "grad_norm": 0.12550625205039978, "kl": 0.0029325485229492188, "learning_rate": 2.4946839873611927e-06, "loss": 0.0465, "reward": 0.7792411088943482, "reward_std": 0.18276286944746972, "rewards/accuracy_reward": 0.7792411088943482, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 587.1710090637207, "epoch": 0.42643923240938164, "grad_norm": 0.18487855792045593, "kl": 0.004139328002929687, "learning_rate": 2.1156192081791355e-06, "loss": 0.0452, "reward": 0.7696428924798966, "reward_std": 0.17610765052959323, "rewards/accuracy_reward": 0.7696428924798966, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 593.3131958007813, "epoch": 0.511727078891258, "grad_norm": 0.1600646674633026, "kl": 0.004147720336914062, "learning_rate": 1.6808050203829845e-06, "loss": 0.0428, "reward": 0.7513393238186836, "reward_std": 0.17458505025133492, "rewards/accuracy_reward": 0.7513393238186836, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 582.7629692077637, "epoch": 0.5970149253731343, "grad_norm": 0.1672583967447281, "kl": 0.004720306396484375, "learning_rate": 1.2296174432791415e-06, "loss": 0.0355, "reward": 0.7491071790456771, "reward_std": 0.17336862673982978, "rewards/accuracy_reward": 0.7491071790456771, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 575.660513305664, "epoch": 0.6823027718550106, "grad_norm": 0.1312212347984314, "kl": 0.004215621948242187, "learning_rate": 8.029152419343472e-07, "loss": 0.0387, "reward": 0.7794643193483353, "reward_std": 0.16920664254575968, "rewards/accuracy_reward": 0.7794643193483353, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 588.4040451049805, "epoch": 0.767590618336887, "grad_norm": 0.18345855176448822, "kl": 0.004205322265625, "learning_rate": 4.3933982822017883e-07, "loss": 0.036, "reward": 0.7571428939700127, "reward_std": 0.18678655978292227, "rewards/accuracy_reward": 0.7571428939700127, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 590.6636367797852, "epoch": 0.8528784648187633, "grad_norm": 0.17137926816940308, "kl": 0.004085159301757813, "learning_rate": 1.718159615201853e-07, "loss": 0.0329, "reward": 0.759151816368103, "reward_std": 0.17406445033848286, "rewards/accuracy_reward": 0.759151816368103, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 580.3337303161621, "epoch": 0.9381663113006397, "grad_norm": 0.3547234833240509, "kl": 0.0044841766357421875, "learning_rate": 2.4570139579284723e-08, "loss": 0.0341, "reward": 0.7834821850061416, "reward_std": 0.17811276931315662, "rewards/accuracy_reward": 0.7834821850061416, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 572.3296394348145, "epoch": 0.9893390191897654, "kl": 0.0043417612711588545, "reward": 0.771205392976602, "reward_std": 0.1717855976894498, "rewards/accuracy_reward": 0.771205392976602, "rewards/format_reward": 0.0, "step": 58, "total_flos": 0.0, "train_loss": 7.9953470033561365, "train_runtime": 16497.7515, "train_samples_per_second": 0.455, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }