{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 606.7053756713867, "epoch": 0.017057569296375266, "grad_norm": 0.37430307269096375, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0331, "reward": 0.623883955180645, "reward_std": 0.34485295601189137, "rewards/accuracy_reward": 0.623883955180645, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 616.130612373352, "epoch": 0.08528784648187633, "grad_norm": 1.1676669120788574, "kl": 0.000269472599029541, "learning_rate": 2.5e-06, "loss": 0.0261, "reward": 0.593191989697516, "reward_std": 0.36569571029394865, "rewards/accuracy_reward": 0.593191989697516, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 614.5042640686036, "epoch": 0.17057569296375266, "grad_norm": 0.9632540941238403, "kl": 0.009089851379394531, "learning_rate": 2.956412726139078e-06, "loss": 0.0512, "reward": 0.6821428880095481, "reward_std": 0.30126961767673494, "rewards/accuracy_reward": 0.6821428880095481, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 599.6660972595215, "epoch": 0.255863539445629, "grad_norm": 0.14580217003822327, "kl": 0.004628181457519531, "learning_rate": 2.7836719084521715e-06, "loss": 0.0639, "reward": 0.765178607404232, "reward_std": 0.2118765540421009, "rewards/accuracy_reward": 0.765178607404232, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 587.9198951721191, "epoch": 0.3411513859275053, "grad_norm": 0.4212934970855713, "kl": 0.053232765197753905, "learning_rate": 2.4946839873611927e-06, "loss": 0.0517, "reward": 0.7651786103844642, "reward_std": 0.19744985159486533, "rewards/accuracy_reward": 0.7651786103844642, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 592.8506958007813, "epoch": 0.42643923240938164, "grad_norm": 0.18875020742416382, "kl": 0.004035186767578125, "learning_rate": 2.1156192081791355e-06, "loss": 0.0467, "reward": 0.7680803954601287, "reward_std": 0.1883797325193882, "rewards/accuracy_reward": 0.7680803954601287, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 596.9201133728027, "epoch": 0.511727078891258, "grad_norm": 0.11470279842615128, "kl": 0.00387420654296875, "learning_rate": 1.6808050203829845e-06, "loss": 0.0429, "reward": 0.7524553909897804, "reward_std": 0.17808201136067509, "rewards/accuracy_reward": 0.7524553909897804, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 590.4598487854004, "epoch": 0.5970149253731343, "grad_norm": 0.2764819860458374, "kl": 0.0034837722778320312, "learning_rate": 1.2296174432791415e-06, "loss": 0.0316, "reward": 0.7455357491970063, "reward_std": 0.1789832441136241, "rewards/accuracy_reward": 0.7455357491970063, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 577.9904289245605, "epoch": 0.6823027718550106, "grad_norm": 0.10016006976366043, "kl": 0.004136276245117187, "learning_rate": 8.029152419343472e-07, "loss": 0.0423, "reward": 0.7671875342726707, "reward_std": 0.18496839571744203, "rewards/accuracy_reward": 0.7671875342726707, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 596.3118553161621, "epoch": 0.767590618336887, "grad_norm": 0.9257429242134094, "kl": 0.00593109130859375, "learning_rate": 4.3933982822017883e-07, "loss": 0.0291, "reward": 0.7488839641213417, "reward_std": 0.19455855945125222, "rewards/accuracy_reward": 0.7488839641213417, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 603.4419929504395, "epoch": 0.8528784648187633, "grad_norm": 0.24372941255569458, "kl": 0.004146194458007813, "learning_rate": 1.718159615201853e-07, "loss": 0.0398, "reward": 0.7470982506871223, "reward_std": 0.17433594232425093, "rewards/accuracy_reward": 0.7470982506871223, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 583.7348480224609, "epoch": 0.9381663113006397, "grad_norm": 0.08758081495761871, "kl": 0.007670402526855469, "learning_rate": 2.4570139579284723e-08, "loss": 0.035, "reward": 0.783928607404232, "reward_std": 0.18272702796384693, "rewards/accuracy_reward": 0.783928607404232, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 580.6320686340332, "epoch": 0.9893390191897654, "kl": 0.0042463938395182295, "reward": 0.7734375397364298, "reward_std": 0.18152949400246143, "rewards/accuracy_reward": 0.7734375397364298, "rewards/format_reward": 0.0, "step": 58, "total_flos": 0.0, "train_loss": 0.04230335579606993, "train_runtime": 7223.2167, "train_samples_per_second": 1.038, "train_steps_per_second": 0.008 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }