{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971563981042654, "eval_steps": 100, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018957345971563982, "grad_norm": 134.91035482329545, "learning_rate": 4.545454545454545e-08, "logits/chosen": 117.67350769042969, "logits/rejected": 126.90988159179688, "logps/chosen": -336.5020751953125, "logps/rejected": -438.0943298339844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1895734597156398, "grad_norm": 136.70946628723402, "learning_rate": 4.545454545454545e-07, "logits/chosen": 134.7197723388672, "logits/rejected": 138.20950317382812, "logps/chosen": -395.6691589355469, "logps/rejected": -439.5714111328125, "loss": 0.6989, "rewards/accuracies": 0.4375, "rewards/chosen": 0.009534548036754131, "rewards/margins": -0.0016968999989330769, "rewards/rejected": 0.011231447570025921, "step": 10 }, { "epoch": 0.3791469194312796, "grad_norm": 131.61830677294216, "learning_rate": 4.885348141000122e-07, "logits/chosen": 121.12580871582031, "logits/rejected": 124.68989562988281, "logps/chosen": -371.40533447265625, "logps/rejected": -424.2557678222656, "loss": 0.6373, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0666501596570015, "rewards/margins": 0.27632588148117065, "rewards/rejected": -0.20967569947242737, "step": 20 }, { "epoch": 0.5687203791469194, "grad_norm": 106.84429986615133, "learning_rate": 4.5025027361734613e-07, "logits/chosen": 140.8944091796875, "logits/rejected": 134.4232177734375, "logps/chosen": -420.9166564941406, "logps/rejected": -467.222900390625, "loss": 0.5663, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4017441272735596, "rewards/margins": 0.9039627909660339, "rewards/rejected": -2.3057069778442383, "step": 30 }, { "epoch": 0.7582938388625592, "grad_norm": 103.77783353366277, "learning_rate": 3.893311157806091e-07, "logits/chosen": 123.17936706542969, "logits/rejected": 112.06998443603516, "logps/chosen": -389.9255676269531, "logps/rejected": -417.8564453125, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": -1.825113296508789, "rewards/margins": 1.1491353511810303, "rewards/rejected": -2.9742486476898193, "step": 40 }, { "epoch": 0.9478672985781991, "grad_norm": 111.71527698813779, "learning_rate": 3.126631330646801e-07, "logits/chosen": 136.4942169189453, "logits/rejected": 140.7519073486328, "logps/chosen": -462.2588806152344, "logps/rejected": -544.9989624023438, "loss": 0.4903, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.164924144744873, "rewards/margins": 1.2726519107818604, "rewards/rejected": -3.4375758171081543, "step": 50 }, { "epoch": 1.1374407582938388, "grad_norm": 47.62030824870374, "learning_rate": 2.2891223348923882e-07, "logits/chosen": 127.03621673583984, "logits/rejected": 130.60501098632812, "logps/chosen": -459.1299743652344, "logps/rejected": -547.1569213867188, "loss": 0.2948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0450351238250732, "rewards/margins": 2.4868345260620117, "rewards/rejected": -5.531870365142822, "step": 60 }, { "epoch": 1.3270142180094786, "grad_norm": 40.05134350853385, "learning_rate": 1.4754491880085317e-07, "logits/chosen": 116.989013671875, "logits/rejected": 119.484130859375, "logps/chosen": -432.87384033203125, "logps/rejected": -542.761474609375, "loss": 0.1734, "rewards/accuracies": 0.96875, "rewards/chosen": -3.288933515548706, "rewards/margins": 3.1884524822235107, "rewards/rejected": -6.477385520935059, "step": 70 }, { "epoch": 1.5165876777251186, "grad_norm": 173.76861933524862, "learning_rate": 7.775827023107834e-08, "logits/chosen": 100.70893859863281, "logits/rejected": 117.65828704833984, "logps/chosen": -442.055419921875, "logps/rejected": -570.5432739257812, "loss": 0.1466, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.423954963684082, "rewards/margins": 3.4382846355438232, "rewards/rejected": -7.862239837646484, "step": 80 }, { "epoch": 1.7061611374407581, "grad_norm": 46.767267802887574, "learning_rate": 2.7440387297912122e-08, "logits/chosen": 98.97454071044922, "logits/rejected": 111.58085632324219, "logps/chosen": -476.39898681640625, "logps/rejected": -606.3544311523438, "loss": 0.1352, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5173468589782715, "rewards/margins": 3.891281843185425, "rewards/rejected": -8.408628463745117, "step": 90 }, { "epoch": 1.8957345971563981, "grad_norm": 47.930873499541235, "learning_rate": 2.27878296044029e-09, "logits/chosen": 105.6420669555664, "logits/rejected": 105.06365966796875, "logps/chosen": -465.14501953125, "logps/rejected": -568.013916015625, "loss": 0.1404, "rewards/accuracies": 0.9375, "rewards/chosen": -4.164032936096191, "rewards/margins": 3.4092299938201904, "rewards/rejected": -7.573262691497803, "step": 100 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": 85.3946533203125, "eval_logits/rejected": 79.68743133544922, "eval_logps/chosen": -464.6122131347656, "eval_logps/rejected": -499.045654296875, "eval_loss": 0.480917751789093, "eval_rewards/accuracies": 0.7083333134651184, "eval_rewards/chosen": -5.093915939331055, "eval_rewards/margins": 1.7607501745224, "eval_rewards/rejected": -6.854665756225586, "eval_runtime": 35.5196, "eval_samples_per_second": 21.115, "eval_steps_per_second": 0.676, "step": 100 }, { "epoch": 1.971563981042654, "step": 104, "total_flos": 0.0, "train_loss": 0.37812595069408417, "train_runtime": 1139.5405, "train_samples_per_second": 11.847, "train_steps_per_second": 0.091 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }