{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 0.8209380560893507, "learning_rate": 4e-09, "logits/chosen": 0.90625, "logits/rejected": 0.90625, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.691, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0016, "grad_norm": 1.6440299235562053, "learning_rate": 8e-09, "logits/chosen": 0.75390625, "logits/rejected": 0.89453125, "logps/chosen": -51.0, "logps/rejected": -42.75, "loss": 0.6923, "loss/demonstration_loss": -298.0, "loss/preference_loss": -298.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00030517578125, "rewards/rejected": -0.00592041015625, "step": 2 }, { "epoch": 0.0024, "grad_norm": 1.1043452323080547, "learning_rate": 1.2e-08, "logits/chosen": 0.671875, "logits/rejected": 0.65625, "logps/chosen": -33.0, "logps/rejected": -33.25, "loss": 0.691, "loss/demonstration_loss": -528.0, "loss/preference_loss": -524.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.006256103515625, "step": 3 }, { "epoch": 0.0032, "grad_norm": 0.8061901336989212, "learning_rate": 1.6e-08, "logits/chosen": 0.640625, "logits/rejected": 0.6640625, "logps/chosen": -24.0, "logps/rejected": -23.875, "loss": 0.6912, "loss/demonstration_loss": -382.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00439453125, "step": 4 }, { "epoch": 0.004, "grad_norm": 1.4441787075317971, "learning_rate": 2e-08, "logits/chosen": 0.828125, "logits/rejected": 0.78515625, "logps/chosen": -56.0, "logps/rejected": -65.5, "loss": 0.6935, "loss/demonstration_loss": -640.0, "loss/preference_loss": -648.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.0087890625, "rewards/rejected": -0.0050048828125, "step": 5 }, { "epoch": 0.0048, "grad_norm": 1.050640679839386, "learning_rate": 2.4e-08, "logits/chosen": 0.7734375, "logits/rejected": 0.8046875, "logps/chosen": -35.75, "logps/rejected": -37.5, "loss": 0.6931, "loss/demonstration_loss": -390.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.00093841552734375, "step": 6 }, { "epoch": 0.0056, "grad_norm": 1.647796609281657, "learning_rate": 2.8e-08, "logits/chosen": 0.6796875, "logits/rejected": 0.5078125, "logps/chosen": -84.5, "logps/rejected": -96.5, "loss": 0.6952, "loss/demonstration_loss": -472.0, "loss/preference_loss": -478.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.034912109375, "rewards/margins": -0.016845703125, "rewards/rejected": -0.0181884765625, "step": 7 }, { "epoch": 0.0064, "grad_norm": 1.742786624192571, "learning_rate": 3.2e-08, "logits/chosen": 1.0625, "logits/rejected": 0.9921875, "logps/chosen": -19.5, "logps/rejected": -22.125, "loss": 0.6924, "loss/demonstration_loss": -220.0, "loss/preference_loss": -219.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00390625, "rewards/margins": 0.00201416015625, "rewards/rejected": -0.00592041015625, "step": 8 }, { "epoch": 0.0072, "grad_norm": 1.0960223846081436, "learning_rate": 3.6e-08, "logits/chosen": 0.7734375, "logits/rejected": 0.7578125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6919, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9 }, { "epoch": 0.008, "grad_norm": 1.3435606973859948, "learning_rate": 4e-08, "logits/chosen": 0.58984375, "logits/rejected": 0.7421875, "logps/chosen": -50.5, "logps/rejected": -31.125, "loss": 0.6964, "loss/demonstration_loss": -434.0, "loss/preference_loss": -438.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0078125, "rewards/rejected": 0.002197265625, "step": 10 }, { "epoch": 0.0088, "grad_norm": 1.5565361962215234, "learning_rate": 4.4e-08, "logits/chosen": 0.9765625, "logits/rejected": 0.87109375, "logps/chosen": -60.0, "logps/rejected": -75.5, "loss": 0.6893, "loss/demonstration_loss": -432.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0037689208984375, "rewards/rejected": -0.0118408203125, "step": 11 }, { "epoch": 0.0096, "grad_norm": 1.4978263868845951, "learning_rate": 4.8e-08, "logits/chosen": 0.52734375, "logits/rejected": 0.59375, "logps/chosen": -66.5, "logps/rejected": -55.0, "loss": 0.6887, "loss/demonstration_loss": -384.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0150146484375, "rewards/margins": 0.009765625, "rewards/rejected": -0.024658203125, "step": 12 }, { "epoch": 0.0104, "grad_norm": 1.085349802445053, "learning_rate": 5.1999999999999996e-08, "logits/chosen": 0.625, "logits/rejected": 0.70703125, "logps/chosen": -57.5, "logps/rejected": -41.5, "loss": 0.6921, "loss/demonstration_loss": -524.0, "loss/preference_loss": -524.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": 0.00030517578125, "rewards/rejected": -0.00909423828125, "step": 13 }, { "epoch": 0.0112, "grad_norm": 1.8575096156895132, "learning_rate": 5.6e-08, "logits/chosen": 1.046875, "logits/rejected": 0.8515625, "logps/chosen": -42.25, "logps/rejected": -77.0, "loss": 0.6893, "loss/demonstration_loss": -476.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.01190185546875, "rewards/rejected": -0.0137939453125, "step": 14 }, { "epoch": 0.012, "grad_norm": 1.216806796769409, "learning_rate": 6e-08, "logits/chosen": 0.86328125, "logits/rejected": 0.80859375, "logps/chosen": -6.90625, "logps/rejected": -9.875, "loss": 0.6938, "loss/demonstration_loss": -134.0, "loss/preference_loss": -134.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.000782012939453125, "rewards/margins": -0.000156402587890625, "rewards/rejected": -0.0006256103515625, "step": 15 }, { "epoch": 0.0128, "grad_norm": 1.526972680150784, "learning_rate": 6.4e-08, "logits/chosen": 0.640625, "logits/rejected": 0.73046875, "logps/chosen": -63.75, "logps/rejected": -59.0, "loss": 0.6962, "loss/demonstration_loss": -386.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.02001953125, "rewards/margins": -0.009033203125, "rewards/rejected": -0.01092529296875, "step": 16 }, { "epoch": 0.0136, "grad_norm": 1.111122975369239, "learning_rate": 6.8e-08, "logits/chosen": 0.625, "logits/rejected": 0.60546875, "logps/chosen": -51.5, "logps/rejected": -50.5, "loss": 0.6942, "loss/demonstration_loss": -402.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.01153564453125, "rewards/rejected": -0.0028076171875, "step": 17 }, { "epoch": 0.0144, "grad_norm": 1.266410458337516, "learning_rate": 7.2e-08, "logits/chosen": 0.69140625, "logits/rejected": 0.6328125, "logps/chosen": -35.5, "logps/rejected": -32.5, "loss": 0.6937, "loss/demonstration_loss": -360.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0062255859375, "rewards/margins": -0.002471923828125, "rewards/rejected": -0.003753662109375, "step": 18 }, { "epoch": 0.0152, "grad_norm": 1.5994122644654925, "learning_rate": 7.599999999999999e-08, "logits/chosen": 0.86328125, "logits/rejected": 0.83203125, "logps/chosen": -95.0, "logps/rejected": -94.5, "loss": 0.688, "loss/demonstration_loss": -432.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007354736328125, "rewards/margins": 0.01806640625, "rewards/rejected": -0.0255126953125, "step": 19 }, { "epoch": 0.016, "grad_norm": 1.6927386457809033, "learning_rate": 8e-08, "logits/chosen": 0.7890625, "logits/rejected": 0.8671875, "logps/chosen": -14.3125, "logps/rejected": -15.4375, "loss": 0.6898, "loss/demonstration_loss": -240.0, "loss/preference_loss": -238.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0021820068359375, "rewards/margins": 0.0021820068359375, "rewards/rejected": 0.0, "step": 20 }, { "epoch": 0.0168, "grad_norm": 0.2794042715361866, "learning_rate": 8.4e-08, "logits/chosen": 0.51171875, "logits/rejected": 0.490234375, "logps/chosen": -14.5, "logps/rejected": -14.25, "loss": 0.6921, "loss/demonstration_loss": -226.0, "loss/preference_loss": -228.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005157470703125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.002655029296875, "step": 21 }, { "epoch": 0.0176, "grad_norm": 1.6977704386860895, "learning_rate": 8.8e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.78515625, "logps/chosen": -80.0, "logps/rejected": -63.75, "loss": 0.6915, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.0106201171875, "step": 22 }, { "epoch": 0.0184, "grad_norm": 2.1173310593035466, "learning_rate": 9.199999999999999e-08, "logits/chosen": 0.73828125, "logits/rejected": 0.7734375, "logps/chosen": -61.5, "logps/rejected": -64.0, "loss": 0.6929, "loss/demonstration_loss": -398.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01220703125, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.006256103515625, "step": 23 }, { "epoch": 0.0192, "grad_norm": 1.572484914762249, "learning_rate": 9.6e-08, "logits/chosen": 0.6953125, "logits/rejected": 0.60546875, "logps/chosen": -67.5, "logps/rejected": -75.0, "loss": 0.6901, "loss/demonstration_loss": -568.0, "loss/preference_loss": -564.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0078125, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.01312255859375, "step": 24 }, { "epoch": 0.02, "grad_norm": 1.199028538220455, "learning_rate": 1e-07, "logits/chosen": 1.03125, "logits/rejected": 1.015625, "logps/chosen": -22.875, "logps/rejected": -22.875, "loss": 0.6925, "loss/demonstration_loss": -728.0, "loss/preference_loss": -728.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0, "rewards/rejected": -0.00250244140625, "step": 25 }, { "epoch": 0.0208, "grad_norm": 1.0837316278288727, "learning_rate": 1.0399999999999999e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.73046875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6907, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 26 }, { "epoch": 0.0216, "grad_norm": 1.5434481457115024, "learning_rate": 1.0799999999999999e-07, "logits/chosen": 1.0078125, "logits/rejected": 0.87109375, "logps/chosen": -47.5, "logps/rejected": -60.25, "loss": 0.6895, "loss/demonstration_loss": -342.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00439453125, "rewards/rejected": -0.01123046875, "step": 27 }, { "epoch": 0.0224, "grad_norm": 1.721953411291839, "learning_rate": 1.12e-07, "logits/chosen": 0.5625, "logits/rejected": 0.546875, "logps/chosen": -4.96875, "logps/rejected": -7.71875, "loss": 0.6907, "loss/demonstration_loss": -204.0, "loss/preference_loss": -203.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.00031280517578125, "rewards/rejected": 0.0, "step": 28 }, { "epoch": 0.0232, "grad_norm": 1.786367442578249, "learning_rate": 1.16e-07, "logits/chosen": 0.69921875, "logits/rejected": 0.90234375, "logps/chosen": -77.5, "logps/rejected": -62.0, "loss": 0.6934, "loss/demonstration_loss": -552.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.004058837890625, "step": 29 }, { "epoch": 0.024, "grad_norm": 0.8378269882448623, "learning_rate": 1.2e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.9140625, "logps/chosen": -51.0, "logps/rejected": -42.75, "loss": 0.6931, "loss/demonstration_loss": -372.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0078125, "rewards/margins": -0.00421142578125, "rewards/rejected": -0.00360107421875, "step": 30 }, { "epoch": 0.0248, "grad_norm": 0.8738754417371163, "learning_rate": 1.24e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.67578125, "logps/chosen": -22.0, "logps/rejected": -23.5, "loss": 0.6931, "loss/demonstration_loss": -358.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00531005859375, "rewards/margins": -0.002349853515625, "rewards/rejected": -0.0029754638671875, "step": 31 }, { "epoch": 0.0256, "grad_norm": 0.8509769800116922, "learning_rate": 1.28e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.75390625, "logps/chosen": -26.75, "logps/rejected": -29.5, "loss": 0.6929, "loss/demonstration_loss": -296.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.00970458984375, "rewards/rejected": 0.001251220703125, "step": 32 }, { "epoch": 0.0264, "grad_norm": 1.462569932281536, "learning_rate": 1.32e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.5859375, "logps/chosen": -63.0, "logps/rejected": -60.75, "loss": 0.693, "loss/demonstration_loss": -490.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.0078125, "step": 33 }, { "epoch": 0.0272, "grad_norm": 1.2177658389350623, "learning_rate": 1.36e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.515625, "logps/chosen": -45.5, "logps/rejected": -44.75, "loss": 0.6926, "loss/demonstration_loss": -476.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01220703125, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.00531005859375, "step": 34 }, { "epoch": 0.028, "grad_norm": 1.5869321769385532, "learning_rate": 1.4e-07, "logits/chosen": 0.703125, "logits/rejected": 0.55078125, "logps/chosen": -50.0, "logps/rejected": -76.0, "loss": 0.6917, "loss/demonstration_loss": -500.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01092529296875, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.00811767578125, "step": 35 }, { "epoch": 0.0288, "grad_norm": 1.6375657929926535, "learning_rate": 1.44e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.921875, "logps/chosen": -28.0, "logps/rejected": -40.75, "loss": 0.6903, "loss/demonstration_loss": -274.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.00970458984375, "step": 36 }, { "epoch": 0.0296, "grad_norm": 0.7681532593454634, "learning_rate": 1.4799999999999998e-07, "logits/chosen": 0.5625, "logits/rejected": 0.5234375, "logps/chosen": -13.75, "logps/rejected": -17.25, "loss": 0.6919, "loss/demonstration_loss": -245.0, "loss/preference_loss": -246.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.0018768310546875, "step": 37 }, { "epoch": 0.0304, "grad_norm": 1.2333386551576944, "learning_rate": 1.5199999999999998e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.62109375, "logps/chosen": -20.5, "logps/rejected": -26.25, "loss": 0.6926, "loss/demonstration_loss": -249.0, "loss/preference_loss": -249.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.002197265625, "rewards/margins": -0.0028076171875, "rewards/rejected": 0.0006256103515625, "step": 38 }, { "epoch": 0.0312, "grad_norm": 1.9072928978163763, "learning_rate": 1.56e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.91796875, "logps/chosen": -24.5, "logps/rejected": -17.25, "loss": 0.6921, "loss/demonstration_loss": -220.0, "loss/preference_loss": -221.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.00156402587890625, "step": 39 }, { "epoch": 0.032, "grad_norm": 1.0214934634872623, "learning_rate": 1.6e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.78515625, "logps/chosen": -3.53125, "logps/rejected": -9.875, "loss": 0.6899, "loss/demonstration_loss": -214.0, "loss/preference_loss": -214.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.000469207763671875, "rewards/margins": -0.000469207763671875, "rewards/rejected": 0.0, "step": 40 }, { "epoch": 0.0328, "grad_norm": 0.9368247594445012, "learning_rate": 1.64e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.80859375, "logps/chosen": -34.25, "logps/rejected": -34.5, "loss": 0.6912, "loss/demonstration_loss": -364.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00531005859375, "rewards/margins": 0.0015869140625, "rewards/rejected": -0.00689697265625, "step": 41 }, { "epoch": 0.0336, "grad_norm": 1.609620786077252, "learning_rate": 1.68e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.59375, "logps/chosen": -34.5, "logps/rejected": -41.25, "loss": 0.6901, "loss/demonstration_loss": -400.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.00567626953125, "rewards/rejected": -0.0137939453125, "step": 42 }, { "epoch": 0.0344, "grad_norm": 0.7383073838571603, "learning_rate": 1.7199999999999998e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.78515625, "logps/chosen": -14.0, "logps/rejected": -14.1875, "loss": 0.6921, "loss/demonstration_loss": -448.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.0031280517578125, "step": 43 }, { "epoch": 0.0352, "grad_norm": 1.8112552587094783, "learning_rate": 1.76e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.74609375, "logps/chosen": -51.0, "logps/rejected": -60.5, "loss": 0.6907, "loss/demonstration_loss": -356.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00124359130859375, "rewards/margins": 0.010009765625, "rewards/rejected": -0.01123046875, "step": 44 }, { "epoch": 0.036, "grad_norm": 1.2397436726110618, "learning_rate": 1.8e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.69921875, "logps/chosen": -52.0, "logps/rejected": -54.0, "loss": 0.6898, "loss/demonstration_loss": -560.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01123046875, "rewards/margins": 0.006927490234375, "rewards/rejected": -0.0181884765625, "step": 45 }, { "epoch": 0.0368, "grad_norm": 0.9454999175726874, "learning_rate": 1.8399999999999998e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.84375, "logps/chosen": -12.8125, "logps/rejected": -12.9375, "loss": 0.6925, "loss/demonstration_loss": -408.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0031280517578125, "step": 46 }, { "epoch": 0.0376, "grad_norm": 0.9891235578015929, "learning_rate": 1.88e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.75, "logps/chosen": -2.59375, "logps/rejected": -5.21875, "loss": 0.6915, "loss/demonstration_loss": -125.0, "loss/preference_loss": -121.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.000156402587890625, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.00250244140625, "step": 47 }, { "epoch": 0.0384, "grad_norm": 1.8705786109410005, "learning_rate": 1.92e-07, "logits/chosen": 0.8984375, "logits/rejected": 1.1328125, "logps/chosen": -77.0, "logps/rejected": -57.0, "loss": 0.6925, "loss/demonstration_loss": -426.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00531005859375, "rewards/margins": -0.005950927734375, "rewards/rejected": 0.0006256103515625, "step": 48 }, { "epoch": 0.0392, "grad_norm": 1.6710761825634408, "learning_rate": 1.96e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.76953125, "logps/chosen": -56.25, "logps/rejected": -47.75, "loss": 0.6925, "loss/demonstration_loss": -328.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.0067138671875, "rewards/rejected": -0.007659912109375, "step": 49 }, { "epoch": 0.04, "grad_norm": 1.3662039337117682, "learning_rate": 2e-07, "logits/chosen": 0.4375, "logits/rejected": 0.44140625, "logps/chosen": -39.5, "logps/rejected": -39.5, "loss": 0.6934, "loss/demonstration_loss": -624.0, "loss/preference_loss": -624.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0087890625, "rewards/margins": 0.0, "rewards/rejected": -0.0087890625, "step": 50 }, { "epoch": 0.0408, "grad_norm": 1.1870509466254304, "learning_rate": 2.0399999999999997e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.890625, "logps/chosen": -66.0, "logps/rejected": -48.5, "loss": 0.691, "loss/demonstration_loss": -364.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0098876953125, "rewards/margins": -0.000823974609375, "rewards/rejected": -0.009033203125, "step": 51 }, { "epoch": 0.0416, "grad_norm": 1.5799262187181753, "learning_rate": 2.0799999999999998e-07, "logits/chosen": 1.046875, "logits/rejected": 0.87109375, "logps/chosen": -43.5, "logps/rejected": -56.75, "loss": 0.6914, "loss/demonstration_loss": -398.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0078125, "rewards/margins": 0.0, "rewards/rejected": -0.0078125, "step": 52 }, { "epoch": 0.0424, "grad_norm": 1.32336733229503, "learning_rate": 2.12e-07, "logits/chosen": 0.859375, "logits/rejected": 0.9140625, "logps/chosen": -68.0, "logps/rejected": -62.25, "loss": 0.6931, "loss/demonstration_loss": -414.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": -0.000782012939453125, "rewards/rejected": -0.00921630859375, "step": 53 }, { "epoch": 0.0432, "grad_norm": 1.017483775757399, "learning_rate": 2.1599999999999998e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.671875, "logps/chosen": -4.125, "logps/rejected": -4.03125, "loss": 0.6923, "loss/demonstration_loss": -129.0, "loss/preference_loss": -130.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00093841552734375, "rewards/margins": -0.00093841552734375, "rewards/rejected": 0.0, "step": 54 }, { "epoch": 0.044, "grad_norm": 2.169341093306222, "learning_rate": 2.1999999999999998e-07, "logits/chosen": 0.94921875, "logits/rejected": 0.97265625, "logps/chosen": -7.46875, "logps/rejected": -7.03125, "loss": 0.6919, "loss/demonstration_loss": -234.0, "loss/preference_loss": -229.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.0018768310546875, "step": 55 }, { "epoch": 0.0448, "grad_norm": 1.8828861775944417, "learning_rate": 2.24e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.76953125, "logps/chosen": -44.5, "logps/rejected": -45.5, "loss": 0.694, "loss/demonstration_loss": -478.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0017242431640625, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.000782012939453125, "step": 56 }, { "epoch": 0.0456, "grad_norm": 1.1525111979796472, "learning_rate": 2.28e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.66796875, "logps/chosen": -10.5625, "logps/rejected": -3.234375, "loss": 0.6957, "loss/demonstration_loss": -213.0, "loss/preference_loss": -223.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.006256103515625, "rewards/rejected": 0.001251220703125, "step": 57 }, { "epoch": 0.0464, "grad_norm": 1.2269553760950345, "learning_rate": 2.32e-07, "logits/chosen": 0.5625, "logits/rejected": 0.62890625, "logps/chosen": -47.25, "logps/rejected": -46.5, "loss": 0.6956, "loss/demonstration_loss": -494.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.010009765625, "rewards/rejected": 0.0006256103515625, "step": 58 }, { "epoch": 0.0472, "grad_norm": 0.9855459625896333, "learning_rate": 2.3599999999999997e-07, "logits/chosen": 0.97265625, "logits/rejected": 0.875, "logps/chosen": -39.0, "logps/rejected": -42.5, "loss": 0.6924, "loss/demonstration_loss": -258.0, "loss/preference_loss": -260.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.004547119140625, "rewards/rejected": -0.00360107421875, "step": 59 }, { "epoch": 0.048, "grad_norm": 1.2449316926147886, "learning_rate": 2.4e-07, "logits/chosen": 0.78125, "logits/rejected": 0.64453125, "logps/chosen": -29.625, "logps/rejected": -34.5, "loss": 0.6932, "loss/demonstration_loss": -334.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0172119140625, "rewards/margins": -0.006591796875, "rewards/rejected": -0.0106201171875, "step": 60 }, { "epoch": 0.0488, "grad_norm": 1.3971478293331656, "learning_rate": 2.4399999999999996e-07, "logits/chosen": 0.859375, "logits/rejected": 0.8125, "logps/chosen": -56.5, "logps/rejected": -61.75, "loss": 0.6898, "loss/demonstration_loss": -470.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": 0.006866455078125, "rewards/rejected": -0.016845703125, "step": 61 }, { "epoch": 0.0496, "grad_norm": 1.0982642699121732, "learning_rate": 2.48e-07, "logits/chosen": 0.78125, "logits/rejected": 0.73046875, "logps/chosen": -14.3125, "logps/rejected": -19.625, "loss": 0.6923, "loss/demonstration_loss": -544.0, "loss/preference_loss": -536.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.003753662109375, "step": 62 }, { "epoch": 0.0504, "grad_norm": 0.9495899598730796, "learning_rate": 2.52e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.8203125, "logps/chosen": -22.25, "logps/rejected": -21.75, "loss": 0.6936, "loss/demonstration_loss": -344.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006866455078125, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.0018768310546875, "step": 63 }, { "epoch": 0.0512, "grad_norm": 1.6254169902993139, "learning_rate": 2.56e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.625, "logps/chosen": -40.5, "logps/rejected": -70.5, "loss": 0.6893, "loss/demonstration_loss": -442.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0029754638671875, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.00811767578125, "step": 64 }, { "epoch": 0.052, "grad_norm": 1.2487694353971655, "learning_rate": 2.6e-07, "logits/chosen": 0.65625, "logits/rejected": 0.67578125, "logps/chosen": -31.0, "logps/rejected": -32.0, "loss": 0.6888, "loss/demonstration_loss": -336.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.009033203125, "rewards/rejected": -0.0084228515625, "step": 65 }, { "epoch": 0.0528, "grad_norm": 1.6985901315312775, "learning_rate": 2.64e-07, "logits/chosen": 0.84375, "logits/rejected": 0.94921875, "logps/chosen": -44.0, "logps/rejected": -31.75, "loss": 0.6882, "loss/demonstration_loss": -398.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01251220703125, "rewards/margins": 0.01025390625, "rewards/rejected": -0.022705078125, "step": 66 }, { "epoch": 0.0536, "grad_norm": 1.4672361545407757, "learning_rate": 2.68e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.6171875, "logps/chosen": -29.0, "logps/rejected": -29.25, "loss": 0.6901, "loss/demonstration_loss": -460.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.00153350830078125, "rewards/rejected": -0.007781982421875, "step": 67 }, { "epoch": 0.0544, "grad_norm": 1.800880366741341, "learning_rate": 2.72e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.73828125, "logps/chosen": -31.75, "logps/rejected": -24.0, "loss": 0.692, "loss/demonstration_loss": -442.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.001220703125, "rewards/rejected": -0.0068359375, "step": 68 }, { "epoch": 0.0552, "grad_norm": 1.4488776918762911, "learning_rate": 2.7600000000000004e-07, "logits/chosen": 0.984375, "logits/rejected": 1.0859375, "logps/chosen": -48.5, "logps/rejected": -27.25, "loss": 0.6895, "loss/demonstration_loss": -402.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.006561279296875, "step": 69 }, { "epoch": 0.056, "grad_norm": 0.6296325465307354, "learning_rate": 2.8e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.546875, "logps/chosen": -31.5, "logps/rejected": -31.5, "loss": 0.6915, "loss/demonstration_loss": -498.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0, "rewards/rejected": -0.00439453125, "step": 70 }, { "epoch": 0.0568, "grad_norm": 1.4416479670403033, "learning_rate": 2.8399999999999995e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.81640625, "logps/chosen": -43.75, "logps/rejected": -49.25, "loss": 0.6909, "loss/demonstration_loss": -370.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 71 }, { "epoch": 0.0576, "grad_norm": 0.9149307208835594, "learning_rate": 2.88e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.85546875, "logps/chosen": -42.0, "logps/rejected": -40.0, "loss": 0.6923, "loss/demonstration_loss": -436.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.00250244140625, "step": 72 }, { "epoch": 0.0584, "grad_norm": 2.1537184337707904, "learning_rate": 2.9199999999999997e-07, "logits/chosen": 0.91015625, "logits/rejected": 1.0625, "logps/chosen": -58.25, "logps/rejected": -44.5, "loss": 0.6932, "loss/demonstration_loss": -408.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.00439453125, "step": 73 }, { "epoch": 0.0592, "grad_norm": 1.0453387463011836, "learning_rate": 2.9599999999999995e-07, "logits/chosen": 0.953125, "logits/rejected": 0.80078125, "logps/chosen": -11.25, "logps/rejected": -22.25, "loss": 0.6927, "loss/demonstration_loss": -264.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004364013671875, "rewards/margins": -0.00154876708984375, "rewards/rejected": -0.0028076171875, "step": 74 }, { "epoch": 0.06, "grad_norm": 1.6942018368777387, "learning_rate": 3e-07, "logits/chosen": 0.828125, "logits/rejected": 0.88671875, "logps/chosen": -102.0, "logps/rejected": -82.5, "loss": 0.6938, "loss/demonstration_loss": -364.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.02294921875, "rewards/margins": -0.004669189453125, "rewards/rejected": -0.018310546875, "step": 75 }, { "epoch": 0.0608, "grad_norm": 1.1758791366529964, "learning_rate": 3.0399999999999997e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.51171875, "logps/chosen": -29.75, "logps/rejected": -30.75, "loss": 0.6914, "loss/demonstration_loss": -322.0, "loss/preference_loss": -318.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.00811767578125, "step": 76 }, { "epoch": 0.0616, "grad_norm": 1.376488471008137, "learning_rate": 3.08e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.5234375, "logps/chosen": -38.0, "logps/rejected": -39.0, "loss": 0.6906, "loss/demonstration_loss": -616.0, "loss/preference_loss": -608.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00872802734375, "rewards/rejected": -0.01123046875, "step": 77 }, { "epoch": 0.0624, "grad_norm": 1.1175161767053294, "learning_rate": 3.12e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.8203125, "logps/chosen": -3.125, "logps/rejected": -4.40625, "loss": 0.6921, "loss/demonstration_loss": -121.5, "loss/preference_loss": -121.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.0, "rewards/rejected": 0.0006256103515625, "step": 78 }, { "epoch": 0.0632, "grad_norm": 1.10197521289134, "learning_rate": 3.1599999999999997e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.72265625, "logps/chosen": -27.75, "logps/rejected": -35.0, "loss": 0.6927, "loss/demonstration_loss": -334.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.002349853515625, "rewards/margins": -0.00360107421875, "rewards/rejected": 0.001251220703125, "step": 79 }, { "epoch": 0.064, "grad_norm": 0.9172757821978966, "learning_rate": 3.2e-07, "logits/chosen": 0.578125, "logits/rejected": 0.5625, "logps/chosen": -11.6875, "logps/rejected": -11.625, "loss": 0.693, "loss/demonstration_loss": -185.0, "loss/preference_loss": -186.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.0006256103515625, "step": 80 }, { "epoch": 0.0648, "grad_norm": 1.3729118127157203, "learning_rate": 3.24e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.6484375, "logps/chosen": -12.0625, "logps/rejected": -12.25, "loss": 0.6914, "loss/demonstration_loss": -384.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.001861572265625, "rewards/rejected": -0.004364013671875, "step": 81 }, { "epoch": 0.0656, "grad_norm": 1.4357386792130993, "learning_rate": 3.28e-07, "logits/chosen": 1.140625, "logits/rejected": 0.96875, "logps/chosen": -13.6875, "logps/rejected": -41.5, "loss": 0.6895, "loss/demonstration_loss": -294.0, "loss/preference_loss": -290.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.006103515625, "rewards/rejected": -0.007659912109375, "step": 82 }, { "epoch": 0.0664, "grad_norm": 1.1675794414624217, "learning_rate": 3.32e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.94921875, "logps/chosen": -44.5, "logps/rejected": -51.25, "loss": 0.6918, "loss/demonstration_loss": -508.0, "loss/preference_loss": -508.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.005615234375, "step": 83 }, { "epoch": 0.0672, "grad_norm": 17.364907628875546, "learning_rate": 3.36e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.7421875, "logps/chosen": -38.0, "logps/rejected": -34.0, "loss": 0.6903, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.005615234375, "rewards/rejected": -0.0084228515625, "step": 84 }, { "epoch": 0.068, "grad_norm": 1.1349257316301735, "learning_rate": 3.4000000000000003e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.71875, "logps/chosen": -35.5, "logps/rejected": -31.75, "loss": 0.6918, "loss/demonstration_loss": -532.0, "loss/preference_loss": -532.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.0072021484375, "step": 85 }, { "epoch": 0.0688, "grad_norm": 0.867517897537488, "learning_rate": 3.4399999999999996e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.85546875, "logps/chosen": -43.5, "logps/rejected": -36.0, "loss": 0.6897, "loss/demonstration_loss": -422.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.00750732421875, "step": 86 }, { "epoch": 0.0696, "grad_norm": 6.584689000818033, "learning_rate": 3.4799999999999994e-07, "logits/chosen": 0.578125, "logits/rejected": 0.578125, "logps/chosen": -31.75, "logps/rejected": -31.75, "loss": 0.6914, "loss/demonstration_loss": -334.0, "loss/preference_loss": -334.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.00128173828125, "rewards/rejected": -0.0087890625, "step": 87 }, { "epoch": 0.0704, "grad_norm": 1.3274912755115518, "learning_rate": 3.52e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.83203125, "logps/chosen": -25.5, "logps/rejected": -32.0, "loss": 0.6912, "loss/demonstration_loss": -456.0, "loss/preference_loss": -454.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.005615234375, "step": 88 }, { "epoch": 0.0712, "grad_norm": 1.4414207869963513, "learning_rate": 3.5599999999999996e-07, "logits/chosen": 0.75, "logits/rejected": 0.90625, "logps/chosen": -65.5, "logps/rejected": -45.25, "loss": 0.6898, "loss/demonstration_loss": -352.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0050048828125, "step": 89 }, { "epoch": 0.072, "grad_norm": 0.6800437351634878, "learning_rate": 3.6e-07, "logits/chosen": 0.466796875, "logits/rejected": 0.47265625, "logps/chosen": -25.125, "logps/rejected": -25.125, "loss": 0.6913, "loss/demonstration_loss": -800.0, "loss/preference_loss": -800.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0, "rewards/rejected": -0.003753662109375, "step": 90 }, { "epoch": 0.0728, "grad_norm": 1.1870565382964298, "learning_rate": 3.64e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.6640625, "logps/chosen": -20.125, "logps/rejected": -20.5, "loss": 0.6913, "loss/demonstration_loss": -324.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.007049560546875, "step": 91 }, { "epoch": 0.0736, "grad_norm": 0.9365367697254688, "learning_rate": 3.6799999999999996e-07, "logits/chosen": 0.4765625, "logits/rejected": 0.498046875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6899, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 92 }, { "epoch": 0.0744, "grad_norm": 1.781922363164748, "learning_rate": 3.72e-07, "logits/chosen": 0.9375, "logits/rejected": 0.98046875, "logps/chosen": -39.25, "logps/rejected": -33.25, "loss": 0.6896, "loss/demonstration_loss": -386.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.000469207763671875, "rewards/margins": 0.00189971923828125, "rewards/rejected": -0.002349853515625, "step": 93 }, { "epoch": 0.0752, "grad_norm": 0.9741059685196195, "learning_rate": 3.76e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.546875, "logps/chosen": -4.46875, "logps/rejected": -4.40625, "loss": 0.6913, "loss/demonstration_loss": -141.0, "loss/preference_loss": -142.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.0006256103515625, "rewards/rejected": 0.0, "step": 94 }, { "epoch": 0.076, "grad_norm": 0.8718042408555279, "learning_rate": 3.7999999999999996e-07, "logits/chosen": 0.765625, "logits/rejected": 0.75390625, "logps/chosen": -13.1875, "logps/rejected": -12.9375, "loss": 0.6932, "loss/demonstration_loss": -204.0, "loss/preference_loss": -207.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004852294921875, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.002349853515625, "step": 95 }, { "epoch": 0.0768, "grad_norm": 0.7110577371171516, "learning_rate": 3.84e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.75, "logps/chosen": -21.75, "logps/rejected": -21.75, "loss": 0.6907, "loss/demonstration_loss": -344.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005462646484375, "rewards/margins": -0.0010986328125, "rewards/rejected": -0.00439453125, "step": 96 }, { "epoch": 0.0776, "grad_norm": 0.8946144146680481, "learning_rate": 3.88e-07, "logits/chosen": 0.65625, "logits/rejected": 0.65625, "logps/chosen": -39.0, "logps/rejected": -39.0, "loss": 0.6923, "loss/demonstration_loss": -616.0, "loss/preference_loss": -616.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": 0.0, "rewards/rejected": -0.010009765625, "step": 97 }, { "epoch": 0.0784, "grad_norm": 0.8781400684825423, "learning_rate": 3.92e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.58984375, "logps/chosen": -28.625, "logps/rejected": -29.25, "loss": 0.6926, "loss/demonstration_loss": -458.0, "loss/preference_loss": -452.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.01312255859375, "step": 98 }, { "epoch": 0.0792, "grad_norm": 0.6587745671548039, "learning_rate": 3.96e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.640625, "logps/chosen": -21.375, "logps/rejected": -21.25, "loss": 0.6929, "loss/demonstration_loss": -336.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0012664794921875, "rewards/rejected": -0.0037384033203125, "step": 99 }, { "epoch": 0.08, "grad_norm": 1.0130955315751637, "learning_rate": 4e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.6875, "logps/chosen": -44.75, "logps/rejected": -37.0, "loss": 0.6938, "loss/demonstration_loss": -432.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": -0.0034332275390625, "rewards/rejected": -0.003448486328125, "step": 100 }, { "epoch": 0.0808, "grad_norm": 1.3907440334009287, "learning_rate": 4.04e-07, "logits/chosen": 1.078125, "logits/rejected": 1.15625, "logps/chosen": -62.5, "logps/rejected": -48.75, "loss": 0.6907, "loss/demonstration_loss": -588.0, "loss/preference_loss": -588.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.01251220703125, "step": 101 }, { "epoch": 0.0816, "grad_norm": 1.4041496150601482, "learning_rate": 4.0799999999999995e-07, "logits/chosen": 1.078125, "logits/rejected": 0.984375, "logps/chosen": -52.75, "logps/rejected": -57.5, "loss": 0.6913, "loss/demonstration_loss": -438.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.00970458984375, "step": 102 }, { "epoch": 0.0824, "grad_norm": 2.79887572759493, "learning_rate": 4.12e-07, "logits/chosen": 0.859375, "logits/rejected": 0.9609375, "logps/chosen": -79.5, "logps/rejected": -78.0, "loss": 0.6917, "loss/demonstration_loss": -356.0, "loss/preference_loss": -356.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.014404296875, "rewards/margins": 0.001495361328125, "rewards/rejected": -0.015869140625, "step": 103 }, { "epoch": 0.0832, "grad_norm": 1.4649790249566117, "learning_rate": 4.1599999999999997e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.57421875, "logps/chosen": -13.4375, "logps/rejected": -15.75, "loss": 0.6925, "loss/demonstration_loss": -232.0, "loss/preference_loss": -233.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0006256103515625, "step": 104 }, { "epoch": 0.084, "grad_norm": 0.6561169016405004, "learning_rate": 4.1999999999999995e-07, "logits/chosen": 0.9140625, "logits/rejected": 0.9609375, "logps/chosen": -7.71875, "logps/rejected": -2.40625, "loss": 0.6919, "loss/demonstration_loss": -162.0, "loss/preference_loss": -163.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": -0.000469207763671875, "rewards/rejected": 0.000469207763671875, "step": 105 }, { "epoch": 0.0848, "grad_norm": 1.139413302841845, "learning_rate": 4.24e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.8984375, "logps/chosen": -21.5, "logps/rejected": -11.75, "loss": 0.6912, "loss/demonstration_loss": -262.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.00093841552734375, "step": 106 }, { "epoch": 0.0856, "grad_norm": 1.3734086334155666, "learning_rate": 4.2799999999999997e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.69921875, "logps/chosen": -17.25, "logps/rejected": -16.25, "loss": 0.6943, "loss/demonstration_loss": -262.0, "loss/preference_loss": -270.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.00970458984375, "rewards/rejected": 0.0021820068359375, "step": 107 }, { "epoch": 0.0864, "grad_norm": 1.350355869961073, "learning_rate": 4.3199999999999995e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.73046875, "logps/chosen": -40.0, "logps/rejected": -42.0, "loss": 0.6904, "loss/demonstration_loss": -326.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.0093994140625, "step": 108 }, { "epoch": 0.0872, "grad_norm": 1.3386762615950862, "learning_rate": 4.36e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.87890625, "logps/chosen": -61.0, "logps/rejected": -58.25, "loss": 0.6929, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.00046539306640625, "rewards/rejected": -0.00799560546875, "step": 109 }, { "epoch": 0.088, "grad_norm": 1.690081659441462, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.76171875, "logps/chosen": -69.0, "logps/rejected": -61.75, "loss": 0.6941, "loss/demonstration_loss": -516.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01220703125, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.0093994140625, "step": 110 }, { "epoch": 0.0888, "grad_norm": 0.8766809705584249, "learning_rate": 4.44e-07, "logits/chosen": 0.6875, "logits/rejected": 0.69921875, "logps/chosen": -48.75, "logps/rejected": -48.5, "loss": 0.6929, "loss/demonstration_loss": -384.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0040283203125, "rewards/rejected": -0.005950927734375, "step": 111 }, { "epoch": 0.0896, "grad_norm": 1.9676259756514616, "learning_rate": 4.48e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.95703125, "logps/chosen": -45.5, "logps/rejected": -45.0, "loss": 0.6947, "loss/demonstration_loss": -478.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0062255859375, "rewards/margins": -0.001861572265625, "rewards/rejected": -0.00439453125, "step": 112 }, { "epoch": 0.0904, "grad_norm": 1.9990828168116606, "learning_rate": 4.5199999999999997e-07, "logits/chosen": 0.71875, "logits/rejected": 0.8125, "logps/chosen": -91.5, "logps/rejected": -81.0, "loss": 0.6909, "loss/demonstration_loss": -392.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.00092315673828125, "rewards/rejected": -0.01220703125, "step": 113 }, { "epoch": 0.0912, "grad_norm": 0.9098292339017582, "learning_rate": 4.56e-07, "logits/chosen": 0.7734375, "logits/rejected": 0.76953125, "logps/chosen": -30.625, "logps/rejected": -33.5, "loss": 0.6907, "loss/demonstration_loss": -342.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.0050048828125, "step": 114 }, { "epoch": 0.092, "grad_norm": 1.1198797693043432, "learning_rate": 4.6e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.78515625, "logps/chosen": -39.25, "logps/rejected": -41.5, "loss": 0.6935, "loss/demonstration_loss": -318.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01092529296875, "rewards/margins": -0.00848388671875, "rewards/rejected": -0.00250244140625, "step": 115 }, { "epoch": 0.0928, "grad_norm": 1.482656601304879, "learning_rate": 4.64e-07, "logits/chosen": 0.546875, "logits/rejected": 0.5703125, "logps/chosen": -58.75, "logps/rejected": -62.25, "loss": 0.693, "loss/demonstration_loss": -476.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.018798828125, "rewards/margins": -0.005615234375, "rewards/rejected": -0.01312255859375, "step": 116 }, { "epoch": 0.0936, "grad_norm": 1.4354266146408057, "learning_rate": 4.68e-07, "logits/chosen": 0.83203125, "logits/rejected": 0.734375, "logps/chosen": -34.0, "logps/rejected": -46.5, "loss": 0.6943, "loss/demonstration_loss": -428.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": -1.52587890625e-05, "rewards/rejected": -0.005615234375, "step": 117 }, { "epoch": 0.0944, "grad_norm": 1.1849483770842528, "learning_rate": 4.7199999999999994e-07, "logits/chosen": 0.765625, "logits/rejected": 0.7734375, "logps/chosen": -5.6875, "logps/rejected": -5.8125, "loss": 0.6932, "loss/demonstration_loss": -186.0, "loss/preference_loss": -184.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.001251220703125, "rewards/rejected": 0.00031280517578125, "step": 118 }, { "epoch": 0.0952, "grad_norm": 1.2452098858148142, "learning_rate": 4.76e-07, "logits/chosen": 1.203125, "logits/rejected": 1.0703125, "logps/chosen": -62.0, "logps/rejected": -61.5, "loss": 0.6918, "loss/demonstration_loss": -392.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0093994140625, "step": 119 }, { "epoch": 0.096, "grad_norm": 1.1628255666326697, "learning_rate": 4.8e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.703125, "logps/chosen": -9.75, "logps/rejected": -6.84375, "loss": 0.6932, "loss/demonstration_loss": -264.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0, "rewards/rejected": -0.001251220703125, "step": 120 }, { "epoch": 0.0968, "grad_norm": 21.592919848495512, "learning_rate": 4.839999999999999e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.6953125, "logps/chosen": -15.0625, "logps/rejected": -6.25, "loss": 0.6886, "loss/demonstration_loss": -340.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0009307861328125, "rewards/rejected": -0.0021820068359375, "step": 121 }, { "epoch": 0.0976, "grad_norm": 1.1734165889081358, "learning_rate": 4.879999999999999e-07, "logits/chosen": 0.9296875, "logits/rejected": 0.83984375, "logps/chosen": -19.25, "logps/rejected": -29.5, "loss": 0.691, "loss/demonstration_loss": -388.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.00750732421875, "step": 122 }, { "epoch": 0.0984, "grad_norm": 0.8396578276796183, "learning_rate": 4.92e-07, "logits/chosen": 0.75, "logits/rejected": 0.78125, "logps/chosen": -14.75, "logps/rejected": -20.625, "loss": 0.6912, "loss/demonstration_loss": -564.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.0031280517578125, "rewards/rejected": 0.001251220703125, "step": 123 }, { "epoch": 0.0992, "grad_norm": 2.1507209982339788, "learning_rate": 4.96e-07, "logits/chosen": 0.578125, "logits/rejected": 0.671875, "logps/chosen": -55.5, "logps/rejected": -48.25, "loss": 0.6926, "loss/demonstration_loss": -412.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01031494140625, "rewards/margins": 0.0062255859375, "rewards/rejected": -0.0166015625, "step": 124 }, { "epoch": 0.1, "grad_norm": 0.7688287544744891, "learning_rate": 5e-07, "logits/chosen": 0.83203125, "logits/rejected": 0.81640625, "logps/chosen": -21.875, "logps/rejected": -21.875, "loss": 0.6919, "loss/demonstration_loss": -688.0, "loss/preference_loss": -688.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0, "rewards/rejected": -0.00750732421875, "step": 125 }, { "epoch": 0.1008, "grad_norm": 1.1515780923544978, "learning_rate": 4.999990252248901e-07, "logits/chosen": 0.59375, "logits/rejected": 0.60546875, "logps/chosen": -63.75, "logps/rejected": -64.0, "loss": 0.6901, "loss/demonstration_loss": -508.0, "loss/preference_loss": -506.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.007659912109375, "rewards/margins": 0.004241943359375, "rewards/rejected": -0.01190185546875, "step": 126 }, { "epoch": 0.1016, "grad_norm": 1.3646989811966215, "learning_rate": 4.99996100907162e-07, "logits/chosen": 0.8671875, "logits/rejected": 0.84765625, "logps/chosen": -28.625, "logps/rejected": -30.125, "loss": 0.6908, "loss/demonstration_loss": -466.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": 0.000640869140625, "rewards/rejected": -0.00408935546875, "step": 127 }, { "epoch": 0.1024, "grad_norm": 1.2527887087527243, "learning_rate": 4.999912270696202e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.73828125, "logps/chosen": -79.0, "logps/rejected": -88.0, "loss": 0.6926, "loss/demonstration_loss": -440.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0162353515625, "rewards/margins": -0.00372314453125, "rewards/rejected": -0.01251220703125, "step": 128 }, { "epoch": 0.1032, "grad_norm": 1.6491684784413825, "learning_rate": 4.999844037502717e-07, "logits/chosen": 0.490234375, "logits/rejected": 0.36328125, "logps/chosen": -41.0, "logps/rejected": -46.5, "loss": 0.6935, "loss/demonstration_loss": -348.0, "loss/preference_loss": -348.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0037384033203125, "rewards/rejected": -0.00872802734375, "step": 129 }, { "epoch": 0.104, "grad_norm": 0.9344445054092613, "learning_rate": 4.99975631002326e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.6328125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.692, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 130 }, { "epoch": 0.1048, "grad_norm": 1.5674061103878776, "learning_rate": 4.999649088941951e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.921875, "logps/chosen": -82.0, "logps/rejected": -63.0, "loss": 0.6935, "loss/demonstration_loss": -382.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.017822265625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.0128173828125, "step": 131 }, { "epoch": 0.1056, "grad_norm": 1.0210209238573607, "learning_rate": 4.999522375094918e-07, "logits/chosen": 0.953125, "logits/rejected": 0.8984375, "logps/chosen": -6.21875, "logps/rejected": -13.5, "loss": 0.6908, "loss/demonstration_loss": -314.0, "loss/preference_loss": -308.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.00439453125, "rewards/rejected": -0.0050048828125, "step": 132 }, { "epoch": 0.1064, "grad_norm": 1.6718839601404938, "learning_rate": 4.999376169470305e-07, "logits/chosen": 1.03125, "logits/rejected": 0.9609375, "logps/chosen": -79.0, "logps/rejected": -96.5, "loss": 0.6956, "loss/demonstration_loss": -398.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0205078125, "rewards/margins": -0.01080322265625, "rewards/rejected": -0.00970458984375, "step": 133 }, { "epoch": 0.1072, "grad_norm": 1.3321982686524356, "learning_rate": 4.999210473208249e-07, "logits/chosen": 0.76171875, "logits/rejected": 0.828125, "logps/chosen": -67.5, "logps/rejected": -69.0, "loss": 0.6927, "loss/demonstration_loss": -540.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.013427734375, "rewards/margins": -0.0018463134765625, "rewards/rejected": -0.0115966796875, "step": 134 }, { "epoch": 0.108, "grad_norm": 1.7404360020627443, "learning_rate": 4.999025287600885e-07, "logits/chosen": 0.875, "logits/rejected": 0.72265625, "logps/chosen": -98.5, "logps/rejected": -139.0, "loss": 0.6929, "loss/demonstration_loss": -540.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.0028228759765625, "rewards/rejected": -0.01190185546875, "step": 135 }, { "epoch": 0.1088, "grad_norm": 1.1336337789145086, "learning_rate": 4.998820614092328e-07, "logits/chosen": 0.796875, "logits/rejected": 0.625, "logps/chosen": -23.625, "logps/rejected": -34.0, "loss": 0.6921, "loss/demonstration_loss": -304.0, "loss/preference_loss": -306.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.002197265625, "rewards/rejected": -0.0018768310546875, "step": 136 }, { "epoch": 0.1096, "grad_norm": 1.121268812418991, "learning_rate": 4.998596454278661e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.94140625, "logps/chosen": -58.5, "logps/rejected": -41.0, "loss": 0.6934, "loss/demonstration_loss": -394.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01123046875, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.005950927734375, "step": 137 }, { "epoch": 0.1104, "grad_norm": 1.7286892371743692, "learning_rate": 4.998352809907928e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.67578125, "logps/chosen": -67.0, "logps/rejected": -83.0, "loss": 0.6926, "loss/demonstration_loss": -478.0, "loss/preference_loss": -478.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.00689697265625, "step": 138 }, { "epoch": 0.1112, "grad_norm": 2.1549446701229447, "learning_rate": 4.998089682880116e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.96875, "logps/chosen": -78.5, "logps/rejected": -64.0, "loss": 0.6948, "loss/demonstration_loss": -452.0, "loss/preference_loss": -454.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.00469970703125, "step": 139 }, { "epoch": 0.112, "grad_norm": 0.941886597946166, "learning_rate": 4.997807075247145e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.6015625, "logps/chosen": -12.0, "logps/rejected": -23.5, "loss": 0.6918, "loss/demonstration_loss": -284.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0017242431640625, "rewards/margins": 0.0017242431640625, "rewards/rejected": -0.003448486328125, "step": 140 }, { "epoch": 0.1128, "grad_norm": 1.5321314572804496, "learning_rate": 4.997504989212846e-07, "logits/chosen": 0.89453125, "logits/rejected": 0.77734375, "logps/chosen": -19.625, "logps/rejected": -30.375, "loss": 0.6948, "loss/demonstration_loss": -264.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0045166015625, "rewards/rejected": 0.000156402587890625, "step": 141 }, { "epoch": 0.1136, "grad_norm": 1.2367172233061043, "learning_rate": 4.997183427132942e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.6640625, "logps/chosen": -45.5, "logps/rejected": -44.5, "loss": 0.6927, "loss/demonstration_loss": -286.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0032806396484375, "rewards/rejected": -0.0029754638671875, "step": 142 }, { "epoch": 0.1144, "grad_norm": 0.5872506947759352, "learning_rate": 4.996842391515044e-07, "logits/chosen": 0.75, "logits/rejected": 0.7109375, "logps/chosen": -40.0, "logps/rejected": -39.5, "loss": 0.6925, "loss/demonstration_loss": -420.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.00439453125, "step": 143 }, { "epoch": 0.1152, "grad_norm": 1.3550417475061063, "learning_rate": 4.996481885018614e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.92578125, "logps/chosen": -46.75, "logps/rejected": -47.5, "loss": 0.692, "loss/demonstration_loss": -500.0, "loss/preference_loss": -498.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.0062255859375, "rewards/rejected": -0.0093994140625, "step": 144 }, { "epoch": 0.116, "grad_norm": 1.9671816333796321, "learning_rate": 4.996101910454953e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.6015625, "logps/chosen": -23.25, "logps/rejected": -23.125, "loss": 0.6927, "loss/demonstration_loss": -736.0, "loss/preference_loss": -736.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.003753662109375, "step": 145 }, { "epoch": 0.1168, "grad_norm": 1.0443393004547656, "learning_rate": 4.99570247078718e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.63671875, "logps/chosen": -10.3125, "logps/rejected": -8.25, "loss": 0.691, "loss/demonstration_loss": -99.0, "loss/preference_loss": -99.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0010986328125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.000156402587890625, "step": 146 }, { "epoch": 0.1176, "grad_norm": 1.4450995653473429, "learning_rate": 4.995283569130206e-07, "logits/chosen": 0.796875, "logits/rejected": 0.72265625, "logps/chosen": -58.5, "logps/rejected": -45.0, "loss": 0.6904, "loss/demonstration_loss": -328.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.002197265625, "rewards/rejected": -0.00531005859375, "step": 147 }, { "epoch": 0.1184, "grad_norm": 1.3092179562756041, "learning_rate": 4.994845208750711e-07, "logits/chosen": 0.380859375, "logits/rejected": 0.314453125, "logps/chosen": -28.75, "logps/rejected": -44.25, "loss": 0.6904, "loss/demonstration_loss": -292.0, "loss/preference_loss": -290.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0029754638671875, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.005615234375, "step": 148 }, { "epoch": 0.1192, "grad_norm": 1.3249631274267244, "learning_rate": 4.994387393067116e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.6015625, "logps/chosen": -22.875, "logps/rejected": -22.625, "loss": 0.6934, "loss/demonstration_loss": -720.0, "loss/preference_loss": -724.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 149 }, { "epoch": 0.12, "grad_norm": 0.8390335082373886, "learning_rate": 4.99391012564956e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.859375, "logps/chosen": -29.5, "logps/rejected": -30.25, "loss": 0.6893, "loss/demonstration_loss": -318.0, "loss/preference_loss": -316.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.006561279296875, "step": 150 }, { "epoch": 0.1208, "grad_norm": 0.9624695954449461, "learning_rate": 4.993413410219871e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.66015625, "logps/chosen": -21.75, "logps/rejected": -21.75, "loss": 0.6918, "loss/demonstration_loss": -228.0, "loss/preference_loss": -228.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0006103515625, "rewards/rejected": -0.006866455078125, "step": 151 }, { "epoch": 0.1216, "grad_norm": 1.8765605239120946, "learning_rate": 4.992897250651535e-07, "logits/chosen": 0.8125, "logits/rejected": 0.66796875, "logps/chosen": -40.5, "logps/rejected": -69.0, "loss": 0.6888, "loss/demonstration_loss": -436.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.006561279296875, "step": 152 }, { "epoch": 0.1224, "grad_norm": 0.7856896436526142, "learning_rate": 4.992361650969668e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.8515625, "logps/chosen": -4.4375, "logps/rejected": -4.59375, "loss": 0.6912, "loss/demonstration_loss": -145.0, "loss/preference_loss": -142.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.001251220703125, "step": 153 }, { "epoch": 0.1232, "grad_norm": 1.3238108906429966, "learning_rate": 4.991806615350983e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.796875, "logps/chosen": -5.15625, "logps/rejected": -11.375, "loss": 0.6914, "loss/demonstration_loss": -264.0, "loss/preference_loss": -260.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 154 }, { "epoch": 0.124, "grad_norm": 1.847097353441759, "learning_rate": 4.991232148123761e-07, "logits/chosen": 0.890625, "logits/rejected": 0.98828125, "logps/chosen": -71.0, "logps/rejected": -61.5, "loss": 0.6924, "loss/demonstration_loss": -420.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.009033203125, "rewards/margins": -0.0034332275390625, "rewards/rejected": -0.005615234375, "step": 155 }, { "epoch": 0.1248, "grad_norm": 1.756528045317591, "learning_rate": 4.990638253767812e-07, "logits/chosen": 0.87890625, "logits/rejected": 0.91796875, "logps/chosen": -65.0, "logps/rejected": -66.0, "loss": 0.6923, "loss/demonstration_loss": -412.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0174560546875, "rewards/margins": 0.000965118408203125, "rewards/rejected": -0.0184326171875, "step": 156 }, { "epoch": 0.1256, "grad_norm": 1.281325549724317, "learning_rate": 4.990024936914443e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.7890625, "logps/chosen": -38.5, "logps/rejected": -44.75, "loss": 0.6923, "loss/demonstration_loss": -332.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0028076171875, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.00250244140625, "step": 157 }, { "epoch": 0.1264, "grad_norm": 1.2944647949214905, "learning_rate": 4.989392202346423e-07, "logits/chosen": 0.796875, "logits/rejected": 0.65625, "logps/chosen": -15.1875, "logps/rejected": -37.0, "loss": 0.6896, "loss/demonstration_loss": -412.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.00811767578125, "step": 158 }, { "epoch": 0.1272, "grad_norm": 1.282469175033407, "learning_rate": 4.988740054997942e-07, "logits/chosen": 0.94921875, "logits/rejected": 0.8828125, "logps/chosen": -46.5, "logps/rejected": -56.0, "loss": 0.692, "loss/demonstration_loss": -410.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00439453125, "step": 159 }, { "epoch": 0.128, "grad_norm": 1.5232927940243022, "learning_rate": 4.988068499954577e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.78125, "logps/chosen": -31.625, "logps/rejected": -25.5, "loss": 0.6926, "loss/demonstration_loss": -302.0, "loss/preference_loss": -304.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.0006256103515625, "step": 160 }, { "epoch": 0.1288, "grad_norm": 1.6186997258448714, "learning_rate": 4.98737754245325e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.62109375, "logps/chosen": -38.0, "logps/rejected": -52.75, "loss": 0.6936, "loss/demonstration_loss": -358.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.013427734375, "rewards/margins": -0.00653076171875, "rewards/rejected": -0.00689697265625, "step": 161 }, { "epoch": 0.1296, "grad_norm": 0.8487637127592945, "learning_rate": 4.986667187882185e-07, "logits/chosen": 0.75, "logits/rejected": 0.8359375, "logps/chosen": -32.25, "logps/rejected": -24.25, "loss": 0.6917, "loss/demonstration_loss": -302.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": -0.000469207763671875, "rewards/rejected": 0.000782012939453125, "step": 162 }, { "epoch": 0.1304, "grad_norm": 0.9791012762019397, "learning_rate": 4.985937441780869e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.703125, "logps/chosen": -11.125, "logps/rejected": -10.375, "loss": 0.6945, "loss/demonstration_loss": -173.0, "loss/preference_loss": -173.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": -0.00031280517578125, "rewards/rejected": 0.00093841552734375, "step": 163 }, { "epoch": 0.1312, "grad_norm": 1.0204287748952576, "learning_rate": 4.985188309840011e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.89453125, "logps/chosen": -32.75, "logps/rejected": -17.25, "loss": 0.6924, "loss/demonstration_loss": -264.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006561279296875, "rewards/margins": -0.003265380859375, "rewards/rejected": -0.0032806396484375, "step": 164 }, { "epoch": 0.132, "grad_norm": 1.2512277673663745, "learning_rate": 4.984419797901491e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.4921875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6919, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 165 }, { "epoch": 0.1328, "grad_norm": 1.4354889452954591, "learning_rate": 4.983631911958319e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.7734375, "logps/chosen": -100.0, "logps/rejected": -87.0, "loss": 0.692, "loss/demonstration_loss": -496.0, "loss/preference_loss": -498.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.00360107421875, "rewards/rejected": -0.0111083984375, "step": 166 }, { "epoch": 0.1336, "grad_norm": 0.8859121500271574, "learning_rate": 4.982824658154588e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.79296875, "logps/chosen": -44.0, "logps/rejected": -44.5, "loss": 0.6893, "loss/demonstration_loss": -352.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005615234375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00811767578125, "step": 167 }, { "epoch": 0.1344, "grad_norm": 1.5977016455577444, "learning_rate": 4.981998042785426e-07, "logits/chosen": 1.046875, "logits/rejected": 0.90625, "logps/chosen": -41.75, "logps/rejected": -63.25, "loss": 0.6919, "loss/demonstration_loss": -556.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.001861572265625, "rewards/rejected": -0.00750732421875, "step": 168 }, { "epoch": 0.1352, "grad_norm": 1.0939397532919861, "learning_rate": 4.981152072296946e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.30859375, "logps/chosen": -20.125, "logps/rejected": -48.75, "loss": 0.6914, "loss/demonstration_loss": -366.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.003448486328125, "step": 169 }, { "epoch": 0.136, "grad_norm": 1.3077405452404038, "learning_rate": 4.980286753286194e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.7578125, "logps/chosen": -26.5, "logps/rejected": -16.75, "loss": 0.694, "loss/demonstration_loss": -340.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00689697265625, "rewards/margins": -0.0031280517578125, "rewards/rejected": -0.003753662109375, "step": 170 }, { "epoch": 0.1368, "grad_norm": 1.3031825533459338, "learning_rate": 4.979402092501104e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.84765625, "logps/chosen": -18.5, "logps/rejected": -6.875, "loss": 0.6948, "loss/demonstration_loss": -394.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.001251220703125, "step": 171 }, { "epoch": 0.1376, "grad_norm": 1.3935714795996876, "learning_rate": 4.978498096840437e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.69921875, "logps/chosen": -66.0, "logps/rejected": -69.0, "loss": 0.6887, "loss/demonstration_loss": -712.0, "loss/preference_loss": -712.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01251220703125, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.017578125, "step": 172 }, { "epoch": 0.1384, "grad_norm": 0.8801800986147548, "learning_rate": 4.977574773353732e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.58203125, "logps/chosen": -20.5, "logps/rejected": -20.25, "loss": 0.6909, "loss/demonstration_loss": -322.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.0025177001953125, "rewards/rejected": -0.00811767578125, "step": 173 }, { "epoch": 0.1392, "grad_norm": 1.7251182731085488, "learning_rate": 4.976632129241252e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.8125, "logps/chosen": -50.5, "logps/rejected": -40.5, "loss": 0.6952, "loss/demonstration_loss": -720.0, "loss/preference_loss": -728.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0087890625, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.005615234375, "step": 174 }, { "epoch": 0.14, "grad_norm": 4.308051915706349, "learning_rate": 4.975670171853925e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.796875, "logps/chosen": -22.375, "logps/rejected": -39.25, "loss": 0.6919, "loss/demonstration_loss": -246.0, "loss/preference_loss": -244.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0017242431640625, "rewards/margins": 0.005462646484375, "rewards/rejected": -0.007171630859375, "step": 175 }, { "epoch": 0.1408, "grad_norm": 1.4115770643852785, "learning_rate": 4.974688908693289e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.65234375, "logps/chosen": -65.5, "logps/rejected": -73.5, "loss": 0.6919, "loss/demonstration_loss": -740.0, "loss/preference_loss": -740.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.01123046875, "step": 176 }, { "epoch": 0.1416, "grad_norm": 1.4597969355841371, "learning_rate": 4.97368834741143e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.85546875, "logps/chosen": -33.5, "logps/rejected": -41.5, "loss": 0.6907, "loss/demonstration_loss": -398.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": 0.003448486328125, "rewards/rejected": -0.00689697265625, "step": 177 }, { "epoch": 0.1424, "grad_norm": 0.615318117008448, "learning_rate": 4.972668495810926e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.62109375, "logps/chosen": -30.0, "logps/rejected": -25.0, "loss": 0.6927, "loss/demonstration_loss": -434.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.002197265625, "rewards/rejected": -0.004058837890625, "step": 178 }, { "epoch": 0.1432, "grad_norm": 1.377169055689529, "learning_rate": 4.971629361844785e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.6796875, "logps/chosen": -42.5, "logps/rejected": -56.25, "loss": 0.687, "loss/demonstration_loss": -314.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.00494384765625, "rewards/rejected": -0.012451171875, "step": 179 }, { "epoch": 0.144, "grad_norm": 1.6512971699306982, "learning_rate": 4.970570953616382e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.94921875, "logps/chosen": -89.5, "logps/rejected": -85.0, "loss": 0.693, "loss/demonstration_loss": -552.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.01220703125, "step": 180 }, { "epoch": 0.1448, "grad_norm": 1.3569028429661734, "learning_rate": 4.969493279379397e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.859375, "logps/chosen": -64.0, "logps/rejected": -44.5, "loss": 0.6942, "loss/demonstration_loss": -288.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.00701904296875, "rewards/rejected": -0.0032958984375, "step": 181 }, { "epoch": 0.1456, "grad_norm": 1.4447894777008616, "learning_rate": 4.968396347537751e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.65234375, "logps/chosen": -48.25, "logps/rejected": -43.5, "loss": 0.6937, "loss/demonstration_loss": -244.0, "loss/preference_loss": -243.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.004058837890625, "step": 182 }, { "epoch": 0.1464, "grad_norm": 1.6419782922799655, "learning_rate": 4.967280166645538e-07, "logits/chosen": 0.828125, "logits/rejected": 0.71875, "logps/chosen": -58.0, "logps/rejected": -62.5, "loss": 0.6915, "loss/demonstration_loss": -384.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0037384033203125, "rewards/rejected": 0.001251220703125, "step": 183 }, { "epoch": 0.1472, "grad_norm": 1.4900649541850786, "learning_rate": 4.966144745406961e-07, "logits/chosen": 0.91015625, "logits/rejected": 0.8984375, "logps/chosen": -23.625, "logps/rejected": -22.25, "loss": 0.6952, "loss/demonstration_loss": -364.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.00439453125, "step": 184 }, { "epoch": 0.148, "grad_norm": 1.7434677841986461, "learning_rate": 4.964990092676262e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.6484375, "logps/chosen": -58.5, "logps/rejected": -58.25, "loss": 0.6921, "loss/demonstration_loss": -460.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0150146484375, "rewards/margins": -0.0106201171875, "rewards/rejected": -0.00439453125, "step": 185 }, { "epoch": 0.1488, "grad_norm": 1.188831627499812, "learning_rate": 4.963816217457657e-07, "logits/chosen": 0.73046875, "logits/rejected": 0.78125, "logps/chosen": -21.25, "logps/rejected": -11.0, "loss": 0.6919, "loss/demonstration_loss": -256.0, "loss/preference_loss": -254.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00140380859375, "rewards/margins": 0.002349853515625, "rewards/rejected": -0.003753662109375, "step": 186 }, { "epoch": 0.1496, "grad_norm": 1.976430435632196, "learning_rate": 4.962623128905259e-07, "logits/chosen": 0.99609375, "logits/rejected": 0.95703125, "logps/chosen": -42.25, "logps/rejected": -65.5, "loss": 0.688, "loss/demonstration_loss": -568.0, "loss/preference_loss": -572.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.000335693359375, "rewards/rejected": -0.00811767578125, "step": 187 }, { "epoch": 0.1504, "grad_norm": 1.5041412382118642, "learning_rate": 4.961410836323014e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.52734375, "logps/chosen": -33.5, "logps/rejected": -21.125, "loss": 0.6936, "loss/demonstration_loss": -290.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.005615234375, "rewards/rejected": 0.002197265625, "step": 188 }, { "epoch": 0.1512, "grad_norm": 1.2915291010578005, "learning_rate": 4.960179349164621e-07, "logits/chosen": 0.640625, "logits/rejected": 0.734375, "logps/chosen": -27.75, "logps/rejected": -31.375, "loss": 0.6935, "loss/demonstration_loss": -464.0, "loss/preference_loss": -470.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.007476806640625, "rewards/rejected": -0.0031280517578125, "step": 189 }, { "epoch": 0.152, "grad_norm": 0.8839358868535954, "learning_rate": 4.958928677033465e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.66015625, "logps/chosen": -32.25, "logps/rejected": -26.25, "loss": 0.6908, "loss/demonstration_loss": -464.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.005615234375, "step": 190 }, { "epoch": 0.1528, "grad_norm": 1.3761201437895834, "learning_rate": 4.957658829682538e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.64453125, "logps/chosen": -48.0, "logps/rejected": -41.75, "loss": 0.6917, "loss/demonstration_loss": -284.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.007354736328125, "rewards/rejected": -0.004547119140625, "step": 191 }, { "epoch": 0.1536, "grad_norm": 1.526608055107953, "learning_rate": 4.956369817014366e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.83203125, "logps/chosen": -48.75, "logps/rejected": -49.5, "loss": 0.6927, "loss/demonstration_loss": -392.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005462646484375, "rewards/margins": -0.00125885009765625, "rewards/rejected": -0.00421142578125, "step": 192 }, { "epoch": 0.1544, "grad_norm": 1.6604263053119912, "learning_rate": 4.95506164908093e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.85546875, "logps/chosen": -77.5, "logps/rejected": -70.5, "loss": 0.6949, "loss/demonstration_loss": -584.0, "loss/preference_loss": -588.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0224609375, "rewards/margins": -0.0118408203125, "rewards/rejected": -0.0106201171875, "step": 193 }, { "epoch": 0.1552, "grad_norm": 1.9092622347786705, "learning_rate": 4.953734336083582e-07, "logits/chosen": 0.96484375, "logits/rejected": 0.83203125, "logps/chosen": -72.5, "logps/rejected": -81.5, "loss": 0.6949, "loss/demonstration_loss": -408.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0140380859375, "rewards/margins": -0.0072021484375, "rewards/rejected": -0.00689697265625, "step": 194 }, { "epoch": 0.156, "grad_norm": 1.335577160057032, "learning_rate": 4.952387888372978e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.56640625, "logps/chosen": -95.5, "logps/rejected": -87.0, "loss": 0.6946, "loss/demonstration_loss": -414.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0159912109375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0172119140625, "step": 195 }, { "epoch": 0.1568, "grad_norm": 1.1882075337237872, "learning_rate": 4.951022316448989e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.609375, "logps/chosen": -21.75, "logps/rejected": -35.75, "loss": 0.6906, "loss/demonstration_loss": -306.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002960205078125, "rewards/margins": 0.003936767578125, "rewards/rejected": -0.00689697265625, "step": 196 }, { "epoch": 0.1576, "grad_norm": 1.2606905744735677, "learning_rate": 4.949637630960617e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.890625, "logps/chosen": -92.0, "logps/rejected": -96.0, "loss": 0.6882, "loss/demonstration_loss": -498.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0140380859375, "rewards/margins": 0.010986328125, "rewards/rejected": -0.0250244140625, "step": 197 }, { "epoch": 0.1584, "grad_norm": 0.7230655843643436, "learning_rate": 4.948233842705918e-07, "logits/chosen": 0.412109375, "logits/rejected": 0.380859375, "logps/chosen": -32.75, "logps/rejected": -32.25, "loss": 0.6921, "loss/demonstration_loss": -512.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.0031280517578125, "rewards/rejected": -0.005615234375, "step": 198 }, { "epoch": 0.1592, "grad_norm": 1.90532208692897, "learning_rate": 4.946810962631915e-07, "logits/chosen": 1.1640625, "logits/rejected": 1.09375, "logps/chosen": -45.0, "logps/rejected": -61.0, "loss": 0.6903, "loss/demonstration_loss": -338.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0093994140625, "step": 199 }, { "epoch": 0.16, "grad_norm": 0.959544075051194, "learning_rate": 4.945369001834514e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.8671875, "logps/chosen": -28.75, "logps/rejected": -33.25, "loss": 0.6926, "loss/demonstration_loss": -330.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.00156402587890625, "step": 200 }, { "epoch": 0.1608, "grad_norm": 1.8520563560659085, "learning_rate": 4.943907971558413e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.6328125, "logps/chosen": -67.0, "logps/rejected": -80.5, "loss": 0.6891, "loss/demonstration_loss": -390.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01153564453125, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.0162353515625, "step": 201 }, { "epoch": 0.1616, "grad_norm": 1.2815008375672092, "learning_rate": 4.94242788319702e-07, "logits/chosen": 0.8671875, "logits/rejected": 0.80078125, "logps/chosen": -55.0, "logps/rejected": -66.0, "loss": 0.6898, "loss/demonstration_loss": -484.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00140380859375, "rewards/margins": 0.00579833984375, "rewards/rejected": -0.0072021484375, "step": 202 }, { "epoch": 0.1624, "grad_norm": 1.2475241116759455, "learning_rate": 4.940928748292362e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.6484375, "logps/chosen": -52.25, "logps/rejected": -55.75, "loss": 0.6919, "loss/demonstration_loss": -284.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00140380859375, "rewards/rejected": -0.0067138671875, "step": 203 }, { "epoch": 0.1632, "grad_norm": 1.1177976998038468, "learning_rate": 4.939410578534994e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.6640625, "logps/chosen": -21.625, "logps/rejected": -24.375, "loss": 0.6917, "loss/demonstration_loss": -245.0, "loss/preference_loss": -244.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.000156402587890625, "rewards/rejected": -0.0010986328125, "step": 204 }, { "epoch": 0.164, "grad_norm": 1.244381554066115, "learning_rate": 4.937873385763907e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.8125, "logps/chosen": -35.0, "logps/rejected": -22.25, "loss": 0.6896, "loss/demonstration_loss": -456.0, "loss/preference_loss": -456.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.00156402587890625, "rewards/rejected": 0.00031280517578125, "step": 205 }, { "epoch": 0.1648, "grad_norm": 1.279603045701344, "learning_rate": 4.936317181966443e-07, "logits/chosen": 0.53125, "logits/rejected": 0.48046875, "logps/chosen": -48.0, "logps/rejected": -56.75, "loss": 0.6923, "loss/demonstration_loss": -414.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0140380859375, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.01031494140625, "step": 206 }, { "epoch": 0.1656, "grad_norm": 1.2687165769211979, "learning_rate": 4.934741979278187e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.78125, "logps/chosen": -21.5, "logps/rejected": -21.375, "loss": 0.693, "loss/demonstration_loss": -340.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0018768310546875, "step": 207 }, { "epoch": 0.1664, "grad_norm": 1.7250313720849693, "learning_rate": 4.93314778998289e-07, "logits/chosen": 1.0703125, "logits/rejected": 1.078125, "logps/chosen": -33.0, "logps/rejected": -43.25, "loss": 0.6893, "loss/demonstration_loss": -242.0, "loss/preference_loss": -240.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005462646484375, "rewards/margins": 0.0078125, "rewards/rejected": -0.01324462890625, "step": 208 }, { "epoch": 0.1672, "grad_norm": 1.2585647191918998, "learning_rate": 4.931534626512358e-07, "logits/chosen": 0.515625, "logits/rejected": 0.55859375, "logps/chosen": -37.75, "logps/rejected": -33.75, "loss": 0.6897, "loss/demonstration_loss": -568.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.006256103515625, "step": 209 }, { "epoch": 0.168, "grad_norm": 1.0196946838925856, "learning_rate": 4.929902501446366e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.71484375, "logps/chosen": -28.25, "logps/rejected": -30.5, "loss": 0.6917, "loss/demonstration_loss": -464.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.0031280517578125, "step": 210 }, { "epoch": 0.1688, "grad_norm": 1.1121918750432715, "learning_rate": 4.92825142751255e-07, "logits/chosen": 0.796875, "logits/rejected": 0.8125, "logps/chosen": -36.0, "logps/rejected": -40.0, "loss": 0.6923, "loss/demonstration_loss": -402.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0067138671875, "rewards/margins": -0.001708984375, "rewards/rejected": -0.0050048828125, "step": 211 }, { "epoch": 0.1696, "grad_norm": 1.200217537919589, "learning_rate": 4.926581417586318e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.578125, "logps/chosen": -7.5625, "logps/rejected": -19.375, "loss": 0.6912, "loss/demonstration_loss": -430.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.001251220703125, "step": 212 }, { "epoch": 0.1704, "grad_norm": 0.9225360639866745, "learning_rate": 4.924892484690743e-07, "logits/chosen": 0.578125, "logits/rejected": 0.5625, "logps/chosen": -14.875, "logps/rejected": -17.0, "loss": 0.6921, "loss/demonstration_loss": -252.0, "loss/preference_loss": -254.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0028076171875, "rewards/margins": -0.002197265625, "rewards/rejected": -0.0006256103515625, "step": 213 }, { "epoch": 0.1712, "grad_norm": 1.7035227997038747, "learning_rate": 4.923184641996463e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.8515625, "logps/chosen": -40.5, "logps/rejected": -30.875, "loss": 0.6954, "loss/demonstration_loss": -374.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01092529296875, "rewards/margins": -0.0096435546875, "rewards/rejected": -0.001251220703125, "step": 214 }, { "epoch": 0.172, "grad_norm": 1.1391561115638107, "learning_rate": 4.921457902821578e-07, "logits/chosen": 0.90234375, "logits/rejected": 0.9609375, "logps/chosen": -63.5, "logps/rejected": -60.25, "loss": 0.6921, "loss/demonstration_loss": -392.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01043701171875, "rewards/margins": -0.002960205078125, "rewards/rejected": -0.00750732421875, "step": 215 }, { "epoch": 0.1728, "grad_norm": 0.9543568057829036, "learning_rate": 4.919712280631546e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.625, "logps/chosen": -37.25, "logps/rejected": -38.0, "loss": 0.6909, "loss/demonstration_loss": -396.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0078125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.00909423828125, "step": 216 }, { "epoch": 0.1736, "grad_norm": 1.5437486598281511, "learning_rate": 4.917947789039081e-07, "logits/chosen": 0.9921875, "logits/rejected": 1.1640625, "logps/chosen": -60.75, "logps/rejected": -35.5, "loss": 0.6929, "loss/demonstration_loss": -384.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0050048828125, "rewards/rejected": 0.0006256103515625, "step": 217 }, { "epoch": 0.1744, "grad_norm": 1.6973995996130167, "learning_rate": 4.916164441804044e-07, "logits/chosen": 0.8046875, "logits/rejected": 1.0546875, "logps/chosen": -139.0, "logps/rejected": -109.5, "loss": 0.6948, "loss/demonstration_loss": -492.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02587890625, "rewards/margins": -0.009033203125, "rewards/rejected": -0.016845703125, "step": 218 }, { "epoch": 0.1752, "grad_norm": 1.121581769427685, "learning_rate": 4.914362252833331e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.76171875, "logps/chosen": -24.0, "logps/rejected": -28.25, "loss": 0.6914, "loss/demonstration_loss": -416.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.0003204345703125, "rewards/rejected": -0.0031280517578125, "step": 219 }, { "epoch": 0.176, "grad_norm": 0.9270904127224919, "learning_rate": 4.912541236180778e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.83984375, "logps/chosen": -20.625, "logps/rejected": -19.5, "loss": 0.6929, "loss/demonstration_loss": -213.0, "loss/preference_loss": -214.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.0, "step": 220 }, { "epoch": 0.1768, "grad_norm": 1.341540671329504, "learning_rate": 4.910701406047036e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.9921875, "logps/chosen": -53.25, "logps/rejected": -37.0, "loss": 0.6901, "loss/demonstration_loss": -480.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.0028076171875, "step": 221 }, { "epoch": 0.1776, "grad_norm": 1.6289051821427234, "learning_rate": 4.908842776779471e-07, "logits/chosen": 0.97265625, "logits/rejected": 0.7421875, "logps/chosen": -66.0, "logps/rejected": -88.5, "loss": 0.6896, "loss/demonstration_loss": -612.0, "loss/preference_loss": -612.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01220703125, "rewards/margins": 0.00653076171875, "rewards/rejected": -0.018798828125, "step": 222 }, { "epoch": 0.1784, "grad_norm": 1.21536611461486, "learning_rate": 4.906965362872047e-07, "logits/chosen": 0.8125, "logits/rejected": 0.76171875, "logps/chosen": -22.5, "logps/rejected": -24.5, "loss": 0.6926, "loss/demonstration_loss": -372.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.00469970703125, "step": 223 }, { "epoch": 0.1792, "grad_norm": 1.386032438732633, "learning_rate": 4.905069178965215e-07, "logits/chosen": 0.53125, "logits/rejected": 0.578125, "logps/chosen": -65.0, "logps/rejected": -64.5, "loss": 0.6932, "loss/demonstration_loss": -408.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0177001953125, "rewards/margins": -0.00640869140625, "rewards/rejected": -0.01123046875, "step": 224 }, { "epoch": 0.18, "grad_norm": 1.013286063260643, "learning_rate": 4.903154239845797e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.828125, "logps/chosen": -38.25, "logps/rejected": -34.0, "loss": 0.6924, "loss/demonstration_loss": -286.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.007049560546875, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.00640869140625, "step": 225 }, { "epoch": 0.1808, "grad_norm": 1.6340388140937157, "learning_rate": 4.901220560446874e-07, "logits/chosen": 0.640625, "logits/rejected": 0.6328125, "logps/chosen": -17.0, "logps/rejected": -18.5, "loss": 0.6954, "loss/demonstration_loss": -276.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0096435546875, "rewards/margins": -0.007476806640625, "rewards/rejected": -0.002197265625, "step": 226 }, { "epoch": 0.1816, "grad_norm": 1.928850065072846, "learning_rate": 4.899268155847667e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.9140625, "logps/chosen": -38.5, "logps/rejected": -34.5, "loss": 0.6902, "loss/demonstration_loss": -580.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.00750732421875, "step": 227 }, { "epoch": 0.1824, "grad_norm": 1.7484462675798296, "learning_rate": 4.897297041273417e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.80859375, "logps/chosen": -48.25, "logps/rejected": -47.5, "loss": 0.6947, "loss/demonstration_loss": -502.0, "loss/preference_loss": -506.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0150146484375, "rewards/margins": -0.00750732421875, "rewards/rejected": -0.00750732421875, "step": 228 }, { "epoch": 0.1832, "grad_norm": 1.6058217706921616, "learning_rate": 4.895307232095274e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.6875, "logps/chosen": -55.0, "logps/rejected": -53.0, "loss": 0.6921, "loss/demonstration_loss": -422.0, "loss/preference_loss": -424.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0250244140625, "rewards/margins": -0.0031585693359375, "rewards/rejected": -0.0218505859375, "step": 229 }, { "epoch": 0.184, "grad_norm": 1.317730663413701, "learning_rate": 4.893298743830167e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.59375, "logps/chosen": -62.0, "logps/rejected": -62.75, "loss": 0.6918, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01312255859375, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.01470947265625, "step": 230 }, { "epoch": 0.1848, "grad_norm": 1.0949689738979564, "learning_rate": 4.891271592140694e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.7421875, "logps/chosen": -36.5, "logps/rejected": -33.25, "loss": 0.6913, "loss/demonstration_loss": -280.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00701904296875, "rewards/rejected": -0.0089111328125, "step": 231 }, { "epoch": 0.1856, "grad_norm": 1.1145900069538335, "learning_rate": 4.88922579283499e-07, "logits/chosen": 1.0234375, "logits/rejected": 0.7421875, "logps/chosen": -29.25, "logps/rejected": -56.5, "loss": 0.6888, "loss/demonstration_loss": -344.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.00689697265625, "step": 232 }, { "epoch": 0.1864, "grad_norm": 1.4059932818844059, "learning_rate": 4.887161361866607e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.87890625, "logps/chosen": -8.5, "logps/rejected": -10.5, "loss": 0.6893, "loss/demonstration_loss": -308.0, "loss/preference_loss": -304.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0006256103515625, "step": 233 }, { "epoch": 0.1872, "grad_norm": 1.1574995176668748, "learning_rate": 4.885078315334394e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.6640625, "logps/chosen": -22.125, "logps/rejected": -22.25, "loss": 0.6907, "loss/demonstration_loss": -704.0, "loss/preference_loss": -704.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0050048828125, "step": 234 }, { "epoch": 0.188, "grad_norm": 1.1670913594841277, "learning_rate": 4.882976669482367e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.890625, "logps/chosen": -21.0, "logps/rejected": -21.125, "loss": 0.6941, "loss/demonstration_loss": -222.0, "loss/preference_loss": -222.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": 0.00029754638671875, "rewards/rejected": -0.0037384033203125, "step": 235 }, { "epoch": 0.1888, "grad_norm": 1.2014494617904863, "learning_rate": 4.880856440699582e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.9921875, "logps/chosen": -41.75, "logps/rejected": -22.25, "loss": 0.693, "loss/demonstration_loss": -338.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0072021484375, "rewards/rejected": 0.00156402587890625, "step": 236 }, { "epoch": 0.1896, "grad_norm": 1.3621259205554854, "learning_rate": 4.878717645520008e-07, "logits/chosen": 0.671875, "logits/rejected": 0.828125, "logps/chosen": -44.75, "logps/rejected": -30.625, "loss": 0.6931, "loss/demonstration_loss": -398.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0007781982421875, "rewards/rejected": -0.005462646484375, "step": 237 }, { "epoch": 0.1904, "grad_norm": 0.5698628047787532, "learning_rate": 4.876560300622399e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.66015625, "logps/chosen": -10.4375, "logps/rejected": -10.375, "loss": 0.692, "loss/demonstration_loss": -330.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.001251220703125, "step": 238 }, { "epoch": 0.1912, "grad_norm": 1.1392971568702668, "learning_rate": 4.874384422830167e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.5546875, "logps/chosen": -12.625, "logps/rejected": -25.5, "loss": 0.6917, "loss/demonstration_loss": -304.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.004058837890625, "step": 239 }, { "epoch": 0.192, "grad_norm": 0.771168693376665, "learning_rate": 4.872190029111241e-07, "logits/chosen": 0.890625, "logits/rejected": 0.87890625, "logps/chosen": -8.4375, "logps/rejected": -11.25, "loss": 0.6908, "loss/demonstration_loss": -157.0, "loss/preference_loss": -157.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.000152587890625, "rewards/margins": 7.62939453125e-06, "rewards/rejected": -0.000156402587890625, "step": 240 }, { "epoch": 0.1928, "grad_norm": 23.42069676182683, "learning_rate": 4.869977136577945e-07, "logits/chosen": 0.9765625, "logits/rejected": 0.8359375, "logps/chosen": -34.0, "logps/rejected": -43.5, "loss": 0.693, "loss/demonstration_loss": -406.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0159912109375, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.00970458984375, "step": 241 }, { "epoch": 0.1936, "grad_norm": 0.9664250108305303, "learning_rate": 4.867745762486861e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.7421875, "logps/chosen": -28.0, "logps/rejected": -32.25, "loss": 0.6921, "loss/demonstration_loss": -480.0, "loss/preference_loss": -476.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.005615234375, "step": 242 }, { "epoch": 0.1944, "grad_norm": 1.0194191035125064, "learning_rate": 4.86549592423869e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.921875, "logps/chosen": -45.25, "logps/rejected": -51.5, "loss": 0.6888, "loss/demonstration_loss": -310.0, "loss/preference_loss": -306.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0020294189453125, "rewards/margins": 0.00909423828125, "rewards/rejected": -0.0111083984375, "step": 243 }, { "epoch": 0.1952, "grad_norm": 1.1207130324490238, "learning_rate": 4.863227639378123e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.94140625, "logps/chosen": -34.0, "logps/rejected": -16.375, "loss": 0.6927, "loss/demonstration_loss": -400.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.004364013671875, "rewards/rejected": 0.00031280517578125, "step": 244 }, { "epoch": 0.196, "grad_norm": 0.7377761261483771, "learning_rate": 4.860940925593702e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.53515625, "logps/chosen": -40.0, "logps/rejected": -40.0, "loss": 0.6919, "loss/demonstration_loss": -636.0, "loss/preference_loss": -632.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00750732421875, "step": 245 }, { "epoch": 0.1968, "grad_norm": 1.1476622159077519, "learning_rate": 4.85863580071768e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.59375, "logps/chosen": -29.5, "logps/rejected": -29.125, "loss": 0.6927, "loss/demonstration_loss": -464.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00592041015625, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.0031280517578125, "step": 246 }, { "epoch": 0.1976, "grad_norm": 1.2617836139770693, "learning_rate": 4.856312282725885e-07, "logits/chosen": 0.78125, "logits/rejected": 0.82421875, "logps/chosen": -12.375, "logps/rejected": -10.875, "loss": 0.692, "loss/demonstration_loss": -185.0, "loss/preference_loss": -187.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.002349853515625, "rewards/rejected": 0.000782012939453125, "step": 247 }, { "epoch": 0.1984, "grad_norm": 1.4279943484650042, "learning_rate": 4.853970389737575e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.70703125, "logps/chosen": -69.0, "logps/rejected": -68.5, "loss": 0.6918, "loss/demonstration_loss": -548.0, "loss/preference_loss": -548.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.00469970703125, "step": 248 }, { "epoch": 0.1992, "grad_norm": 1.045502406270416, "learning_rate": 4.851610140015303e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.65625, "logps/chosen": -22.25, "logps/rejected": -21.875, "loss": 0.6956, "loss/demonstration_loss": -700.0, "loss/preference_loss": -704.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.003753662109375, "rewards/rejected": 0.0, "step": 249 }, { "epoch": 0.2, "grad_norm": 1.2954391142515858, "learning_rate": 4.849231551964771e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.91015625, "logps/chosen": -43.25, "logps/rejected": -26.5, "loss": 0.6908, "loss/demonstration_loss": -278.0, "loss/preference_loss": -278.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002197265625, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.0031280517578125, "step": 250 }, { "epoch": 0.2008, "grad_norm": 1.5329064831433938, "learning_rate": 4.846834644134685e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.84375, "logps/chosen": -30.75, "logps/rejected": -33.5, "loss": 0.6921, "loss/demonstration_loss": -340.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.005950927734375, "rewards/rejected": -0.0072021484375, "step": 251 }, { "epoch": 0.2016, "grad_norm": 1.2692942667415024, "learning_rate": 4.844419435216614e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.71875, "logps/chosen": -55.75, "logps/rejected": -58.75, "loss": 0.6935, "loss/demonstration_loss": -456.0, "loss/preference_loss": -456.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0012359619140625, "rewards/rejected": -0.006256103515625, "step": 252 }, { "epoch": 0.2024, "grad_norm": 0.8827185233611434, "learning_rate": 4.841985944044844e-07, "logits/chosen": 0.796875, "logits/rejected": 0.828125, "logps/chosen": -12.25, "logps/rejected": -12.1875, "loss": 0.693, "loss/demonstration_loss": -388.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0018768310546875, "step": 253 }, { "epoch": 0.2032, "grad_norm": 1.2527691460655053, "learning_rate": 4.839534189596227e-07, "logits/chosen": 1.09375, "logits/rejected": 1.171875, "logps/chosen": -42.25, "logps/rejected": -25.375, "loss": 0.6908, "loss/demonstration_loss": -358.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.0006256103515625, "step": 254 }, { "epoch": 0.204, "grad_norm": 0.854512508315575, "learning_rate": 4.837064190990036e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.67578125, "logps/chosen": -32.75, "logps/rejected": -42.0, "loss": 0.6896, "loss/demonstration_loss": -400.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.00811767578125, "rewards/rejected": -0.00933837890625, "step": 255 }, { "epoch": 0.2048, "grad_norm": 0.7132393819778908, "learning_rate": 4.834575967487817e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.7578125, "logps/chosen": -33.5, "logps/rejected": -28.125, "loss": 0.6906, "loss/demonstration_loss": -492.0, "loss/preference_loss": -490.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.004058837890625, "rewards/rejected": -0.00469970703125, "step": 256 }, { "epoch": 0.2056, "grad_norm": 1.3869204231074956, "learning_rate": 4.832069538493237e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.625, "logps/chosen": -39.75, "logps/rejected": -48.75, "loss": 0.6918, "loss/demonstration_loss": -468.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0006103515625, "rewards/rejected": -0.00750732421875, "step": 257 }, { "epoch": 0.2064, "grad_norm": 1.2713999765171817, "learning_rate": 4.829544923551931e-07, "logits/chosen": 0.578125, "logits/rejected": 0.484375, "logps/chosen": -50.0, "logps/rejected": -56.5, "loss": 0.692, "loss/demonstration_loss": -340.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.004852294921875, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.010009765625, "step": 258 }, { "epoch": 0.2072, "grad_norm": 1.9712682869830773, "learning_rate": 4.827002142351355e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.734375, "logps/chosen": -67.0, "logps/rejected": -62.25, "loss": 0.692, "loss/demonstration_loss": -414.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.0, "rewards/margins": 0.01092529296875, "rewards/rejected": -0.01092529296875, "step": 259 }, { "epoch": 0.208, "grad_norm": 1.3398334859393155, "learning_rate": 4.824441214720628e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.58984375, "logps/chosen": -31.25, "logps/rejected": -27.375, "loss": 0.6891, "loss/demonstration_loss": -235.0, "loss/preference_loss": -234.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.0028228759765625, "rewards/rejected": -0.002197265625, "step": 260 }, { "epoch": 0.2088, "grad_norm": 1.1513796223442145, "learning_rate": 4.821862160630378e-07, "logits/chosen": 0.640625, "logits/rejected": 0.6484375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 261 }, { "epoch": 0.2096, "grad_norm": 2.7168304819020643, "learning_rate": 4.819265000192585e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.6953125, "logps/chosen": -16.5, "logps/rejected": -31.375, "loss": 0.6877, "loss/demonstration_loss": -256.0, "loss/preference_loss": -252.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0010986328125, "rewards/margins": 0.007354736328125, "rewards/rejected": -0.006256103515625, "step": 262 }, { "epoch": 0.2104, "grad_norm": 0.9824921063038419, "learning_rate": 4.81664975366043e-07, "logits/chosen": 0.87890625, "logits/rejected": 0.8203125, "logps/chosen": -33.25, "logps/rejected": -31.5, "loss": 0.6919, "loss/demonstration_loss": -340.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00091552734375, "rewards/rejected": -0.0072021484375, "step": 263 }, { "epoch": 0.2112, "grad_norm": 1.0178893018573538, "learning_rate": 4.81401644142813e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.82421875, "logps/chosen": -25.0, "logps/rejected": -37.0, "loss": 0.6912, "loss/demonstration_loss": -328.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.005950927734375, "step": 264 }, { "epoch": 0.212, "grad_norm": 1.5281967861600199, "learning_rate": 4.811365084030783e-07, "logits/chosen": 0.83984375, "logits/rejected": 1.078125, "logps/chosen": -58.25, "logps/rejected": -38.25, "loss": 0.693, "loss/demonstration_loss": -380.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.0037689208984375, "rewards/rejected": -0.009033203125, "step": 265 }, { "epoch": 0.2128, "grad_norm": 1.4201788013758394, "learning_rate": 4.808695702144205e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.8125, "logps/chosen": -51.0, "logps/rejected": -49.5, "loss": 0.6906, "loss/demonstration_loss": -400.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00830078125, "rewards/margins": -0.00579833984375, "rewards/rejected": -0.00250244140625, "step": 266 }, { "epoch": 0.2136, "grad_norm": 1.0943844602891215, "learning_rate": 4.806008316584774e-07, "logits/chosen": 0.9296875, "logits/rejected": 0.88671875, "logps/chosen": -5.09375, "logps/rejected": -5.84375, "loss": 0.6936, "loss/demonstration_loss": -167.0, "loss/preference_loss": -174.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.00093841552734375, "step": 267 }, { "epoch": 0.2144, "grad_norm": 1.1053481045828282, "learning_rate": 4.803302948309264e-07, "logits/chosen": 0.9453125, "logits/rejected": 0.875, "logps/chosen": -22.0, "logps/rejected": -27.5, "loss": 0.6901, "loss/demonstration_loss": -394.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002197265625, "rewards/margins": 0.003448486328125, "rewards/rejected": -0.005615234375, "step": 268 }, { "epoch": 0.2152, "grad_norm": 0.6499072198851044, "learning_rate": 4.800579618414676e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.76953125, "logps/chosen": -25.75, "logps/rejected": -10.5, "loss": 0.6919, "loss/demonstration_loss": -290.0, "loss/preference_loss": -292.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.00156402587890625, "rewards/rejected": 0.00093841552734375, "step": 269 }, { "epoch": 0.216, "grad_norm": 0.7201713102719576, "learning_rate": 4.797838348138086e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.71484375, "logps/chosen": -30.125, "logps/rejected": -24.75, "loss": 0.6917, "loss/demonstration_loss": -432.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00250244140625, "step": 270 }, { "epoch": 0.2168, "grad_norm": 0.8162742778493564, "learning_rate": 4.79507915885647e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.75390625, "logps/chosen": -14.25, "logps/rejected": -18.0, "loss": 0.6921, "loss/demonstration_loss": -258.0, "loss/preference_loss": -258.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.000782012939453125, "rewards/margins": -0.0010986328125, "rewards/rejected": 0.00031280517578125, "step": 271 }, { "epoch": 0.2176, "grad_norm": 0.8385058227760824, "learning_rate": 4.792302072086541e-07, "logits/chosen": 0.609375, "logits/rejected": 0.62109375, "logps/chosen": -26.5, "logps/rejected": -28.625, "loss": 0.6901, "loss/demonstration_loss": -436.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.005615234375, "rewards/rejected": -0.010009765625, "step": 272 }, { "epoch": 0.2184, "grad_norm": 1.6761143462881491, "learning_rate": 4.789507109484579e-07, "logits/chosen": 0.59375, "logits/rejected": 0.7109375, "logps/chosen": -70.5, "logps/rejected": -71.0, "loss": 0.696, "loss/demonstration_loss": -372.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0205078125, "rewards/margins": -0.0118408203125, "rewards/rejected": -0.00860595703125, "step": 273 }, { "epoch": 0.2192, "grad_norm": 0.7326694999272815, "learning_rate": 4.786694292846262e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.59765625, "logps/chosen": -17.25, "logps/rejected": -24.5, "loss": 0.6907, "loss/demonstration_loss": -336.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.0028076171875, "step": 274 }, { "epoch": 0.22, "grad_norm": 1.3698914066104824, "learning_rate": 4.783863644106502e-07, "logits/chosen": 0.92578125, "logits/rejected": 0.8984375, "logps/chosen": -12.6875, "logps/rejected": -12.4375, "loss": 0.6915, "loss/demonstration_loss": -396.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004364013671875, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.0018768310546875, "step": 275 }, { "epoch": 0.2208, "grad_norm": 1.702632932344196, "learning_rate": 4.781015185339265e-07, "logits/chosen": 0.29296875, "logits/rejected": 0.3203125, "logps/chosen": -46.5, "logps/rejected": -45.0, "loss": 0.688, "loss/demonstration_loss": -486.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0006103515625, "rewards/rejected": -0.006866455078125, "step": 276 }, { "epoch": 0.2216, "grad_norm": 1.374763731512369, "learning_rate": 4.778148938757405e-07, "logits/chosen": 0.9375, "logits/rejected": 0.87890625, "logps/chosen": -32.0, "logps/rejected": -45.5, "loss": 0.6895, "loss/demonstration_loss": -410.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00640869140625, "rewards/margins": 0.00860595703125, "rewards/rejected": -0.0150146484375, "step": 277 }, { "epoch": 0.2224, "grad_norm": 2.114274077938515, "learning_rate": 4.775264926712489e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.66796875, "logps/chosen": -73.5, "logps/rejected": -76.5, "loss": 0.6943, "loss/demonstration_loss": -476.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.014404296875, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.0150146484375, "step": 278 }, { "epoch": 0.2232, "grad_norm": 1.3199603016958925, "learning_rate": 4.772363171694622e-07, "logits/chosen": 0.9375, "logits/rejected": 0.9296875, "logps/chosen": -15.0, "logps/rejected": -15.1875, "loss": 0.6932, "loss/demonstration_loss": -482.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00250244140625, "step": 279 }, { "epoch": 0.224, "grad_norm": 1.5957049737087075, "learning_rate": 4.769443696332272e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.84375, "logps/chosen": -38.75, "logps/rejected": -43.5, "loss": 0.6898, "loss/demonstration_loss": -328.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00439453125, "rewards/rejected": -0.006256103515625, "step": 280 }, { "epoch": 0.2248, "grad_norm": 1.2504546179689064, "learning_rate": 4.7665065233920944e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.82421875, "logps/chosen": -10.125, "logps/rejected": -22.0, "loss": 0.6899, "loss/demonstration_loss": -254.0, "loss/preference_loss": -252.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0023345947265625, "rewards/margins": 0.00390625, "rewards/rejected": -0.006256103515625, "step": 281 }, { "epoch": 0.2256, "grad_norm": 1.219369024460613, "learning_rate": 4.763551675778754e-07, "logits/chosen": 0.65625, "logits/rejected": 0.6796875, "logps/chosen": -30.75, "logps/rejected": -30.625, "loss": 0.6914, "loss/demonstration_loss": -244.0, "loss/preference_loss": -244.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.000461578369140625, "rewards/rejected": -0.0032806396484375, "step": 282 }, { "epoch": 0.2264, "grad_norm": 1.4153714263420083, "learning_rate": 4.7605791765347465e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.71875, "logps/chosen": -73.0, "logps/rejected": -85.0, "loss": 0.6937, "loss/demonstration_loss": -358.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.00592041015625, "rewards/rejected": -0.0084228515625, "step": 283 }, { "epoch": 0.2272, "grad_norm": 1.4305470327301162, "learning_rate": 4.7575890488402183e-07, "logits/chosen": 0.75, "logits/rejected": 0.75, "logps/chosen": -37.25, "logps/rejected": -36.5, "loss": 0.6938, "loss/demonstration_loss": -386.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.0062255859375, "rewards/rejected": -0.004364013671875, "step": 284 }, { "epoch": 0.228, "grad_norm": 1.234627588735063, "learning_rate": 4.7545813160127845e-07, "logits/chosen": 0.96484375, "logits/rejected": 0.9296875, "logps/chosen": -55.5, "logps/rejected": -52.25, "loss": 0.6921, "loss/demonstration_loss": -430.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.004058837890625, "step": 285 }, { "epoch": 0.2288, "grad_norm": 0.8765472247770683, "learning_rate": 4.7515560015073507e-07, "logits/chosen": 0.97265625, "logits/rejected": 1.0625, "logps/chosen": -34.0, "logps/rejected": -19.0, "loss": 0.6925, "loss/demonstration_loss": -422.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.00469970703125, "rewards/rejected": 0.0028076171875, "step": 286 }, { "epoch": 0.2296, "grad_norm": 1.1899936279338132, "learning_rate": 4.7485131289159274e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.65234375, "logps/chosen": -29.125, "logps/rejected": -17.875, "loss": 0.6914, "loss/demonstration_loss": -376.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00171661376953125, "rewards/margins": 0.00390625, "rewards/rejected": -0.005615234375, "step": 287 }, { "epoch": 0.2304, "grad_norm": 1.1871377168399015, "learning_rate": 4.7454527219674455e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.73828125, "logps/chosen": -24.125, "logps/rejected": -47.0, "loss": 0.6897, "loss/demonstration_loss": -380.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.009033203125, "rewards/rejected": -0.00811767578125, "step": 288 }, { "epoch": 0.2312, "grad_norm": 1.6416738130695843, "learning_rate": 4.742374804527575e-07, "logits/chosen": 1.140625, "logits/rejected": 1.0859375, "logps/chosen": -29.25, "logps/rejected": -35.5, "loss": 0.6924, "loss/demonstration_loss": -256.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.000629425048828125, "rewards/rejected": -0.00531005859375, "step": 289 }, { "epoch": 0.232, "grad_norm": 1.2592511439481786, "learning_rate": 4.739279400598532e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.81640625, "logps/chosen": -38.0, "logps/rejected": -38.0, "loss": 0.6914, "loss/demonstration_loss": -604.0, "loss/preference_loss": -604.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.00439453125, "step": 290 }, { "epoch": 0.2328, "grad_norm": 1.3307263047204292, "learning_rate": 4.7361665343188993e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.72265625, "logps/chosen": -46.0, "logps/rejected": -41.0, "loss": 0.6937, "loss/demonstration_loss": -458.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.001251220703125, "step": 291 }, { "epoch": 0.2336, "grad_norm": 1.7661803223029706, "learning_rate": 4.733036229963434e-07, "logits/chosen": 0.78125, "logits/rejected": 0.90234375, "logps/chosen": -51.5, "logps/rejected": -23.25, "loss": 0.6956, "loss/demonstration_loss": -592.0, "loss/preference_loss": -596.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.001251220703125, "step": 292 }, { "epoch": 0.2344, "grad_norm": 1.3827252332684694, "learning_rate": 4.7298885119428767e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.8203125, "logps/chosen": -89.0, "logps/rejected": -70.0, "loss": 0.6923, "loss/demonstration_loss": -632.0, "loss/preference_loss": -632.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.0106201171875, "step": 293 }, { "epoch": 0.2352, "grad_norm": 1.3306579666554852, "learning_rate": 4.726723404803766e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.671875, "logps/chosen": -54.75, "logps/rejected": -63.75, "loss": 0.6906, "loss/demonstration_loss": -470.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01031494140625, "rewards/margins": 0.00872802734375, "rewards/rejected": -0.01904296875, "step": 294 }, { "epoch": 0.236, "grad_norm": 2.3821738484407375, "learning_rate": 4.7235409332282436e-07, "logits/chosen": 0.9140625, "logits/rejected": 0.89453125, "logps/chosen": -69.5, "logps/rejected": -75.0, "loss": 0.6969, "loss/demonstration_loss": -380.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.018798828125, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.0106201171875, "step": 295 }, { "epoch": 0.2368, "grad_norm": 1.1219400161663977, "learning_rate": 4.720341122033861e-07, "logits/chosen": 0.6875, "logits/rejected": 0.828125, "logps/chosen": -46.25, "logps/rejected": -37.5, "loss": 0.6899, "loss/demonstration_loss": -444.0, "loss/preference_loss": -442.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00439453125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00689697265625, "step": 296 }, { "epoch": 0.2376, "grad_norm": 1.2800023514026913, "learning_rate": 4.7171239961733895e-07, "logits/chosen": 0.828125, "logits/rejected": 0.71875, "logps/chosen": -49.0, "logps/rejected": -64.0, "loss": 0.6895, "loss/demonstration_loss": -360.0, "loss/preference_loss": -358.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00750732421875, "rewards/rejected": -0.0093994140625, "step": 297 }, { "epoch": 0.2384, "grad_norm": 1.1919925174333041, "learning_rate": 4.7138895807346223e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.6484375, "logps/chosen": -51.0, "logps/rejected": -50.0, "loss": 0.696, "loss/demonstration_loss": -528.0, "loss/preference_loss": -536.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0166015625, "rewards/margins": -0.0106201171875, "rewards/rejected": -0.005950927734375, "step": 298 }, { "epoch": 0.2392, "grad_norm": 0.9893903113584179, "learning_rate": 4.710637900940181e-07, "logits/chosen": 0.7734375, "logits/rejected": 0.83203125, "logps/chosen": -25.5, "logps/rejected": -29.125, "loss": 0.692, "loss/demonstration_loss": -434.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 299 }, { "epoch": 0.24, "grad_norm": 1.402501128785907, "learning_rate": 4.707368982147317e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.9296875, "logps/chosen": -14.875, "logps/rejected": -14.4375, "loss": 0.6927, "loss/demonstration_loss": -460.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00439453125, "rewards/rejected": -0.0006256103515625, "step": 300 }, { "epoch": 0.2408, "grad_norm": 1.0839364962148086, "learning_rate": 4.704082849847717e-07, "logits/chosen": 0.859375, "logits/rejected": 1.140625, "logps/chosen": -31.75, "logps/rejected": -12.125, "loss": 0.6926, "loss/demonstration_loss": -348.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00439453125, "rewards/rejected": -0.001251220703125, "step": 301 }, { "epoch": 0.2416, "grad_norm": 1.1578235391290441, "learning_rate": 4.7007795296673005e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.46484375, "logps/chosen": -22.75, "logps/rejected": -41.5, "loss": 0.6912, "loss/demonstration_loss": -256.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.003448486328125, "rewards/rejected": -0.004058837890625, "step": 302 }, { "epoch": 0.2424, "grad_norm": 1.5375599705433967, "learning_rate": 4.6974590473660214e-07, "logits/chosen": 0.90234375, "logits/rejected": 0.96875, "logps/chosen": -37.5, "logps/rejected": -28.25, "loss": 0.6918, "loss/demonstration_loss": -350.0, "loss/preference_loss": -348.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0050048828125, "step": 303 }, { "epoch": 0.2432, "grad_norm": 1.012411824853166, "learning_rate": 4.6941214288376676e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.671875, "logps/chosen": -22.25, "logps/rejected": -21.75, "loss": 0.6924, "loss/demonstration_loss": -348.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.004547119140625, "rewards/rejected": 0.000782012939453125, "step": 304 }, { "epoch": 0.244, "grad_norm": 1.7360240788762478, "learning_rate": 4.6907667001096585e-07, "logits/chosen": 1.1640625, "logits/rejected": 1.1796875, "logps/chosen": -39.25, "logps/rejected": -36.5, "loss": 0.6901, "loss/demonstration_loss": -302.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.002197265625, "step": 305 }, { "epoch": 0.2448, "grad_norm": 1.3259070825465398, "learning_rate": 4.6873948873428444e-07, "logits/chosen": 1.0234375, "logits/rejected": 0.9453125, "logps/chosen": -15.5, "logps/rejected": -21.625, "loss": 0.6888, "loss/demonstration_loss": -200.0, "loss/preference_loss": -195.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0031280517578125, "rewards/margins": 0.00848388671875, "rewards/rejected": -0.00531005859375, "step": 306 }, { "epoch": 0.2456, "grad_norm": 1.4412986575162055, "learning_rate": 4.684006016831297e-07, "logits/chosen": 0.703125, "logits/rejected": 0.9375, "logps/chosen": -69.5, "logps/rejected": -47.0, "loss": 0.6964, "loss/demonstration_loss": -368.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.009033203125, "rewards/rejected": -0.00469970703125, "step": 307 }, { "epoch": 0.2464, "grad_norm": 1.9401065938153328, "learning_rate": 4.680600115002109e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.609375, "logps/chosen": -32.0, "logps/rejected": -59.25, "loss": 0.6844, "loss/demonstration_loss": -486.0, "loss/preference_loss": -478.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.01708984375, "rewards/rejected": -0.01806640625, "step": 308 }, { "epoch": 0.2472, "grad_norm": 1.6380779116154347, "learning_rate": 4.677177208415188e-07, "logits/chosen": 0.65625, "logits/rejected": 0.75, "logps/chosen": -66.0, "logps/rejected": -59.0, "loss": 0.6927, "loss/demonstration_loss": -330.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.000797271728515625, "rewards/rejected": -0.01171875, "step": 309 }, { "epoch": 0.248, "grad_norm": 1.9141573098570759, "learning_rate": 4.6737373237630473e-07, "logits/chosen": 0.98828125, "logits/rejected": 0.95703125, "logps/chosen": -74.5, "logps/rejected": -87.5, "loss": 0.691, "loss/demonstration_loss": -368.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.002349853515625, "rewards/rejected": -0.00921630859375, "step": 310 }, { "epoch": 0.2488, "grad_norm": 1.6647308048427472, "learning_rate": 4.670280487870598e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.81640625, "logps/chosen": -92.0, "logps/rejected": -94.0, "loss": 0.691, "loss/demonstration_loss": -424.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0084228515625, "rewards/margins": 0.006866455078125, "rewards/rejected": -0.01531982421875, "step": 311 }, { "epoch": 0.2496, "grad_norm": 1.9351324415175855, "learning_rate": 4.6668067276949407e-07, "logits/chosen": 0.76171875, "logits/rejected": 0.625, "logps/chosen": -63.0, "logps/rejected": -88.5, "loss": 0.6886, "loss/demonstration_loss": -402.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.01031494140625, "rewards/margins": 0.0150146484375, "rewards/rejected": -0.025390625, "step": 312 }, { "epoch": 0.2504, "grad_norm": 1.0509336480693503, "learning_rate": 4.663316070325155e-07, "logits/chosen": 1.265625, "logits/rejected": 1.1171875, "logps/chosen": -32.75, "logps/rejected": -51.0, "loss": 0.6899, "loss/demonstration_loss": -334.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.000469207763671875, "rewards/margins": 0.006439208984375, "rewards/rejected": -0.00689697265625, "step": 313 }, { "epoch": 0.2512, "grad_norm": 1.3327472787951804, "learning_rate": 4.6598085429820877e-07, "logits/chosen": 0.90625, "logits/rejected": 0.828125, "logps/chosen": -27.875, "logps/rejected": -40.0, "loss": 0.6912, "loss/demonstration_loss": -544.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.0050048828125, "step": 314 }, { "epoch": 0.252, "grad_norm": 1.2689071138782009, "learning_rate": 4.6562841730181435e-07, "logits/chosen": 0.578125, "logits/rejected": 0.51171875, "logps/chosen": -15.1875, "logps/rejected": -16.5, "loss": 0.6906, "loss/demonstration_loss": -506.0, "loss/preference_loss": -484.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.01312255859375, "rewards/rejected": -0.01373291015625, "step": 315 }, { "epoch": 0.2528, "grad_norm": 0.9256901971823556, "learning_rate": 4.6527429879170657e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.68359375, "logps/chosen": -12.6875, "logps/rejected": -10.0625, "loss": 0.6906, "loss/demonstration_loss": -184.0, "loss/preference_loss": -182.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.00093841552734375, "rewards/rejected": 0.0006256103515625, "step": 316 }, { "epoch": 0.2536, "grad_norm": 1.1395009710775603, "learning_rate": 4.6491850152937275e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.79296875, "logps/chosen": -21.625, "logps/rejected": -20.75, "loss": 0.6929, "loss/demonstration_loss": -334.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.0018768310546875, "step": 317 }, { "epoch": 0.2544, "grad_norm": 1.345867160017009, "learning_rate": 4.645610282893914e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.73828125, "logps/chosen": -44.0, "logps/rejected": -48.25, "loss": 0.6859, "loss/demonstration_loss": -296.0, "loss/preference_loss": -292.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.004058837890625, "rewards/margins": 0.0140380859375, "rewards/rejected": -0.010009765625, "step": 318 }, { "epoch": 0.2552, "grad_norm": 1.0883961182201558, "learning_rate": 4.642018818594107e-07, "logits/chosen": 0.44921875, "logits/rejected": 0.5078125, "logps/chosen": -52.0, "logps/rejected": -40.5, "loss": 0.6918, "loss/demonstration_loss": -490.0, "loss/preference_loss": -490.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0072021484375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.005950927734375, "step": 319 }, { "epoch": 0.256, "grad_norm": 1.0884831361772467, "learning_rate": 4.6384106504012665e-07, "logits/chosen": 0.515625, "logits/rejected": 0.51953125, "logps/chosen": -11.0, "logps/rejected": -10.75, "loss": 0.6893, "loss/demonstration_loss": -172.0, "loss/preference_loss": -175.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0017242431640625, "rewards/margins": -0.002960205078125, "rewards/rejected": 0.001251220703125, "step": 320 }, { "epoch": 0.2568, "grad_norm": 1.3588109512029405, "learning_rate": 4.6347858064526123e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.68359375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6927, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 321 }, { "epoch": 0.2576, "grad_norm": 1.064148423789147, "learning_rate": 4.631144315015406e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.5859375, "logps/chosen": -9.375, "logps/rejected": -8.5625, "loss": 0.6934, "loss/demonstration_loss": -278.0, "loss/preference_loss": -292.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00811767578125, "rewards/rejected": 0.00250244140625, "step": 322 }, { "epoch": 0.2584, "grad_norm": 1.0285446341611966, "learning_rate": 4.62748620448673e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.8515625, "logps/chosen": -31.625, "logps/rejected": -31.25, "loss": 0.6932, "loss/demonstration_loss": -496.0, "loss/preference_loss": -498.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.004058837890625, "step": 323 }, { "epoch": 0.2592, "grad_norm": 1.3041574897433876, "learning_rate": 4.6238115033932635e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.765625, "logps/chosen": -31.875, "logps/rejected": -29.25, "loss": 0.6926, "loss/demonstration_loss": -480.0, "loss/preference_loss": -484.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.0009765625, "rewards/rejected": -0.0084228515625, "step": 324 }, { "epoch": 0.26, "grad_norm": 0.27039822948571135, "learning_rate": 4.6201202403910643e-07, "logits/chosen": 0.8671875, "logits/rejected": 0.8671875, "logps/chosen": -11.8125, "logps/rejected": -12.125, "loss": 0.6907, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0018768310546875, "step": 325 }, { "epoch": 0.2608, "grad_norm": 1.4044764636295741, "learning_rate": 4.616412444265344e-07, "logits/chosen": 1.1015625, "logits/rejected": 1.1171875, "logps/chosen": -47.25, "logps/rejected": -50.25, "loss": 0.694, "loss/demonstration_loss": -512.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01123046875, "rewards/margins": -0.006561279296875, "rewards/rejected": -0.00469970703125, "step": 326 }, { "epoch": 0.2616, "grad_norm": 1.8749042639530937, "learning_rate": 4.612688143930241e-07, "logits/chosen": 1.078125, "logits/rejected": 1.078125, "logps/chosen": -28.5, "logps/rejected": -31.875, "loss": 0.6924, "loss/demonstration_loss": -240.0, "loss/preference_loss": -240.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.001861572265625, "rewards/rejected": -0.004364013671875, "step": 327 }, { "epoch": 0.2624, "grad_norm": 1.9791032400870452, "learning_rate": 4.608947368428597e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.88671875, "logps/chosen": -29.75, "logps/rejected": -38.5, "loss": 0.6935, "loss/demonstration_loss": -362.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004547119140625, "rewards/margins": -0.00141143798828125, "rewards/rejected": -0.0031280517578125, "step": 328 }, { "epoch": 0.2632, "grad_norm": 5.015060330487301, "learning_rate": 4.6051901469317303e-07, "logits/chosen": 0.671875, "logits/rejected": 0.73828125, "logps/chosen": -14.3125, "logps/rejected": -10.6875, "loss": 0.691, "loss/demonstration_loss": -396.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 329 }, { "epoch": 0.264, "grad_norm": 1.128702610209299, "learning_rate": 4.6014165087392105e-07, "logits/chosen": 0.671875, "logits/rejected": 0.70703125, "logps/chosen": -21.0, "logps/rejected": -20.5, "loss": 0.6935, "loss/demonstration_loss": -656.0, "loss/preference_loss": -664.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0050048828125, "rewards/rejected": 0.0, "step": 330 }, { "epoch": 0.2648, "grad_norm": 1.518773355371053, "learning_rate": 4.597626483278625e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.9453125, "logps/chosen": -82.5, "logps/rejected": -60.5, "loss": 0.6948, "loss/demonstration_loss": -378.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01080322265625, "rewards/margins": -0.01019287109375, "rewards/rejected": -0.0006256103515625, "step": 331 }, { "epoch": 0.2656, "grad_norm": 1.2399269659858536, "learning_rate": 4.5938201001053546e-07, "logits/chosen": 0.625, "logits/rejected": 0.578125, "logps/chosen": -25.0, "logps/rejected": -30.875, "loss": 0.6924, "loss/demonstration_loss": -222.0, "loss/preference_loss": -221.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002349853515625, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.0050048828125, "step": 332 }, { "epoch": 0.2664, "grad_norm": 1.2989605394577444, "learning_rate": 4.589997388902338e-07, "logits/chosen": 1.078125, "logits/rejected": 1.2578125, "logps/chosen": -54.5, "logps/rejected": -27.75, "loss": 0.6958, "loss/demonstration_loss": -324.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.01312255859375, "rewards/rejected": 0.0028076171875, "step": 333 }, { "epoch": 0.2672, "grad_norm": 1.0673896097022135, "learning_rate": 4.5861583794798477e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.625, "logps/chosen": -35.0, "logps/rejected": -46.0, "loss": 0.6897, "loss/demonstration_loss": -432.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.01251220703125, "rewards/rejected": -0.0115966796875, "step": 334 }, { "epoch": 0.268, "grad_norm": 1.7706132299631088, "learning_rate": 4.582303101775248e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.66015625, "logps/chosen": -58.5, "logps/rejected": -67.5, "loss": 0.6953, "loss/demonstration_loss": -400.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01220703125, "rewards/margins": -0.00970458984375, "rewards/rejected": -0.00250244140625, "step": 335 }, { "epoch": 0.2688, "grad_norm": 1.7726573094800455, "learning_rate": 4.578431585852771e-07, "logits/chosen": 0.85546875, "logits/rejected": 1.0, "logps/chosen": -59.25, "logps/rejected": -45.0, "loss": 0.6938, "loss/demonstration_loss": -332.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0067138671875, "rewards/margins": 0.000782012939453125, "rewards/rejected": -0.00750732421875, "step": 336 }, { "epoch": 0.2696, "grad_norm": 1.3203596028006965, "learning_rate": 4.574543861903274e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.72265625, "logps/chosen": -21.5, "logps/rejected": -33.75, "loss": 0.6921, "loss/demonstration_loss": -438.0, "loss/preference_loss": -438.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.004058837890625, "step": 337 }, { "epoch": 0.2704, "grad_norm": 1.358223052679719, "learning_rate": 4.5706399602440104e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.5546875, "logps/chosen": -52.5, "logps/rejected": -52.0, "loss": 0.6924, "loss/demonstration_loss": -556.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.005950927734375, "rewards/rejected": -0.00909423828125, "step": 338 }, { "epoch": 0.2712, "grad_norm": 1.0603072195916445, "learning_rate": 4.5667199113183887e-07, "logits/chosen": 0.765625, "logits/rejected": 0.71875, "logps/chosen": -50.75, "logps/rejected": -58.0, "loss": 0.692, "loss/demonstration_loss": -576.0, "loss/preference_loss": -580.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.0050048828125, "step": 339 }, { "epoch": 0.272, "grad_norm": 1.3069669497487042, "learning_rate": 4.5627837456957374e-07, "logits/chosen": 1.0859375, "logits/rejected": 0.94140625, "logps/chosen": -77.0, "logps/rejected": -97.5, "loss": 0.6907, "loss/demonstration_loss": -556.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.01123046875, "step": 340 }, { "epoch": 0.2728, "grad_norm": 1.3451888977280693, "learning_rate": 4.558831494071068e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.6640625, "logps/chosen": -19.125, "logps/rejected": -19.25, "loss": 0.6924, "loss/demonstration_loss": -298.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0106201171875, "rewards/margins": 0.0006103515625, "rewards/rejected": -0.01123046875, "step": 341 }, { "epoch": 0.2736, "grad_norm": 1.5763551119693042, "learning_rate": 4.5548631872648327e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.890625, "logps/chosen": -48.5, "logps/rejected": -49.25, "loss": 0.6887, "loss/demonstration_loss": -388.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.005950927734375, "step": 342 }, { "epoch": 0.2744, "grad_norm": 0.9649534367005548, "learning_rate": 4.550878856222684e-07, "logits/chosen": 0.55078125, "logits/rejected": 0.703125, "logps/chosen": -33.0, "logps/rejected": -26.75, "loss": 0.6924, "loss/demonstration_loss": -318.0, "loss/preference_loss": -316.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.00439453125, "step": 343 }, { "epoch": 0.2752, "grad_norm": 0.919509412561667, "learning_rate": 4.546878532015236e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.734375, "logps/chosen": -24.5, "logps/rejected": -23.875, "loss": 0.6903, "loss/demonstration_loss": -156.0, "loss/preference_loss": -153.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0045166015625, "rewards/margins": 0.01080322265625, "rewards/rejected": -0.006256103515625, "step": 344 }, { "epoch": 0.276, "grad_norm": 0.9694163307990481, "learning_rate": 4.542862245837821e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.60546875, "logps/chosen": -40.0, "logps/rejected": -40.5, "loss": 0.6919, "loss/demonstration_loss": -320.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0078125, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.006256103515625, "step": 345 }, { "epoch": 0.2768, "grad_norm": 1.289815949894835, "learning_rate": 4.5388300290102454e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.58203125, "logps/chosen": -24.5, "logps/rejected": -30.875, "loss": 0.689, "loss/demonstration_loss": -220.0, "loss/preference_loss": -219.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.00640869140625, "step": 346 }, { "epoch": 0.2776, "grad_norm": 1.2787477591105885, "learning_rate": 4.5347819129765454e-07, "logits/chosen": 0.671875, "logits/rejected": 0.71484375, "logps/chosen": -44.25, "logps/rejected": -40.5, "loss": 0.6929, "loss/demonstration_loss": -336.0, "loss/preference_loss": -334.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": 0.0021820068359375, "rewards/rejected": -0.01092529296875, "step": 347 }, { "epoch": 0.2784, "grad_norm": 1.018165955849104, "learning_rate": 4.530717929304743e-07, "logits/chosen": 0.59375, "logits/rejected": 0.62890625, "logps/chosen": -13.375, "logps/rejected": -7.3125, "loss": 0.6932, "loss/demonstration_loss": -320.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.006561279296875, "rewards/rejected": 0.00031280517578125, "step": 348 }, { "epoch": 0.2792, "grad_norm": 0.8379010816475899, "learning_rate": 4.5266381096866e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.7734375, "logps/chosen": -30.25, "logps/rejected": -18.5, "loss": 0.6926, "loss/demonstration_loss": -388.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0018768310546875, "step": 349 }, { "epoch": 0.28, "grad_norm": 4.612268314735285, "learning_rate": 4.5225424859373684e-07, "logits/chosen": 0.96484375, "logits/rejected": 1.09375, "logps/chosen": -56.0, "logps/rejected": -43.5, "loss": 0.6931, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0093994140625, "step": 350 }, { "epoch": 0.2808, "grad_norm": 0.9612682423187542, "learning_rate": 4.5184310899955457e-07, "logits/chosen": 0.828125, "logits/rejected": 0.86328125, "logps/chosen": -10.75, "logps/rejected": -9.0, "loss": 0.6912, "loss/demonstration_loss": -316.0, "loss/preference_loss": -316.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 351 }, { "epoch": 0.2816, "grad_norm": 1.1542928893291644, "learning_rate": 4.514303953922623e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.6328125, "logps/chosen": -40.5, "logps/rejected": -40.5, "loss": 0.6902, "loss/demonstration_loss": -640.0, "loss/preference_loss": -640.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0, "rewards/rejected": -0.00811767578125, "step": 352 }, { "epoch": 0.2824, "grad_norm": 0.5058573576234423, "learning_rate": 4.5101611099028363e-07, "logits/chosen": 0.65625, "logits/rejected": 0.6875, "logps/chosen": -12.1875, "logps/rejected": -16.125, "loss": 0.6919, "loss/demonstration_loss": -452.0, "loss/preference_loss": -456.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.0018768310546875, "rewards/rejected": 0.001251220703125, "step": 353 }, { "epoch": 0.2832, "grad_norm": 1.827138992953967, "learning_rate": 4.5060025902429165e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.796875, "logps/chosen": -98.5, "logps/rejected": -104.0, "loss": 0.6895, "loss/demonstration_loss": -536.0, "loss/preference_loss": -536.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.00909423828125, "rewards/rejected": -0.02001953125, "step": 354 }, { "epoch": 0.284, "grad_norm": 1.1600821899829108, "learning_rate": 4.501828427371833e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.859375, "logps/chosen": -34.25, "logps/rejected": -24.875, "loss": 0.694, "loss/demonstration_loss": -233.0, "loss/preference_loss": -237.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.008544921875, "rewards/margins": -0.0089111328125, "rewards/rejected": 0.00031280517578125, "step": 355 }, { "epoch": 0.2848, "grad_norm": 1.652359434054351, "learning_rate": 4.497638653840549e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.61328125, "logps/chosen": -37.0, "logps/rejected": -50.5, "loss": 0.6913, "loss/demonstration_loss": -350.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.0118408203125, "step": 356 }, { "epoch": 0.2856, "grad_norm": 1.2676454944177549, "learning_rate": 4.4934333023217584e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.5078125, "logps/chosen": -46.25, "logps/rejected": -54.75, "loss": 0.6932, "loss/demonstration_loss": -400.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.003753662109375, "step": 357 }, { "epoch": 0.2864, "grad_norm": 1.59862476057095, "learning_rate": 4.489212405609638e-07, "logits/chosen": 0.75, "logits/rejected": 0.76171875, "logps/chosen": -9.75, "logps/rejected": -8.5625, "loss": 0.6937, "loss/demonstration_loss": -292.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.001251220703125, "step": 358 }, { "epoch": 0.2872, "grad_norm": 1.625552855625269, "learning_rate": 4.4849759966195884e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.73828125, "logps/chosen": -25.625, "logps/rejected": -47.5, "loss": 0.6897, "loss/demonstration_loss": -388.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002655029296875, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.00689697265625, "step": 359 }, { "epoch": 0.288, "grad_norm": 1.3276654580621152, "learning_rate": 4.4807241083879764e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.4609375, "logps/chosen": -13.625, "logps/rejected": -25.375, "loss": 0.6915, "loss/demonstration_loss": -310.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.002197265625, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00093841552734375, "step": 360 }, { "epoch": 0.2888, "grad_norm": 1.116770847632795, "learning_rate": 4.476456774071882e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.734375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6906, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 361 }, { "epoch": 0.2896, "grad_norm": 1.2822943144627854, "learning_rate": 4.472174026948835e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.94140625, "logps/chosen": -74.0, "logps/rejected": -73.0, "loss": 0.6893, "loss/demonstration_loss": -584.0, "loss/preference_loss": -584.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.015625, "step": 362 }, { "epoch": 0.2904, "grad_norm": 1.513390250316614, "learning_rate": 4.467875900416558e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.69921875, "logps/chosen": -79.5, "logps/rejected": -82.5, "loss": 0.6888, "loss/demonstration_loss": -516.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.01220703125, "step": 363 }, { "epoch": 0.2912, "grad_norm": 1.5727182595635445, "learning_rate": 4.463562427992704e-07, "logits/chosen": 0.59375, "logits/rejected": 0.58984375, "logps/chosen": -39.25, "logps/rejected": -41.75, "loss": 0.6908, "loss/demonstration_loss": -256.0, "loss/preference_loss": -258.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00689697265625, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.0031280517578125, "step": 364 }, { "epoch": 0.292, "grad_norm": 1.0576673204309532, "learning_rate": 4.459233643314599e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.7265625, "logps/chosen": -41.0, "logps/rejected": -40.75, "loss": 0.6912, "loss/demonstration_loss": -430.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00872802734375, "rewards/margins": -0.0005950927734375, "rewards/rejected": -0.00811767578125, "step": 365 }, { "epoch": 0.2928, "grad_norm": 1.5847880690737732, "learning_rate": 4.454889580138975e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.78125, "logps/chosen": -8.375, "logps/rejected": -8.6875, "loss": 0.6903, "loss/demonstration_loss": -276.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0006256103515625, "step": 366 }, { "epoch": 0.2936, "grad_norm": 1.1367595566005535, "learning_rate": 4.450530272341709e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.73828125, "logps/chosen": -52.75, "logps/rejected": -67.0, "loss": 0.6898, "loss/demonstration_loss": -476.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.01312255859375, "step": 367 }, { "epoch": 0.2944, "grad_norm": 1.57544981129692, "learning_rate": 4.4461557539175587e-07, "logits/chosen": 0.9375, "logits/rejected": 0.8359375, "logps/chosen": -70.5, "logps/rejected": -79.0, "loss": 0.6919, "loss/demonstration_loss": -472.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01409912109375, "rewards/margins": -0.0003204345703125, "rewards/rejected": -0.01373291015625, "step": 368 }, { "epoch": 0.2952, "grad_norm": 1.1240318882946225, "learning_rate": 4.441766058979898e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.81640625, "logps/chosen": -65.5, "logps/rejected": -63.25, "loss": 0.6912, "loss/demonstration_loss": -406.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0150146484375, "rewards/margins": 0.0019073486328125, "rewards/rejected": -0.016845703125, "step": 369 }, { "epoch": 0.296, "grad_norm": 1.2927674133193081, "learning_rate": 4.437361221760449e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.5859375, "logps/chosen": -27.0, "logps/rejected": -34.75, "loss": 0.692, "loss/demonstration_loss": -330.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0028076171875, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.002197265625, "step": 370 }, { "epoch": 0.2968, "grad_norm": 0.9807115765737817, "learning_rate": 4.432941276609018e-07, "logits/chosen": 0.75, "logits/rejected": 0.921875, "logps/chosen": -33.0, "logps/rejected": -20.25, "loss": 0.6935, "loss/demonstration_loss": -416.0, "loss/preference_loss": -418.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0118408203125, "rewards/margins": -0.003082275390625, "rewards/rejected": -0.0087890625, "step": 371 }, { "epoch": 0.2976, "grad_norm": 1.076968077562448, "learning_rate": 4.428506257993225e-07, "logits/chosen": 0.8984375, "logits/rejected": 0.98828125, "logps/chosen": -52.75, "logps/rejected": -49.0, "loss": 0.6901, "loss/demonstration_loss": -540.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00439453125, "rewards/rejected": -0.006256103515625, "step": 372 }, { "epoch": 0.2984, "grad_norm": 1.7609733761958084, "learning_rate": 4.4240562004982364e-07, "logits/chosen": 0.8671875, "logits/rejected": 0.70703125, "logps/chosen": -51.25, "logps/rejected": -61.75, "loss": 0.6903, "loss/demonstration_loss": -448.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0084228515625, "rewards/margins": 0.0084228515625, "rewards/rejected": -0.016845703125, "step": 373 }, { "epoch": 0.2992, "grad_norm": 1.7423078809211383, "learning_rate": 4.419591138826494e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.93359375, "logps/chosen": -69.0, "logps/rejected": -66.5, "loss": 0.6929, "loss/demonstration_loss": -426.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.020263671875, "rewards/margins": -0.0040283203125, "rewards/rejected": -0.0162353515625, "step": 374 }, { "epoch": 0.3, "grad_norm": 1.077725233763043, "learning_rate": 4.415111107797445e-07, "logits/chosen": 0.59375, "logits/rejected": 0.6796875, "logps/chosen": -34.0, "logps/rejected": -43.0, "loss": 0.6904, "loss/demonstration_loss": -410.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002197265625, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.003753662109375, "step": 375 }, { "epoch": 0.3008, "grad_norm": 1.1619991209190434, "learning_rate": 4.410616142347272e-07, "logits/chosen": 0.875, "logits/rejected": 0.734375, "logps/chosen": -11.3125, "logps/rejected": -23.375, "loss": 0.6893, "loss/demonstration_loss": -276.0, "loss/preference_loss": -268.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.00970458984375, "rewards/rejected": -0.01129150390625, "step": 376 }, { "epoch": 0.3016, "grad_norm": 1.0758447880321658, "learning_rate": 4.4061062775286197e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.6875, "logps/chosen": -22.375, "logps/rejected": -12.0625, "loss": 0.6898, "loss/demonstration_loss": -552.0, "loss/preference_loss": -548.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.0018768310546875, "step": 377 }, { "epoch": 0.3024, "grad_norm": 1.0790536874968466, "learning_rate": 4.401581548510318e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.671875, "logps/chosen": -12.3125, "logps/rejected": -22.25, "loss": 0.6924, "loss/demonstration_loss": -276.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00360107421875, "rewards/margins": -0.0010986328125, "rewards/rejected": -0.00250244140625, "step": 378 }, { "epoch": 0.3032, "grad_norm": 1.6978505283585232, "learning_rate": 4.397041990577114e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.9375, "logps/chosen": -3.09375, "logps/rejected": -3.3125, "loss": 0.6927, "loss/demonstration_loss": -105.5, "loss/preference_loss": -102.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.002197265625, "rewards/rejected": -0.00031280517578125, "step": 379 }, { "epoch": 0.304, "grad_norm": 1.4727456964611132, "learning_rate": 4.392487639129391e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.5625, "logps/chosen": -28.75, "logps/rejected": -30.75, "loss": 0.6892, "loss/demonstration_loss": -474.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0050048828125, "step": 380 }, { "epoch": 0.3048, "grad_norm": 1.2443713120237103, "learning_rate": 4.3879185296828976e-07, "logits/chosen": 0.734375, "logits/rejected": 0.625, "logps/chosen": -23.875, "logps/rejected": -44.5, "loss": 0.6887, "loss/demonstration_loss": -544.0, "loss/preference_loss": -536.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0010986328125, "rewards/margins": 0.0101318359375, "rewards/rejected": -0.01123046875, "step": 381 }, { "epoch": 0.3056, "grad_norm": 1.8924304357276873, "learning_rate": 4.383334697868467e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.8046875, "logps/chosen": -58.5, "logps/rejected": -66.0, "loss": 0.6903, "loss/demonstration_loss": -496.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0, "rewards/rejected": -0.00811767578125, "step": 382 }, { "epoch": 0.3064, "grad_norm": 1.421684199452123, "learning_rate": 4.3787361794317403e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.44921875, "logps/chosen": -57.25, "logps/rejected": -70.5, "loss": 0.6919, "loss/demonstration_loss": -404.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.014404296875, "rewards/margins": 0.00311279296875, "rewards/rejected": -0.0174560546875, "step": 383 }, { "epoch": 0.3072, "grad_norm": 1.3863602115880276, "learning_rate": 4.3741230102328873e-07, "logits/chosen": 0.65625, "logits/rejected": 0.71484375, "logps/chosen": -48.5, "logps/rejected": -42.75, "loss": 0.6913, "loss/demonstration_loss": -362.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0062255859375, "rewards/margins": 0.004058837890625, "rewards/rejected": -0.01031494140625, "step": 384 }, { "epoch": 0.308, "grad_norm": 1.1791749056050824, "learning_rate": 4.36949522624633e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.54296875, "logps/chosen": -12.6875, "logps/rejected": -12.5625, "loss": 0.6941, "loss/demonstration_loss": -398.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 385 }, { "epoch": 0.3088, "grad_norm": 1.186197941353783, "learning_rate": 4.3648528635604555e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.63671875, "logps/chosen": -41.0, "logps/rejected": -41.5, "loss": 0.6903, "loss/demonstration_loss": -652.0, "loss/preference_loss": -648.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01123046875, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.0150146484375, "step": 386 }, { "epoch": 0.3096, "grad_norm": 1.3487759752843138, "learning_rate": 4.3601959583773414e-07, "logits/chosen": 0.89453125, "logits/rejected": 0.7734375, "logps/chosen": -18.25, "logps/rejected": -33.5, "loss": 0.6909, "loss/demonstration_loss": -276.0, "loss/preference_loss": -274.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 387 }, { "epoch": 0.3104, "grad_norm": 1.402740321316714, "learning_rate": 4.35552454701247e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.51171875, "logps/chosen": -32.25, "logps/rejected": -34.0, "loss": 0.6923, "loss/demonstration_loss": -264.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00390625, "rewards/margins": 0.0004730224609375, "rewards/rejected": -0.00439453125, "step": 388 }, { "epoch": 0.3112, "grad_norm": 1.8272607616978984, "learning_rate": 4.350838665894445e-07, "logits/chosen": 0.369140625, "logits/rejected": 0.36328125, "logps/chosen": -45.0, "logps/rejected": -44.0, "loss": 0.6934, "loss/demonstration_loss": -348.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.017822265625, "rewards/margins": -0.01068115234375, "rewards/rejected": -0.0072021484375, "step": 389 }, { "epoch": 0.312, "grad_norm": 1.4240484999885559, "learning_rate": 4.34613835156471e-07, "logits/chosen": 0.515625, "logits/rejected": 0.578125, "logps/chosen": -52.5, "logps/rejected": -44.25, "loss": 0.6918, "loss/demonstration_loss": -512.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.0012359619140625, "rewards/rejected": -0.006866455078125, "step": 390 }, { "epoch": 0.3128, "grad_norm": 0.7717999619111521, "learning_rate": 4.341423640677258e-07, "logits/chosen": 0.515625, "logits/rejected": 0.4765625, "logps/chosen": -34.75, "logps/rejected": -34.75, "loss": 0.692, "loss/demonstration_loss": -366.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0, "rewards/rejected": -0.00811767578125, "step": 391 }, { "epoch": 0.3136, "grad_norm": 1.5150129447296707, "learning_rate": 4.336694569998354e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.8359375, "logps/chosen": -55.25, "logps/rejected": -58.0, "loss": 0.6907, "loss/demonstration_loss": -600.0, "loss/preference_loss": -600.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.006256103515625, "step": 392 }, { "epoch": 0.3144, "grad_norm": 1.1926329740581627, "learning_rate": 4.331951176406239e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.75390625, "logps/chosen": -82.0, "logps/rejected": -67.0, "loss": 0.6936, "loss/demonstration_loss": -468.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0216064453125, "rewards/margins": -0.0084228515625, "rewards/rejected": -0.01312255859375, "step": 393 }, { "epoch": 0.3152, "grad_norm": 1.257261278878758, "learning_rate": 4.3271934968908507e-07, "logits/chosen": 0.546875, "logits/rejected": 0.640625, "logps/chosen": -37.25, "logps/rejected": -24.25, "loss": 0.6957, "loss/demonstration_loss": -488.0, "loss/preference_loss": -490.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.001251220703125, "step": 394 }, { "epoch": 0.316, "grad_norm": 1.3761716478509598, "learning_rate": 4.3224215685535287e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.5703125, "logps/chosen": -17.875, "logps/rejected": -17.5, "loss": 0.6934, "loss/demonstration_loss": -560.0, "loss/preference_loss": -564.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.001251220703125, "step": 395 }, { "epoch": 0.3168, "grad_norm": 1.2983460385791294, "learning_rate": 4.3176354286067296e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.73046875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 396 }, { "epoch": 0.3176, "grad_norm": 1.3032821767100937, "learning_rate": 4.312835114373733e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.80078125, "logps/chosen": -77.5, "logps/rejected": -54.5, "loss": 0.6953, "loss/demonstration_loss": -520.0, "loss/preference_loss": -524.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0162353515625, "rewards/margins": -0.004547119140625, "rewards/rejected": -0.01171875, "step": 397 }, { "epoch": 0.3184, "grad_norm": 1.2851897499414342, "learning_rate": 4.308020663288355e-07, "logits/chosen": 0.73046875, "logits/rejected": 0.6484375, "logps/chosen": -74.0, "logps/rejected": -78.0, "loss": 0.6917, "loss/demonstration_loss": -482.0, "loss/preference_loss": -482.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.0012664794921875, "rewards/rejected": -0.013427734375, "step": 398 }, { "epoch": 0.3192, "grad_norm": 1.0161707468958951, "learning_rate": 4.3031921128946515e-07, "logits/chosen": 0.9375, "logits/rejected": 0.86328125, "logps/chosen": -19.5, "logps/rejected": -29.75, "loss": 0.6898, "loss/demonstration_loss": -390.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.010009765625, "rewards/rejected": -0.0150146484375, "step": 399 }, { "epoch": 0.32, "grad_norm": 1.6726074557405786, "learning_rate": 4.2983495008466273e-07, "logits/chosen": 1.109375, "logits/rejected": 1.0625, "logps/chosen": -33.5, "logps/rejected": -37.5, "loss": 0.6907, "loss/demonstration_loss": -374.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0078125, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.0018768310546875, "step": 400 }, { "epoch": 0.3208, "grad_norm": 1.5918198301149398, "learning_rate": 4.293492864907947e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.8515625, "logps/chosen": -79.0, "logps/rejected": -90.5, "loss": 0.6868, "loss/demonstration_loss": -672.0, "loss/preference_loss": -672.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01312255859375, "rewards/margins": 0.009033203125, "rewards/rejected": -0.022216796875, "step": 401 }, { "epoch": 0.3216, "grad_norm": 1.6989153608469563, "learning_rate": 4.2886222429516294e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.96875, "logps/chosen": -71.0, "logps/rejected": -66.5, "loss": 0.6917, "loss/demonstration_loss": -544.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0137939453125, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.0172119140625, "step": 402 }, { "epoch": 0.3224, "grad_norm": 0.6016309077705547, "learning_rate": 4.283737672959766e-07, "logits/chosen": 0.75, "logits/rejected": 0.7109375, "logps/chosen": -20.5, "logps/rejected": -21.875, "loss": 0.692, "loss/demonstration_loss": -336.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.002197265625, "step": 403 }, { "epoch": 0.3232, "grad_norm": 0.9370188763548133, "learning_rate": 4.278839193023214e-07, "logits/chosen": 0.765625, "logits/rejected": 0.78125, "logps/chosen": -2.453125, "logps/rejected": -5.15625, "loss": 0.6895, "loss/demonstration_loss": -122.0, "loss/preference_loss": -122.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.000156402587890625, "rewards/margins": 0.000156402587890625, "rewards/rejected": 0.0, "step": 404 }, { "epoch": 0.324, "grad_norm": 1.2456118102586333, "learning_rate": 4.273926841341302e-07, "logits/chosen": 0.9609375, "logits/rejected": 1.109375, "logps/chosen": -34.0, "logps/rejected": -24.875, "loss": 0.692, "loss/demonstration_loss": -310.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0072021484375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.006561279296875, "step": 405 }, { "epoch": 0.3248, "grad_norm": 1.3505512009895988, "learning_rate": 4.269000656221538e-07, "logits/chosen": 0.91015625, "logits/rejected": 0.80859375, "logps/chosen": -29.75, "logps/rejected": -31.875, "loss": 0.6957, "loss/demonstration_loss": -324.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.0037384033203125, "step": 406 }, { "epoch": 0.3256, "grad_norm": 1.3816895524905575, "learning_rate": 4.264060676079302e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.89453125, "logps/chosen": -81.0, "logps/rejected": -73.5, "loss": 0.6906, "loss/demonstration_loss": -490.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0137939453125, "rewards/margins": 0.0040283203125, "rewards/rejected": -0.017822265625, "step": 407 }, { "epoch": 0.3264, "grad_norm": 1.0622647400882534, "learning_rate": 4.2591069394375506e-07, "logits/chosen": 0.734375, "logits/rejected": 0.86328125, "logps/chosen": -29.75, "logps/rejected": -14.375, "loss": 0.694, "loss/demonstration_loss": -350.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.00156402587890625, "step": 408 }, { "epoch": 0.3272, "grad_norm": 1.8989487596040777, "learning_rate": 4.2541394849265186e-07, "logits/chosen": 1.0546875, "logits/rejected": 1.1640625, "logps/chosen": -60.0, "logps/rejected": -56.5, "loss": 0.6882, "loss/demonstration_loss": -372.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.010009765625, "rewards/rejected": -0.0128173828125, "step": 409 }, { "epoch": 0.328, "grad_norm": 0.5916966633354532, "learning_rate": 4.249158351283413e-07, "logits/chosen": 0.75, "logits/rejected": 0.6953125, "logps/chosen": -16.625, "logps/rejected": -24.375, "loss": 0.6904, "loss/demonstration_loss": -218.0, "loss/preference_loss": -216.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00439453125, "rewards/rejected": -0.006256103515625, "step": 410 }, { "epoch": 0.3288, "grad_norm": 1.2184969920433846, "learning_rate": 4.244163577352116e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.671875, "logps/chosen": -93.5, "logps/rejected": -85.0, "loss": 0.6954, "loss/demonstration_loss": -468.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0228271484375, "rewards/margins": -0.01092529296875, "rewards/rejected": -0.01190185546875, "step": 411 }, { "epoch": 0.3296, "grad_norm": 0.9929363418449132, "learning_rate": 4.239155202082877e-07, "logits/chosen": 0.73046875, "logits/rejected": 0.68359375, "logps/chosen": -11.75, "logps/rejected": -13.5, "loss": 0.6884, "loss/demonstration_loss": -203.0, "loss/preference_loss": -200.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.004058837890625, "rewards/rejected": -0.00311279296875, "step": 412 }, { "epoch": 0.3304, "grad_norm": 1.9514722891309297, "learning_rate": 4.234133264532012e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.8828125, "logps/chosen": -65.5, "logps/rejected": -42.0, "loss": 0.6962, "loss/demonstration_loss": -424.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.016845703125, "rewards/margins": -0.01373291015625, "rewards/rejected": -0.0031280517578125, "step": 413 }, { "epoch": 0.3312, "grad_norm": 1.6335931213311732, "learning_rate": 4.2290978038616e-07, "logits/chosen": 1.140625, "logits/rejected": 1.0546875, "logps/chosen": -37.0, "logps/rejected": -41.5, "loss": 0.6932, "loss/demonstration_loss": -312.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.001251220703125, "step": 414 }, { "epoch": 0.332, "grad_norm": 1.2085125310594949, "learning_rate": 4.224048859339174e-07, "logits/chosen": 0.99609375, "logits/rejected": 0.83203125, "logps/chosen": -46.25, "logps/rejected": -71.0, "loss": 0.6926, "loss/demonstration_loss": -464.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0093994140625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0106201171875, "step": 415 }, { "epoch": 0.3328, "grad_norm": 0.8786991886780942, "learning_rate": 4.218986470337418e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.87890625, "logps/chosen": -14.375, "logps/rejected": -20.25, "loss": 0.6924, "loss/demonstration_loss": -548.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.00250244140625, "step": 416 }, { "epoch": 0.3336, "grad_norm": 0.7324786485147345, "learning_rate": 4.213910676333859e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.5703125, "logps/chosen": -5.6875, "logps/rejected": -5.5625, "loss": 0.6912, "loss/demonstration_loss": -179.0, "loss/preference_loss": -181.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.0006256103515625, "step": 417 }, { "epoch": 0.3344, "grad_norm": 1.3745743166814353, "learning_rate": 4.2088215169105566e-07, "logits/chosen": 0.546875, "logits/rejected": 0.5, "logps/chosen": -79.5, "logps/rejected": -92.0, "loss": 0.689, "loss/demonstration_loss": -544.0, "loss/preference_loss": -536.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.019287109375, "rewards/margins": 0.01202392578125, "rewards/rejected": -0.03125, "step": 418 }, { "epoch": 0.3352, "grad_norm": 1.512782969835043, "learning_rate": 4.2037190317537994e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.90234375, "logps/chosen": -38.0, "logps/rejected": -37.5, "loss": 0.6945, "loss/demonstration_loss": -400.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.001251220703125, "step": 419 }, { "epoch": 0.336, "grad_norm": 1.243538828179209, "learning_rate": 4.1986032606537916e-07, "logits/chosen": 0.703125, "logits/rejected": 0.75390625, "logps/chosen": -58.75, "logps/rejected": -55.5, "loss": 0.693, "loss/demonstration_loss": -448.0, "loss/preference_loss": -452.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0162353515625, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.00811767578125, "step": 420 }, { "epoch": 0.3368, "grad_norm": 1.13505346866085, "learning_rate": 4.193474243504343e-07, "logits/chosen": 0.8203125, "logits/rejected": 1.1015625, "logps/chosen": -67.0, "logps/rejected": -36.0, "loss": 0.6919, "loss/demonstration_loss": -544.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.00909423828125, "step": 421 }, { "epoch": 0.3376, "grad_norm": 1.070382584148918, "learning_rate": 4.188332020302561e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.87890625, "logps/chosen": -28.625, "logps/rejected": -20.625, "loss": 0.6923, "loss/demonstration_loss": -390.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0062255859375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0050048828125, "step": 422 }, { "epoch": 0.3384, "grad_norm": 1.4027405581435324, "learning_rate": 4.183176631148534e-07, "logits/chosen": 0.84765625, "logits/rejected": 1.0546875, "logps/chosen": -31.75, "logps/rejected": -21.5, "loss": 0.6919, "loss/demonstration_loss": -282.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00531005859375, "rewards/margins": -0.0034332275390625, "rewards/rejected": -0.0018768310546875, "step": 423 }, { "epoch": 0.3392, "grad_norm": 1.8546088713665443, "learning_rate": 4.1780081162450233e-07, "logits/chosen": 0.94921875, "logits/rejected": 1.109375, "logps/chosen": -66.5, "logps/rejected": -58.5, "loss": 0.6927, "loss/demonstration_loss": -328.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.005645751953125, "rewards/rejected": -0.006866455078125, "step": 424 }, { "epoch": 0.34, "grad_norm": 2.0146919924086752, "learning_rate": 4.172826515897145e-07, "logits/chosen": 0.84375, "logits/rejected": 0.75390625, "logps/chosen": -100.5, "logps/rejected": -107.5, "loss": 0.6946, "loss/demonstration_loss": -468.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.030029296875, "rewards/margins": -0.01312255859375, "rewards/rejected": -0.016845703125, "step": 425 }, { "epoch": 0.3408, "grad_norm": 1.1435390083004564, "learning_rate": 4.167631870512061e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.79296875, "logps/chosen": -60.0, "logps/rejected": -52.75, "loss": 0.6908, "loss/demonstration_loss": -596.0, "loss/preference_loss": -596.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01251220703125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.014404296875, "step": 426 }, { "epoch": 0.3416, "grad_norm": 1.6140357195314738, "learning_rate": 4.162424220598658e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.6640625, "logps/chosen": -51.75, "logps/rejected": -51.0, "loss": 0.6931, "loss/demonstration_loss": -808.0, "loss/preference_loss": -816.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0162353515625, "rewards/margins": -0.00750732421875, "rewards/rejected": -0.0087890625, "step": 427 }, { "epoch": 0.3424, "grad_norm": 1.7573048921852543, "learning_rate": 4.157203606767238e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.7890625, "logps/chosen": -42.5, "logps/rejected": -44.25, "loss": 0.6927, "loss/demonstration_loss": -344.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0087890625, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00750732421875, "step": 428 }, { "epoch": 0.3432, "grad_norm": 1.1698179073118053, "learning_rate": 4.151970069729194e-07, "logits/chosen": 0.765625, "logits/rejected": 0.859375, "logps/chosen": -37.5, "logps/rejected": -33.0, "loss": 0.693, "loss/demonstration_loss": -278.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00469970703125, "rewards/rejected": -0.00469970703125, "step": 429 }, { "epoch": 0.344, "grad_norm": 0.46320638740125686, "learning_rate": 4.146723650296701e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.6484375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 430 }, { "epoch": 0.3448, "grad_norm": 1.280990130493377, "learning_rate": 4.141464389382391e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.546875, "logps/chosen": -89.5, "logps/rejected": -85.5, "loss": 0.6909, "loss/demonstration_loss": -696.0, "loss/preference_loss": -692.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0150146484375, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.02001953125, "step": 431 }, { "epoch": 0.3456, "grad_norm": 1.2710873760351025, "learning_rate": 4.136192327999037e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.4296875, "logps/chosen": -31.625, "logps/rejected": -56.0, "loss": 0.6896, "loss/demonstration_loss": -466.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0029754638671875, "rewards/margins": 0.01202392578125, "rewards/rejected": -0.0150146484375, "step": 432 }, { "epoch": 0.3464, "grad_norm": 1.4194521853423492, "learning_rate": 4.1309075072592325e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.60546875, "logps/chosen": -92.5, "logps/rejected": -86.0, "loss": 0.6958, "loss/demonstration_loss": -564.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.026611328125, "rewards/margins": -0.01373291015625, "rewards/rejected": -0.0128173828125, "step": 433 }, { "epoch": 0.3472, "grad_norm": 1.3215711969523125, "learning_rate": 4.1256099683750716e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.7890625, "logps/chosen": -44.25, "logps/rejected": -47.0, "loss": 0.692, "loss/demonstration_loss": -480.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.01251220703125, "step": 434 }, { "epoch": 0.348, "grad_norm": 1.3068273467392595, "learning_rate": 4.120299752657827e-07, "logits/chosen": 0.640625, "logits/rejected": 0.65234375, "logps/chosen": -30.75, "logps/rejected": -31.0, "loss": 0.6927, "loss/demonstration_loss": -492.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.00439453125, "step": 435 }, { "epoch": 0.3488, "grad_norm": 1.6447945332385425, "learning_rate": 4.114976901517627e-07, "logits/chosen": 0.578125, "logits/rejected": 0.72265625, "logps/chosen": -104.0, "logps/rejected": -86.5, "loss": 0.6917, "loss/demonstration_loss": -604.0, "loss/preference_loss": -604.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.016845703125, "rewards/margins": 0.002777099609375, "rewards/rejected": -0.0196533203125, "step": 436 }, { "epoch": 0.3496, "grad_norm": 1.2758149958197171, "learning_rate": 4.109641456463134e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.6953125, "logps/chosen": -47.0, "logps/rejected": -48.75, "loss": 0.6912, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0101318359375, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.007659912109375, "step": 437 }, { "epoch": 0.3504, "grad_norm": 1.5336710000717877, "learning_rate": 4.1042934591012214e-07, "logits/chosen": 0.76171875, "logits/rejected": 0.9609375, "logps/chosen": -83.0, "logps/rejected": -63.75, "loss": 0.6948, "loss/demonstration_loss": -388.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01373291015625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.00811767578125, "step": 438 }, { "epoch": 0.3512, "grad_norm": 2.0676027072805496, "learning_rate": 4.098932951136645e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.578125, "logps/chosen": -8.0, "logps/rejected": -3.4375, "loss": 0.6915, "loss/demonstration_loss": -182.0, "loss/preference_loss": -183.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.000782012939453125, "rewards/rejected": 0.000156402587890625, "step": 439 }, { "epoch": 0.352, "grad_norm": 1.5749703065223475, "learning_rate": 4.0935599743717244e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.77734375, "logps/chosen": -38.25, "logps/rejected": -34.75, "loss": 0.693, "loss/demonstration_loss": -288.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.01470947265625, "rewards/rejected": 0.00439453125, "step": 440 }, { "epoch": 0.3528, "grad_norm": 13.423840077189428, "learning_rate": 4.0881745707060106e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.87890625, "logps/chosen": -35.5, "logps/rejected": -13.0625, "loss": 0.692, "loss/demonstration_loss": -384.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.005950927734375, "step": 441 }, { "epoch": 0.3536, "grad_norm": 1.1168160294847511, "learning_rate": 4.082776782135964e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.796875, "logps/chosen": -18.0, "logps/rejected": -19.0, "loss": 0.6887, "loss/demonstration_loss": -296.0, "loss/preference_loss": -290.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.006866455078125, "rewards/rejected": -0.00592041015625, "step": 442 }, { "epoch": 0.3544, "grad_norm": 1.4875042872958055, "learning_rate": 4.0773666507546237e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.6484375, "logps/chosen": -86.5, "logps/rejected": -78.5, "loss": 0.6927, "loss/demonstration_loss": -520.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.023681640625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.018798828125, "step": 443 }, { "epoch": 0.3552, "grad_norm": 0.6173076325472531, "learning_rate": 4.071944218751282e-07, "logits/chosen": 0.6875, "logits/rejected": 0.6640625, "logps/chosen": -18.125, "logps/rejected": -17.5, "loss": 0.6931, "loss/demonstration_loss": -560.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.001251220703125, "step": 444 }, { "epoch": 0.356, "grad_norm": 0.7415876389466017, "learning_rate": 4.066509528411151e-07, "logits/chosen": 0.65625, "logits/rejected": 0.76171875, "logps/chosen": -24.0, "logps/rejected": -14.25, "loss": 0.6912, "loss/demonstration_loss": -302.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00811767578125, "step": 445 }, { "epoch": 0.3568, "grad_norm": 1.0370923153183138, "learning_rate": 4.061062622115039e-07, "logits/chosen": 0.78125, "logits/rejected": 0.8203125, "logps/chosen": -17.125, "logps/rejected": -19.25, "loss": 0.691, "loss/demonstration_loss": -290.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0020294189453125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00390625, "step": 446 }, { "epoch": 0.3576, "grad_norm": 1.2571336781123872, "learning_rate": 4.0556035423390164e-07, "logits/chosen": 0.89453125, "logits/rejected": 0.7421875, "logps/chosen": -36.5, "logps/rejected": -37.0, "loss": 0.692, "loss/demonstration_loss": -390.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00701904296875, "rewards/margins": 0.00238037109375, "rewards/rejected": -0.0093994140625, "step": 447 }, { "epoch": 0.3584, "grad_norm": 1.2429781989862558, "learning_rate": 4.0501323316540814e-07, "logits/chosen": 0.84765625, "logits/rejected": 1.0546875, "logps/chosen": -39.5, "logps/rejected": -12.625, "loss": 0.6938, "loss/demonstration_loss": -276.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.00469970703125, "rewards/rejected": 0.00156402587890625, "step": 448 }, { "epoch": 0.3592, "grad_norm": 1.3926466855447885, "learning_rate": 4.044649032725836e-07, "logits/chosen": 0.6875, "logits/rejected": 0.578125, "logps/chosen": -49.0, "logps/rejected": -55.5, "loss": 0.696, "loss/demonstration_loss": -552.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.010009765625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.005615234375, "step": 449 }, { "epoch": 0.36, "grad_norm": 1.8012708242224371, "learning_rate": 4.039153688314145e-07, "logits/chosen": 0.87890625, "logits/rejected": 1.09375, "logps/chosen": -45.25, "logps/rejected": -26.25, "loss": 0.6932, "loss/demonstration_loss": -282.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.007659912109375, "rewards/rejected": -0.002655029296875, "step": 450 }, { "epoch": 0.3608, "grad_norm": 1.2321963037726271, "learning_rate": 4.0336463412728106e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.80859375, "logps/chosen": -66.0, "logps/rejected": -53.5, "loss": 0.6906, "loss/demonstration_loss": -636.0, "loss/preference_loss": -632.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.00970458984375, "step": 451 }, { "epoch": 0.3616, "grad_norm": 0.772265387230259, "learning_rate": 4.028127034549229e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.486328125, "logps/chosen": -27.5, "logps/rejected": -27.75, "loss": 0.6942, "loss/demonstration_loss": -432.0, "loss/preference_loss": -442.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.0093994140625, "rewards/rejected": -0.001251220703125, "step": 452 }, { "epoch": 0.3624, "grad_norm": 1.833601680735281, "learning_rate": 4.0225958111840633e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.6328125, "logps/chosen": -44.25, "logps/rejected": -46.75, "loss": 0.6945, "loss/demonstration_loss": -354.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.023681640625, "rewards/margins": -0.0118408203125, "rewards/rejected": -0.0118408203125, "step": 453 }, { "epoch": 0.3632, "grad_norm": 1.6187427951558109, "learning_rate": 4.0170527143109055e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.71875, "logps/chosen": -15.375, "logps/rejected": -23.25, "loss": 0.6891, "loss/demonstration_loss": -608.0, "loss/preference_loss": -600.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.00439453125, "rewards/rejected": -0.010009765625, "step": 454 }, { "epoch": 0.364, "grad_norm": 1.1481992995613, "learning_rate": 4.0114977871559377e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.87890625, "logps/chosen": -23.0, "logps/rejected": -23.0, "loss": 0.6919, "loss/demonstration_loss": -728.0, "loss/preference_loss": -728.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0, "rewards/rejected": -0.006256103515625, "step": 455 }, { "epoch": 0.3648, "grad_norm": 1.0772066386480434, "learning_rate": 4.0059310730375963e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.734375, "logps/chosen": -55.5, "logps/rejected": -66.0, "loss": 0.6935, "loss/demonstration_loss": -640.0, "loss/preference_loss": -644.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.0050048828125, "step": 456 }, { "epoch": 0.3656, "grad_norm": 1.5262284089301976, "learning_rate": 4.000352615366239e-07, "logits/chosen": 0.703125, "logits/rejected": 0.71484375, "logps/chosen": -66.5, "logps/rejected": -62.0, "loss": 0.691, "loss/demonstration_loss": -508.0, "loss/preference_loss": -508.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01531982421875, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.014404296875, "step": 457 }, { "epoch": 0.3664, "grad_norm": 0.9753270684598278, "learning_rate": 3.994762457643797e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.7109375, "logps/chosen": -38.25, "logps/rejected": -45.75, "loss": 0.6915, "loss/demonstration_loss": -664.0, "loss/preference_loss": -664.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.00750732421875, "step": 458 }, { "epoch": 0.3672, "grad_norm": 1.0182919445651348, "learning_rate": 3.989160643463445e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.84375, "logps/chosen": -22.375, "logps/rejected": -37.75, "loss": 0.6918, "loss/demonstration_loss": -240.0, "loss/preference_loss": -238.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.006103515625, "step": 459 }, { "epoch": 0.368, "grad_norm": 1.4871547898633903, "learning_rate": 3.983547216509254e-07, "logits/chosen": 1.0546875, "logits/rejected": 0.83984375, "logps/chosen": -72.5, "logps/rejected": -91.0, "loss": 0.6901, "loss/demonstration_loss": -432.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01409912109375, "rewards/margins": 0.00341796875, "rewards/rejected": -0.0174560546875, "step": 460 }, { "epoch": 0.3688, "grad_norm": 1.6337519746346696, "learning_rate": 3.977922220555855e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.76953125, "logps/chosen": -33.25, "logps/rejected": -36.5, "loss": 0.6895, "loss/demonstration_loss": -276.0, "loss/preference_loss": -274.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.004852294921875, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.00830078125, "step": 461 }, { "epoch": 0.3696, "grad_norm": 1.6911452667878526, "learning_rate": 3.9722856994680963e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.69921875, "logps/chosen": -98.0, "logps/rejected": -89.0, "loss": 0.6875, "loss/demonstration_loss": -424.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0150146484375, "rewards/margins": 0.01470947265625, "rewards/rejected": -0.0296630859375, "step": 462 }, { "epoch": 0.3704, "grad_norm": 1.4068798132610099, "learning_rate": 3.966637697200703e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.83984375, "logps/chosen": -45.5, "logps/rejected": -42.5, "loss": 0.6913, "loss/demonstration_loss": -464.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00433349609375, "rewards/rejected": -0.01123046875, "step": 463 }, { "epoch": 0.3712, "grad_norm": 1.464647900119817, "learning_rate": 3.96097825779793e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.82421875, "logps/chosen": -34.25, "logps/rejected": -27.75, "loss": 0.6927, "loss/demonstration_loss": -490.0, "loss/preference_loss": -494.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.004364013671875, "rewards/rejected": -0.003753662109375, "step": 464 }, { "epoch": 0.372, "grad_norm": 2.3782000616017704, "learning_rate": 3.9553074253932233e-07, "logits/chosen": 0.703125, "logits/rejected": 0.65625, "logps/chosen": -26.0, "logps/rejected": -30.75, "loss": 0.692, "loss/demonstration_loss": -225.0, "loss/preference_loss": -224.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.00933837890625, "step": 465 }, { "epoch": 0.3728, "grad_norm": 1.3695395620700137, "learning_rate": 3.9496252442088727e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.75390625, "logps/chosen": -35.75, "logps/rejected": -26.25, "loss": 0.6937, "loss/demonstration_loss": -488.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.010009765625, "step": 466 }, { "epoch": 0.3736, "grad_norm": 1.3758637850860425, "learning_rate": 3.9439317585556686e-07, "logits/chosen": 0.921875, "logits/rejected": 0.859375, "logps/chosen": -7.125, "logps/rejected": -18.25, "loss": 0.6942, "loss/demonstration_loss": -406.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.0050048828125, "step": 467 }, { "epoch": 0.3744, "grad_norm": 1.3058413866632603, "learning_rate": 3.9382270128325567e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.7578125, "logps/chosen": -77.5, "logps/rejected": -78.0, "loss": 0.6912, "loss/demonstration_loss": -492.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0159912109375, "rewards/margins": 0.0009307861328125, "rewards/rejected": -0.016845703125, "step": 468 }, { "epoch": 0.3752, "grad_norm": 1.457689008688699, "learning_rate": 3.932511051526288e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.84375, "logps/chosen": -56.25, "logps/rejected": -47.5, "loss": 0.696, "loss/demonstration_loss": -272.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.019287109375, "rewards/margins": -0.01416015625, "rewards/rejected": -0.005157470703125, "step": 469 }, { "epoch": 0.376, "grad_norm": 1.1861104469400794, "learning_rate": 3.9267839192110797e-07, "logits/chosen": 1.15625, "logits/rejected": 0.96875, "logps/chosen": -27.125, "logps/rejected": -34.5, "loss": 0.6906, "loss/demonstration_loss": -245.0, "loss/preference_loss": -244.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00421142578125, "rewards/margins": 0.0021820068359375, "rewards/rejected": -0.00640869140625, "step": 470 }, { "epoch": 0.3768, "grad_norm": 1.7220396333874886, "learning_rate": 3.921045660548257e-07, "logits/chosen": 0.41796875, "logits/rejected": 0.53515625, "logps/chosen": -24.75, "logps/rejected": -10.375, "loss": 0.6927, "loss/demonstration_loss": -280.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.0, "step": 471 }, { "epoch": 0.3776, "grad_norm": 1.2399351316746563, "learning_rate": 3.9152963202859165e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.8671875, "logps/chosen": -16.625, "logps/rejected": -9.0, "loss": 0.6915, "loss/demonstration_loss": -404.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.00439453125, "rewards/rejected": 0.0006256103515625, "step": 472 }, { "epoch": 0.3784, "grad_norm": 1.0876830194244838, "learning_rate": 3.9095359432585665e-07, "logits/chosen": 0.78125, "logits/rejected": 0.80859375, "logps/chosen": -29.75, "logps/rejected": -30.25, "loss": 0.6919, "loss/demonstration_loss": -476.0, "loss/preference_loss": -476.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006103515625, "rewards/margins": 0.000782012939453125, "rewards/rejected": -0.00689697265625, "step": 473 }, { "epoch": 0.3792, "grad_norm": 1.711127412550184, "learning_rate": 3.9037645743867857e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.96875, "logps/chosen": -77.5, "logps/rejected": -64.0, "loss": 0.692, "loss/demonstration_loss": -372.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0194091796875, "rewards/margins": 0.0037078857421875, "rewards/rejected": -0.0230712890625, "step": 474 }, { "epoch": 0.38, "grad_norm": 1.1972062186153736, "learning_rate": 3.8979822586768666e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.8125, "logps/chosen": -51.75, "logps/rejected": -56.5, "loss": 0.6927, "loss/demonstration_loss": -572.0, "loss/preference_loss": -572.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0118408203125, "rewards/margins": -0.0018310546875, "rewards/rejected": -0.010009765625, "step": 475 }, { "epoch": 0.3808, "grad_norm": 0.4826917332537818, "learning_rate": 3.89218904122047e-07, "logits/chosen": 0.78125, "logits/rejected": 0.75390625, "logps/chosen": -15.375, "logps/rejected": -15.125, "loss": 0.692, "loss/demonstration_loss": -242.0, "loss/preference_loss": -242.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.0028076171875, "step": 476 }, { "epoch": 0.3816, "grad_norm": 1.340805392017326, "learning_rate": 3.886384967194268e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.734375, "logps/chosen": -62.75, "logps/rejected": -48.5, "loss": 0.692, "loss/demonstration_loss": -588.0, "loss/preference_loss": -584.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0162353515625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.018798828125, "step": 477 }, { "epoch": 0.3824, "grad_norm": 1.8821968110774714, "learning_rate": 3.8805700818595967e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.68359375, "logps/chosen": -32.5, "logps/rejected": -35.25, "loss": 0.6948, "loss/demonstration_loss": -268.0, "loss/preference_loss": -268.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00799560546875, "rewards/margins": -0.0001373291015625, "rewards/rejected": -0.0078125, "step": 478 }, { "epoch": 0.3832, "grad_norm": 1.4776457059771932, "learning_rate": 3.8747444305621e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.796875, "logps/chosen": -35.0, "logps/rejected": -18.375, "loss": 0.6908, "loss/demonstration_loss": -282.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.002197265625, "step": 479 }, { "epoch": 0.384, "grad_norm": 1.526218451974321, "learning_rate": 3.8689080587313755e-07, "logits/chosen": 0.8984375, "logits/rejected": 0.9375, "logps/chosen": -62.5, "logps/rejected": -53.75, "loss": 0.6919, "loss/demonstration_loss": -460.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.0017242431640625, "rewards/rejected": -0.0089111328125, "step": 480 }, { "epoch": 0.3848, "grad_norm": 1.5205428301404866, "learning_rate": 3.863061011880625e-07, "logits/chosen": 0.703125, "logits/rejected": 0.796875, "logps/chosen": -33.75, "logps/rejected": -24.0, "loss": 0.6906, "loss/demonstration_loss": -230.0, "loss/preference_loss": -230.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003143310546875, "rewards/margins": 0.00046539306640625, "rewards/rejected": -0.00360107421875, "step": 481 }, { "epoch": 0.3856, "grad_norm": 1.3663501877937707, "learning_rate": 3.857203335606294e-07, "logits/chosen": 0.88671875, "logits/rejected": 1.1015625, "logps/chosen": -40.5, "logps/rejected": -26.5, "loss": 0.6934, "loss/demonstration_loss": -346.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.017578125, "rewards/margins": -0.007232666015625, "rewards/rejected": -0.01031494140625, "step": 482 }, { "epoch": 0.3864, "grad_norm": 1.1397728549890154, "learning_rate": 3.851335075587717e-07, "logits/chosen": 0.796875, "logits/rejected": 0.79296875, "logps/chosen": -48.75, "logps/rejected": -49.5, "loss": 0.691, "loss/demonstration_loss": -390.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00799560546875, "rewards/margins": 0.006103515625, "rewards/rejected": -0.01409912109375, "step": 483 }, { "epoch": 0.3872, "grad_norm": 1.216555682295879, "learning_rate": 3.845456277586768e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.443359375, "logps/chosen": -26.0, "logps/rejected": -20.0, "loss": 0.6902, "loss/demonstration_loss": -364.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.002197265625, "rewards/rejected": -0.00156402587890625, "step": 484 }, { "epoch": 0.388, "grad_norm": 3.144104601760776, "learning_rate": 3.839566987447491e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.57421875, "logps/chosen": -83.5, "logps/rejected": -92.0, "loss": 0.6967, "loss/demonstration_loss": -464.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.01031494140625, "step": 485 }, { "epoch": 0.3888, "grad_norm": 0.9726586755359833, "learning_rate": 3.833667251095757e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.5703125, "logps/chosen": -47.5, "logps/rejected": -48.0, "loss": 0.6893, "loss/demonstration_loss": -506.0, "loss/preference_loss": -502.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.01092529296875, "step": 486 }, { "epoch": 0.3896, "grad_norm": 0.4460682566747493, "learning_rate": 3.8277571145388917e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.953125, "logps/chosen": -30.0, "logps/rejected": -27.125, "loss": 0.6899, "loss/demonstration_loss": -456.0, "loss/preference_loss": -450.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.0072021484375, "rewards/rejected": -0.0072021484375, "step": 487 }, { "epoch": 0.3904, "grad_norm": 1.5517196762397327, "learning_rate": 3.821836623865329e-07, "logits/chosen": 0.5234375, "logits/rejected": 0.6015625, "logps/chosen": -67.5, "logps/rejected": -53.0, "loss": 0.6893, "loss/demonstration_loss": -320.0, "loss/preference_loss": -318.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.01312255859375, "rewards/rejected": -0.017822265625, "step": 488 }, { "epoch": 0.3912, "grad_norm": 0.5655881548041418, "learning_rate": 3.815905825244244e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.78515625, "logps/chosen": -2.28125, "logps/rejected": -2.296875, "loss": 0.6912, "loss/demonstration_loss": -74.0, "loss/preference_loss": -74.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.000156402587890625, "rewards/rejected": 0.000469207763671875, "step": 489 }, { "epoch": 0.392, "grad_norm": 1.475736150390553, "learning_rate": 3.809964764925198e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.85546875, "logps/chosen": -41.0, "logps/rejected": -38.75, "loss": 0.6937, "loss/demonstration_loss": -420.0, "loss/preference_loss": -424.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.004364013671875, "rewards/rejected": -0.006256103515625, "step": 490 }, { "epoch": 0.3928, "grad_norm": 1.2742879944827494, "learning_rate": 3.8040134892377695e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.99609375, "logps/chosen": -35.0, "logps/rejected": -20.25, "loss": 0.6954, "loss/demonstration_loss": -432.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01123046875, "rewards/margins": -0.00970458984375, "rewards/rejected": -0.00156402587890625, "step": 491 }, { "epoch": 0.3936, "grad_norm": 1.792738540470516, "learning_rate": 3.7980520445912037e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.71875, "logps/chosen": -67.5, "logps/rejected": -58.75, "loss": 0.6938, "loss/demonstration_loss": -402.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0067138671875, "rewards/margins": -0.00640869140625, "rewards/rejected": -0.00031280517578125, "step": 492 }, { "epoch": 0.3944, "grad_norm": 1.5128855282120852, "learning_rate": 3.792080477474042e-07, "logits/chosen": 0.91015625, "logits/rejected": 0.9921875, "logps/chosen": -37.0, "logps/rejected": -33.25, "loss": 0.6927, "loss/demonstration_loss": -278.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00592041015625, "rewards/margins": -0.001708984375, "rewards/rejected": -0.00421142578125, "step": 493 }, { "epoch": 0.3952, "grad_norm": 2.2892832846706375, "learning_rate": 3.7860988344537657e-07, "logits/chosen": 1.03125, "logits/rejected": 1.0, "logps/chosen": -26.25, "logps/rejected": -25.875, "loss": 0.6917, "loss/demonstration_loss": -414.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.00439453125, "rewards/rejected": 0.00093841552734375, "step": 494 }, { "epoch": 0.396, "grad_norm": 0.7527985046808181, "learning_rate": 3.780107162176429e-07, "logits/chosen": 0.640625, "logits/rejected": 0.625, "logps/chosen": -37.75, "logps/rejected": -37.5, "loss": 0.6901, "loss/demonstration_loss": -302.0, "loss/preference_loss": -298.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.0050048828125, "step": 495 }, { "epoch": 0.3968, "grad_norm": 1.1644598997130817, "learning_rate": 3.7741055073662943e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.73828125, "logps/chosen": -40.25, "logps/rejected": -23.625, "loss": 0.6907, "loss/demonstration_loss": -506.0, "loss/preference_loss": -504.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.000946044921875, "rewards/rejected": -0.006561279296875, "step": 496 }, { "epoch": 0.3976, "grad_norm": 1.6725054075181125, "learning_rate": 3.7680939168254726e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.70703125, "logps/chosen": -39.75, "logps/rejected": -48.25, "loss": 0.6915, "loss/demonstration_loss": -352.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00390625, "rewards/rejected": -0.00579833984375, "step": 497 }, { "epoch": 0.3984, "grad_norm": 1.4176815668039304, "learning_rate": 3.7620724374335544e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.62109375, "logps/chosen": -3.265625, "logps/rejected": -3.265625, "loss": 0.692, "loss/demonstration_loss": -104.0, "loss/preference_loss": -104.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.0, "rewards/rejected": -0.00031280517578125, "step": 498 }, { "epoch": 0.3992, "grad_norm": 1.4380779626082882, "learning_rate": 3.756041116147245e-07, "logits/chosen": 0.828125, "logits/rejected": 0.7734375, "logps/chosen": -10.125, "logps/rejected": -17.375, "loss": 0.689, "loss/demonstration_loss": -219.0, "loss/preference_loss": -211.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0010986328125, "rewards/margins": 0.0101318359375, "rewards/rejected": -0.01123046875, "step": 499 }, { "epoch": 0.4, "grad_norm": 1.5741205377268566, "learning_rate": 3.75e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.92578125, "logps/chosen": -39.0, "logps/rejected": -33.0, "loss": 0.6904, "loss/demonstration_loss": -382.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.005950927734375, "step": 500 }, { "epoch": 0.4008, "grad_norm": 1.465914983427328, "learning_rate": 3.743949136101656e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.78515625, "logps/chosen": -57.25, "logps/rejected": -50.0, "loss": 0.6941, "loss/demonstration_loss": -282.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0140380859375, "rewards/margins": -0.009033203125, "rewards/rejected": -0.0050048828125, "step": 501 }, { "epoch": 0.4016, "grad_norm": 1.1245911703585663, "learning_rate": 3.737888571638066e-07, "logits/chosen": 0.734375, "logits/rejected": 0.58203125, "logps/chosen": -25.375, "logps/rejected": -40.25, "loss": 0.6906, "loss/demonstration_loss": -350.0, "loss/preference_loss": -346.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0078125, "rewards/rejected": -0.0084228515625, "step": 502 }, { "epoch": 0.4024, "grad_norm": 0.7011458329303424, "learning_rate": 3.731818353870729e-07, "logits/chosen": 0.671875, "logits/rejected": 0.70703125, "logps/chosen": -22.0, "logps/rejected": -22.125, "loss": 0.6912, "loss/demonstration_loss": -234.0, "loss/preference_loss": -233.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0020294189453125, "rewards/margins": 0.0010986328125, "rewards/rejected": -0.0031280517578125, "step": 503 }, { "epoch": 0.4032, "grad_norm": 1.4513326863631262, "learning_rate": 3.7257385301364216e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.828125, "logps/chosen": -30.25, "logps/rejected": -16.875, "loss": 0.6931, "loss/demonstration_loss": -376.0, "loss/preference_loss": -374.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.004058837890625, "step": 504 }, { "epoch": 0.404, "grad_norm": 1.1883007131457544, "learning_rate": 3.7196491478468316e-07, "logits/chosen": 0.828125, "logits/rejected": 0.8125, "logps/chosen": -19.0, "logps/rejected": -19.0, "loss": 0.6896, "loss/demonstration_loss": -608.0, "loss/preference_loss": -608.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0, "rewards/rejected": -0.001251220703125, "step": 505 }, { "epoch": 0.4048, "grad_norm": 1.2877301283963853, "learning_rate": 3.713550254488185e-07, "logits/chosen": 0.703125, "logits/rejected": 0.66796875, "logps/chosen": -19.625, "logps/rejected": -22.625, "loss": 0.6902, "loss/demonstration_loss": -672.0, "loss/preference_loss": -672.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0, "rewards/rejected": -0.003753662109375, "step": 506 }, { "epoch": 0.4056, "grad_norm": 4.666329278789684, "learning_rate": 3.7074418976208766e-07, "logits/chosen": 0.408203125, "logits/rejected": 0.45703125, "logps/chosen": -48.5, "logps/rejected": -34.0, "loss": 0.6952, "loss/demonstration_loss": -648.0, "loss/preference_loss": -656.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.01123046875, "rewards/rejected": -0.001251220703125, "step": 507 }, { "epoch": 0.4064, "grad_norm": 1.33411371148728, "learning_rate": 3.7013241248791015e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.609375, "logps/chosen": -50.75, "logps/rejected": -68.0, "loss": 0.6898, "loss/demonstration_loss": -472.0, "loss/preference_loss": -470.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0072021484375, "rewards/margins": 0.00750732421875, "rewards/rejected": -0.01470947265625, "step": 508 }, { "epoch": 0.4072, "grad_norm": 1.1548585167042724, "learning_rate": 3.695196983970481e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.76171875, "logps/chosen": -29.875, "logps/rejected": -28.625, "loss": 0.6924, "loss/demonstration_loss": -308.0, "loss/preference_loss": -308.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.0032806396484375, "rewards/rejected": -0.005157470703125, "step": 509 }, { "epoch": 0.408, "grad_norm": 2.4420004797974832, "learning_rate": 3.689060522675688e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.73046875, "logps/chosen": -68.0, "logps/rejected": -65.0, "loss": 0.6875, "loss/demonstration_loss": -528.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.0017242431640625, "rewards/rejected": -0.0111083984375, "step": 510 }, { "epoch": 0.4088, "grad_norm": 1.834891469165879, "learning_rate": 3.6829147888480827e-07, "logits/chosen": 1.078125, "logits/rejected": 0.8828125, "logps/chosen": -59.5, "logps/rejected": -70.0, "loss": 0.6903, "loss/demonstration_loss": -294.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.006561279296875, "rewards/margins": 0.003448486328125, "rewards/rejected": -0.010009765625, "step": 511 }, { "epoch": 0.4096, "grad_norm": 0.6751369123857776, "learning_rate": 3.676759830413332e-07, "logits/chosen": 0.59375, "logits/rejected": 0.54296875, "logps/chosen": -17.125, "logps/rejected": -18.875, "loss": 0.6907, "loss/demonstration_loss": -191.0, "loss/preference_loss": -190.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00140380859375, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.002349853515625, "step": 512 }, { "epoch": 0.4104, "grad_norm": 1.1625965434509367, "learning_rate": 3.670595695369036e-07, "logits/chosen": 0.65625, "logits/rejected": 0.53125, "logps/chosen": -55.25, "logps/rejected": -65.5, "loss": 0.6891, "loss/demonstration_loss": -640.0, "loss/preference_loss": -636.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0087890625, "rewards/rejected": -0.0162353515625, "step": 513 }, { "epoch": 0.4112, "grad_norm": 1.513907774969443, "learning_rate": 3.6644224317843604e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.83984375, "logps/chosen": -28.375, "logps/rejected": -42.0, "loss": 0.6909, "loss/demonstration_loss": -282.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00063323974609375, "rewards/margins": 0.001708984375, "rewards/rejected": -0.002349853515625, "step": 514 }, { "epoch": 0.412, "grad_norm": 1.2585320541703313, "learning_rate": 3.658240087799654e-07, "logits/chosen": 0.83984375, "logits/rejected": 1.0, "logps/chosen": -64.5, "logps/rejected": -52.25, "loss": 0.6898, "loss/demonstration_loss": -462.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.00750732421875, "step": 515 }, { "epoch": 0.4128, "grad_norm": 1.7351676613015685, "learning_rate": 3.652048711626077e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.90234375, "logps/chosen": -55.5, "logps/rejected": -32.0, "loss": 0.6951, "loss/demonstration_loss": -460.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.0072021484375, "rewards/rejected": -0.00531005859375, "step": 516 }, { "epoch": 0.4136, "grad_norm": 0.8522689901707869, "learning_rate": 3.6458483515452244e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.5859375, "logps/chosen": -5.625, "logps/rejected": -5.625, "loss": 0.6904, "loss/demonstration_loss": -179.0, "loss/preference_loss": -179.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0, "rewards/rejected": -0.0006256103515625, "step": 517 }, { "epoch": 0.4144, "grad_norm": 1.482879136671421, "learning_rate": 3.63963905590875e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.59375, "logps/chosen": -88.5, "logps/rejected": -80.0, "loss": 0.6946, "loss/demonstration_loss": -444.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0174560546875, "rewards/margins": -0.004364013671875, "rewards/rejected": -0.01312255859375, "step": 518 }, { "epoch": 0.4152, "grad_norm": 1.2489794505017076, "learning_rate": 3.633420873137988e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.875, "logps/chosen": -42.5, "logps/rejected": -35.0, "loss": 0.6901, "loss/demonstration_loss": -408.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01220703125, "rewards/margins": 1.52587890625e-05, "rewards/rejected": -0.01220703125, "step": 519 }, { "epoch": 0.416, "grad_norm": 1.1270040423312833, "learning_rate": 3.6271938517235765e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.78125, "logps/chosen": -89.0, "logps/rejected": -92.5, "loss": 0.6913, "loss/demonstration_loss": -576.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01220703125, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.0137939453125, "step": 520 }, { "epoch": 0.4168, "grad_norm": 2.068644617603379, "learning_rate": 3.620958040225081e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.72265625, "logps/chosen": -49.5, "logps/rejected": -65.0, "loss": 0.6886, "loss/demonstration_loss": -304.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.3125, "rewards/chosen": 7.62939453125e-06, "rewards/margins": 0.01019287109375, "rewards/rejected": -0.01019287109375, "step": 521 }, { "epoch": 0.4176, "grad_norm": 1.3000102336427346, "learning_rate": 3.6147134872706104e-07, "logits/chosen": 0.73046875, "logits/rejected": 1.0390625, "logps/chosen": -76.0, "logps/rejected": -52.0, "loss": 0.6921, "loss/demonstration_loss": -408.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0031585693359375, "rewards/rejected": -0.006866455078125, "step": 522 }, { "epoch": 0.4184, "grad_norm": 66.53751974897635, "learning_rate": 3.6084602415564424e-07, "logits/chosen": 0.78125, "logits/rejected": 0.671875, "logps/chosen": -53.0, "logps/rejected": -67.5, "loss": 0.6873, "loss/demonstration_loss": -480.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00439453125, "rewards/margins": 0.015625, "rewards/rejected": -0.02001953125, "step": 523 }, { "epoch": 0.4192, "grad_norm": 1.2308374144474887, "learning_rate": 3.6021983518466463e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.57421875, "logps/chosen": -43.0, "logps/rejected": -42.0, "loss": 0.694, "loss/demonstration_loss": -336.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01373291015625, "rewards/margins": -0.00970458984375, "rewards/rejected": -0.004058837890625, "step": 524 }, { "epoch": 0.42, "grad_norm": 1.4791370517472273, "learning_rate": 3.595927866972693e-07, "logits/chosen": 0.369140625, "logits/rejected": 0.330078125, "logps/chosen": -66.5, "logps/rejected": -66.5, "loss": 0.6929, "loss/demonstration_loss": -418.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0244140625, "step": 525 }, { "epoch": 0.4208, "grad_norm": 1.2385469721718154, "learning_rate": 3.589648835833085e-07, "logits/chosen": 0.765625, "logits/rejected": 0.6953125, "logps/chosen": -52.75, "logps/rejected": -58.5, "loss": 0.6904, "loss/demonstration_loss": -294.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.007659912109375, "rewards/margins": 0.00201416015625, "rewards/rejected": -0.00970458984375, "step": 526 }, { "epoch": 0.4216, "grad_norm": 1.267900366729273, "learning_rate": 3.583361307392968e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.765625, "logps/chosen": -45.75, "logps/rejected": -40.5, "loss": 0.6901, "loss/demonstration_loss": -344.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.00640869140625, "rewards/rejected": -0.01202392578125, "step": 527 }, { "epoch": 0.4224, "grad_norm": 0.8757964434261207, "learning_rate": 3.577065330683751e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.7890625, "logps/chosen": -24.0, "logps/rejected": -22.5, "loss": 0.6924, "loss/demonstration_loss": -245.0, "loss/preference_loss": -247.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.00156402587890625, "step": 528 }, { "epoch": 0.4232, "grad_norm": 1.7007600197089299, "learning_rate": 3.5707609548027254e-07, "logits/chosen": 0.859375, "logits/rejected": 0.66796875, "logps/chosen": -57.5, "logps/rejected": -63.75, "loss": 0.6907, "loss/demonstration_loss": -386.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.0062255859375, "step": 529 }, { "epoch": 0.424, "grad_norm": 1.2211824872667034, "learning_rate": 3.5644482289126813e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.80859375, "logps/chosen": -51.0, "logps/rejected": -32.0, "loss": 0.6907, "loss/demonstration_loss": -330.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005615234375, "rewards/margins": 0.004669189453125, "rewards/rejected": -0.01031494140625, "step": 530 }, { "epoch": 0.4248, "grad_norm": 0.514029762808921, "learning_rate": 3.5581272022415237e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.58984375, "logps/chosen": -31.75, "logps/rejected": -31.125, "loss": 0.6926, "loss/demonstration_loss": -498.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00469970703125, "rewards/rejected": -0.003448486328125, "step": 531 }, { "epoch": 0.4256, "grad_norm": 1.480843097584366, "learning_rate": 3.551797924081887e-07, "logits/chosen": 0.92578125, "logits/rejected": 0.87890625, "logps/chosen": -40.25, "logps/rejected": -55.0, "loss": 0.6923, "loss/demonstration_loss": -506.0, "loss/preference_loss": -506.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.006256103515625, "step": 532 }, { "epoch": 0.4264, "grad_norm": 0.9079816037257802, "learning_rate": 3.545460443790753e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.55859375, "logps/chosen": -7.0, "logps/rejected": -13.4375, "loss": 0.6914, "loss/demonstration_loss": -320.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0031280517578125, "rewards/rejected": -0.0006256103515625, "step": 533 }, { "epoch": 0.4272, "grad_norm": 1.51845030214223, "learning_rate": 3.5391148107890694e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.72265625, "logps/chosen": -18.75, "logps/rejected": -9.6875, "loss": 0.691, "loss/demonstration_loss": -226.0, "loss/preference_loss": -226.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0020294189453125, "rewards/margins": -0.000316619873046875, "rewards/rejected": -0.00171661376953125, "step": 534 }, { "epoch": 0.428, "grad_norm": 5.688441479748442, "learning_rate": 3.5327610745613546e-07, "logits/chosen": 0.546875, "logits/rejected": 0.609375, "logps/chosen": -36.5, "logps/rejected": -30.25, "loss": 0.6877, "loss/demonstration_loss": -264.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0015716552734375, "rewards/rejected": -0.004058837890625, "step": 535 }, { "epoch": 0.4288, "grad_norm": 1.3995632195329637, "learning_rate": 3.5263992846553197e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.65234375, "logps/chosen": -89.5, "logps/rejected": -91.0, "loss": 0.6903, "loss/demonstration_loss": -572.0, "loss/preference_loss": -572.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.01251220703125, "step": 536 }, { "epoch": 0.4296, "grad_norm": 1.406469384171237, "learning_rate": 3.5200294906814823e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.66796875, "logps/chosen": -88.5, "logps/rejected": -72.5, "loss": 0.693, "loss/demonstration_loss": -510.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0054931640625, "rewards/rejected": -0.004547119140625, "step": 537 }, { "epoch": 0.4304, "grad_norm": 1.353449082211998, "learning_rate": 3.5136517423127734e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.6875, "logps/chosen": -42.25, "logps/rejected": -46.25, "loss": 0.6915, "loss/demonstration_loss": -472.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0050048828125, "step": 538 }, { "epoch": 0.4312, "grad_norm": 1.476328400077812, "learning_rate": 3.5072660892841566e-07, "logits/chosen": 0.546875, "logits/rejected": 0.5703125, "logps/chosen": -17.125, "logps/rejected": -21.125, "loss": 0.6904, "loss/demonstration_loss": -608.0, "loss/preference_loss": -608.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 539 }, { "epoch": 0.432, "grad_norm": 1.1503291334410919, "learning_rate": 3.500872581392238e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.61328125, "logps/chosen": -5.96875, "logps/rejected": -13.25, "loss": 0.6904, "loss/demonstration_loss": -300.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.0009307861328125, "rewards/rejected": -0.0031280517578125, "step": 540 }, { "epoch": 0.4328, "grad_norm": 1.0894468035601588, "learning_rate": 3.4944712684948744e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.9765625, "logps/chosen": -28.125, "logps/rejected": -28.0, "loss": 0.6908, "loss/demonstration_loss": -296.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0072021484375, "step": 541 }, { "epoch": 0.4336, "grad_norm": 1.672752135520411, "learning_rate": 3.488062200510791e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.89453125, "logps/chosen": -28.0, "logps/rejected": -13.125, "loss": 0.6908, "loss/demonstration_loss": -328.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.00439453125, "rewards/rejected": -0.00439453125, "step": 542 }, { "epoch": 0.4344, "grad_norm": 1.7860372538615457, "learning_rate": 3.481645427419188e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.81640625, "logps/chosen": -70.0, "logps/rejected": -50.5, "loss": 0.6953, "loss/demonstration_loss": -320.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01123046875, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.0072021484375, "step": 543 }, { "epoch": 0.4352, "grad_norm": 1.1548279863356674, "learning_rate": 3.475220999259349e-07, "logits/chosen": 0.484375, "logits/rejected": 0.4765625, "logps/chosen": -41.0, "logps/rejected": -41.5, "loss": 0.6899, "loss/demonstration_loss": -328.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.00311279296875, "rewards/rejected": -0.009033203125, "step": 544 }, { "epoch": 0.436, "grad_norm": 1.270131733540396, "learning_rate": 3.468788966130257e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.5703125, "logps/chosen": -22.625, "logps/rejected": -24.625, "loss": 0.6913, "loss/demonstration_loss": -368.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0118408203125, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.006866455078125, "step": 545 }, { "epoch": 0.4368, "grad_norm": 1.1985685960080044, "learning_rate": 3.4623493781901983e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.796875, "logps/chosen": -35.25, "logps/rejected": -35.5, "loss": 0.691, "loss/demonstration_loss": -374.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.00811767578125, "step": 546 }, { "epoch": 0.4376, "grad_norm": 1.2902515076714973, "learning_rate": 3.4559022856563727e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.88671875, "logps/chosen": -14.375, "logps/rejected": -15.1875, "loss": 0.6897, "loss/demonstration_loss": -236.0, "loss/preference_loss": -232.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.005615234375, "rewards/rejected": -0.006256103515625, "step": 547 }, { "epoch": 0.4384, "grad_norm": 1.2334490360303725, "learning_rate": 3.4494477388045027e-07, "logits/chosen": 0.9609375, "logits/rejected": 0.79296875, "logps/chosen": -21.375, "logps/rejected": -31.25, "loss": 0.6925, "loss/demonstration_loss": -280.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.002197265625, "rewards/rejected": 0.0006256103515625, "step": 548 }, { "epoch": 0.4392, "grad_norm": 1.085420782657001, "learning_rate": 3.4429857879684414e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.8984375, "logps/chosen": -21.125, "logps/rejected": -16.875, "loss": 0.6925, "loss/demonstration_loss": -300.0, "loss/preference_loss": -304.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0028228759765625, "rewards/rejected": -0.00156402587890625, "step": 549 }, { "epoch": 0.44, "grad_norm": 2.991453874920797, "learning_rate": 3.43651648353978e-07, "logits/chosen": 0.609375, "logits/rejected": 0.75390625, "logps/chosen": -77.0, "logps/rejected": -68.5, "loss": 0.697, "loss/demonstration_loss": -458.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.025634765625, "rewards/margins": -0.018798828125, "rewards/rejected": -0.00689697265625, "step": 550 }, { "epoch": 0.4408, "grad_norm": 0.8939918891027746, "learning_rate": 3.4300398759674535e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.65234375, "logps/chosen": -16.5, "logps/rejected": -8.6875, "loss": 0.6945, "loss/demonstration_loss": -198.0, "loss/preference_loss": -202.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.004852294921875, "rewards/rejected": 0.000782012939453125, "step": 551 }, { "epoch": 0.4416, "grad_norm": 1.912028299744464, "learning_rate": 3.4235560157573484e-07, "logits/chosen": 0.859375, "logits/rejected": 0.9375, "logps/chosen": -52.75, "logps/rejected": -55.0, "loss": 0.696, "loss/demonstration_loss": -340.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.015625, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.0128173828125, "step": 552 }, { "epoch": 0.4424, "grad_norm": 1.1129300629643177, "learning_rate": 3.41706495347191e-07, "logits/chosen": 0.625, "logits/rejected": 0.671875, "logps/chosen": -24.5, "logps/rejected": -29.375, "loss": 0.6906, "loss/demonstration_loss": -286.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002655029296875, "rewards/margins": -0.003753662109375, "rewards/rejected": 0.0010986328125, "step": 553 }, { "epoch": 0.4432, "grad_norm": 0.7901641986230934, "learning_rate": 3.4105667397297453e-07, "logits/chosen": 0.7734375, "logits/rejected": 0.7734375, "logps/chosen": -17.75, "logps/rejected": -18.125, "loss": 0.6898, "loss/demonstration_loss": -288.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.004058837890625, "step": 554 }, { "epoch": 0.444, "grad_norm": 1.4497786661050769, "learning_rate": 3.40406142520523e-07, "logits/chosen": 1.0078125, "logits/rejected": 0.94921875, "logps/chosen": -29.625, "logps/rejected": -39.0, "loss": 0.6938, "loss/demonstration_loss": -362.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.003753662109375, "step": 555 }, { "epoch": 0.4448, "grad_norm": 1.3746341511422195, "learning_rate": 3.3975490606281153e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.75, "logps/chosen": -33.75, "logps/rejected": -34.25, "loss": 0.6904, "loss/demonstration_loss": -360.0, "loss/preference_loss": -358.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0020294189453125, "rewards/margins": 0.00579833984375, "rewards/rejected": -0.0078125, "step": 556 }, { "epoch": 0.4456, "grad_norm": 1.4068955946923036, "learning_rate": 3.3910296967831265e-07, "logits/chosen": 0.671875, "logits/rejected": 0.6640625, "logps/chosen": -28.25, "logps/rejected": -28.5, "loss": 0.6925, "loss/demonstration_loss": -450.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00811767578125, "step": 557 }, { "epoch": 0.4464, "grad_norm": 1.4870637313628143, "learning_rate": 3.3845033845095735e-07, "logits/chosen": 0.83203125, "logits/rejected": 1.0, "logps/chosen": -48.5, "logps/rejected": -25.25, "loss": 0.6926, "loss/demonstration_loss": -386.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.0106201171875, "rewards/rejected": -0.0031280517578125, "step": 558 }, { "epoch": 0.4472, "grad_norm": 1.1701611347439234, "learning_rate": 3.37797017470095e-07, "logits/chosen": 0.75, "logits/rejected": 0.7421875, "logps/chosen": -14.9375, "logps/rejected": -12.375, "loss": 0.691, "loss/demonstration_loss": -432.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 559 }, { "epoch": 0.448, "grad_norm": 0.8133122497454235, "learning_rate": 3.371430118304538e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.64453125, "logps/chosen": -11.25, "logps/rejected": -15.375, "loss": 0.6895, "loss/demonstration_loss": -212.0, "loss/preference_loss": -210.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.003753662109375, "step": 560 }, { "epoch": 0.4488, "grad_norm": 0.5844048509854888, "learning_rate": 3.364883266321012e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.84375, "logps/chosen": -39.25, "logps/rejected": -39.75, "loss": 0.6902, "loss/demonstration_loss": -632.0, "loss/preference_loss": -628.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.00750732421875, "step": 561 }, { "epoch": 0.4496, "grad_norm": 1.6589594395703144, "learning_rate": 3.358329669804038e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.52734375, "logps/chosen": -24.875, "logps/rejected": -42.5, "loss": 0.6888, "loss/demonstration_loss": -358.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0017242431640625, "rewards/margins": 0.0079345703125, "rewards/rejected": -0.00970458984375, "step": 562 }, { "epoch": 0.4504, "grad_norm": 1.5169811842057364, "learning_rate": 3.3517693798598793e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.73046875, "logps/chosen": -69.5, "logps/rejected": -66.0, "loss": 0.6904, "loss/demonstration_loss": -432.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.003448486328125, "rewards/rejected": -0.01092529296875, "step": 563 }, { "epoch": 0.4512, "grad_norm": 1.3419335099614402, "learning_rate": 3.345202447646993e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.796875, "logps/chosen": -25.125, "logps/rejected": -33.25, "loss": 0.691, "loss/demonstration_loss": -231.0, "loss/preference_loss": -231.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0010833740234375, "rewards/rejected": -0.004547119140625, "step": 564 }, { "epoch": 0.452, "grad_norm": 1.0942952040183698, "learning_rate": 3.338628924375638e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.83984375, "logps/chosen": -37.25, "logps/rejected": -35.0, "loss": 0.6912, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0087890625, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.01153564453125, "step": 565 }, { "epoch": 0.4528, "grad_norm": 1.395107258342547, "learning_rate": 3.3320488613074666e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.70703125, "logps/chosen": -24.25, "logps/rejected": -26.0, "loss": 0.6893, "loss/demonstration_loss": -268.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.003753662109375, "step": 566 }, { "epoch": 0.4536, "grad_norm": 0.8903197476944568, "learning_rate": 3.3254623097551337e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.5625, "logps/chosen": -37.75, "logps/rejected": -38.0, "loss": 0.6912, "loss/demonstration_loss": -240.0, "loss/preference_loss": -240.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005615234375, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0087890625, "step": 567 }, { "epoch": 0.4544, "grad_norm": 1.5171757876240788, "learning_rate": 3.3188693210818917e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.7578125, "logps/chosen": -57.0, "logps/rejected": -55.75, "loss": 0.6925, "loss/demonstration_loss": -356.0, "loss/preference_loss": -358.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01220703125, "rewards/margins": -0.001556396484375, "rewards/rejected": -0.0106201171875, "step": 568 }, { "epoch": 0.4552, "grad_norm": 1.3032015280437894, "learning_rate": 3.312269946701191e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.71875, "logps/chosen": -55.0, "logps/rejected": -64.0, "loss": 0.6926, "loss/demonstration_loss": -376.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0166015625, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.0106201171875, "step": 569 }, { "epoch": 0.456, "grad_norm": 2.00675770609056, "learning_rate": 3.305664238076278e-07, "logits/chosen": 0.71875, "logits/rejected": 0.671875, "logps/chosen": -88.0, "logps/rejected": -103.5, "loss": 0.6912, "loss/demonstration_loss": -378.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01904296875, "rewards/margins": 0.00994873046875, "rewards/rejected": -0.029052734375, "step": 570 }, { "epoch": 0.4568, "grad_norm": 1.300946409237165, "learning_rate": 3.2990522467197946e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.71875, "logps/chosen": -20.75, "logps/rejected": -20.75, "loss": 0.6898, "loss/demonstration_loss": -652.0, "loss/preference_loss": -652.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0, "rewards/rejected": -0.00750732421875, "step": 571 }, { "epoch": 0.4576, "grad_norm": 0.9131923813463111, "learning_rate": 3.2924340241933796e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.80859375, "logps/chosen": -41.0, "logps/rejected": -27.375, "loss": 0.6932, "loss/demonstration_loss": -270.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.00592041015625, "step": 572 }, { "epoch": 0.4584, "grad_norm": 1.009996208219368, "learning_rate": 3.28580962210726e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.83984375, "logps/chosen": -42.5, "logps/rejected": -32.5, "loss": 0.693, "loss/demonstration_loss": -600.0, "loss/preference_loss": -596.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.00439453125, "step": 573 }, { "epoch": 0.4592, "grad_norm": 1.5220609346392517, "learning_rate": 3.279179092119854e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.5703125, "logps/chosen": -63.5, "logps/rejected": -68.5, "loss": 0.6918, "loss/demonstration_loss": -418.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01171875, "rewards/margins": -0.00640869140625, "rewards/rejected": -0.00531005859375, "step": 574 }, { "epoch": 0.46, "grad_norm": 1.0669505081756336, "learning_rate": 3.272542485937368e-07, "logits/chosen": 0.71875, "logits/rejected": 0.71484375, "logps/chosen": -9.375, "logps/rejected": -9.375, "loss": 0.6913, "loss/demonstration_loss": -296.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.0, "rewards/rejected": -0.0031280517578125, "step": 575 }, { "epoch": 0.4608, "grad_norm": 1.504222458729267, "learning_rate": 3.265899855313389e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.71484375, "logps/chosen": -54.5, "logps/rejected": -45.75, "loss": 0.6888, "loss/demonstration_loss": -398.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.000782012939453125, "rewards/rejected": -0.007354736328125, "step": 576 }, { "epoch": 0.4616, "grad_norm": 1.2241234592407775, "learning_rate": 3.2592512520484857e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.796875, "logps/chosen": -30.0, "logps/rejected": -32.0, "loss": 0.6936, "loss/demonstration_loss": -490.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01123046875, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.0031280517578125, "step": 577 }, { "epoch": 0.4624, "grad_norm": 1.0261424408447029, "learning_rate": 3.2525967279898015e-07, "logits/chosen": 0.455078125, "logits/rejected": 0.443359375, "logps/chosen": -27.25, "logps/rejected": -25.75, "loss": 0.6903, "loss/demonstration_loss": -282.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.00156402587890625, "step": 578 }, { "epoch": 0.4632, "grad_norm": 1.5972556250654506, "learning_rate": 3.2459363350306506e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.69140625, "logps/chosen": -97.0, "logps/rejected": -95.5, "loss": 0.6929, "loss/demonstration_loss": -510.0, "loss/preference_loss": -510.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.0021820068359375, "rewards/rejected": -0.0140380859375, "step": 579 }, { "epoch": 0.464, "grad_norm": 1.8465316223904082, "learning_rate": 3.2392701251101167e-07, "logits/chosen": 0.8125, "logits/rejected": 0.7890625, "logps/chosen": -47.0, "logps/rejected": -48.5, "loss": 0.6863, "loss/demonstration_loss": -302.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": 0.004669189453125, "rewards/rejected": -0.0146484375, "step": 580 }, { "epoch": 0.4648, "grad_norm": 1.6908126690080518, "learning_rate": 3.232598150212643e-07, "logits/chosen": 0.9375, "logits/rejected": 1.0703125, "logps/chosen": -72.0, "logps/rejected": -52.75, "loss": 0.6906, "loss/demonstration_loss": -398.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.01220703125, "step": 581 }, { "epoch": 0.4656, "grad_norm": 1.5185436857596326, "learning_rate": 3.2259204623676317e-07, "logits/chosen": 0.51171875, "logits/rejected": 0.72265625, "logps/chosen": -41.25, "logps/rejected": -37.75, "loss": 0.6941, "loss/demonstration_loss": -312.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": -0.001556396484375, "rewards/rejected": -0.0084228515625, "step": 582 }, { "epoch": 0.4664, "grad_norm": 0.944750392027016, "learning_rate": 3.219237113649032e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.5546875, "logps/chosen": -27.75, "logps/rejected": -32.5, "loss": 0.6909, "loss/demonstration_loss": -318.0, "loss/preference_loss": -314.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.0062255859375, "rewards/rejected": -0.0118408203125, "step": 583 }, { "epoch": 0.4672, "grad_norm": 1.1806317569112634, "learning_rate": 3.21254815617494e-07, "logits/chosen": 1.046875, "logits/rejected": 1.0078125, "logps/chosen": -30.75, "logps/rejected": -38.75, "loss": 0.6912, "loss/demonstration_loss": -276.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00360107421875, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.005462646484375, "step": 584 }, { "epoch": 0.468, "grad_norm": 1.3341603140796483, "learning_rate": 3.2058536421071914e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.6328125, "logps/chosen": -46.25, "logps/rejected": -33.75, "loss": 0.6951, "loss/demonstration_loss": -418.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.016845703125, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.0087890625, "step": 585 }, { "epoch": 0.4688, "grad_norm": 1.7700554253651566, "learning_rate": 3.19915362365095e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.64453125, "logps/chosen": -11.0, "logps/rejected": -18.75, "loss": 0.6923, "loss/demonstration_loss": -236.0, "loss/preference_loss": -236.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": -7.62939453125e-06, "rewards/rejected": -0.0034332275390625, "step": 586 }, { "epoch": 0.4696, "grad_norm": 1.8402114627374067, "learning_rate": 3.192448153054306e-07, "logits/chosen": 1.046875, "logits/rejected": 0.90625, "logps/chosen": -40.0, "logps/rejected": -41.5, "loss": 0.6948, "loss/demonstration_loss": -256.0, "loss/preference_loss": -260.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01513671875, "rewards/margins": -0.0107421875, "rewards/rejected": -0.00439453125, "step": 587 }, { "epoch": 0.4704, "grad_norm": 1.423057510295079, "learning_rate": 3.1857372826078667e-07, "logits/chosen": 0.98046875, "logits/rejected": 0.8828125, "logps/chosen": -87.0, "logps/rejected": -96.0, "loss": 0.6918, "loss/demonstration_loss": -484.0, "loss/preference_loss": -484.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0150146484375, "rewards/margins": 0.003143310546875, "rewards/rejected": -0.0181884765625, "step": 588 }, { "epoch": 0.4712, "grad_norm": 1.4220268362287034, "learning_rate": 3.1790210646443465e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.78515625, "logps/chosen": -20.25, "logps/rejected": -19.25, "loss": 0.6927, "loss/demonstration_loss": -206.0, "loss/preference_loss": -208.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.009521484375, "rewards/margins": -0.00360107421875, "rewards/rejected": -0.00592041015625, "step": 589 }, { "epoch": 0.472, "grad_norm": 1.0910522186519822, "learning_rate": 3.172299551538164e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.6328125, "logps/chosen": -24.25, "logps/rejected": -25.625, "loss": 0.6906, "loss/demonstration_loss": -392.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0, "rewards/rejected": -0.00750732421875, "step": 590 }, { "epoch": 0.4728, "grad_norm": 0.7344484816628097, "learning_rate": 3.1655727957050284e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.73046875, "logps/chosen": -32.0, "logps/rejected": -31.875, "loss": 0.6919, "loss/demonstration_loss": -508.0, "loss/preference_loss": -510.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.003753662109375, "step": 591 }, { "epoch": 0.4736, "grad_norm": 1.7506261570486434, "learning_rate": 3.158840849601532e-07, "logits/chosen": 0.8671875, "logits/rejected": 1.0234375, "logps/chosen": -52.0, "logps/rejected": -34.25, "loss": 0.6937, "loss/demonstration_loss": -684.0, "loss/preference_loss": -692.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00439453125, "rewards/rejected": -0.0006256103515625, "step": 592 }, { "epoch": 0.4744, "grad_norm": 1.0147038167499292, "learning_rate": 3.152103765724742e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.39453125, "logps/chosen": -26.75, "logps/rejected": -26.5, "loss": 0.6926, "loss/demonstration_loss": -422.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00439453125, "step": 593 }, { "epoch": 0.4752, "grad_norm": 1.358346409113352, "learning_rate": 3.145361596611794e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.61328125, "logps/chosen": -28.5, "logps/rejected": -23.625, "loss": 0.6921, "loss/demonstration_loss": -272.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.009033203125, "rewards/rejected": -0.003448486328125, "step": 594 }, { "epoch": 0.476, "grad_norm": 2.077102585586657, "learning_rate": 3.1386143948394763e-07, "logits/chosen": 0.83203125, "logits/rejected": 0.79296875, "logps/chosen": -95.5, "logps/rejected": -93.5, "loss": 0.6946, "loss/demonstration_loss": -500.0, "loss/preference_loss": -504.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.017578125, "rewards/margins": -0.0150146484375, "rewards/rejected": -0.00250244140625, "step": 595 }, { "epoch": 0.4768, "grad_norm": 0.7765930907900492, "learning_rate": 3.131862213023823e-07, "logits/chosen": 0.703125, "logits/rejected": 0.6953125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 596 }, { "epoch": 0.4776, "grad_norm": 1.0375881827752158, "learning_rate": 3.125105103819705e-07, "logits/chosen": 0.734375, "logits/rejected": 0.64453125, "logps/chosen": -20.5, "logps/rejected": -28.875, "loss": 0.6926, "loss/demonstration_loss": -388.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0037384033203125, "rewards/rejected": -0.00439453125, "step": 597 }, { "epoch": 0.4784, "grad_norm": 1.361523157136997, "learning_rate": 3.1183431199204173e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.74609375, "logps/chosen": -28.875, "logps/rejected": -45.0, "loss": 0.6923, "loss/demonstration_loss": -392.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0032806396484375, "rewards/margins": -0.00140380859375, "rewards/rejected": -0.0018768310546875, "step": 598 }, { "epoch": 0.4792, "grad_norm": 1.253757703655083, "learning_rate": 3.111576314057268e-07, "logits/chosen": 0.76171875, "logits/rejected": 0.6171875, "logps/chosen": -25.25, "logps/rejected": -52.5, "loss": 0.6909, "loss/demonstration_loss": -410.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00958251953125, "rewards/margins": -0.00079345703125, "rewards/rejected": -0.0087890625, "step": 599 }, { "epoch": 0.48, "grad_norm": 2.521951337381573, "learning_rate": 3.104804738999169e-07, "logits/chosen": 0.8125, "logits/rejected": 0.91015625, "logps/chosen": -74.0, "logps/rejected": -56.25, "loss": 0.6908, "loss/demonstration_loss": -414.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01019287109375, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.01611328125, "step": 600 }, { "epoch": 0.4808, "grad_norm": 1.489454305176474, "learning_rate": 3.0980284475522233e-07, "logits/chosen": 1.015625, "logits/rejected": 0.96484375, "logps/chosen": -54.0, "logps/rejected": -57.75, "loss": 0.6909, "loss/demonstration_loss": -444.0, "loss/preference_loss": -442.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.009033203125, "rewards/margins": 0.00189208984375, "rewards/rejected": -0.010986328125, "step": 601 }, { "epoch": 0.4816, "grad_norm": 0.9840273348391562, "learning_rate": 3.091247492559312e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.59375, "logps/chosen": -9.125, "logps/rejected": -12.125, "loss": 0.6899, "loss/demonstration_loss": -340.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.00811767578125, "rewards/rejected": -0.00811767578125, "step": 602 }, { "epoch": 0.4824, "grad_norm": 1.079128410050442, "learning_rate": 3.084461926899684e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.7265625, "logps/chosen": -54.75, "logps/rejected": -64.0, "loss": 0.6918, "loss/demonstration_loss": -472.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005950927734375, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.005615234375, "step": 603 }, { "epoch": 0.4832, "grad_norm": 2.0667079072801844, "learning_rate": 3.0776718034885454e-07, "logits/chosen": 0.52734375, "logits/rejected": 0.490234375, "logps/chosen": -46.5, "logps/rejected": -57.75, "loss": 0.6917, "loss/demonstration_loss": -552.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.001220703125, "rewards/rejected": -0.010009765625, "step": 604 }, { "epoch": 0.484, "grad_norm": 0.9466821435195066, "learning_rate": 3.0708771752766395e-07, "logits/chosen": 0.546875, "logits/rejected": 0.53515625, "logps/chosen": -7.5, "logps/rejected": -6.15625, "loss": 0.691, "loss/demonstration_loss": -112.0, "loss/preference_loss": -109.5, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00311279296875, "rewards/margins": 0.0029754638671875, "rewards/rejected": 0.000156402587890625, "step": 605 }, { "epoch": 0.4848, "grad_norm": 1.395502045781961, "learning_rate": 3.0640780952498435e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.78515625, "logps/chosen": -38.5, "logps/rejected": -40.25, "loss": 0.693, "loss/demonstration_loss": -314.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005950927734375, "rewards/margins": -0.000457763671875, "rewards/rejected": -0.005462646484375, "step": 606 }, { "epoch": 0.4856, "grad_norm": 1.3007619527847696, "learning_rate": 3.057274616428751e-07, "logits/chosen": 0.494140625, "logits/rejected": 0.5859375, "logps/chosen": -67.0, "logps/rejected": -52.75, "loss": 0.6925, "loss/demonstration_loss": -378.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.015625, "rewards/margins": -0.0067138671875, "rewards/rejected": -0.0089111328125, "step": 607 }, { "epoch": 0.4864, "grad_norm": 1.3286316480084783, "learning_rate": 3.0504667918682536e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.7890625, "logps/chosen": -35.25, "logps/rejected": -36.25, "loss": 0.6913, "loss/demonstration_loss": -378.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.00469970703125, "step": 608 }, { "epoch": 0.4872, "grad_norm": 1.0676448071590354, "learning_rate": 3.043654674657137e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.80859375, "logps/chosen": -51.75, "logps/rejected": -45.5, "loss": 0.6953, "loss/demonstration_loss": -306.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01019287109375, "rewards/margins": -0.00811767578125, "rewards/rejected": -0.0020294189453125, "step": 609 }, { "epoch": 0.488, "grad_norm": 1.1145452975689618, "learning_rate": 3.036838317917658e-07, "logits/chosen": 0.8125, "logits/rejected": 0.75, "logps/chosen": -25.75, "logps/rejected": -34.5, "loss": 0.6941, "loss/demonstration_loss": -476.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0078125, "rewards/margins": -0.00469970703125, "rewards/rejected": -0.0031280517578125, "step": 610 }, { "epoch": 0.4888, "grad_norm": 0.9455194723655654, "learning_rate": 3.030017774805137e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.8671875, "logps/chosen": -28.125, "logps/rejected": -16.125, "loss": 0.6918, "loss/demonstration_loss": -352.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.001251220703125, "step": 611 }, { "epoch": 0.4896, "grad_norm": 3.2548088780476707, "learning_rate": 3.0231930985075376e-07, "logits/chosen": 0.703125, "logits/rejected": 0.609375, "logps/chosen": -38.0, "logps/rejected": -42.75, "loss": 0.6908, "loss/demonstration_loss": -428.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.00811767578125, "step": 612 }, { "epoch": 0.4904, "grad_norm": 1.2924247780095324, "learning_rate": 3.0163643422450585e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.6640625, "logps/chosen": -23.375, "logps/rejected": -23.125, "loss": 0.6891, "loss/demonstration_loss": -736.0, "loss/preference_loss": -740.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 613 }, { "epoch": 0.4912, "grad_norm": 0.7006108103441456, "learning_rate": 3.009531559269712e-07, "logits/chosen": 0.65625, "logits/rejected": 0.67578125, "logps/chosen": -2.515625, "logps/rejected": -3.03125, "loss": 0.6931, "loss/demonstration_loss": -88.5, "loss/preference_loss": -91.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.000156402587890625, "rewards/margins": -0.0017242431640625, "rewards/rejected": 0.00156402587890625, "step": 614 }, { "epoch": 0.492, "grad_norm": 0.9418131527255468, "learning_rate": 3.002694802864912e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.7734375, "logps/chosen": -49.5, "logps/rejected": -59.5, "loss": 0.6924, "loss/demonstration_loss": -576.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.015625, "rewards/margins": -0.003082275390625, "rewards/rejected": -0.01251220703125, "step": 615 }, { "epoch": 0.4928, "grad_norm": 1.5529442107651148, "learning_rate": 2.995854126345058e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.58203125, "logps/chosen": -63.25, "logps/rejected": -59.5, "loss": 0.6899, "loss/demonstration_loss": -390.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0115966796875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.01031494140625, "step": 616 }, { "epoch": 0.4936, "grad_norm": 1.5260283249058952, "learning_rate": 2.98900958305512e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.71875, "logps/chosen": -31.75, "logps/rejected": -31.75, "loss": 0.6918, "loss/demonstration_loss": -504.0, "loss/preference_loss": -504.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.000640869140625, "rewards/rejected": -0.004364013671875, "step": 617 }, { "epoch": 0.4944, "grad_norm": 1.207604937430794, "learning_rate": 2.9821612263702224e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.5859375, "logps/chosen": -32.25, "logps/rejected": -42.0, "loss": 0.693, "loss/demonstration_loss": -392.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.00750732421875, "step": 618 }, { "epoch": 0.4952, "grad_norm": 1.648411728522522, "learning_rate": 2.9753091096952255e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.94140625, "logps/chosen": -44.75, "logps/rejected": -43.75, "loss": 0.6915, "loss/demonstration_loss": -350.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00830078125, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.00982666015625, "step": 619 }, { "epoch": 0.496, "grad_norm": 1.5134168143087614, "learning_rate": 2.968453286464312e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.578125, "logps/chosen": -71.5, "logps/rejected": -78.0, "loss": 0.6952, "loss/demonstration_loss": -394.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01513671875, "rewards/margins": -0.0067138671875, "rewards/rejected": -0.0084228515625, "step": 620 }, { "epoch": 0.4968, "grad_norm": 1.7353553341545696, "learning_rate": 2.9615938101405673e-07, "logits/chosen": 0.8984375, "logits/rejected": 0.9765625, "logps/chosen": -61.0, "logps/rejected": -47.25, "loss": 0.6897, "loss/demonstration_loss": -346.0, "loss/preference_loss": -346.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.0, "step": 621 }, { "epoch": 0.4976, "grad_norm": 1.2285020928905002, "learning_rate": 2.954730734215567e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.7265625, "logps/chosen": -68.0, "logps/rejected": -60.5, "loss": 0.6934, "loss/demonstration_loss": -512.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.006256103515625, "step": 622 }, { "epoch": 0.4984, "grad_norm": 1.2894754159330946, "learning_rate": 2.947864112208956e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.8046875, "logps/chosen": -57.5, "logps/rejected": -71.0, "loss": 0.6904, "loss/demonstration_loss": -676.0, "loss/preference_loss": -672.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0128173828125, "rewards/margins": 0.007171630859375, "rewards/rejected": -0.02001953125, "step": 623 }, { "epoch": 0.4992, "grad_norm": 2.034268183339288, "learning_rate": 2.940993997668031e-07, "logits/chosen": 0.734375, "logits/rejected": 0.73046875, "logps/chosen": -34.75, "logps/rejected": -32.75, "loss": 0.6892, "loss/demonstration_loss": -360.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.00093841552734375, "step": 624 }, { "epoch": 0.5, "grad_norm": 0.6376882756383476, "learning_rate": 2.934120444167326e-07, "logits/chosen": 0.53125, "logits/rejected": 0.53125, "logps/chosen": -30.625, "logps/rejected": -30.875, "loss": 0.6909, "loss/demonstration_loss": -328.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.002197265625, "step": 625 }, { "epoch": 0.5008, "grad_norm": 1.6517881876753804, "learning_rate": 2.927243505308192e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.62890625, "logps/chosen": -41.25, "logps/rejected": -37.75, "loss": 0.6925, "loss/demonstration_loss": -418.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0017242431640625, "rewards/rejected": -0.0032806396484375, "step": 626 }, { "epoch": 0.5016, "grad_norm": 1.2106948051583009, "learning_rate": 2.9203632347183787e-07, "logits/chosen": 0.74609375, "logits/rejected": 0.68359375, "logps/chosen": -40.0, "logps/rejected": -52.5, "loss": 0.6909, "loss/demonstration_loss": -494.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0017242431640625, "rewards/margins": 0.00579833984375, "rewards/rejected": -0.00750732421875, "step": 627 }, { "epoch": 0.5024, "grad_norm": 1.1712282997027752, "learning_rate": 2.913479686051619e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.80859375, "logps/chosen": -18.25, "logps/rejected": -6.09375, "loss": 0.6941, "loss/demonstration_loss": -386.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0009307861328125, "rewards/rejected": -0.0034332275390625, "step": 628 }, { "epoch": 0.5032, "grad_norm": 1.8242661512742842, "learning_rate": 2.906592912987209e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.66015625, "logps/chosen": -27.125, "logps/rejected": -34.5, "loss": 0.6914, "loss/demonstration_loss": -326.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0040283203125, "rewards/margins": 0.00408935546875, "rewards/rejected": -0.00811767578125, "step": 629 }, { "epoch": 0.504, "grad_norm": 1.0587312608044013, "learning_rate": 2.899702969229587e-07, "logits/chosen": 0.53125, "logits/rejected": 0.55859375, "logps/chosen": -18.5, "logps/rejected": -18.25, "loss": 0.6899, "loss/demonstration_loss": -292.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0020294189453125, "rewards/margins": -0.0032806396484375, "rewards/rejected": 0.001251220703125, "step": 630 }, { "epoch": 0.5048, "grad_norm": 1.9291083199725223, "learning_rate": 2.892809908507919e-07, "logits/chosen": 0.5, "logits/rejected": 0.474609375, "logps/chosen": -52.0, "logps/rejected": -66.0, "loss": 0.6897, "loss/demonstration_loss": -628.0, "loss/preference_loss": -624.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0087890625, "rewards/rejected": -0.01123046875, "step": 631 }, { "epoch": 0.5056, "grad_norm": 0.8266175806827856, "learning_rate": 2.885913784575678e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.85546875, "logps/chosen": -11.0, "logps/rejected": -11.1875, "loss": 0.6929, "loss/demonstration_loss": -176.0, "loss/preference_loss": -178.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": -0.001556396484375, "rewards/rejected": 0.0006256103515625, "step": 632 }, { "epoch": 0.5064, "grad_norm": 0.9440276905065321, "learning_rate": 2.8790146512102227e-07, "logits/chosen": 0.4921875, "logits/rejected": 0.474609375, "logps/chosen": -19.625, "logps/rejected": -19.5, "loss": 0.691, "loss/demonstration_loss": -620.0, "loss/preference_loss": -624.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 633 }, { "epoch": 0.5072, "grad_norm": 1.0435978911889994, "learning_rate": 2.8721125622123806e-07, "logits/chosen": 0.71875, "logits/rejected": 0.6640625, "logps/chosen": -13.75, "logps/rejected": -14.5625, "loss": 0.6899, "loss/demonstration_loss": -448.0, "loss/preference_loss": -450.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 634 }, { "epoch": 0.508, "grad_norm": 0.739424853786443, "learning_rate": 2.865207571406029e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.6328125, "logps/chosen": -14.5, "logps/rejected": -14.5, "loss": 0.691, "loss/demonstration_loss": -462.0, "loss/preference_loss": -462.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0, "rewards/rejected": -0.001251220703125, "step": 635 }, { "epoch": 0.5088, "grad_norm": 2.495733024843621, "learning_rate": 2.8582997326376735e-07, "logits/chosen": 1.03125, "logits/rejected": 0.84375, "logps/chosen": -36.0, "logps/rejected": -62.0, "loss": 0.6887, "loss/demonstration_loss": -312.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.000156402587890625, "rewards/margins": 0.00982666015625, "rewards/rejected": -0.010009765625, "step": 636 }, { "epoch": 0.5096, "grad_norm": 1.5955276053022556, "learning_rate": 2.851389099776027e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.921875, "logps/chosen": -44.5, "logps/rejected": -28.5, "loss": 0.6892, "loss/demonstration_loss": -388.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00360107421875, "rewards/margins": -0.00016021728515625, "rewards/rejected": -0.003448486328125, "step": 637 }, { "epoch": 0.5104, "grad_norm": 0.5266561558620512, "learning_rate": 2.844475726711595e-07, "logits/chosen": 0.75, "logits/rejected": 0.765625, "logps/chosen": -15.8125, "logps/rejected": -16.0, "loss": 0.6907, "loss/demonstration_loss": -508.0, "loss/preference_loss": -504.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.0031280517578125, "step": 638 }, { "epoch": 0.5112, "grad_norm": 1.917632924134519, "learning_rate": 2.837559667356248e-07, "logits/chosen": 0.71875, "logits/rejected": 0.70703125, "logps/chosen": -44.5, "logps/rejected": -45.0, "loss": 0.6912, "loss/demonstration_loss": -466.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02001953125, "rewards/margins": 0.00494384765625, "rewards/rejected": -0.0250244140625, "step": 639 }, { "epoch": 0.512, "grad_norm": 1.614065375910583, "learning_rate": 2.830640975642806e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.63671875, "logps/chosen": -28.5, "logps/rejected": -25.875, "loss": 0.6932, "loss/demonstration_loss": -288.0, "loss/preference_loss": -290.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0029754638671875, "rewards/rejected": -0.000782012939453125, "step": 640 }, { "epoch": 0.5128, "grad_norm": 1.4940776662631898, "learning_rate": 2.823719705524617e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.84765625, "logps/chosen": -37.0, "logps/rejected": -52.25, "loss": 0.693, "loss/demonstration_loss": -474.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0031280517578125, "step": 641 }, { "epoch": 0.5136, "grad_norm": 1.1704748823523885, "learning_rate": 2.816795910975137e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.5703125, "logps/chosen": -21.0, "logps/rejected": -23.875, "loss": 0.6925, "loss/demonstration_loss": -238.0, "loss/preference_loss": -236.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0015716552734375, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.00750732421875, "step": 642 }, { "epoch": 0.5144, "grad_norm": 1.126247044662104, "learning_rate": 2.809869645987504e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.828125, "logps/chosen": -24.0, "logps/rejected": -36.5, "loss": 0.6907, "loss/demonstration_loss": -482.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0106201171875, "rewards/rejected": -0.014404296875, "step": 643 }, { "epoch": 0.5152, "grad_norm": 1.1872027354256682, "learning_rate": 2.8029409645741263e-07, "logits/chosen": 0.90625, "logits/rejected": 0.9140625, "logps/chosen": -14.8125, "logps/rejected": -14.875, "loss": 0.6903, "loss/demonstration_loss": -158.0, "loss/preference_loss": -158.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.000156402587890625, "rewards/margins": 0.0009307861328125, "rewards/rejected": -0.0010833740234375, "step": 644 }, { "epoch": 0.516, "grad_norm": 1.114190040520178, "learning_rate": 2.796009920766253e-07, "logits/chosen": 1.015625, "logits/rejected": 0.921875, "logps/chosen": -74.0, "logps/rejected": -96.0, "loss": 0.6917, "loss/demonstration_loss": -676.0, "loss/preference_loss": -672.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.01123046875, "step": 645 }, { "epoch": 0.5168, "grad_norm": 1.1687549203977408, "learning_rate": 2.7890765686135545e-07, "logits/chosen": 0.83203125, "logits/rejected": 0.94140625, "logps/chosen": -50.75, "logps/rejected": -47.25, "loss": 0.6926, "loss/demonstration_loss": -312.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.004364013671875, "rewards/rejected": -0.003753662109375, "step": 646 }, { "epoch": 0.5176, "grad_norm": 1.4153323141218905, "learning_rate": 2.7821409621837037e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.62109375, "logps/chosen": -42.75, "logps/rejected": -41.5, "loss": 0.6908, "loss/demonstration_loss": -450.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.006561279296875, "rewards/rejected": -0.006256103515625, "step": 647 }, { "epoch": 0.5184, "grad_norm": 1.759996894757585, "learning_rate": 2.7752031555619555e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.83203125, "logps/chosen": -55.0, "logps/rejected": -42.75, "loss": 0.6938, "loss/demonstration_loss": -310.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00360107421875, "rewards/rejected": -0.0045166015625, "step": 648 }, { "epoch": 0.5192, "grad_norm": 1.3895663455554745, "learning_rate": 2.7682632028507165e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.58203125, "logps/chosen": -69.5, "logps/rejected": -74.0, "loss": 0.6936, "loss/demonstration_loss": -456.0, "loss/preference_loss": -456.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.0128173828125, "step": 649 }, { "epoch": 0.52, "grad_norm": 1.252193781667357, "learning_rate": 2.761321158169134e-07, "logits/chosen": 0.9453125, "logits/rejected": 1.0390625, "logps/chosen": -63.0, "logps/rejected": -52.5, "loss": 0.6952, "loss/demonstration_loss": -364.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0184326171875, "rewards/margins": -0.01531982421875, "rewards/rejected": -0.0031280517578125, "step": 650 }, { "epoch": 0.5208, "grad_norm": 1.4936995156199417, "learning_rate": 2.7543770756526657e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.78515625, "logps/chosen": -43.25, "logps/rejected": -41.5, "loss": 0.6938, "loss/demonstration_loss": -268.0, "loss/preference_loss": -270.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0089111328125, "rewards/margins": -0.00592041015625, "rewards/rejected": -0.0029754638671875, "step": 651 }, { "epoch": 0.5216, "grad_norm": 1.745674420170799, "learning_rate": 2.747431009452663e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.478515625, "logps/chosen": -48.5, "logps/rejected": -48.5, "loss": 0.6952, "loss/demonstration_loss": -512.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.001861572265625, "rewards/rejected": -0.006256103515625, "step": 652 }, { "epoch": 0.5224, "grad_norm": 0.7230563537558746, "learning_rate": 2.740483013735944e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.8359375, "logps/chosen": -38.25, "logps/rejected": -39.25, "loss": 0.6892, "loss/demonstration_loss": -620.0, "loss/preference_loss": -612.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0, "rewards/margins": 0.010009765625, "rewards/rejected": -0.010009765625, "step": 653 }, { "epoch": 0.5232, "grad_norm": 1.694318043321162, "learning_rate": 2.7335331426843766e-07, "logits/chosen": 0.890625, "logits/rejected": 1.0859375, "logps/chosen": -46.5, "logps/rejected": -30.75, "loss": 0.6923, "loss/demonstration_loss": -304.0, "loss/preference_loss": -304.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01531982421875, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.00933837890625, "step": 654 }, { "epoch": 0.524, "grad_norm": 2.058130536694481, "learning_rate": 2.726581450494451e-07, "logits/chosen": 1.0546875, "logits/rejected": 1.0234375, "logps/chosen": -66.5, "logps/rejected": -72.0, "loss": 0.6927, "loss/demonstration_loss": -552.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00811767578125, "step": 655 }, { "epoch": 0.5248, "grad_norm": 1.743511769935925, "learning_rate": 2.7196279913768584e-07, "logits/chosen": 0.8125, "logits/rejected": 0.54296875, "logps/chosen": -8.5, "logps/rejected": -26.75, "loss": 0.6906, "loss/demonstration_loss": -280.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.005645751953125, "rewards/rejected": -0.00750732421875, "step": 656 }, { "epoch": 0.5256, "grad_norm": 1.182925691190048, "learning_rate": 2.71267281955607e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.93359375, "logps/chosen": -23.0, "logps/rejected": -15.875, "loss": 0.6913, "loss/demonstration_loss": -308.0, "loss/preference_loss": -308.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0050048828125, "step": 657 }, { "epoch": 0.5264, "grad_norm": 1.3964345465139694, "learning_rate": 2.7057159892699137e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.765625, "logps/chosen": -35.0, "logps/rejected": -54.5, "loss": 0.6936, "loss/demonstration_loss": -476.0, "loss/preference_loss": -478.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0028076171875, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00031280517578125, "step": 658 }, { "epoch": 0.5272, "grad_norm": 1.6473196952767002, "learning_rate": 2.6987575547691495e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.72265625, "logps/chosen": -42.25, "logps/rejected": -54.0, "loss": 0.6925, "loss/demonstration_loss": -382.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00640869140625, "rewards/margins": 0.00360107421875, "rewards/rejected": -0.010009765625, "step": 659 }, { "epoch": 0.528, "grad_norm": 1.1735245108930294, "learning_rate": 2.6917975703170465e-07, "logits/chosen": 0.9296875, "logits/rejected": 0.94140625, "logps/chosen": -45.5, "logps/rejected": -49.0, "loss": 0.6907, "loss/demonstration_loss": -378.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.002197265625, "rewards/margins": 0.00189971923828125, "rewards/rejected": 0.00031280517578125, "step": 660 }, { "epoch": 0.5288, "grad_norm": 1.3067351023489444, "learning_rate": 2.684836090188963e-07, "logits/chosen": 0.65625, "logits/rejected": 0.58203125, "logps/chosen": -23.25, "logps/rejected": -38.0, "loss": 0.6896, "loss/demonstration_loss": -243.0, "loss/preference_loss": -243.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00031280517578125, "rewards/rejected": -0.005950927734375, "step": 661 }, { "epoch": 0.5296, "grad_norm": 1.3191518835412392, "learning_rate": 2.6778731686719174e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.69140625, "logps/chosen": -23.25, "logps/rejected": -37.25, "loss": 0.6904, "loss/demonstration_loss": -484.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.0078125, "step": 662 }, { "epoch": 0.5304, "grad_norm": 0.997850774055083, "learning_rate": 2.6709088600641715e-07, "logits/chosen": 0.671875, "logits/rejected": 0.7265625, "logps/chosen": -28.5, "logps/rejected": -26.25, "loss": 0.6896, "loss/demonstration_loss": -438.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.0106201171875, "step": 663 }, { "epoch": 0.5312, "grad_norm": 1.5966420977517302, "learning_rate": 2.663943218674804e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.5234375, "logps/chosen": -2.40625, "logps/rejected": -8.5, "loss": 0.6882, "loss/demonstration_loss": -173.0, "loss/preference_loss": -176.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.000782012939453125, "rewards/margins": -0.00140380859375, "rewards/rejected": 0.0006256103515625, "step": 664 }, { "epoch": 0.532, "grad_norm": 0.8605816344252725, "learning_rate": 2.6569762988232837e-07, "logits/chosen": 0.6875, "logits/rejected": 0.62109375, "logps/chosen": -27.75, "logps/rejected": -35.0, "loss": 0.6923, "loss/demonstration_loss": -330.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0079345703125, "rewards/margins": -0.0020294189453125, "rewards/rejected": -0.00592041015625, "step": 665 }, { "epoch": 0.5328, "grad_norm": 0.9326143041596285, "learning_rate": 2.650008154839052e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.6171875, "logps/chosen": -14.5, "logps/rejected": -14.375, "loss": 0.6931, "loss/demonstration_loss": -456.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 666 }, { "epoch": 0.5336, "grad_norm": 1.7479328535802319, "learning_rate": 2.643038841061095e-07, "logits/chosen": 0.86328125, "logits/rejected": 0.83984375, "logps/chosen": -56.25, "logps/rejected": -59.0, "loss": 0.6954, "loss/demonstration_loss": -454.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0174560546875, "rewards/margins": -0.0118408203125, "rewards/rejected": -0.005615234375, "step": 667 }, { "epoch": 0.5344, "grad_norm": 1.3916432223349366, "learning_rate": 2.6360684118375225e-07, "logits/chosen": 0.33984375, "logits/rejected": 0.357421875, "logps/chosen": -55.5, "logps/rejected": -56.0, "loss": 0.6888, "loss/demonstration_loss": -356.0, "loss/preference_loss": -352.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00408935546875, "rewards/margins": 0.01153564453125, "rewards/rejected": -0.015625, "step": 668 }, { "epoch": 0.5352, "grad_norm": 1.0390568165232925, "learning_rate": 2.629096921525141e-07, "logits/chosen": 0.9375, "logits/rejected": 0.96484375, "logps/chosen": -40.0, "logps/rejected": -39.0, "loss": 0.6934, "loss/demonstration_loss": -416.0, "loss/preference_loss": -418.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.009033203125, "rewards/margins": -0.00653076171875, "rewards/rejected": -0.00250244140625, "step": 669 }, { "epoch": 0.536, "grad_norm": 2.243147092575663, "learning_rate": 2.6221244244890336e-07, "logits/chosen": 0.90625, "logits/rejected": 0.765625, "logps/chosen": -45.0, "logps/rejected": -67.5, "loss": 0.6932, "loss/demonstration_loss": -446.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": 0.0, "rewards/rejected": -0.0093994140625, "step": 670 }, { "epoch": 0.5368, "grad_norm": 1.4019753059348719, "learning_rate": 2.615150975102131e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.7578125, "logps/chosen": -48.25, "logps/rejected": -48.0, "loss": 0.6915, "loss/demonstration_loss": -760.0, "loss/preference_loss": -760.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0150146484375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.01251220703125, "step": 671 }, { "epoch": 0.5376, "grad_norm": 1.432990018284756, "learning_rate": 2.6081766277447925e-07, "logits/chosen": 0.80078125, "logits/rejected": 0.796875, "logps/chosen": -99.5, "logps/rejected": -102.5, "loss": 0.6888, "loss/demonstration_loss": -456.0, "loss/preference_loss": -454.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.02099609375, "rewards/margins": 0.01025390625, "rewards/rejected": -0.03125, "step": 672 }, { "epoch": 0.5384, "grad_norm": 1.651245295498525, "learning_rate": 2.601201436804381e-07, "logits/chosen": 0.5234375, "logits/rejected": 0.76953125, "logps/chosen": -70.5, "logps/rejected": -58.75, "loss": 0.6923, "loss/demonstration_loss": -412.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0031280517578125, "rewards/rejected": -0.0050048828125, "step": 673 }, { "epoch": 0.5392, "grad_norm": 1.117824510935096, "learning_rate": 2.5942254566748366e-07, "logits/chosen": 0.703125, "logits/rejected": 0.76953125, "logps/chosen": -27.875, "logps/rejected": -20.375, "loss": 0.6947, "loss/demonstration_loss": -252.0, "loss/preference_loss": -258.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010498046875, "rewards/margins": -0.0111083984375, "rewards/rejected": 0.0006256103515625, "step": 674 }, { "epoch": 0.54, "grad_norm": 1.5796205437294821, "learning_rate": 2.5872487417562527e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.7890625, "logps/chosen": -30.75, "logps/rejected": -27.25, "loss": 0.6914, "loss/demonstration_loss": -920.0, "loss/preference_loss": -920.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0, "rewards/rejected": -0.0050048828125, "step": 675 }, { "epoch": 0.5408, "grad_norm": 1.1943485277473358, "learning_rate": 2.580271346454454e-07, "logits/chosen": 0.7578125, "logits/rejected": 0.73046875, "logps/chosen": -14.5, "logps/rejected": -14.1875, "loss": 0.6907, "loss/demonstration_loss": -226.0, "loss/preference_loss": -228.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.001251220703125, "step": 676 }, { "epoch": 0.5416, "grad_norm": 0.7901084920835292, "learning_rate": 2.573293325180571e-07, "logits/chosen": 0.8984375, "logits/rejected": 0.9375, "logps/chosen": -29.375, "logps/rejected": -23.375, "loss": 0.6906, "loss/demonstration_loss": -210.0, "loss/preference_loss": -209.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0004730224609375, "rewards/margins": 0.00390625, "rewards/rejected": -0.00439453125, "step": 677 }, { "epoch": 0.5424, "grad_norm": 1.6586234624622922, "learning_rate": 2.566314732350615e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.6796875, "logps/chosen": -12.4375, "logps/rejected": -23.625, "loss": 0.6902, "loss/demonstration_loss": -288.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0028076171875, "step": 678 }, { "epoch": 0.5432, "grad_norm": 1.8182277919617185, "learning_rate": 2.5593356223850547e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.73828125, "logps/chosen": -49.5, "logps/rejected": -59.0, "loss": 0.6886, "loss/demonstration_loss": -430.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.01373291015625, "rewards/rejected": -0.02001953125, "step": 679 }, { "epoch": 0.544, "grad_norm": 1.7306254592390489, "learning_rate": 2.5523560497083924e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.6953125, "logps/chosen": -14.3125, "logps/rejected": -11.0625, "loss": 0.6897, "loss/demonstration_loss": -402.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.006256103515625, "step": 680 }, { "epoch": 0.5448, "grad_norm": 1.2565775263997956, "learning_rate": 2.545376068748737e-07, "logits/chosen": 0.953125, "logits/rejected": 0.9140625, "logps/chosen": -30.5, "logps/rejected": -47.75, "loss": 0.692, "loss/demonstration_loss": -254.0, "loss/preference_loss": -251.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.010009765625, "rewards/margins": 0.00872802734375, "rewards/rejected": 0.001251220703125, "step": 681 }, { "epoch": 0.5456, "grad_norm": 1.06035999044165, "learning_rate": 2.538395733937382e-07, "logits/chosen": 0.984375, "logits/rejected": 1.0234375, "logps/chosen": -30.25, "logps/rejected": -27.0, "loss": 0.692, "loss/demonstration_loss": -452.0, "loss/preference_loss": -454.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0006103515625, "rewards/rejected": -0.00750732421875, "step": 682 }, { "epoch": 0.5464, "grad_norm": 1.025873728261592, "learning_rate": 2.5314150997083816e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.609375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6918, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 683 }, { "epoch": 0.5472, "grad_norm": 1.2052409603103538, "learning_rate": 2.524434220498123e-07, "logits/chosen": 0.4921875, "logits/rejected": 0.4921875, "logps/chosen": -25.875, "logps/rejected": -26.375, "loss": 0.6921, "loss/demonstration_loss": -418.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00016021728515625, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.004547119140625, "step": 684 }, { "epoch": 0.548, "grad_norm": 1.6663186549160551, "learning_rate": 2.5174531507449037e-07, "logits/chosen": 0.5625, "logits/rejected": 0.75390625, "logps/chosen": -37.5, "logps/rejected": -28.5, "loss": 0.696, "loss/demonstration_loss": -260.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.01251220703125, "rewards/rejected": -0.00031280517578125, "step": 685 }, { "epoch": 0.5488, "grad_norm": 3.0108558895072384, "learning_rate": 2.51047194488851e-07, "logits/chosen": 1.0390625, "logits/rejected": 0.78125, "logps/chosen": -78.0, "logps/rejected": -130.0, "loss": 0.693, "loss/demonstration_loss": -548.0, "loss/preference_loss": -548.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0206298828125, "rewards/margins": 0.01092529296875, "rewards/rejected": -0.031494140625, "step": 686 }, { "epoch": 0.5496, "grad_norm": 7.016895273957017, "learning_rate": 2.503490657369786e-07, "logits/chosen": 0.53125, "logits/rejected": 0.59375, "logps/chosen": -34.75, "logps/rejected": -35.25, "loss": 0.693, "loss/demonstration_loss": -548.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.0062255859375, "rewards/rejected": -0.00811767578125, "step": 687 }, { "epoch": 0.5504, "grad_norm": 1.6358538906009379, "learning_rate": 2.496509342630214e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.53125, "logps/chosen": -48.5, "logps/rejected": -60.0, "loss": 0.6917, "loss/demonstration_loss": -580.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.003753662109375, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.005615234375, "step": 688 }, { "epoch": 0.5512, "grad_norm": 1.497317084262155, "learning_rate": 2.4895280551114905e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.578125, "logps/chosen": -52.75, "logps/rejected": -53.0, "loss": 0.6924, "loss/demonstration_loss": -560.0, "loss/preference_loss": -560.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005615234375, "rewards/margins": 0.002197265625, "rewards/rejected": -0.0078125, "step": 689 }, { "epoch": 0.552, "grad_norm": 1.6691424441594434, "learning_rate": 2.482546849255096e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.63671875, "logps/chosen": -79.5, "logps/rejected": -82.0, "loss": 0.6943, "loss/demonstration_loss": -364.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.021728515625, "rewards/margins": -0.00823974609375, "rewards/rejected": -0.013427734375, "step": 690 }, { "epoch": 0.5528, "grad_norm": 1.2005192943858138, "learning_rate": 2.475565779501878e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.796875, "logps/chosen": -36.75, "logps/rejected": -29.125, "loss": 0.6934, "loss/demonstration_loss": -260.0, "loss/preference_loss": -262.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.0059814453125, "rewards/rejected": -0.0028076171875, "step": 691 }, { "epoch": 0.5536, "grad_norm": 1.0550249449490605, "learning_rate": 2.468584900291618e-07, "logits/chosen": 0.98046875, "logits/rejected": 0.91015625, "logps/chosen": -17.875, "logps/rejected": -19.125, "loss": 0.6926, "loss/demonstration_loss": -195.0, "loss/preference_loss": -195.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.000946044921875, "rewards/rejected": -0.003753662109375, "step": 692 }, { "epoch": 0.5544, "grad_norm": 1.0883704993536707, "learning_rate": 2.4616042660626175e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.84765625, "logps/chosen": -40.0, "logps/rejected": -32.5, "loss": 0.6925, "loss/demonstration_loss": -386.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.001251220703125, "step": 693 }, { "epoch": 0.5552, "grad_norm": 0.8438018810036966, "learning_rate": 2.4546239312512633e-07, "logits/chosen": 0.796875, "logits/rejected": 0.79296875, "logps/chosen": -30.0, "logps/rejected": -31.875, "loss": 0.6914, "loss/demonstration_loss": -246.0, "loss/preference_loss": -245.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0029754638671875, "rewards/margins": 0.00140380859375, "rewards/rejected": -0.004364013671875, "step": 694 }, { "epoch": 0.556, "grad_norm": 1.0019590061090333, "learning_rate": 2.447643950291608e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.7890625, "logps/chosen": -36.5, "logps/rejected": -31.5, "loss": 0.6885, "loss/demonstration_loss": -360.0, "loss/preference_loss": -358.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.00482177734375, "rewards/rejected": -0.00860595703125, "step": 695 }, { "epoch": 0.5568, "grad_norm": 1.9865391735353333, "learning_rate": 2.4406643776149456e-07, "logits/chosen": 0.69921875, "logits/rejected": 0.609375, "logps/chosen": -6.40625, "logps/rejected": -11.875, "loss": 0.6918, "loss/demonstration_loss": -146.0, "loss/preference_loss": -146.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.001251220703125, "step": 696 }, { "epoch": 0.5576, "grad_norm": 2.119960223857473, "learning_rate": 2.4336852676493845e-07, "logits/chosen": 0.98046875, "logits/rejected": 0.8515625, "logps/chosen": -48.0, "logps/rejected": -57.25, "loss": 0.6898, "loss/demonstration_loss": -336.0, "loss/preference_loss": -334.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.010009765625, "step": 697 }, { "epoch": 0.5584, "grad_norm": 1.6084789320793025, "learning_rate": 2.426706674819429e-07, "logits/chosen": 1.0, "logits/rejected": 1.078125, "logps/chosen": -58.0, "logps/rejected": -42.5, "loss": 0.6945, "loss/demonstration_loss": -396.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.013427734375, "rewards/margins": -0.00469970703125, "rewards/rejected": -0.0087890625, "step": 698 }, { "epoch": 0.5592, "grad_norm": 1.481602243110687, "learning_rate": 2.419728653545546e-07, "logits/chosen": 0.76953125, "logits/rejected": 0.8125, "logps/chosen": -10.1875, "logps/rejected": -3.375, "loss": 0.6935, "loss/demonstration_loss": -213.0, "loss/preference_loss": -219.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00390625, "rewards/rejected": 0.00140380859375, "step": 699 }, { "epoch": 0.56, "grad_norm": 1.5163111379739962, "learning_rate": 2.412751258243748e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.6484375, "logps/chosen": -9.625, "logps/rejected": -9.9375, "loss": 0.6907, "loss/demonstration_loss": -316.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0018768310546875, "step": 700 }, { "epoch": 0.5608, "grad_norm": 1.8132340231185469, "learning_rate": 2.405774543325163e-07, "logits/chosen": 0.671875, "logits/rejected": 0.58984375, "logps/chosen": -45.25, "logps/rejected": -60.75, "loss": 0.6917, "loss/demonstration_loss": -340.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0028076171875, "rewards/margins": 0.0072021484375, "rewards/rejected": -0.00439453125, "step": 701 }, { "epoch": 0.5616, "grad_norm": 3.415990877211999, "learning_rate": 2.398798563195619e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.828125, "logps/chosen": -72.0, "logps/rejected": -85.0, "loss": 0.6908, "loss/demonstration_loss": -498.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.0012664794921875, "rewards/rejected": -0.00909423828125, "step": 702 }, { "epoch": 0.5624, "grad_norm": 1.2969177266125662, "learning_rate": 2.3918233722552073e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.86328125, "logps/chosen": -46.5, "logps/rejected": -49.25, "loss": 0.6921, "loss/demonstration_loss": -378.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00421142578125, "rewards/rejected": -0.005157470703125, "step": 703 }, { "epoch": 0.5632, "grad_norm": 1.2809830076597712, "learning_rate": 2.384849024897869e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.77734375, "logps/chosen": -35.0, "logps/rejected": -20.25, "loss": 0.6887, "loss/demonstration_loss": -436.0, "loss/preference_loss": -436.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0006103515625, "rewards/rejected": -0.005615234375, "step": 704 }, { "epoch": 0.564, "grad_norm": 1.3747705059854036, "learning_rate": 2.3778755755109667e-07, "logits/chosen": 0.9453125, "logits/rejected": 1.03125, "logps/chosen": -115.5, "logps/rejected": -103.0, "loss": 0.6927, "loss/demonstration_loss": -434.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00016021728515625, "rewards/rejected": -0.0126953125, "step": 705 }, { "epoch": 0.5648, "grad_norm": 0.9459095068890448, "learning_rate": 2.3709030784748586e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.7890625, "logps/chosen": -17.625, "logps/rejected": -11.25, "loss": 0.6925, "loss/demonstration_loss": -232.0, "loss/preference_loss": -234.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.001251220703125, "rewards/margins": -0.003448486328125, "rewards/rejected": 0.00469970703125, "step": 706 }, { "epoch": 0.5656, "grad_norm": 1.4104807717534769, "learning_rate": 2.3639315881624776e-07, "logits/chosen": 0.890625, "logits/rejected": 0.921875, "logps/chosen": -85.0, "logps/rejected": -78.0, "loss": 0.6923, "loss/demonstration_loss": -432.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0084228515625, "step": 707 }, { "epoch": 0.5664, "grad_norm": 0.7839881260432638, "learning_rate": 2.3569611589389045e-07, "logits/chosen": 0.51953125, "logits/rejected": 0.53125, "logps/chosen": -5.34375, "logps/rejected": -5.53125, "loss": 0.6914, "loss/demonstration_loss": -172.0, "loss/preference_loss": -168.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.00186920166015625, "rewards/rejected": -0.0034332275390625, "step": 708 }, { "epoch": 0.5672, "grad_norm": 1.1141522744457317, "learning_rate": 2.3499918451609488e-07, "logits/chosen": 0.703125, "logits/rejected": 0.70703125, "logps/chosen": -37.5, "logps/rejected": -36.25, "loss": 0.6914, "loss/demonstration_loss": -390.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005950927734375, "rewards/margins": -0.00469970703125, "rewards/rejected": -0.001251220703125, "step": 709 }, { "epoch": 0.568, "grad_norm": 1.5437856598243063, "learning_rate": 2.3430237011767164e-07, "logits/chosen": 0.875, "logits/rejected": 0.9140625, "logps/chosen": -54.75, "logps/rejected": -45.0, "loss": 0.6912, "loss/demonstration_loss": -398.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0072021484375, "step": 710 }, { "epoch": 0.5688, "grad_norm": 0.5333412967140774, "learning_rate": 2.3360567813251966e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.75, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6908, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 711 }, { "epoch": 0.5696, "grad_norm": 1.3101425240994364, "learning_rate": 2.3290911399358285e-07, "logits/chosen": 0.5234375, "logits/rejected": 0.50390625, "logps/chosen": -34.5, "logps/rejected": -35.75, "loss": 0.6891, "loss/demonstration_loss": -374.0, "loss/preference_loss": -370.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00061798095703125, "rewards/margins": 0.010009765625, "rewards/rejected": -0.0106201171875, "step": 712 }, { "epoch": 0.5704, "grad_norm": 0.15460563084904905, "learning_rate": 2.3221268313280834e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.5, "logps/chosen": -4.03125, "logps/rejected": -4.1875, "loss": 0.691, "loss/demonstration_loss": -134.0, "loss/preference_loss": -131.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.00031280517578125, "step": 713 }, { "epoch": 0.5712, "grad_norm": 1.4034012632940982, "learning_rate": 2.3151639098110376e-07, "logits/chosen": 0.875, "logits/rejected": 0.87109375, "logps/chosen": -60.0, "logps/rejected": -63.75, "loss": 0.691, "loss/demonstration_loss": -392.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01470947265625, "rewards/margins": 0.00030517578125, "rewards/rejected": -0.0150146484375, "step": 714 }, { "epoch": 0.572, "grad_norm": 1.111464284300382, "learning_rate": 2.3082024296829532e-07, "logits/chosen": 0.671875, "logits/rejected": 0.76171875, "logps/chosen": -39.5, "logps/rejected": -43.0, "loss": 0.6923, "loss/demonstration_loss": -656.0, "loss/preference_loss": -656.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.006256103515625, "step": 715 }, { "epoch": 0.5728, "grad_norm": 1.4501645587869152, "learning_rate": 2.3012424452308508e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.734375, "logps/chosen": -22.125, "logps/rejected": -5.84375, "loss": 0.6951, "loss/demonstration_loss": -434.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00872802734375, "rewards/margins": -0.00933837890625, "rewards/rejected": 0.0006256103515625, "step": 716 }, { "epoch": 0.5736, "grad_norm": 1.0640110614994867, "learning_rate": 2.2942840107300856e-07, "logits/chosen": 0.41015625, "logits/rejected": 0.40234375, "logps/chosen": -34.75, "logps/rejected": -37.25, "loss": 0.6896, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.01092529296875, "rewards/rejected": -0.0093994140625, "step": 717 }, { "epoch": 0.5744, "grad_norm": 3.614978819266233, "learning_rate": 2.2873271804439297e-07, "logits/chosen": 0.828125, "logits/rejected": 0.7578125, "logps/chosen": -36.0, "logps/rejected": -43.0, "loss": 0.6891, "loss/demonstration_loss": -316.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.0010986328125, "rewards/margins": 0.0145263671875, "rewards/rejected": -0.013427734375, "step": 718 }, { "epoch": 0.5752, "grad_norm": 1.2189337935678328, "learning_rate": 2.280372008623142e-07, "logits/chosen": 0.91796875, "logits/rejected": 0.76171875, "logps/chosen": -25.0, "logps/rejected": -42.0, "loss": 0.6912, "loss/demonstration_loss": -268.0, "loss/preference_loss": -268.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0009307861328125, "rewards/margins": 0.001861572265625, "rewards/rejected": -0.0009307861328125, "step": 719 }, { "epoch": 0.576, "grad_norm": 0.9832080427539421, "learning_rate": 2.2734185495055498e-07, "logits/chosen": 0.8125, "logits/rejected": 0.65625, "logps/chosen": -9.1875, "logps/rejected": -34.75, "loss": 0.6904, "loss/demonstration_loss": -350.0, "loss/preference_loss": -346.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0020294189453125, "rewards/margins": 0.005126953125, "rewards/rejected": -0.007171630859375, "step": 720 }, { "epoch": 0.5768, "grad_norm": 0.5431491541518665, "learning_rate": 2.2664668573156237e-07, "logits/chosen": 0.640625, "logits/rejected": 0.6328125, "logps/chosen": -43.5, "logps/rejected": -43.5, "loss": 0.6918, "loss/demonstration_loss": -462.0, "loss/preference_loss": -462.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.000640869140625, "rewards/rejected": -0.00439453125, "step": 721 }, { "epoch": 0.5776, "grad_norm": 1.5657697343340136, "learning_rate": 2.2595169862640567e-07, "logits/chosen": 0.85546875, "logits/rejected": 0.73046875, "logps/chosen": -51.5, "logps/rejected": -81.5, "loss": 0.6868, "loss/demonstration_loss": -354.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.002349853515625, "rewards/margins": 0.01300048828125, "rewards/rejected": -0.01531982421875, "step": 722 }, { "epoch": 0.5784, "grad_norm": 0.726066597430071, "learning_rate": 2.2525689905473375e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.62890625, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6895, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 723 }, { "epoch": 0.5792, "grad_norm": 1.093713831270068, "learning_rate": 2.2456229243473344e-07, "logits/chosen": 0.3046875, "logits/rejected": 0.279296875, "logps/chosen": -15.3125, "logps/rejected": -18.75, "loss": 0.6903, "loss/demonstration_loss": -272.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0006256103515625, "step": 724 }, { "epoch": 0.58, "grad_norm": 1.9843356147019682, "learning_rate": 2.2386788418308665e-07, "logits/chosen": 0.828125, "logits/rejected": 1.015625, "logps/chosen": -104.0, "logps/rejected": -93.0, "loss": 0.6884, "loss/demonstration_loss": -446.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02001953125, "rewards/margins": -0.00131988525390625, "rewards/rejected": -0.018798828125, "step": 725 }, { "epoch": 0.5808, "grad_norm": 1.2444699112656237, "learning_rate": 2.2317367971492832e-07, "logits/chosen": 0.9375, "logits/rejected": 0.91015625, "logps/chosen": -33.75, "logps/rejected": -39.25, "loss": 0.6906, "loss/demonstration_loss": -386.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00592041015625, "rewards/rejected": -0.0128173828125, "step": 726 }, { "epoch": 0.5816, "grad_norm": 1.0750630874731386, "learning_rate": 2.2247968444380448e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.86328125, "logps/chosen": -49.5, "logps/rejected": -48.0, "loss": 0.6898, "loss/demonstration_loss": -776.0, "loss/preference_loss": -772.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.010009765625, "step": 727 }, { "epoch": 0.5824, "grad_norm": 2.2679127299897006, "learning_rate": 2.2178590378162955e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.81640625, "logps/chosen": -47.0, "logps/rejected": -44.25, "loss": 0.692, "loss/demonstration_loss": -360.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.013427734375, "rewards/margins": -0.005889892578125, "rewards/rejected": -0.00750732421875, "step": 728 }, { "epoch": 0.5832, "grad_norm": 1.2361983308303337, "learning_rate": 2.2109234313864463e-07, "logits/chosen": 0.8046875, "logits/rejected": 0.7890625, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 729 }, { "epoch": 0.584, "grad_norm": 1.3552442782569134, "learning_rate": 2.2039900792337474e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.5234375, "logps/chosen": -51.25, "logps/rejected": -42.0, "loss": 0.6943, "loss/demonstration_loss": -368.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.013427734375, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.00970458984375, "step": 730 }, { "epoch": 0.5848, "grad_norm": 1.6122002047462358, "learning_rate": 2.1970590354258743e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.63671875, "logps/chosen": -53.0, "logps/rejected": -59.0, "loss": 0.6919, "loss/demonstration_loss": -356.0, "loss/preference_loss": -356.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0098876953125, "rewards/margins": -0.00125885009765625, "rewards/rejected": -0.00860595703125, "step": 731 }, { "epoch": 0.5856, "grad_norm": 1.383570177698159, "learning_rate": 2.1901303540124954e-07, "logits/chosen": 0.640625, "logits/rejected": 0.52734375, "logps/chosen": -45.75, "logps/rejected": -54.25, "loss": 0.6907, "loss/demonstration_loss": -396.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0106201171875, "rewards/margins": 0.0078125, "rewards/rejected": -0.0184326171875, "step": 732 }, { "epoch": 0.5864, "grad_norm": 1.567411370936918, "learning_rate": 2.1832040890248638e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.60546875, "logps/chosen": -81.5, "logps/rejected": -79.0, "loss": 0.694, "loss/demonstration_loss": -510.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.005615234375, "rewards/rejected": -0.00811767578125, "step": 733 }, { "epoch": 0.5872, "grad_norm": 1.6139394597227597, "learning_rate": 2.1762802944753827e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.8984375, "logps/chosen": -81.5, "logps/rejected": -72.0, "loss": 0.6898, "loss/demonstration_loss": -408.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.002960205078125, "rewards/rejected": -0.013916015625, "step": 734 }, { "epoch": 0.588, "grad_norm": 1.9797010354284708, "learning_rate": 2.1693590243571935e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.6953125, "logps/chosen": -39.5, "logps/rejected": -36.5, "loss": 0.6942, "loss/demonstration_loss": -300.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.004730224609375, "rewards/rejected": -0.00592041015625, "step": 735 }, { "epoch": 0.5888, "grad_norm": 1.0221815815787014, "learning_rate": 2.162440332643752e-07, "logits/chosen": 0.3984375, "logits/rejected": 0.54296875, "logps/chosen": -46.5, "logps/rejected": -35.25, "loss": 0.6921, "loss/demonstration_loss": -432.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.002960205078125, "rewards/rejected": -0.005157470703125, "step": 736 }, { "epoch": 0.5896, "grad_norm": 0.7976166415913162, "learning_rate": 2.155524273288405e-07, "logits/chosen": 0.953125, "logits/rejected": 1.0390625, "logps/chosen": -47.75, "logps/rejected": -34.5, "loss": 0.6906, "loss/demonstration_loss": -436.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.001861572265625, "rewards/rejected": -0.00811767578125, "step": 737 }, { "epoch": 0.5904, "grad_norm": 1.6801006060366823, "learning_rate": 2.1486109002239726e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.72265625, "logps/chosen": -67.0, "logps/rejected": -70.0, "loss": 0.692, "loss/demonstration_loss": -724.0, "loss/preference_loss": -724.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0150146484375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.01251220703125, "step": 738 }, { "epoch": 0.5912, "grad_norm": 1.8073147611405276, "learning_rate": 2.1417002673623263e-07, "logits/chosen": 0.875, "logits/rejected": 0.72265625, "logps/chosen": -45.0, "logps/rejected": -63.0, "loss": 0.6908, "loss/demonstration_loss": -342.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01019287109375, "rewards/margins": -0.00579833984375, "rewards/rejected": -0.00439453125, "step": 739 }, { "epoch": 0.592, "grad_norm": 0.9794918590070832, "learning_rate": 2.1347924285939712e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.6328125, "logps/chosen": -3.390625, "logps/rejected": -3.46875, "loss": 0.6936, "loss/demonstration_loss": -110.0, "loss/preference_loss": -109.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.000156402587890625, "rewards/margins": 0.000782012939453125, "rewards/rejected": -0.0006256103515625, "step": 740 }, { "epoch": 0.5928, "grad_norm": 1.2347190237195829, "learning_rate": 2.1278874377876194e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.51953125, "logps/chosen": -5.8125, "logps/rejected": -24.625, "loss": 0.6921, "loss/demonstration_loss": -246.0, "loss/preference_loss": -242.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00360107421875, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.0032806396484375, "step": 741 }, { "epoch": 0.5936, "grad_norm": 1.9211281756620884, "learning_rate": 2.1209853487897782e-07, "logits/chosen": 0.796875, "logits/rejected": 0.8046875, "logps/chosen": -53.75, "logps/rejected": -52.25, "loss": 0.6902, "loss/demonstration_loss": -420.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00079345703125, "rewards/rejected": -0.01202392578125, "step": 742 }, { "epoch": 0.5944, "grad_norm": 0.8713818056783998, "learning_rate": 2.1140862154243218e-07, "logits/chosen": 0.53125, "logits/rejected": 0.5625, "logps/chosen": -13.9375, "logps/rejected": -7.78125, "loss": 0.6919, "loss/demonstration_loss": -171.0, "loss/preference_loss": -174.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.004058837890625, "rewards/rejected": 0.0006256103515625, "step": 743 }, { "epoch": 0.5952, "grad_norm": 0.9471793284790981, "learning_rate": 2.1071900914920814e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.83984375, "logps/chosen": -27.75, "logps/rejected": -22.0, "loss": 0.6919, "loss/demonstration_loss": -394.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.00689697265625, "step": 744 }, { "epoch": 0.596, "grad_norm": 1.2096857598750612, "learning_rate": 2.100297030770413e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.7109375, "logps/chosen": -30.25, "logps/rejected": -30.25, "loss": 0.6884, "loss/demonstration_loss": -482.0, "loss/preference_loss": -482.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00311279296875, "rewards/margins": 0.000640869140625, "rewards/rejected": -0.003753662109375, "step": 745 }, { "epoch": 0.5968, "grad_norm": 1.6967019021413463, "learning_rate": 2.0934070870127909e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.67578125, "logps/chosen": -40.5, "logps/rejected": -40.0, "loss": 0.6929, "loss/demonstration_loss": -424.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00811767578125, "step": 746 }, { "epoch": 0.5976, "grad_norm": 1.1328862118773013, "learning_rate": 2.086520313948381e-07, "logits/chosen": 0.78515625, "logits/rejected": 0.78515625, "logps/chosen": -31.375, "logps/rejected": -29.125, "loss": 0.6923, "loss/demonstration_loss": -322.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.0021820068359375, "step": 747 }, { "epoch": 0.5984, "grad_norm": 1.5851137726931162, "learning_rate": 2.079636765281621e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.69921875, "logps/chosen": -54.0, "logps/rejected": -47.0, "loss": 0.694, "loss/demonstration_loss": -532.0, "loss/preference_loss": -532.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0118408203125, "rewards/margins": -0.001861572265625, "rewards/rejected": -0.010009765625, "step": 748 }, { "epoch": 0.5992, "grad_norm": 1.1312719004618215, "learning_rate": 2.0727564946918085e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.494140625, "logps/chosen": -36.5, "logps/rejected": -36.5, "loss": 0.6914, "loss/demonstration_loss": -388.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.003448486328125, "step": 749 }, { "epoch": 0.6, "grad_norm": 0.5524452317603821, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.546875, "logits/rejected": 0.578125, "logps/chosen": -18.375, "logps/rejected": -17.625, "loss": 0.6921, "loss/demonstration_loss": -190.0, "loss/preference_loss": -190.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0031280517578125, "step": 750 }, { "epoch": 0.6008, "grad_norm": 1.803974186040543, "learning_rate": 2.0590060023319695e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.72265625, "logps/chosen": -4.625, "logps/rejected": -6.0, "loss": 0.6924, "loss/demonstration_loss": -168.0, "loss/preference_loss": -168.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.00156402587890625, "step": 751 }, { "epoch": 0.6016, "grad_norm": 0.5795357781642441, "learning_rate": 2.0521358877910441e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.671875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6925, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 752 }, { "epoch": 0.6024, "grad_norm": 1.767126861597063, "learning_rate": 2.045269265784433e-07, "logits/chosen": 0.734375, "logits/rejected": 0.83984375, "logps/chosen": -57.25, "logps/rejected": -48.5, "loss": 0.6864, "loss/demonstration_loss": -338.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0004730224609375, "rewards/margins": 0.01904296875, "rewards/rejected": -0.01953125, "step": 753 }, { "epoch": 0.6032, "grad_norm": 0.9675276450074874, "learning_rate": 2.038406189859433e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.62109375, "logps/chosen": -23.75, "logps/rejected": -23.5, "loss": 0.6918, "loss/demonstration_loss": -748.0, "loss/preference_loss": -752.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 754 }, { "epoch": 0.604, "grad_norm": 1.1146034588657845, "learning_rate": 2.0315467135356878e-07, "logits/chosen": 0.69140625, "logits/rejected": 0.6796875, "logps/chosen": -7.84375, "logps/rejected": -7.84375, "loss": 0.6881, "loss/demonstration_loss": -247.0, "loss/preference_loss": -247.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0, "rewards/rejected": -0.00250244140625, "step": 755 }, { "epoch": 0.6048, "grad_norm": 0.7711131335257401, "learning_rate": 2.0246908903047748e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.6875, "logps/chosen": -5.21875, "logps/rejected": -4.8125, "loss": 0.6909, "loss/demonstration_loss": -162.0, "loss/preference_loss": -162.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.00093841552734375, "rewards/margins": -0.00031280517578125, "rewards/rejected": 0.001251220703125, "step": 756 }, { "epoch": 0.6056, "grad_norm": 1.4505827136978513, "learning_rate": 2.0178387736297768e-07, "logits/chosen": 0.71875, "logits/rejected": 0.78125, "logps/chosen": -58.25, "logps/rejected": -55.25, "loss": 0.691, "loss/demonstration_loss": -360.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005462646484375, "rewards/margins": 0.0012359619140625, "rewards/rejected": -0.0067138671875, "step": 757 }, { "epoch": 0.6064, "grad_norm": 0.9681734338164087, "learning_rate": 2.0109904169448796e-07, "logits/chosen": 0.59375, "logits/rejected": 0.5234375, "logps/chosen": -44.5, "logps/rejected": -45.0, "loss": 0.6926, "loss/demonstration_loss": -470.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00341796875, "rewards/rejected": -0.0093994140625, "step": 758 }, { "epoch": 0.6072, "grad_norm": 1.2061830865088083, "learning_rate": 2.004145873654942e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.84765625, "logps/chosen": -11.0, "logps/rejected": -7.65625, "loss": 0.691, "loss/demonstration_loss": -292.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.001251220703125, "step": 759 }, { "epoch": 0.608, "grad_norm": 1.5746044934915364, "learning_rate": 1.9973051971350888e-07, "logits/chosen": 0.53125, "logits/rejected": 0.66015625, "logps/chosen": -52.5, "logps/rejected": -43.25, "loss": 0.6947, "loss/demonstration_loss": -506.0, "loss/preference_loss": -508.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00689697265625, "step": 760 }, { "epoch": 0.6088, "grad_norm": 1.6294157898667747, "learning_rate": 1.9904684407302878e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.64453125, "logps/chosen": -105.5, "logps/rejected": -102.0, "loss": 0.6904, "loss/demonstration_loss": -660.0, "loss/preference_loss": -660.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.00811767578125, "rewards/rejected": -0.0162353515625, "step": 761 }, { "epoch": 0.6096, "grad_norm": 1.6431520253661818, "learning_rate": 1.9836356577549418e-07, "logits/chosen": 0.93359375, "logits/rejected": 0.95703125, "logps/chosen": -34.75, "logps/rejected": -37.0, "loss": 0.693, "loss/demonstration_loss": -227.0, "loss/preference_loss": -228.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00830078125, "rewards/margins": -0.00048828125, "rewards/rejected": -0.0078125, "step": 762 }, { "epoch": 0.6104, "grad_norm": 1.1810975325201682, "learning_rate": 1.9768069014924622e-07, "logits/chosen": 0.546875, "logits/rejected": 0.546875, "logps/chosen": -49.5, "logps/rejected": -49.5, "loss": 0.6926, "loss/demonstration_loss": -312.0, "loss/preference_loss": -314.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.0084228515625, "step": 763 }, { "epoch": 0.6112, "grad_norm": 1.4723758989337616, "learning_rate": 1.9699822251948638e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.55078125, "logps/chosen": -68.0, "logps/rejected": -62.25, "loss": 0.6907, "loss/demonstration_loss": -346.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.006561279296875, "rewards/margins": 0.00872802734375, "rewards/rejected": -0.01531982421875, "step": 764 }, { "epoch": 0.612, "grad_norm": 1.0177414838908159, "learning_rate": 1.9631616820823418e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.66796875, "logps/chosen": -15.1875, "logps/rejected": -23.625, "loss": 0.6881, "loss/demonstration_loss": -312.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00093841552734375, "step": 765 }, { "epoch": 0.6128, "grad_norm": 1.745455824332646, "learning_rate": 1.956345325342863e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.671875, "logps/chosen": -43.0, "logps/rejected": -35.75, "loss": 0.6904, "loss/demonstration_loss": -416.0, "loss/preference_loss": -418.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.00439453125, "step": 766 }, { "epoch": 0.6136, "grad_norm": 1.379616650175435, "learning_rate": 1.9495332081317461e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.52734375, "logps/chosen": -65.5, "logps/rejected": -73.0, "loss": 0.6927, "loss/demonstration_loss": -438.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.015625, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.01251220703125, "step": 767 }, { "epoch": 0.6144, "grad_norm": 1.4079617064532994, "learning_rate": 1.9427253835712487e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.58203125, "logps/chosen": -59.5, "logps/rejected": -66.0, "loss": 0.6898, "loss/demonstration_loss": -400.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.00341796875, "rewards/rejected": -0.0093994140625, "step": 768 }, { "epoch": 0.6152, "grad_norm": 1.606422978711204, "learning_rate": 1.9359219047501563e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.63671875, "logps/chosen": -61.25, "logps/rejected": -61.0, "loss": 0.6901, "loss/demonstration_loss": -486.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.0012359619140625, "rewards/rejected": -0.0084228515625, "step": 769 }, { "epoch": 0.616, "grad_norm": 2.306863509647201, "learning_rate": 1.9291228247233603e-07, "logits/chosen": 1.1171875, "logits/rejected": 1.1328125, "logps/chosen": -91.0, "logps/rejected": -80.0, "loss": 0.6912, "loss/demonstration_loss": -452.0, "loss/preference_loss": -452.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0167236328125, "rewards/margins": -0.001434326171875, "rewards/rejected": -0.01531982421875, "step": 770 }, { "epoch": 0.6168, "grad_norm": 1.5152093864576677, "learning_rate": 1.9223281965114557e-07, "logits/chosen": 0.85546875, "logits/rejected": 1.0859375, "logps/chosen": -68.0, "logps/rejected": -42.75, "loss": 0.6923, "loss/demonstration_loss": -352.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00061798095703125, "rewards/rejected": -0.00750732421875, "step": 771 }, { "epoch": 0.6176, "grad_norm": 1.1775790179892787, "learning_rate": 1.915538073100316e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.5546875, "logps/chosen": -78.0, "logps/rejected": -77.5, "loss": 0.6929, "loss/demonstration_loss": -616.0, "loss/preference_loss": -620.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.003753662109375, "step": 772 }, { "epoch": 0.6184, "grad_norm": 1.391869204622441, "learning_rate": 1.9087525074406887e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.7734375, "logps/chosen": -26.25, "logps/rejected": -25.25, "loss": 0.6921, "loss/demonstration_loss": -408.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.003448486328125, "step": 773 }, { "epoch": 0.6192, "grad_norm": 0.9800313283068981, "learning_rate": 1.9019715524477767e-07, "logits/chosen": 0.625, "logits/rejected": 0.6171875, "logps/chosen": -18.75, "logps/rejected": -23.0, "loss": 0.6904, "loss/demonstration_loss": -167.0, "loss/preference_loss": -165.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.000782012939453125, "rewards/margins": 0.006561279296875, "rewards/rejected": -0.00579833984375, "step": 774 }, { "epoch": 0.62, "grad_norm": 1.135183123031107, "learning_rate": 1.895195261000831e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.6484375, "logps/chosen": -66.5, "logps/rejected": -63.5, "loss": 0.694, "loss/demonstration_loss": -516.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.010009765625, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.005950927734375, "step": 775 }, { "epoch": 0.6208, "grad_norm": 1.5040274130312057, "learning_rate": 1.8884236859427318e-07, "logits/chosen": 0.640625, "logits/rejected": 0.66015625, "logps/chosen": -27.625, "logps/rejected": -18.25, "loss": 0.692, "loss/demonstration_loss": -356.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.00909423828125, "rewards/rejected": -0.004058837890625, "step": 776 }, { "epoch": 0.6216, "grad_norm": 0.8485461861864265, "learning_rate": 1.8816568800795822e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.58203125, "logps/chosen": -24.75, "logps/rejected": -24.625, "loss": 0.6921, "loss/demonstration_loss": -262.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.00156402587890625, "rewards/rejected": 0.0, "step": 777 }, { "epoch": 0.6224, "grad_norm": 1.4476936528082165, "learning_rate": 1.8748948961802946e-07, "logits/chosen": 0.8125, "logits/rejected": 0.84765625, "logps/chosen": -38.0, "logps/rejected": -35.25, "loss": 0.6885, "loss/demonstration_loss": -388.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.010009765625, "step": 778 }, { "epoch": 0.6232, "grad_norm": 0.7409253655616747, "learning_rate": 1.8681377869761767e-07, "logits/chosen": 0.796875, "logits/rejected": 0.796875, "logps/chosen": -21.75, "logps/rejected": -21.875, "loss": 0.6915, "loss/demonstration_loss": -692.0, "loss/preference_loss": -688.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0050048828125, "step": 779 }, { "epoch": 0.624, "grad_norm": 1.0719439260376407, "learning_rate": 1.861385605160524e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.427734375, "logps/chosen": -7.125, "logps/rejected": -7.65625, "loss": 0.6901, "loss/demonstration_loss": -235.0, "loss/preference_loss": -225.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.0072021484375, "step": 780 }, { "epoch": 0.6248, "grad_norm": 1.3784998980173926, "learning_rate": 1.854638403388206e-07, "logits/chosen": 0.55078125, "logits/rejected": 0.5625, "logps/chosen": -57.5, "logps/rejected": -67.5, "loss": 0.6935, "loss/demonstration_loss": -496.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.0003204345703125, "rewards/rejected": -0.013427734375, "step": 781 }, { "epoch": 0.6256, "grad_norm": 1.0564609745127846, "learning_rate": 1.8478962342752584e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.8515625, "logps/chosen": -6.0625, "logps/rejected": -5.9375, "loss": 0.6919, "loss/demonstration_loss": -189.0, "loss/preference_loss": -191.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0006256103515625, "step": 782 }, { "epoch": 0.6264, "grad_norm": 1.4302768967672004, "learning_rate": 1.8411591503984685e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.64453125, "logps/chosen": -81.0, "logps/rejected": -83.0, "loss": 0.6875, "loss/demonstration_loss": -648.0, "loss/preference_loss": -644.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0137939453125, "rewards/margins": 0.0162353515625, "rewards/rejected": -0.030029296875, "step": 783 }, { "epoch": 0.6272, "grad_norm": 1.334937742617813, "learning_rate": 1.8344272042949721e-07, "logits/chosen": 0.81640625, "logits/rejected": 0.81640625, "logps/chosen": -27.75, "logps/rejected": -24.5, "loss": 0.692, "loss/demonstration_loss": -412.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.002197265625, "step": 784 }, { "epoch": 0.628, "grad_norm": 1.3924865716053223, "learning_rate": 1.8277004484618357e-07, "logits/chosen": 0.87890625, "logits/rejected": 0.8125, "logps/chosen": -21.875, "logps/rejected": -27.0, "loss": 0.692, "loss/demonstration_loss": -260.0, "loss/preference_loss": -258.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.000469207763671875, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.00469970703125, "step": 785 }, { "epoch": 0.6288, "grad_norm": 1.6439205051975556, "learning_rate": 1.8209789353556525e-07, "logits/chosen": 0.546875, "logits/rejected": 0.57421875, "logps/chosen": -96.5, "logps/rejected": -89.5, "loss": 0.6908, "loss/demonstration_loss": -492.0, "loss/preference_loss": -492.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0159912109375, "rewards/margins": 0.0010986328125, "rewards/rejected": -0.01708984375, "step": 786 }, { "epoch": 0.6296, "grad_norm": 1.370630523651426, "learning_rate": 1.8142627173921339e-07, "logits/chosen": 0.76171875, "logits/rejected": 0.75390625, "logps/chosen": -15.8125, "logps/rejected": -17.375, "loss": 0.6947, "loss/demonstration_loss": -260.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00408935546875, "rewards/rejected": -0.00156402587890625, "step": 787 }, { "epoch": 0.6304, "grad_norm": 1.4922687890617476, "learning_rate": 1.807551846945694e-07, "logits/chosen": 0.8984375, "logits/rejected": 0.890625, "logps/chosen": -38.75, "logps/rejected": -34.75, "loss": 0.6924, "loss/demonstration_loss": -386.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.0062255859375, "rewards/rejected": -0.005615234375, "step": 788 }, { "epoch": 0.6312, "grad_norm": 1.4238547358447895, "learning_rate": 1.8008463763490505e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.90625, "logps/chosen": -48.25, "logps/rejected": -21.375, "loss": 0.6931, "loss/demonstration_loss": -366.0, "loss/preference_loss": -370.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0118408203125, "rewards/margins": -0.0072021484375, "rewards/rejected": -0.00469970703125, "step": 789 }, { "epoch": 0.632, "grad_norm": 1.2693455572398646, "learning_rate": 1.7941463578928083e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.6171875, "logps/chosen": -27.25, "logps/rejected": -26.0, "loss": 0.6908, "loss/demonstration_loss": -282.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00032806396484375, "rewards/rejected": -0.002166748046875, "step": 790 }, { "epoch": 0.6328, "grad_norm": 1.0665630326221258, "learning_rate": 1.7874518438250595e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.61328125, "logps/chosen": -37.0, "logps/rejected": -41.5, "loss": 0.6918, "loss/demonstration_loss": -624.0, "loss/preference_loss": -616.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0087890625, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.01251220703125, "step": 791 }, { "epoch": 0.6336, "grad_norm": 1.6019030368429992, "learning_rate": 1.7807628863509683e-07, "logits/chosen": 0.8125, "logits/rejected": 1.0703125, "logps/chosen": -42.75, "logps/rejected": -24.875, "loss": 0.6917, "loss/demonstration_loss": -358.0, "loss/preference_loss": -360.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00531005859375, "rewards/margins": -0.00173187255859375, "rewards/rejected": -0.003570556640625, "step": 792 }, { "epoch": 0.6344, "grad_norm": 1.2978912702946952, "learning_rate": 1.774079537632369e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.6875, "logps/chosen": -39.0, "logps/rejected": -36.5, "loss": 0.6904, "loss/demonstration_loss": -300.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00030517578125, "rewards/rejected": -0.00469970703125, "step": 793 }, { "epoch": 0.6352, "grad_norm": 1.0993093678228523, "learning_rate": 1.7674018497873565e-07, "logits/chosen": 0.78125, "logits/rejected": 0.77734375, "logps/chosen": -31.0, "logps/rejected": -31.375, "loss": 0.6958, "loss/demonstration_loss": -328.0, "loss/preference_loss": -330.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.002349853515625, "rewards/rejected": -0.005157470703125, "step": 794 }, { "epoch": 0.636, "grad_norm": 1.7720349685417234, "learning_rate": 1.760729874889884e-07, "logits/chosen": 0.69921875, "logits/rejected": 0.796875, "logps/chosen": -55.75, "logps/rejected": -52.5, "loss": 0.6879, "loss/demonstration_loss": -344.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01080322265625, "rewards/margins": 0.0078125, "rewards/rejected": -0.0185546875, "step": 795 }, { "epoch": 0.6368, "grad_norm": 1.400253435201845, "learning_rate": 1.7540636649693494e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.75390625, "logps/chosen": -12.1875, "logps/rejected": -6.625, "loss": 0.6923, "loss/demonstration_loss": -302.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0006256103515625, "rewards/margins": -0.00031280517578125, "rewards/rejected": 0.00093841552734375, "step": 796 }, { "epoch": 0.6376, "grad_norm": 1.0296800292099646, "learning_rate": 1.7474032720101988e-07, "logits/chosen": 0.703125, "logits/rejected": 0.73046875, "logps/chosen": -40.5, "logps/rejected": -38.5, "loss": 0.6888, "loss/demonstration_loss": -418.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00531005859375, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.0084228515625, "step": 797 }, { "epoch": 0.6384, "grad_norm": 1.530703856844397, "learning_rate": 1.7407487479515146e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.71875, "logps/chosen": -46.75, "logps/rejected": -31.125, "loss": 0.6973, "loss/demonstration_loss": -406.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.014404296875, "rewards/margins": -0.0115966796875, "rewards/rejected": -0.0028076171875, "step": 798 }, { "epoch": 0.6392, "grad_norm": 1.4141018080623653, "learning_rate": 1.73410014468661e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.796875, "logps/chosen": -20.0, "logps/rejected": -13.625, "loss": 0.6913, "loss/demonstration_loss": -272.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.004058837890625, "rewards/margins": 0.00156402587890625, "rewards/rejected": 0.00250244140625, "step": 799 }, { "epoch": 0.64, "grad_norm": 1.114062039095475, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.55078125, "logps/chosen": -35.25, "logps/rejected": -34.25, "loss": 0.6935, "loss/demonstration_loss": -276.0, "loss/preference_loss": -278.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005950927734375, "rewards/margins": -0.0037689208984375, "rewards/rejected": -0.002197265625, "step": 800 }, { "epoch": 0.6408, "grad_norm": 1.6468998052273758, "learning_rate": 1.7208209078801452e-07, "logits/chosen": 0.6875, "logits/rejected": 0.65625, "logps/chosen": -38.25, "logps/rejected": -37.75, "loss": 0.691, "loss/demonstration_loss": -404.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0035858154296875, "rewards/margins": 0.0059814453125, "rewards/rejected": -0.009521484375, "step": 801 }, { "epoch": 0.6416, "grad_norm": 1.2436005086924389, "learning_rate": 1.7141903778927405e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.734375, "logps/chosen": -46.5, "logps/rejected": -70.5, "loss": 0.6929, "loss/demonstration_loss": -616.0, "loss/preference_loss": -620.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.0093994140625, "step": 802 }, { "epoch": 0.6424, "grad_norm": 0.9203774718577699, "learning_rate": 1.7075659758066204e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.625, "logps/chosen": -45.0, "logps/rejected": -46.25, "loss": 0.6924, "loss/demonstration_loss": -486.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00360107421875, "rewards/margins": -0.002349853515625, "rewards/rejected": -0.001251220703125, "step": 803 }, { "epoch": 0.6432, "grad_norm": 1.5591536719754442, "learning_rate": 1.7009477532802052e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.6015625, "logps/chosen": -32.75, "logps/rejected": -39.0, "loss": 0.6915, "loss/demonstration_loss": -572.0, "loss/preference_loss": -572.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.00250244140625, "step": 804 }, { "epoch": 0.644, "grad_norm": 1.4513036483127368, "learning_rate": 1.6943357619237225e-07, "logits/chosen": 0.7890625, "logits/rejected": 0.765625, "logps/chosen": -36.5, "logps/rejected": -44.0, "loss": 0.6959, "loss/demonstration_loss": -428.0, "loss/preference_loss": -428.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.005615234375, "step": 805 }, { "epoch": 0.6448, "grad_norm": 1.7476726289696796, "learning_rate": 1.6877300532988092e-07, "logits/chosen": 0.84375, "logits/rejected": 0.89453125, "logps/chosen": -79.0, "logps/rejected": -74.5, "loss": 0.6924, "loss/demonstration_loss": -404.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01531982421875, "rewards/margins": 0.002471923828125, "rewards/rejected": -0.017822265625, "step": 806 }, { "epoch": 0.6456, "grad_norm": 1.5337317038711051, "learning_rate": 1.6811306789181078e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.57421875, "logps/chosen": -9.8125, "logps/rejected": -24.125, "loss": 0.692, "loss/demonstration_loss": -272.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0010986328125, "rewards/margins": 0.00701904296875, "rewards/rejected": -0.005950927734375, "step": 807 }, { "epoch": 0.6464, "grad_norm": 1.4798451538218524, "learning_rate": 1.6745376902448655e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.68359375, "logps/chosen": -49.5, "logps/rejected": -42.25, "loss": 0.6935, "loss/demonstration_loss": -364.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.009521484375, "rewards/margins": -0.002960205078125, "rewards/rejected": -0.006561279296875, "step": 808 }, { "epoch": 0.6472, "grad_norm": 1.1412097928927454, "learning_rate": 1.6679511386925334e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.50390625, "logps/chosen": -43.75, "logps/rejected": -43.5, "loss": 0.6926, "loss/demonstration_loss": -462.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006561279296875, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.003753662109375, "step": 809 }, { "epoch": 0.648, "grad_norm": 1.8774652314987879, "learning_rate": 1.6613710756243627e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.5390625, "logps/chosen": -22.375, "logps/rejected": -17.875, "loss": 0.6895, "loss/demonstration_loss": -210.0, "loss/preference_loss": -208.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0093994140625, "rewards/margins": 0.0040283203125, "rewards/rejected": -0.013427734375, "step": 810 }, { "epoch": 0.6488, "grad_norm": 1.123910706911031, "learning_rate": 1.6547975523530073e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.65234375, "logps/chosen": -26.5, "logps/rejected": -24.625, "loss": 0.6924, "loss/demonstration_loss": -202.0, "loss/preference_loss": -202.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005767822265625, "rewards/margins": 0.0003204345703125, "rewards/rejected": -0.006072998046875, "step": 811 }, { "epoch": 0.6496, "grad_norm": 0.622322368274587, "learning_rate": 1.6482306201401208e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.73828125, "logps/chosen": -16.375, "logps/rejected": -16.125, "loss": 0.6927, "loss/demonstration_loss": -512.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004364013671875, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.0018768310546875, "step": 812 }, { "epoch": 0.6504, "grad_norm": 1.9145845507384405, "learning_rate": 1.641670330195962e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.486328125, "logps/chosen": -38.0, "logps/rejected": -29.375, "loss": 0.6958, "loss/demonstration_loss": -346.0, "loss/preference_loss": -356.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0262451171875, "rewards/margins": -0.01953125, "rewards/rejected": -0.006744384765625, "step": 813 }, { "epoch": 0.6512, "grad_norm": 1.5215956970787288, "learning_rate": 1.635116733678988e-07, "logits/chosen": 0.90625, "logits/rejected": 0.86328125, "logps/chosen": -65.0, "logps/rejected": -75.5, "loss": 0.6907, "loss/demonstration_loss": -374.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.004852294921875, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.009521484375, "step": 814 }, { "epoch": 0.652, "grad_norm": 1.1779756309999243, "learning_rate": 1.6285698816954624e-07, "logits/chosen": 0.55859375, "logits/rejected": 0.57421875, "logps/chosen": -41.75, "logps/rejected": -41.75, "loss": 0.6938, "loss/demonstration_loss": -330.0, "loss/preference_loss": -334.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.010009765625, "rewards/margins": -0.00909423828125, "rewards/rejected": -0.00093841552734375, "step": 815 }, { "epoch": 0.6528, "grad_norm": 1.5004105193461328, "learning_rate": 1.62202982529905e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.9375, "logps/chosen": -54.0, "logps/rejected": -47.75, "loss": 0.693, "loss/demonstration_loss": -322.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.005157470703125, "rewards/rejected": -0.004547119140625, "step": 816 }, { "epoch": 0.6536, "grad_norm": 1.2417571148998352, "learning_rate": 1.6154966154904263e-07, "logits/chosen": 0.6875, "logits/rejected": 0.8125, "logps/chosen": -42.5, "logps/rejected": -24.0, "loss": 0.6927, "loss/demonstration_loss": -524.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0087890625, "rewards/rejected": -0.001251220703125, "step": 817 }, { "epoch": 0.6544, "grad_norm": 1.8522837064587105, "learning_rate": 1.6089703032168733e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.953125, "logps/chosen": -53.75, "logps/rejected": -44.0, "loss": 0.6855, "loss/demonstration_loss": -384.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01611328125, "rewards/margins": -0.002655029296875, "rewards/rejected": -0.013427734375, "step": 818 }, { "epoch": 0.6552, "grad_norm": 1.7534659863939335, "learning_rate": 1.6024509393718844e-07, "logits/chosen": 0.65625, "logits/rejected": 0.640625, "logps/chosen": -107.5, "logps/rejected": -100.5, "loss": 0.6959, "loss/demonstration_loss": -660.0, "loss/preference_loss": -660.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01806640625, "rewards/margins": -0.004364013671875, "rewards/rejected": -0.0137939453125, "step": 819 }, { "epoch": 0.656, "grad_norm": 1.247585874851812, "learning_rate": 1.5959385747947695e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.7734375, "logps/chosen": -63.25, "logps/rejected": -55.75, "loss": 0.6926, "loss/demonstration_loss": -472.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.005462646484375, "rewards/rejected": -0.0029754638671875, "step": 820 }, { "epoch": 0.6568, "grad_norm": 1.6486248922442621, "learning_rate": 1.5894332602702542e-07, "logits/chosen": 0.89453125, "logits/rejected": 1.0078125, "logps/chosen": -34.25, "logps/rejected": -15.9375, "loss": 0.6947, "loss/demonstration_loss": -392.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.005950927734375, "step": 821 }, { "epoch": 0.6576, "grad_norm": 1.4630617314064298, "learning_rate": 1.5829350465280898e-07, "logits/chosen": 0.859375, "logits/rejected": 0.94140625, "logps/chosen": -109.0, "logps/rejected": -111.0, "loss": 0.6875, "loss/demonstration_loss": -438.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.0172119140625, "rewards/rejected": -0.0240478515625, "step": 822 }, { "epoch": 0.6584, "grad_norm": 1.5075280001731497, "learning_rate": 1.5764439842426514e-07, "logits/chosen": 0.55078125, "logits/rejected": 0.53125, "logps/chosen": -38.25, "logps/rejected": -35.0, "loss": 0.6927, "loss/demonstration_loss": -576.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.001220703125, "rewards/rejected": -0.0087890625, "step": 823 }, { "epoch": 0.6592, "grad_norm": 1.1378405276583938, "learning_rate": 1.569960124032547e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.74609375, "logps/chosen": -7.0, "logps/rejected": -4.125, "loss": 0.6917, "loss/demonstration_loss": -176.0, "loss/preference_loss": -179.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00093841552734375, "rewards/margins": -0.00156402587890625, "rewards/rejected": 0.0006256103515625, "step": 824 }, { "epoch": 0.66, "grad_norm": 1.4977107610440932, "learning_rate": 1.5634835164602196e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.5625, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6899, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 825 }, { "epoch": 0.6608, "grad_norm": 1.413687838862912, "learning_rate": 1.557014212031559e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.75, "logps/chosen": -29.0, "logps/rejected": -30.5, "loss": 0.6945, "loss/demonstration_loss": -234.0, "loss/preference_loss": -240.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00872802734375, "rewards/margins": -0.01409912109375, "rewards/rejected": 0.00531005859375, "step": 826 }, { "epoch": 0.6616, "grad_norm": 1.0092613484033055, "learning_rate": 1.5505522611954973e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.62890625, "logps/chosen": -11.5, "logps/rejected": -11.375, "loss": 0.6913, "loss/demonstration_loss": -183.0, "loss/preference_loss": -182.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00093841552734375, "step": 827 }, { "epoch": 0.6624, "grad_norm": 7.9685872397746875, "learning_rate": 1.5440977143436268e-07, "logits/chosen": 0.4375, "logits/rejected": 0.48828125, "logps/chosen": -63.0, "logps/rejected": -66.0, "loss": 0.6936, "loss/demonstration_loss": -408.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.016357421875, "rewards/margins": -0.007659912109375, "rewards/rejected": -0.0087890625, "step": 828 }, { "epoch": 0.6632, "grad_norm": 2.141149126703448, "learning_rate": 1.5376506218098014e-07, "logits/chosen": 0.71875, "logits/rejected": 0.71875, "logps/chosen": -108.5, "logps/rejected": -106.5, "loss": 0.6897, "loss/demonstration_loss": -490.0, "loss/preference_loss": -490.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.007476806640625, "rewards/margins": 0.006744384765625, "rewards/rejected": -0.01422119140625, "step": 829 }, { "epoch": 0.664, "grad_norm": 0.8379935022986486, "learning_rate": 1.5312110338697427e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.67578125, "logps/chosen": -39.25, "logps/rejected": -39.5, "loss": 0.6924, "loss/demonstration_loss": -624.0, "loss/preference_loss": -624.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00750732421875, "step": 830 }, { "epoch": 0.6648, "grad_norm": 1.656969203493446, "learning_rate": 1.5247790007406507e-07, "logits/chosen": 0.671875, "logits/rejected": 0.64453125, "logps/chosen": -117.5, "logps/rejected": -127.0, "loss": 0.6901, "loss/demonstration_loss": -552.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.02099609375, "rewards/margins": 0.00372314453125, "rewards/rejected": -0.024658203125, "step": 831 }, { "epoch": 0.6656, "grad_norm": 1.3938097374080671, "learning_rate": 1.5183545725808125e-07, "logits/chosen": 0.65234375, "logits/rejected": 0.6796875, "logps/chosen": -8.1875, "logps/rejected": -2.328125, "loss": 0.6921, "loss/demonstration_loss": -157.0, "loss/preference_loss": -169.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006866455078125, "rewards/margins": -0.00732421875, "rewards/rejected": 0.000469207763671875, "step": 832 }, { "epoch": 0.6664, "grad_norm": 1.9147790685414274, "learning_rate": 1.5119377994892094e-07, "logits/chosen": 0.78125, "logits/rejected": 0.66015625, "logps/chosen": -75.5, "logps/rejected": -81.0, "loss": 0.6915, "loss/demonstration_loss": -496.0, "loss/preference_loss": -494.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0174560546875, "rewards/margins": 0.00128173828125, "rewards/rejected": -0.018798828125, "step": 833 }, { "epoch": 0.6672, "grad_norm": 1.3682035955544707, "learning_rate": 1.5055287315051257e-07, "logits/chosen": 1.1015625, "logits/rejected": 1.0390625, "logps/chosen": -52.5, "logps/rejected": -65.0, "loss": 0.6934, "loss/demonstration_loss": -374.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0128173828125, "rewards/margins": 0.002044677734375, "rewards/rejected": -0.01483154296875, "step": 834 }, { "epoch": 0.668, "grad_norm": 1.6364267359458384, "learning_rate": 1.4991274186077628e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.77734375, "logps/chosen": -18.25, "logps/rejected": -4.875, "loss": 0.6917, "loss/demonstration_loss": -360.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.001251220703125, "step": 835 }, { "epoch": 0.6688, "grad_norm": 1.3405041373531108, "learning_rate": 1.4927339107158435e-07, "logits/chosen": 0.671875, "logits/rejected": 0.67578125, "logps/chosen": -47.75, "logps/rejected": -47.25, "loss": 0.6927, "loss/demonstration_loss": -376.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0029754638671875, "rewards/rejected": -0.0032806396484375, "step": 836 }, { "epoch": 0.6696, "grad_norm": 1.664345313152491, "learning_rate": 1.4863482576872275e-07, "logits/chosen": 0.54296875, "logits/rejected": 0.51171875, "logps/chosen": -55.25, "logps/rejected": -55.5, "loss": 0.6884, "loss/demonstration_loss": -294.0, "loss/preference_loss": -292.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0023345947265625, "rewards/margins": 0.0084228515625, "rewards/rejected": -0.01080322265625, "step": 837 }, { "epoch": 0.6704, "grad_norm": 0.8304744793802379, "learning_rate": 1.479970509318518e-07, "logits/chosen": 0.88671875, "logits/rejected": 0.8984375, "logps/chosen": -11.0, "logps/rejected": -13.75, "loss": 0.6925, "loss/demonstration_loss": -200.0, "loss/preference_loss": -199.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0017242431640625, "rewards/margins": 0.0004730224609375, "rewards/rejected": 0.001251220703125, "step": 838 }, { "epoch": 0.6712, "grad_norm": 1.1656475747466573, "learning_rate": 1.47360071534468e-07, "logits/chosen": 0.9296875, "logits/rejected": 0.98828125, "logps/chosen": -30.75, "logps/rejected": -17.5, "loss": 0.692, "loss/demonstration_loss": -380.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0078125, "rewards/margins": -0.004669189453125, "rewards/rejected": -0.0031280517578125, "step": 839 }, { "epoch": 0.672, "grad_norm": 1.1097709025330045, "learning_rate": 1.4672389254386457e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.76953125, "logps/chosen": -35.25, "logps/rejected": -30.375, "loss": 0.6897, "loss/demonstration_loss": -174.0, "loss/preference_loss": -173.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.005462646484375, "rewards/rejected": -0.0101318359375, "step": 840 }, { "epoch": 0.6728, "grad_norm": 2.0769715184304745, "learning_rate": 1.4608851892109304e-07, "logits/chosen": 0.8515625, "logits/rejected": 0.75390625, "logps/chosen": -43.0, "logps/rejected": -56.75, "loss": 0.6935, "loss/demonstration_loss": -264.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00860595703125, "rewards/margins": 0.0032806396484375, "rewards/rejected": -0.0118408203125, "step": 841 }, { "epoch": 0.6736, "grad_norm": 1.6116727725854807, "learning_rate": 1.4545395562092467e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.859375, "logps/chosen": -59.5, "logps/rejected": -37.0, "loss": 0.6965, "loss/demonstration_loss": -508.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.009521484375, "rewards/rejected": -0.0017242431640625, "step": 842 }, { "epoch": 0.6744, "grad_norm": 1.0087684476156136, "learning_rate": 1.4482020759181134e-07, "logits/chosen": 0.84375, "logits/rejected": 0.84375, "logps/chosen": -28.0, "logps/rejected": -28.0, "loss": 0.6919, "loss/demonstration_loss": -448.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00156402587890625, "rewards/margins": 0.00156402587890625, "rewards/rejected": 0.0, "step": 843 }, { "epoch": 0.6752, "grad_norm": 1.1975388674040752, "learning_rate": 1.4418727977584771e-07, "logits/chosen": 0.484375, "logits/rejected": 0.6015625, "logps/chosen": -16.25, "logps/rejected": -5.8125, "loss": 0.6925, "loss/demonstration_loss": -348.0, "loss/preference_loss": -348.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0, "rewards/rejected": -0.003753662109375, "step": 844 }, { "epoch": 0.676, "grad_norm": 10.733010010040205, "learning_rate": 1.4355517710873182e-07, "logits/chosen": 0.78125, "logits/rejected": 0.8046875, "logps/chosen": -73.0, "logps/rejected": -82.0, "loss": 0.6847, "loss/demonstration_loss": -616.0, "loss/preference_loss": -612.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.021240234375, "step": 845 }, { "epoch": 0.6768, "grad_norm": 1.700821673109667, "learning_rate": 1.4292390451972744e-07, "logits/chosen": 0.78125, "logits/rejected": 0.859375, "logps/chosen": -16.375, "logps/rejected": -5.71875, "loss": 0.6929, "loss/demonstration_loss": -346.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0050048828125, "rewards/rejected": 0.0, "step": 846 }, { "epoch": 0.6776, "grad_norm": 1.2721664785252134, "learning_rate": 1.4229346693162497e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.84765625, "logps/chosen": -34.75, "logps/rejected": -28.25, "loss": 0.6941, "loss/demonstration_loss": -336.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.00860595703125, "rewards/rejected": 0.0067138671875, "step": 847 }, { "epoch": 0.6784, "grad_norm": 1.625024455498773, "learning_rate": 1.416638692607032e-07, "logits/chosen": 0.9375, "logits/rejected": 0.8828125, "logps/chosen": -36.0, "logps/rejected": -26.875, "loss": 0.6917, "loss/demonstration_loss": -494.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00189208984375, "rewards/rejected": -0.00750732421875, "step": 848 }, { "epoch": 0.6792, "grad_norm": 0.4708805076799469, "learning_rate": 1.410351164166915e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.69921875, "logps/chosen": -25.875, "logps/rejected": -26.5, "loss": 0.6899, "loss/demonstration_loss": -420.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.006561279296875, "rewards/rejected": -0.005615234375, "step": 849 }, { "epoch": 0.68, "grad_norm": 1.2108189570198569, "learning_rate": 1.404072133027306e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.578125, "logps/chosen": -49.0, "logps/rejected": -59.0, "loss": 0.6927, "loss/demonstration_loss": -856.0, "loss/preference_loss": -856.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00433349609375, "rewards/rejected": -0.01123046875, "step": 850 }, { "epoch": 0.6808, "grad_norm": 1.7295211491089932, "learning_rate": 1.397801648153354e-07, "logits/chosen": 1.171875, "logits/rejected": 0.890625, "logps/chosen": -66.5, "logps/rejected": -117.0, "loss": 0.6899, "loss/demonstration_loss": -584.0, "loss/preference_loss": -580.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0162353515625, "rewards/margins": 0.0118408203125, "rewards/rejected": -0.028076171875, "step": 851 }, { "epoch": 0.6816, "grad_norm": 1.202549448065808, "learning_rate": 1.3915397584435563e-07, "logits/chosen": 0.400390625, "logits/rejected": 0.44140625, "logps/chosen": -16.25, "logps/rejected": -8.1875, "loss": 0.6888, "loss/demonstration_loss": -392.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.0021820068359375, "step": 852 }, { "epoch": 0.6824, "grad_norm": 1.3816556896219294, "learning_rate": 1.38528651272939e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.65625, "logps/chosen": -27.25, "logps/rejected": -43.5, "loss": 0.6898, "loss/demonstration_loss": -376.0, "loss/preference_loss": -374.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0032806396484375, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.005950927734375, "step": 853 }, { "epoch": 0.6832, "grad_norm": 1.2615238981300563, "learning_rate": 1.37904195977492e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.63671875, "logps/chosen": -30.875, "logps/rejected": -38.75, "loss": 0.6942, "loss/demonstration_loss": -548.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01220703125, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.00689697265625, "step": 854 }, { "epoch": 0.684, "grad_norm": 0.7400340370501739, "learning_rate": 1.3728061482764235e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.59375, "logps/chosen": -35.75, "logps/rejected": -23.0, "loss": 0.6926, "loss/demonstration_loss": -464.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.005615234375, "rewards/rejected": 0.0, "step": 855 }, { "epoch": 0.6848, "grad_norm": 1.0309996726156274, "learning_rate": 1.366579126862012e-07, "logits/chosen": 0.87109375, "logits/rejected": 0.859375, "logps/chosen": -29.375, "logps/rejected": -27.375, "loss": 0.6892, "loss/demonstration_loss": -226.0, "loss/preference_loss": -226.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00156402587890625, "rewards/margins": 0.002197265625, "rewards/rejected": -0.003753662109375, "step": 856 }, { "epoch": 0.6856, "grad_norm": 1.5606648473589133, "learning_rate": 1.3603609440912505e-07, "logits/chosen": 0.85546875, "logits/rejected": 1.125, "logps/chosen": -105.0, "logps/rejected": -76.5, "loss": 0.6909, "loss/demonstration_loss": -412.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.0012664794921875, "rewards/rejected": -0.01312255859375, "step": 857 }, { "epoch": 0.6864, "grad_norm": 1.925877443944817, "learning_rate": 1.3541516484547751e-07, "logits/chosen": 0.55859375, "logits/rejected": 0.59765625, "logps/chosen": -56.0, "logps/rejected": -46.5, "loss": 0.6936, "loss/demonstration_loss": -324.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0166015625, "rewards/margins": -0.0087890625, "rewards/rejected": -0.0078125, "step": 858 }, { "epoch": 0.6872, "grad_norm": 1.6520004363469707, "learning_rate": 1.347951288373923e-07, "logits/chosen": 0.640625, "logits/rejected": 0.765625, "logps/chosen": -42.0, "logps/rejected": -23.25, "loss": 0.6952, "loss/demonstration_loss": -340.0, "loss/preference_loss": -348.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.012939453125, "rewards/rejected": -0.000782012939453125, "step": 859 }, { "epoch": 0.688, "grad_norm": 1.2489950434565282, "learning_rate": 1.341759912200346e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.75, "logps/chosen": -56.0, "logps/rejected": -68.0, "loss": 0.6886, "loss/demonstration_loss": -490.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.016845703125, "step": 860 }, { "epoch": 0.6888, "grad_norm": 1.997811009524077, "learning_rate": 1.335577568215639e-07, "logits/chosen": 0.96875, "logits/rejected": 0.84375, "logps/chosen": -78.5, "logps/rejected": -101.5, "loss": 0.6898, "loss/demonstration_loss": -476.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0087890625, "rewards/rejected": -0.016845703125, "step": 861 }, { "epoch": 0.6896, "grad_norm": 1.3677552500206054, "learning_rate": 1.329404304630964e-07, "logits/chosen": 0.73046875, "logits/rejected": 0.68359375, "logps/chosen": -51.75, "logps/rejected": -63.75, "loss": 0.6921, "loss/demonstration_loss": -460.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.005615234375, "step": 862 }, { "epoch": 0.6904, "grad_norm": 1.2151654219567027, "learning_rate": 1.3232401695866685e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.703125, "logps/chosen": -37.0, "logps/rejected": -38.25, "loss": 0.6899, "loss/demonstration_loss": -300.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0106201171875, "rewards/rejected": -0.01251220703125, "step": 863 }, { "epoch": 0.6912, "grad_norm": 1.1794352319903272, "learning_rate": 1.3170852111519173e-07, "logits/chosen": 0.70703125, "logits/rejected": 0.75, "logps/chosen": -40.75, "logps/rejected": -41.5, "loss": 0.6908, "loss/demonstration_loss": -328.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.000782012939453125, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.007659912109375, "step": 864 }, { "epoch": 0.692, "grad_norm": 1.7936056201843886, "learning_rate": 1.3109394773243115e-07, "logits/chosen": 0.63671875, "logits/rejected": 0.625, "logps/chosen": -15.5, "logps/rejected": -14.625, "loss": 0.6934, "loss/demonstration_loss": -237.0, "loss/preference_loss": -235.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.0078125, "step": 865 }, { "epoch": 0.6928, "grad_norm": 1.5218610664481411, "learning_rate": 1.3048030160295195e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.765625, "logps/chosen": -33.5, "logps/rejected": -38.75, "loss": 0.6859, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.000469207763671875, "rewards/margins": 0.006561279296875, "rewards/rejected": -0.006103515625, "step": 866 }, { "epoch": 0.6936, "grad_norm": 0.8263766556868458, "learning_rate": 1.2986758751208983e-07, "logits/chosen": 0.9453125, "logits/rejected": 1.0546875, "logps/chosen": -19.5, "logps/rejected": -10.0, "loss": 0.6934, "loss/demonstration_loss": -228.0, "loss/preference_loss": -234.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.006561279296875, "rewards/rejected": -0.0028076171875, "step": 867 }, { "epoch": 0.6944, "grad_norm": 1.2793151175629132, "learning_rate": 1.2925581023791237e-07, "logits/chosen": 0.51171875, "logits/rejected": 0.5078125, "logps/chosen": -80.5, "logps/rejected": -82.0, "loss": 0.6876, "loss/demonstration_loss": -432.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.006866455078125, "rewards/rejected": -0.01092529296875, "step": 868 }, { "epoch": 0.6952, "grad_norm": 0.45044572687378576, "learning_rate": 1.286449745511815e-07, "logits/chosen": 0.5, "logits/rejected": 0.50390625, "logps/chosen": -24.25, "logps/rejected": -24.375, "loss": 0.6912, "loss/demonstration_loss": -776.0, "loss/preference_loss": -772.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.003753662109375, "step": 869 }, { "epoch": 0.696, "grad_norm": 9.304792238148186, "learning_rate": 1.2803508521531677e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.64453125, "logps/chosen": -29.25, "logps/rejected": -29.75, "loss": 0.689, "loss/demonstration_loss": -462.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": 0.00433349609375, "rewards/rejected": -0.015625, "step": 870 }, { "epoch": 0.6968, "grad_norm": 1.2411737408600245, "learning_rate": 1.2742614698635782e-07, "logits/chosen": 0.80859375, "logits/rejected": 0.78515625, "logps/chosen": -10.875, "logps/rejected": -11.9375, "loss": 0.6938, "loss/demonstration_loss": -182.0, "loss/preference_loss": -181.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.00140380859375, "rewards/rejected": -0.0020294189453125, "step": 871 }, { "epoch": 0.6976, "grad_norm": 1.246174799579288, "learning_rate": 1.2681816461292713e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.76953125, "logps/chosen": -49.25, "logps/rejected": -35.0, "loss": 0.6909, "loss/demonstration_loss": -268.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.002960205078125, "rewards/margins": 0.007049560546875, "rewards/rejected": -0.010009765625, "step": 872 }, { "epoch": 0.6984, "grad_norm": 1.0541978685592808, "learning_rate": 1.2621114283619344e-07, "logits/chosen": 0.494140625, "logits/rejected": 0.46875, "logps/chosen": -16.75, "logps/rejected": -16.625, "loss": 0.6913, "loss/demonstration_loss": -528.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.003753662109375, "step": 873 }, { "epoch": 0.6992, "grad_norm": 1.591804398641076, "learning_rate": 1.2560508638983435e-07, "logits/chosen": 0.84375, "logits/rejected": 0.69140625, "logps/chosen": -94.5, "logps/rejected": -109.0, "loss": 0.692, "loss/demonstration_loss": -462.0, "loss/preference_loss": -462.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0172119140625, "rewards/margins": -0.0014190673828125, "rewards/rejected": -0.0157470703125, "step": 874 }, { "epoch": 0.7, "grad_norm": 1.5503350351970266, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 0.77734375, "logits/rejected": 0.796875, "logps/chosen": -69.0, "logps/rejected": -65.0, "loss": 0.6906, "loss/demonstration_loss": -712.0, "loss/preference_loss": -708.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.01190185546875, "rewards/rejected": -0.014404296875, "step": 875 }, { "epoch": 0.7008, "grad_norm": 1.3518954660228601, "learning_rate": 1.243958883852755e-07, "logits/chosen": 0.7734375, "logits/rejected": 0.75390625, "logps/chosen": -50.0, "logps/rejected": -48.0, "loss": 0.6906, "loss/demonstration_loss": -516.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0118408203125, "rewards/margins": 0.004058837890625, "rewards/rejected": -0.015869140625, "step": 876 }, { "epoch": 0.7016, "grad_norm": 1.6236383889427688, "learning_rate": 1.237927562566446e-07, "logits/chosen": 0.8125, "logits/rejected": 0.859375, "logps/chosen": -20.125, "logps/rejected": -14.625, "loss": 0.6935, "loss/demonstration_loss": -276.0, "loss/preference_loss": -276.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.000946044921875, "rewards/rejected": -0.0009307861328125, "step": 877 }, { "epoch": 0.7024, "grad_norm": 1.4249353879629278, "learning_rate": 1.2319060831745272e-07, "logits/chosen": 0.9921875, "logits/rejected": 0.8203125, "logps/chosen": -30.375, "logps/rejected": -45.5, "loss": 0.6937, "loss/demonstration_loss": -242.0, "loss/preference_loss": -242.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.00061798095703125, "rewards/rejected": -0.0006256103515625, "step": 878 }, { "epoch": 0.7032, "grad_norm": 1.3788806698054148, "learning_rate": 1.2258944926337055e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.67578125, "logps/chosen": -39.75, "logps/rejected": -40.25, "loss": 0.692, "loss/demonstration_loss": -182.0, "loss/preference_loss": -181.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0078125, "rewards/margins": 0.000156402587890625, "rewards/rejected": -0.00799560546875, "step": 879 }, { "epoch": 0.704, "grad_norm": 1.805864312307662, "learning_rate": 1.2198928378235715e-07, "logits/chosen": 0.8359375, "logits/rejected": 0.97265625, "logps/chosen": -44.0, "logps/rejected": -16.0, "loss": 0.6926, "loss/demonstration_loss": -478.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.002197265625, "rewards/rejected": -0.00093841552734375, "step": 880 }, { "epoch": 0.7048, "grad_norm": 1.4023867389538942, "learning_rate": 1.2139011655462336e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.51171875, "logps/chosen": -14.3125, "logps/rejected": -25.375, "loss": 0.6898, "loss/demonstration_loss": -628.0, "loss/preference_loss": -620.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.004974365234375, "rewards/rejected": -0.00872802734375, "step": 881 }, { "epoch": 0.7056, "grad_norm": 1.0650408670781197, "learning_rate": 1.2079195225259578e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.7578125, "logps/chosen": -9.1875, "logps/rejected": -6.8125, "loss": 0.6906, "loss/demonstration_loss": -256.0, "loss/preference_loss": -252.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.00250244140625, "step": 882 }, { "epoch": 0.7064, "grad_norm": 1.4143292381224746, "learning_rate": 1.2019479554087963e-07, "logits/chosen": 0.7734375, "logits/rejected": 0.84375, "logps/chosen": -52.75, "logps/rejected": -48.75, "loss": 0.6893, "loss/demonstration_loss": -324.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.00029754638671875, "rewards/rejected": -0.0062255859375, "step": 883 }, { "epoch": 0.7072, "grad_norm": 0.964710506206473, "learning_rate": 1.1959865107622305e-07, "logits/chosen": 0.33203125, "logits/rejected": 0.51171875, "logps/chosen": -40.25, "logps/rejected": -35.5, "loss": 0.6932, "loss/demonstration_loss": -402.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.00408935546875, "rewards/rejected": 0.00031280517578125, "step": 884 }, { "epoch": 0.708, "grad_norm": 1.2007183714819325, "learning_rate": 1.1900352350748024e-07, "logits/chosen": 0.78125, "logits/rejected": 0.921875, "logps/chosen": -54.25, "logps/rejected": -55.0, "loss": 0.6897, "loss/demonstration_loss": -286.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.014404296875, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.0194091796875, "step": 885 }, { "epoch": 0.7088, "grad_norm": 0.9726809728180215, "learning_rate": 1.1840941747557556e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.734375, "logps/chosen": -53.0, "logps/rejected": -48.75, "loss": 0.6937, "loss/demonstration_loss": -320.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.00750732421875, "rewards/rejected": -0.00689697265625, "step": 886 }, { "epoch": 0.7096, "grad_norm": 1.5378988710976689, "learning_rate": 1.1781633761346707e-07, "logits/chosen": 0.94140625, "logits/rejected": 0.86328125, "logps/chosen": -22.0, "logps/rejected": -40.25, "loss": 0.6909, "loss/demonstration_loss": -251.0, "loss/preference_loss": -247.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0032958984375, "rewards/margins": 0.01019287109375, "rewards/rejected": -0.0068359375, "step": 887 }, { "epoch": 0.7104, "grad_norm": 1.7449897963800156, "learning_rate": 1.1722428854611088e-07, "logits/chosen": 0.84375, "logits/rejected": 0.8203125, "logps/chosen": -31.125, "logps/rejected": -34.5, "loss": 0.6947, "loss/demonstration_loss": -348.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00408935546875, "rewards/rejected": -0.00156402587890625, "step": 888 }, { "epoch": 0.7112, "grad_norm": 2.0143690382819024, "learning_rate": 1.1663327489042435e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.9765625, "logps/chosen": -112.0, "logps/rejected": -102.0, "loss": 0.6962, "loss/demonstration_loss": -338.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.0238037109375, "rewards/margins": 0.0032806396484375, "rewards/rejected": -0.027099609375, "step": 889 }, { "epoch": 0.712, "grad_norm": 1.0445528363130556, "learning_rate": 1.1604330125525078e-07, "logits/chosen": 0.640625, "logits/rejected": 0.5078125, "logps/chosen": -6.8125, "logps/rejected": -13.875, "loss": 0.6923, "loss/demonstration_loss": -330.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.002197265625, "rewards/rejected": -0.00250244140625, "step": 890 }, { "epoch": 0.7128, "grad_norm": 1.617699680189613, "learning_rate": 1.1545437224132318e-07, "logits/chosen": 0.4765625, "logits/rejected": 0.380859375, "logps/chosen": -59.5, "logps/rejected": -71.0, "loss": 0.6913, "loss/demonstration_loss": -520.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.00750732421875, "step": 891 }, { "epoch": 0.7136, "grad_norm": 1.2422362434762462, "learning_rate": 1.1486649244122823e-07, "logits/chosen": 0.8828125, "logits/rejected": 0.87109375, "logps/chosen": -31.25, "logps/rejected": -27.125, "loss": 0.693, "loss/demonstration_loss": -308.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00531005859375, "rewards/margins": -0.00157928466796875, "rewards/rejected": -0.0037384033203125, "step": 892 }, { "epoch": 0.7144, "grad_norm": 1.2139481951250988, "learning_rate": 1.1427966643937067e-07, "logits/chosen": 0.67578125, "logits/rejected": 0.7421875, "logps/chosen": -35.25, "logps/rejected": -26.625, "loss": 0.6908, "loss/demonstration_loss": -328.0, "loss/preference_loss": -326.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005950927734375, "rewards/margins": -0.000469207763671875, "rewards/rejected": -0.005462646484375, "step": 893 }, { "epoch": 0.7152, "grad_norm": 1.0615649067908788, "learning_rate": 1.1369389881193747e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.6484375, "logps/chosen": -3.109375, "logps/rejected": -3.359375, "loss": 0.6897, "loss/demonstration_loss": -102.0, "loss/preference_loss": -104.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.000782012939453125, "rewards/margins": -0.0010986328125, "rewards/rejected": 0.00031280517578125, "step": 894 }, { "epoch": 0.716, "grad_norm": 1.719476604209144, "learning_rate": 1.1310919412686245e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.625, "logps/chosen": -30.5, "logps/rejected": -29.5, "loss": 0.6885, "loss/demonstration_loss": -476.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0031280517578125, "rewards/rejected": -0.001251220703125, "step": 895 }, { "epoch": 0.7168, "grad_norm": 1.796250756649082, "learning_rate": 1.1252555694379004e-07, "logits/chosen": 0.984375, "logits/rejected": 0.93359375, "logps/chosen": -33.75, "logps/rejected": -34.75, "loss": 0.6924, "loss/demonstration_loss": -362.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.0089111328125, "step": 896 }, { "epoch": 0.7176, "grad_norm": 1.143486386290085, "learning_rate": 1.1194299181404034e-07, "logits/chosen": 0.8203125, "logits/rejected": 0.83984375, "logps/chosen": -14.4375, "logps/rejected": -15.125, "loss": 0.6908, "loss/demonstration_loss": -233.0, "loss/preference_loss": -235.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00469970703125, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.0018768310546875, "step": 897 }, { "epoch": 0.7184, "grad_norm": 0.7307679881443147, "learning_rate": 1.1136150328057323e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.63671875, "logps/chosen": -20.625, "logps/rejected": -20.875, "loss": 0.6903, "loss/demonstration_loss": -660.0, "loss/preference_loss": -656.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0050048828125, "step": 898 }, { "epoch": 0.7192, "grad_norm": 1.3467059081207795, "learning_rate": 1.107810958779531e-07, "logits/chosen": 0.6875, "logits/rejected": 0.76953125, "logps/chosen": -65.5, "logps/rejected": -58.75, "loss": 0.692, "loss/demonstration_loss": -394.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.0032958984375, "rewards/rejected": -0.005462646484375, "step": 899 }, { "epoch": 0.72, "grad_norm": 1.3947584640039608, "learning_rate": 1.1020177413231332e-07, "logits/chosen": 0.84375, "logits/rejected": 0.640625, "logps/chosen": -29.0, "logps/rejected": -60.5, "loss": 0.6895, "loss/demonstration_loss": -474.0, "loss/preference_loss": -472.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0087890625, "rewards/rejected": -0.0106201171875, "step": 900 }, { "epoch": 0.7208, "grad_norm": 1.4201454878988553, "learning_rate": 1.096235425613214e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.6171875, "logps/chosen": -46.25, "logps/rejected": -60.5, "loss": 0.6864, "loss/demonstration_loss": -428.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.000156402587890625, "rewards/margins": 0.017822265625, "rewards/rejected": -0.0177001953125, "step": 901 }, { "epoch": 0.7216, "grad_norm": 0.31524296046358874, "learning_rate": 1.090464056741433e-07, "logits/chosen": 0.72265625, "logits/rejected": 0.734375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6904, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 902 }, { "epoch": 0.7224, "grad_norm": 2.161391000225901, "learning_rate": 1.084703679714083e-07, "logits/chosen": 0.9375, "logits/rejected": 0.83203125, "logps/chosen": -55.25, "logps/rejected": -75.0, "loss": 0.6914, "loss/demonstration_loss": -344.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0084228515625, "rewards/margins": 0.0147705078125, "rewards/rejected": -0.023193359375, "step": 903 }, { "epoch": 0.7232, "grad_norm": 3.333167284685878, "learning_rate": 1.0789543394517434e-07, "logits/chosen": 0.84375, "logits/rejected": 0.7578125, "logps/chosen": -80.0, "logps/rejected": -87.0, "loss": 0.6936, "loss/demonstration_loss": -528.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0166015625, "rewards/margins": -0.00408935546875, "rewards/rejected": -0.012451171875, "step": 904 }, { "epoch": 0.724, "grad_norm": 1.6043511628702467, "learning_rate": 1.073216080788921e-07, "logits/chosen": 0.73828125, "logits/rejected": 0.94921875, "logps/chosen": -18.375, "logps/rejected": -4.4375, "loss": 0.6893, "loss/demonstration_loss": -368.0, "loss/preference_loss": -370.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.001251220703125, "rewards/margins": -0.0018768310546875, "rewards/rejected": 0.0031280517578125, "step": 905 }, { "epoch": 0.7248, "grad_norm": 1.1450158782733726, "learning_rate": 1.0674889484737123e-07, "logits/chosen": 0.83984375, "logits/rejected": 0.90234375, "logps/chosen": -34.5, "logps/rejected": -31.75, "loss": 0.6909, "loss/demonstration_loss": -346.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.016845703125, "step": 906 }, { "epoch": 0.7256, "grad_norm": 1.6595922784028694, "learning_rate": 1.0617729871674436e-07, "logits/chosen": 0.84765625, "logits/rejected": 0.76953125, "logps/chosen": -24.5, "logps/rejected": -27.75, "loss": 0.6936, "loss/demonstration_loss": -416.0, "loss/preference_loss": -414.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001861572265625, "rewards/margins": 0.00439453125, "rewards/rejected": -0.006256103515625, "step": 907 }, { "epoch": 0.7264, "grad_norm": 1.4809562486166972, "learning_rate": 1.0560682414443314e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.6953125, "logps/chosen": -45.5, "logps/rejected": -43.25, "loss": 0.6904, "loss/demonstration_loss": -352.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.01019287109375, "step": 908 }, { "epoch": 0.7272, "grad_norm": 1.6881852068498795, "learning_rate": 1.0503747557911269e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.6328125, "logps/chosen": -61.25, "logps/rejected": -63.0, "loss": 0.6908, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01123046875, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.01409912109375, "step": 909 }, { "epoch": 0.728, "grad_norm": 1.3340987930218087, "learning_rate": 1.0446925746067766e-07, "logits/chosen": 1.078125, "logits/rejected": 0.94140625, "logps/chosen": -62.25, "logps/rejected": -96.5, "loss": 0.6882, "loss/demonstration_loss": -504.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01031494140625, "rewards/margins": 0.0115966796875, "rewards/rejected": -0.0218505859375, "step": 910 }, { "epoch": 0.7288, "grad_norm": 17.32290186717492, "learning_rate": 1.03902174220207e-07, "logits/chosen": 0.8125, "logits/rejected": 0.58984375, "logps/chosen": -32.0, "logps/rejected": -50.5, "loss": 0.6879, "loss/demonstration_loss": -434.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.00494384765625, "rewards/rejected": -0.016845703125, "step": 911 }, { "epoch": 0.7296, "grad_norm": 0.6234090365614416, "learning_rate": 1.0333623027992969e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.78515625, "logps/chosen": -24.5, "logps/rejected": -21.25, "loss": 0.6899, "loss/demonstration_loss": -368.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0050048828125, "rewards/margins": 0.006561279296875, "rewards/rejected": -0.00156402587890625, "step": 912 }, { "epoch": 0.7304, "grad_norm": 1.0619711042258273, "learning_rate": 1.0277143005319036e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.81640625, "logps/chosen": -29.375, "logps/rejected": -29.25, "loss": 0.6918, "loss/demonstration_loss": -308.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.002197265625, "rewards/rejected": -0.00531005859375, "step": 913 }, { "epoch": 0.7312, "grad_norm": 0.8442409864918629, "learning_rate": 1.0220777794441448e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.71875, "logps/chosen": -12.75, "logps/rejected": -14.4375, "loss": 0.6912, "loss/demonstration_loss": -216.0, "loss/preference_loss": -214.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0031280517578125, "step": 914 }, { "epoch": 0.732, "grad_norm": 0.9455695902331075, "learning_rate": 1.0164527834907466e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.6640625, "logps/chosen": -26.25, "logps/rejected": -21.875, "loss": 0.6892, "loss/demonstration_loss": -384.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.00439453125, "step": 915 }, { "epoch": 0.7328, "grad_norm": 0.6293573627777045, "learning_rate": 1.010839356536555e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.66015625, "logps/chosen": -45.5, "logps/rejected": -46.25, "loss": 0.6895, "loss/demonstration_loss": -486.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0084228515625, "rewards/margins": 0.0084228515625, "rewards/rejected": -0.016845703125, "step": 916 }, { "epoch": 0.7336, "grad_norm": 2.319233125510302, "learning_rate": 1.0052375423562037e-07, "logits/chosen": 0.609375, "logits/rejected": 0.54296875, "logps/chosen": -47.25, "logps/rejected": -53.25, "loss": 0.6912, "loss/demonstration_loss": -532.0, "loss/preference_loss": -532.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0084228515625, "rewards/margins": 0.0028228759765625, "rewards/rejected": -0.01123046875, "step": 917 }, { "epoch": 0.7344, "grad_norm": 2.1808143353712377, "learning_rate": 9.996473846337613e-08, "logits/chosen": 0.80859375, "logits/rejected": 0.88671875, "logps/chosen": -92.5, "logps/rejected": -90.5, "loss": 0.6942, "loss/demonstration_loss": -484.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.014404296875, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.01123046875, "step": 918 }, { "epoch": 0.7352, "grad_norm": 0.7931942545546204, "learning_rate": 9.940689269624039e-08, "logits/chosen": 0.9375, "logits/rejected": 1.0703125, "logps/chosen": -42.75, "logps/rejected": -34.75, "loss": 0.6904, "loss/demonstration_loss": -410.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.006103515625, "step": 919 }, { "epoch": 0.736, "grad_norm": 1.7509059385165844, "learning_rate": 9.885022128440629e-08, "logits/chosen": 0.88671875, "logits/rejected": 0.87890625, "logps/chosen": -47.0, "logps/rejected": -53.5, "loss": 0.6906, "loss/demonstration_loss": -320.0, "loss/preference_loss": -318.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.005615234375, "rewards/margins": 0.002197265625, "rewards/rejected": -0.0078125, "step": 920 }, { "epoch": 0.7368, "grad_norm": 1.550131068327315, "learning_rate": 9.829472856890941e-08, "logits/chosen": 0.59375, "logits/rejected": 0.58984375, "logps/chosen": -39.0, "logps/rejected": -40.5, "loss": 0.6919, "loss/demonstration_loss": -628.0, "loss/preference_loss": -632.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.006256103515625, "step": 921 }, { "epoch": 0.7376, "grad_norm": 1.9856255979638722, "learning_rate": 9.774041888159362e-08, "logits/chosen": 0.96484375, "logits/rejected": 1.078125, "logps/chosen": -108.5, "logps/rejected": -100.5, "loss": 0.6908, "loss/demonstration_loss": -556.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.0115966796875, "step": 922 }, { "epoch": 0.7384, "grad_norm": 1.2365847262130263, "learning_rate": 9.718729654507712e-08, "logits/chosen": 0.9140625, "logits/rejected": 1.0078125, "logps/chosen": -24.75, "logps/rejected": -21.75, "loss": 0.6892, "loss/demonstration_loss": -187.0, "loss/preference_loss": -184.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.002197265625, "rewards/margins": 0.00701904296875, "rewards/rejected": -0.004852294921875, "step": 923 }, { "epoch": 0.7392, "grad_norm": 1.1747896223735779, "learning_rate": 9.6635365872719e-08, "logits/chosen": 0.498046875, "logits/rejected": 0.48828125, "logps/chosen": -12.5, "logps/rejected": -12.375, "loss": 0.6924, "loss/demonstration_loss": -396.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.0006256103515625, "step": 924 }, { "epoch": 0.74, "grad_norm": 1.5601746911368672, "learning_rate": 9.608463116858542e-08, "logits/chosen": 0.66015625, "logits/rejected": 0.796875, "logps/chosen": -30.25, "logps/rejected": -17.25, "loss": 0.6934, "loss/demonstration_loss": -378.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00360107421875, "rewards/margins": -0.0010986328125, "rewards/rejected": -0.00250244140625, "step": 925 }, { "epoch": 0.7408, "grad_norm": 1.158954410383129, "learning_rate": 9.553509672741644e-08, "logits/chosen": 0.90234375, "logits/rejected": 0.89453125, "logps/chosen": -38.25, "logps/rejected": -39.5, "loss": 0.6902, "loss/demonstration_loss": -249.0, "loss/preference_loss": -248.0, "rewards/accuracies": 0.125, "rewards/chosen": -7.62939453125e-06, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.002655029296875, "step": 926 }, { "epoch": 0.7416, "grad_norm": 1.4853667385238307, "learning_rate": 9.498676683459183e-08, "logits/chosen": 0.64453125, "logits/rejected": 0.8359375, "logps/chosen": -46.0, "logps/rejected": -33.25, "loss": 0.6936, "loss/demonstration_loss": -312.0, "loss/preference_loss": -314.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.0036163330078125, "rewards/rejected": -0.0089111328125, "step": 927 }, { "epoch": 0.7424, "grad_norm": 1.4402847623577668, "learning_rate": 9.443964576609842e-08, "logits/chosen": 0.55078125, "logits/rejected": 0.53125, "logps/chosen": -65.5, "logps/rejected": -62.5, "loss": 0.6932, "loss/demonstration_loss": -506.0, "loss/preference_loss": -508.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01220703125, "rewards/margins": -0.0062255859375, "rewards/rejected": -0.005950927734375, "step": 928 }, { "epoch": 0.7432, "grad_norm": 0.9471369627353898, "learning_rate": 9.389373778849611e-08, "logits/chosen": 0.9296875, "logits/rejected": 1.0078125, "logps/chosen": -32.5, "logps/rejected": -21.75, "loss": 0.6941, "loss/demonstration_loss": -288.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0029754638671875, "rewards/rejected": -0.000782012939453125, "step": 929 }, { "epoch": 0.744, "grad_norm": 1.2716686041955905, "learning_rate": 9.334904715888494e-08, "logits/chosen": 0.69140625, "logits/rejected": 0.6875, "logps/chosen": -91.5, "logps/rejected": -91.0, "loss": 0.6903, "loss/demonstration_loss": -724.0, "loss/preference_loss": -724.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.014404296875, "rewards/margins": -0.00439453125, "rewards/rejected": -0.010009765625, "step": 930 }, { "epoch": 0.7448, "grad_norm": 1.7975092698998272, "learning_rate": 9.280557812487186e-08, "logits/chosen": 0.453125, "logits/rejected": 0.6015625, "logps/chosen": -34.5, "logps/rejected": -12.375, "loss": 0.6931, "loss/demonstration_loss": -372.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0072021484375, "rewards/rejected": 0.0021820068359375, "step": 931 }, { "epoch": 0.7456, "grad_norm": 1.1410651797586013, "learning_rate": 9.226333492453758e-08, "logits/chosen": 0.90234375, "logits/rejected": 0.88671875, "logps/chosen": -59.5, "logps/rejected": -65.0, "loss": 0.6903, "loss/demonstration_loss": -498.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002197265625, "rewards/margins": 0.004638671875, "rewards/rejected": -0.00689697265625, "step": 932 }, { "epoch": 0.7464, "grad_norm": 1.5288913042492218, "learning_rate": 9.17223217864036e-08, "logits/chosen": 0.63671875, "logits/rejected": 0.76953125, "logps/chosen": -84.5, "logps/rejected": -68.5, "loss": 0.6938, "loss/demonstration_loss": -404.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.0089111328125, "rewards/rejected": -0.002349853515625, "step": 933 }, { "epoch": 0.7472, "grad_norm": 0.7883687999886849, "learning_rate": 9.118254292939889e-08, "logits/chosen": 0.75, "logits/rejected": 0.75, "logps/chosen": -20.125, "logps/rejected": -20.5, "loss": 0.6909, "loss/demonstration_loss": -648.0, "loss/preference_loss": -640.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.006256103515625, "step": 934 }, { "epoch": 0.748, "grad_norm": 1.3224300010593948, "learning_rate": 9.064400256282755e-08, "logits/chosen": 0.6953125, "logits/rejected": 0.71484375, "logps/chosen": -2.78125, "logps/rejected": -2.859375, "loss": 0.6904, "loss/demonstration_loss": -91.0, "loss/preference_loss": -89.5, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.000782012939453125, "rewards/rejected": -0.000469207763671875, "step": 935 }, { "epoch": 0.7488, "grad_norm": 1.3386284354044142, "learning_rate": 9.010670488633551e-08, "logits/chosen": 0.734375, "logits/rejected": 0.8515625, "logps/chosen": -51.25, "logps/rejected": -46.5, "loss": 0.693, "loss/demonstration_loss": -388.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.00439453125, "rewards/rejected": -0.00439453125, "step": 936 }, { "epoch": 0.7496, "grad_norm": 1.2210420837002112, "learning_rate": 8.957065408987796e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.79296875, "logps/chosen": -20.5, "logps/rejected": -28.625, "loss": 0.6909, "loss/demonstration_loss": -256.0, "loss/preference_loss": -258.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.004669189453125, "rewards/rejected": -0.00531005859375, "step": 937 }, { "epoch": 0.7504, "grad_norm": 1.359477650815042, "learning_rate": 8.903585435368658e-08, "logits/chosen": 0.466796875, "logits/rejected": 0.671875, "logps/chosen": -99.0, "logps/rejected": -84.5, "loss": 0.6935, "loss/demonstration_loss": -484.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.025634765625, "rewards/margins": -0.010009765625, "rewards/rejected": -0.015625, "step": 938 }, { "epoch": 0.7512, "grad_norm": 0.6724069218093511, "learning_rate": 8.850230984823734e-08, "logits/chosen": 0.69921875, "logits/rejected": 0.76953125, "logps/chosen": -21.75, "logps/rejected": -22.25, "loss": 0.6918, "loss/demonstration_loss": -237.0, "loss/preference_loss": -237.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.004058837890625, "rewards/margins": 0.00188446044921875, "rewards/rejected": 0.0021820068359375, "step": 939 }, { "epoch": 0.752, "grad_norm": 0.9586880697013337, "learning_rate": 8.797002473421727e-08, "logits/chosen": 0.8359375, "logits/rejected": 0.77734375, "logps/chosen": -18.625, "logps/rejected": -22.875, "loss": 0.6903, "loss/demonstration_loss": -221.0, "loss/preference_loss": -221.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.00031280517578125, "step": 940 }, { "epoch": 0.7528, "grad_norm": 1.3634620151929975, "learning_rate": 8.743900316249273e-08, "logits/chosen": 0.69921875, "logits/rejected": 0.70703125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6926, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 941 }, { "epoch": 0.7536, "grad_norm": 27.97341187390925, "learning_rate": 8.690924927407678e-08, "logits/chosen": 0.5234375, "logits/rejected": 0.6171875, "logps/chosen": -30.125, "logps/rejected": -20.375, "loss": 0.6935, "loss/demonstration_loss": -268.0, "loss/preference_loss": -268.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004852294921875, "rewards/margins": -0.002349853515625, "rewards/rejected": -0.00250244140625, "step": 942 }, { "epoch": 0.7544, "grad_norm": 7.267149654204037, "learning_rate": 8.63807672000963e-08, "logits/chosen": 0.69921875, "logits/rejected": 0.6953125, "logps/chosen": -24.25, "logps/rejected": -24.25, "loss": 0.6946, "loss/demonstration_loss": -384.0, "loss/preference_loss": -384.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.003753662109375, "step": 943 }, { "epoch": 0.7552, "grad_norm": 1.2394436921793168, "learning_rate": 8.585356106176092e-08, "logits/chosen": 0.82421875, "logits/rejected": 0.6953125, "logps/chosen": -22.375, "logps/rejected": -33.25, "loss": 0.6942, "loss/demonstration_loss": -444.0, "loss/preference_loss": -452.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0017242431640625, "rewards/margins": -0.00799560546875, "rewards/rejected": 0.006256103515625, "step": 944 }, { "epoch": 0.756, "grad_norm": 1.4477182960404582, "learning_rate": 8.532763497032986e-08, "logits/chosen": 0.60546875, "logits/rejected": 0.61328125, "logps/chosen": -53.0, "logps/rejected": -52.25, "loss": 0.6936, "loss/demonstration_loss": -556.0, "loss/preference_loss": -560.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.0078125, "rewards/rejected": -0.0028076171875, "step": 945 }, { "epoch": 0.7568, "grad_norm": 1.5545551538424451, "learning_rate": 8.480299302708058e-08, "logits/chosen": 0.75, "logits/rejected": 0.74609375, "logps/chosen": -47.5, "logps/rejected": -54.75, "loss": 0.6931, "loss/demonstration_loss": -404.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.006561279296875, "rewards/rejected": -0.0031280517578125, "step": 946 }, { "epoch": 0.7576, "grad_norm": 0.7060394394208399, "learning_rate": 8.42796393232762e-08, "logits/chosen": 0.89453125, "logits/rejected": 0.88671875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6921, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 947 }, { "epoch": 0.7584, "grad_norm": 1.2590471037784507, "learning_rate": 8.375757794013414e-08, "logits/chosen": 0.76171875, "logits/rejected": 0.92578125, "logps/chosen": -46.75, "logps/rejected": -36.25, "loss": 0.6908, "loss/demonstration_loss": -436.0, "loss/preference_loss": -434.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01123046875, "rewards/margins": 0.0040283203125, "rewards/rejected": -0.01531982421875, "step": 948 }, { "epoch": 0.7592, "grad_norm": 0.5273909910098106, "learning_rate": 8.323681294879392e-08, "logits/chosen": 0.6015625, "logits/rejected": 0.578125, "logps/chosen": -3.296875, "logps/rejected": -3.84375, "loss": 0.6908, "loss/demonstration_loss": -113.0, "loss/preference_loss": -113.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.00031280517578125, "rewards/rejected": -0.00093841552734375, "step": 949 }, { "epoch": 0.76, "grad_norm": 1.9356019319087348, "learning_rate": 8.271734841028552e-08, "logits/chosen": 1.1875, "logits/rejected": 0.9453125, "logps/chosen": -106.0, "logps/rejected": -113.0, "loss": 0.6896, "loss/demonstration_loss": -498.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.006591796875, "rewards/margins": 0.009033203125, "rewards/rejected": -0.015625, "step": 950 }, { "epoch": 0.7608, "grad_norm": 1.010431075971755, "learning_rate": 8.219918837549769e-08, "logits/chosen": 0.53515625, "logits/rejected": 0.53125, "logps/chosen": -24.875, "logps/rejected": -24.875, "loss": 0.6908, "loss/demonstration_loss": -262.0, "loss/preference_loss": -262.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00531005859375, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.005950927734375, "step": 951 }, { "epoch": 0.7616, "grad_norm": 0.9027503585791814, "learning_rate": 8.168233688514654e-08, "logits/chosen": 0.73046875, "logits/rejected": 0.60546875, "logps/chosen": -28.375, "logps/rejected": -38.75, "loss": 0.6885, "loss/demonstration_loss": -540.0, "loss/preference_loss": -532.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00311279296875, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.006256103515625, "step": 952 }, { "epoch": 0.7624, "grad_norm": 4.253649539474107, "learning_rate": 8.116679796974387e-08, "logits/chosen": 0.76171875, "logits/rejected": 0.85546875, "logps/chosen": -34.75, "logps/rejected": -18.25, "loss": 0.6953, "loss/demonstration_loss": -280.0, "loss/preference_loss": -282.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00439453125, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.00124359130859375, "step": 953 }, { "epoch": 0.7632, "grad_norm": 1.71730243110936, "learning_rate": 8.065257564956571e-08, "logits/chosen": 0.8515625, "logits/rejected": 0.8515625, "logps/chosen": -38.0, "logps/rejected": -29.625, "loss": 0.6962, "loss/demonstration_loss": -354.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.0146484375, "rewards/rejected": 0.00439453125, "step": 954 }, { "epoch": 0.764, "grad_norm": 1.3023238417469747, "learning_rate": 8.013967393462093e-08, "logits/chosen": 0.7421875, "logits/rejected": 0.72265625, "logps/chosen": -52.0, "logps/rejected": -46.0, "loss": 0.694, "loss/demonstration_loss": -516.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.004058837890625, "rewards/rejected": -0.006561279296875, "step": 955 }, { "epoch": 0.7648, "grad_norm": 1.4326478619514535, "learning_rate": 7.962809682462007e-08, "logits/chosen": 0.490234375, "logits/rejected": 0.482421875, "logps/chosen": -22.0, "logps/rejected": -21.5, "loss": 0.6946, "loss/demonstration_loss": -338.0, "loss/preference_loss": -344.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.0062255859375, "step": 956 }, { "epoch": 0.7656, "grad_norm": 1.406391160690902, "learning_rate": 7.911784830894439e-08, "logits/chosen": 0.765625, "logits/rejected": 1.0234375, "logps/chosen": -29.75, "logps/rejected": -13.125, "loss": 0.6951, "loss/demonstration_loss": -336.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.002197265625, "step": 957 }, { "epoch": 0.7664, "grad_norm": 1.6802267773256105, "learning_rate": 7.860893236661411e-08, "logits/chosen": 0.6953125, "logits/rejected": 0.625, "logps/chosen": -7.15625, "logps/rejected": -20.375, "loss": 0.6957, "loss/demonstration_loss": -436.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0021820068359375, "rewards/margins": -0.0021820068359375, "rewards/rejected": 0.0, "step": 958 }, { "epoch": 0.7672, "grad_norm": 1.1949770979679704, "learning_rate": 7.810135296625817e-08, "logits/chosen": 0.8984375, "logits/rejected": 0.9765625, "logps/chosen": -38.5, "logps/rejected": -32.25, "loss": 0.6907, "loss/demonstration_loss": -280.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0072021484375, "step": 959 }, { "epoch": 0.768, "grad_norm": 1.3428572489247181, "learning_rate": 7.759511406608255e-08, "logits/chosen": 0.921875, "logits/rejected": 0.91796875, "logps/chosen": -35.25, "logps/rejected": -35.75, "loss": 0.6925, "loss/demonstration_loss": -376.0, "loss/preference_loss": -374.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005615234375, "rewards/margins": 0.0034332275390625, "rewards/rejected": -0.009033203125, "step": 960 }, { "epoch": 0.7688, "grad_norm": 1.5468272753485568, "learning_rate": 7.709021961384e-08, "logits/chosen": 0.875, "logits/rejected": 0.81640625, "logps/chosen": -57.5, "logps/rejected": -61.0, "loss": 0.6902, "loss/demonstration_loss": -314.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.0087890625, "rewards/rejected": -0.0128173828125, "step": 961 }, { "epoch": 0.7696, "grad_norm": 1.531032078889124, "learning_rate": 7.658667354679879e-08, "logits/chosen": 0.578125, "logits/rejected": 0.439453125, "logps/chosen": -42.5, "logps/rejected": -53.0, "loss": 0.6913, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0029754638671875, "rewards/margins": 0.0020294189453125, "rewards/rejected": -0.0050048828125, "step": 962 }, { "epoch": 0.7704, "grad_norm": 1.833279236053714, "learning_rate": 7.608447979171228e-08, "logits/chosen": 0.875, "logits/rejected": 0.8203125, "logps/chosen": -32.0, "logps/rejected": -41.25, "loss": 0.691, "loss/demonstration_loss": -292.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.0072021484375, "rewards/rejected": -0.01031494140625, "step": 963 }, { "epoch": 0.7712, "grad_norm": 1.720229714146368, "learning_rate": 7.558364226478841e-08, "logits/chosen": 0.828125, "logits/rejected": 0.65625, "logps/chosen": -53.75, "logps/rejected": -72.0, "loss": 0.6924, "loss/demonstration_loss": -496.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01422119140625, "rewards/margins": 0.0020599365234375, "rewards/rejected": -0.0162353515625, "step": 964 }, { "epoch": 0.772, "grad_norm": 0.8977415727935738, "learning_rate": 7.508416487165862e-08, "logits/chosen": 0.578125, "logits/rejected": 0.65234375, "logps/chosen": -32.75, "logps/rejected": -30.25, "loss": 0.6886, "loss/demonstration_loss": -504.0, "loss/preference_loss": -494.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.010009765625, "rewards/rejected": -0.01123046875, "step": 965 }, { "epoch": 0.7728, "grad_norm": 19.65756505829685, "learning_rate": 7.458605150734815e-08, "logits/chosen": 0.79296875, "logits/rejected": 0.8515625, "logps/chosen": -44.5, "logps/rejected": -30.25, "loss": 0.694, "loss/demonstration_loss": -392.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.0050048828125, "step": 966 }, { "epoch": 0.7736, "grad_norm": 0.9334306418408576, "learning_rate": 7.408930605624497e-08, "logits/chosen": 0.71875, "logits/rejected": 0.68359375, "logps/chosen": -26.5, "logps/rejected": -29.25, "loss": 0.6925, "loss/demonstration_loss": -292.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.0050048828125, "step": 967 }, { "epoch": 0.7744, "grad_norm": 1.201853848183564, "learning_rate": 7.359393239206991e-08, "logits/chosen": 0.6875, "logits/rejected": 0.70703125, "logps/chosen": -45.5, "logps/rejected": -45.0, "loss": 0.6923, "loss/demonstration_loss": -480.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.00469970703125, "step": 968 }, { "epoch": 0.7752, "grad_norm": 0.9763552446380445, "learning_rate": 7.309993437784623e-08, "logits/chosen": 0.51953125, "logits/rejected": 0.5703125, "logps/chosen": -63.5, "logps/rejected": -58.75, "loss": 0.693, "loss/demonstration_loss": -482.0, "loss/preference_loss": -484.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.016845703125, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.0106201171875, "step": 969 }, { "epoch": 0.776, "grad_norm": 2.2280749598959244, "learning_rate": 7.260731586586982e-08, "logits/chosen": 0.91015625, "logits/rejected": 0.8984375, "logps/chosen": -44.25, "logps/rejected": -64.0, "loss": 0.693, "loss/demonstration_loss": -286.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.004058837890625, "rewards/rejected": -0.0150146484375, "step": 970 }, { "epoch": 0.7768, "grad_norm": 1.8801210106089712, "learning_rate": 7.211608069767867e-08, "logits/chosen": 0.88671875, "logits/rejected": 0.9140625, "logps/chosen": -51.0, "logps/rejected": -45.0, "loss": 0.6902, "loss/demonstration_loss": -302.0, "loss/preference_loss": -302.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0155029296875, "rewards/margins": 0.0026397705078125, "rewards/rejected": -0.0181884765625, "step": 971 }, { "epoch": 0.7776, "grad_norm": 1.6099251683955174, "learning_rate": 7.162623270402335e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.74609375, "logps/chosen": -45.25, "logps/rejected": -67.0, "loss": 0.6907, "loss/demonstration_loss": -296.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01312255859375, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.017822265625, "step": 972 }, { "epoch": 0.7784, "grad_norm": 1.308462131037711, "learning_rate": 7.1137775704837e-08, "logits/chosen": 0.3515625, "logits/rejected": 0.4375, "logps/chosen": -30.5, "logps/rejected": -30.125, "loss": 0.6907, "loss/demonstration_loss": -324.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 973 }, { "epoch": 0.7792, "grad_norm": 1.3205262554885735, "learning_rate": 7.065071350920538e-08, "logits/chosen": 0.8359375, "logits/rejected": 0.9453125, "logps/chosen": -44.25, "logps/rejected": -40.25, "loss": 0.6936, "loss/demonstration_loss": -446.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01123046875, "rewards/margins": -0.0040283203125, "rewards/rejected": -0.0072021484375, "step": 974 }, { "epoch": 0.78, "grad_norm": 1.0484173613241718, "learning_rate": 7.016504991533726e-08, "logits/chosen": 0.51953125, "logits/rejected": 0.57421875, "logps/chosen": -30.0, "logps/rejected": -21.25, "loss": 0.6921, "loss/demonstration_loss": -404.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.003448486328125, "rewards/rejected": -0.002197265625, "step": 975 }, { "epoch": 0.7808, "grad_norm": 1.3238563530547462, "learning_rate": 6.968078871053487e-08, "logits/chosen": 0.625, "logits/rejected": 0.5859375, "logps/chosen": -45.25, "logps/rejected": -48.5, "loss": 0.6885, "loss/demonstration_loss": -374.0, "loss/preference_loss": -370.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.010009765625, "rewards/rejected": -0.0093994140625, "step": 976 }, { "epoch": 0.7816, "grad_norm": 1.524782026349477, "learning_rate": 6.919793367116453e-08, "logits/chosen": 0.89453125, "logits/rejected": 1.0546875, "logps/chosen": -73.0, "logps/rejected": -63.0, "loss": 0.693, "loss/demonstration_loss": -540.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0093994140625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0106201171875, "step": 977 }, { "epoch": 0.7824, "grad_norm": 0.874797966594204, "learning_rate": 6.871648856262665e-08, "logits/chosen": 0.84375, "logits/rejected": 0.7578125, "logps/chosen": -19.25, "logps/rejected": -30.75, "loss": 0.6898, "loss/demonstration_loss": -408.0, "loss/preference_loss": -400.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00811767578125, "rewards/margins": 0.006866455078125, "rewards/rejected": 0.001251220703125, "step": 978 }, { "epoch": 0.7832, "grad_norm": 2.549201122338777, "learning_rate": 6.823645713932708e-08, "logits/chosen": 0.53125, "logits/rejected": 0.515625, "logps/chosen": -37.25, "logps/rejected": -37.25, "loss": 0.6907, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01031494140625, "rewards/margins": -0.00311279296875, "rewards/rejected": -0.0072021484375, "step": 979 }, { "epoch": 0.784, "grad_norm": 4.082747039662025, "learning_rate": 6.775784314464716e-08, "logits/chosen": 0.66015625, "logits/rejected": 0.6796875, "logps/chosen": -17.75, "logps/rejected": -17.75, "loss": 0.6923, "loss/demonstration_loss": -564.0, "loss/preference_loss": -564.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0, "rewards/rejected": -0.00250244140625, "step": 980 }, { "epoch": 0.7848, "grad_norm": 1.514239368820571, "learning_rate": 6.728065031091502e-08, "logits/chosen": 0.53125, "logits/rejected": 0.51171875, "logps/chosen": -73.0, "logps/rejected": -70.5, "loss": 0.689, "loss/demonstration_loss": -456.0, "loss/preference_loss": -452.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.01312255859375, "rewards/rejected": -0.01806640625, "step": 981 }, { "epoch": 0.7856, "grad_norm": 1.4341844608404033, "learning_rate": 6.680488235937613e-08, "logits/chosen": 0.7421875, "logits/rejected": 0.60546875, "logps/chosen": -31.75, "logps/rejected": -46.0, "loss": 0.6927, "loss/demonstration_loss": -616.0, "loss/preference_loss": -616.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.006256103515625, "step": 982 }, { "epoch": 0.7864, "grad_norm": 5.220169146086935, "learning_rate": 6.633054300016464e-08, "logits/chosen": 0.75, "logits/rejected": 0.86328125, "logps/chosen": -57.25, "logps/rejected": -47.25, "loss": 0.6978, "loss/demonstration_loss": -330.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01409912109375, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00909423828125, "step": 983 }, { "epoch": 0.7872, "grad_norm": 1.062812176783423, "learning_rate": 6.585763593227419e-08, "logits/chosen": 0.482421875, "logits/rejected": 0.5, "logps/chosen": -7.90625, "logps/rejected": -7.9375, "loss": 0.6907, "loss/demonstration_loss": -252.0, "loss/preference_loss": -251.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.00031280517578125, "rewards/rejected": -0.00156402587890625, "step": 984 }, { "epoch": 0.788, "grad_norm": 1.620141991941391, "learning_rate": 6.538616484352902e-08, "logits/chosen": 0.69140625, "logits/rejected": 0.98828125, "logps/chosen": -45.25, "logps/rejected": -13.625, "loss": 0.6967, "loss/demonstration_loss": -310.0, "loss/preference_loss": -314.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.00640869140625, "rewards/rejected": -0.0017242431640625, "step": 985 }, { "epoch": 0.7888, "grad_norm": 1.4595662662248503, "learning_rate": 6.491613341055546e-08, "logits/chosen": 0.61328125, "logits/rejected": 0.6015625, "logps/chosen": -37.5, "logps/rejected": -36.5, "loss": 0.6885, "loss/demonstration_loss": -394.0, "loss/preference_loss": -390.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.009033203125, "rewards/rejected": -0.01153564453125, "step": 986 }, { "epoch": 0.7896, "grad_norm": 0.7495256853487566, "learning_rate": 6.444754529875302e-08, "logits/chosen": 0.83984375, "logits/rejected": 0.71875, "logps/chosen": -2.984375, "logps/rejected": -22.25, "loss": 0.6901, "loss/demonstration_loss": -402.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00140380859375, "rewards/margins": 0.004852294921875, "rewards/rejected": -0.006256103515625, "step": 987 }, { "epoch": 0.7904, "grad_norm": 1.434797712706959, "learning_rate": 6.398040416226591e-08, "logits/chosen": 0.66015625, "logits/rejected": 0.55078125, "logps/chosen": -53.25, "logps/rejected": -72.5, "loss": 0.6929, "loss/demonstration_loss": -334.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00421142578125, "rewards/margins": -0.002044677734375, "rewards/rejected": -0.002197265625, "step": 988 }, { "epoch": 0.7912, "grad_norm": 2.136310317703171, "learning_rate": 6.351471364395447e-08, "logits/chosen": 0.7421875, "logits/rejected": 0.65625, "logps/chosen": -55.25, "logps/rejected": -61.5, "loss": 0.6919, "loss/demonstration_loss": -464.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.0006103515625, "rewards/rejected": -0.00689697265625, "step": 989 }, { "epoch": 0.792, "grad_norm": 1.2351154494088756, "learning_rate": 6.305047737536707e-08, "logits/chosen": 0.75, "logits/rejected": 0.65625, "logps/chosen": -30.75, "logps/rejected": -43.0, "loss": 0.6913, "loss/demonstration_loss": -584.0, "loss/preference_loss": -584.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0087890625, "step": 990 }, { "epoch": 0.7928, "grad_norm": 1.4988816919334704, "learning_rate": 6.258769897671124e-08, "logits/chosen": 0.4453125, "logits/rejected": 0.44921875, "logps/chosen": -55.0, "logps/rejected": -54.75, "loss": 0.6914, "loss/demonstration_loss": -576.0, "loss/preference_loss": -580.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0162353515625, "rewards/margins": -0.002227783203125, "rewards/rejected": -0.0140380859375, "step": 991 }, { "epoch": 0.7936, "grad_norm": 1.5063813681542284, "learning_rate": 6.212638205682599e-08, "logits/chosen": 0.7265625, "logits/rejected": 0.75390625, "logps/chosen": -51.25, "logps/rejected": -54.5, "loss": 0.694, "loss/demonstration_loss": -418.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.0050048828125, "step": 992 }, { "epoch": 0.7944, "grad_norm": 1.4643184744611846, "learning_rate": 6.166653021315335e-08, "logits/chosen": 0.80078125, "logits/rejected": 0.94140625, "logps/chosen": -34.5, "logps/rejected": -18.625, "loss": 0.6967, "loss/demonstration_loss": -278.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.015625, "rewards/rejected": 0.0018768310546875, "step": 993 }, { "epoch": 0.7952, "grad_norm": 1.9726873630080421, "learning_rate": 6.120814703171023e-08, "logits/chosen": 0.96875, "logits/rejected": 0.83203125, "logps/chosen": -79.0, "logps/rejected": -99.5, "loss": 0.6923, "loss/demonstration_loss": -472.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.005645751953125, "rewards/rejected": -0.0062255859375, "step": 994 }, { "epoch": 0.796, "grad_norm": 1.8227750839023964, "learning_rate": 6.075123608706093e-08, "logits/chosen": 0.59765625, "logits/rejected": 0.70703125, "logps/chosen": -77.5, "logps/rejected": -64.5, "loss": 0.6931, "loss/demonstration_loss": -560.0, "loss/preference_loss": -564.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01806640625, "rewards/margins": -0.013427734375, "rewards/rejected": -0.00469970703125, "step": 995 }, { "epoch": 0.7968, "grad_norm": 1.4561480150923762, "learning_rate": 6.029580094228862e-08, "logits/chosen": 0.609375, "logits/rejected": 0.6953125, "logps/chosen": -59.25, "logps/rejected": -53.5, "loss": 0.6923, "loss/demonstration_loss": -596.0, "loss/preference_loss": -596.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00689697265625, "step": 996 }, { "epoch": 0.7976, "grad_norm": 1.128869175092203, "learning_rate": 5.98418451489682e-08, "logits/chosen": 0.71875, "logits/rejected": 0.7734375, "logps/chosen": -12.9375, "logps/rejected": -4.71875, "loss": 0.6924, "loss/demonstration_loss": -276.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.0018768310546875, "step": 997 }, { "epoch": 0.7984, "grad_norm": 1.2677294128520422, "learning_rate": 5.9389372247138e-08, "logits/chosen": 0.69921875, "logits/rejected": 0.8671875, "logps/chosen": -65.0, "logps/rejected": -59.5, "loss": 0.6923, "loss/demonstration_loss": -398.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.00189208984375, "rewards/rejected": -0.0012664794921875, "step": 998 }, { "epoch": 0.7992, "grad_norm": 0.6507283268439041, "learning_rate": 5.893838576527274e-08, "logits/chosen": 0.451171875, "logits/rejected": 0.435546875, "logps/chosen": -34.5, "logps/rejected": -34.5, "loss": 0.6915, "loss/demonstration_loss": -1096.0, "loss/preference_loss": -1096.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.0, "rewards/rejected": -0.0050048828125, "step": 999 }, { "epoch": 0.8, "grad_norm": 1.2354204517157883, "learning_rate": 5.848888922025552e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.55078125, "logps/chosen": -15.25, "logps/rejected": -30.875, "loss": 0.6907, "loss/demonstration_loss": -368.0, "loss/preference_loss": -368.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.002197265625, "rewards/margins": 0.0, "rewards/rejected": -0.002197265625, "step": 1000 }, { "epoch": 0.8008, "grad_norm": 1.4749003888384493, "learning_rate": 5.80408861173507e-08, "logits/chosen": 0.8671875, "logits/rejected": 0.8046875, "logps/chosen": -112.5, "logps/rejected": -108.0, "loss": 0.6931, "loss/demonstration_loss": -502.0, "loss/preference_loss": -502.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": 0.0012359619140625, "rewards/rejected": -0.014404296875, "step": 1001 }, { "epoch": 0.8016, "grad_norm": 1.1322154371023632, "learning_rate": 5.759437995017638e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.828125, "logps/chosen": -73.5, "logps/rejected": -68.0, "loss": 0.6899, "loss/demonstration_loss": -374.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01251220703125, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.0162353515625, "step": 1002 }, { "epoch": 0.8024, "grad_norm": 1.5839167755019032, "learning_rate": 5.714937420067745e-08, "logits/chosen": 0.453125, "logits/rejected": 0.47265625, "logps/chosen": -20.0, "logps/rejected": -20.125, "loss": 0.6923, "loss/demonstration_loss": -640.0, "loss/preference_loss": -640.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.001251220703125, "step": 1003 }, { "epoch": 0.8032, "grad_norm": 1.511326815128913, "learning_rate": 5.670587233909818e-08, "logits/chosen": 0.6015625, "logits/rejected": 0.6171875, "logps/chosen": -46.0, "logps/rejected": -34.0, "loss": 0.6931, "loss/demonstration_loss": -420.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.000335693359375, "rewards/rejected": -0.01251220703125, "step": 1004 }, { "epoch": 0.804, "grad_norm": 1.1301674436891789, "learning_rate": 5.6263877823955115e-08, "logits/chosen": 0.6953125, "logits/rejected": 0.71875, "logps/chosen": -19.0, "logps/rejected": -17.875, "loss": 0.6921, "loss/demonstration_loss": -290.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.005645751953125, "rewards/rejected": -0.0006256103515625, "step": 1005 }, { "epoch": 0.8048, "grad_norm": 0.8859121649317453, "learning_rate": 5.5823394102010285e-08, "logits/chosen": 0.7890625, "logits/rejected": 0.7890625, "logps/chosen": -10.375, "logps/rejected": -9.8125, "loss": 0.6929, "loss/demonstration_loss": -312.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.0006256103515625, "step": 1006 }, { "epoch": 0.8056, "grad_norm": 1.5232093448024755, "learning_rate": 5.538442460824416e-08, "logits/chosen": 0.640625, "logits/rejected": 0.65625, "logps/chosen": -90.0, "logps/rejected": -86.0, "loss": 0.6921, "loss/demonstration_loss": -466.0, "loss/preference_loss": -468.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0174560546875, "rewards/margins": -0.0076904296875, "rewards/rejected": -0.00982666015625, "step": 1007 }, { "epoch": 0.8064, "grad_norm": 1.5837103979700113, "learning_rate": 5.4946972765829156e-08, "logits/chosen": 0.79296875, "logits/rejected": 0.8203125, "logps/chosen": -35.5, "logps/rejected": -33.0, "loss": 0.6898, "loss/demonstration_loss": -552.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.00811767578125, "rewards/rejected": -0.005615234375, "step": 1008 }, { "epoch": 0.8072, "grad_norm": 1.7599903145404843, "learning_rate": 5.451104198610248e-08, "logits/chosen": 0.91015625, "logits/rejected": 0.9609375, "logps/chosen": -54.0, "logps/rejected": -45.25, "loss": 0.6953, "loss/demonstration_loss": -394.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0072021484375, "rewards/margins": -0.0034332275390625, "rewards/rejected": -0.003753662109375, "step": 1009 }, { "epoch": 0.808, "grad_norm": 1.4904826893337975, "learning_rate": 5.4076635668540065e-08, "logits/chosen": 0.8671875, "logits/rejected": 0.765625, "logps/chosen": -53.25, "logps/rejected": -59.25, "loss": 0.6901, "loss/demonstration_loss": -304.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0107421875, "rewards/margins": 0.00921630859375, "rewards/rejected": 0.00156402587890625, "step": 1010 }, { "epoch": 0.8088, "grad_norm": 1.0293214598062261, "learning_rate": 5.364375720072953e-08, "logits/chosen": 0.71484375, "logits/rejected": 0.80078125, "logps/chosen": -19.375, "logps/rejected": -4.375, "loss": 0.6901, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.0006256103515625, "step": 1011 }, { "epoch": 0.8096, "grad_norm": 1.4427101031249194, "learning_rate": 5.3212409958344195e-08, "logits/chosen": 0.890625, "logits/rejected": 0.73828125, "logps/chosen": -36.25, "logps/rejected": -57.0, "loss": 0.691, "loss/demonstration_loss": -496.0, "loss/preference_loss": -496.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0050048828125, "step": 1012 }, { "epoch": 0.8104, "grad_norm": 10.103853698331122, "learning_rate": 5.27825973051165e-08, "logits/chosen": 1.109375, "logits/rejected": 1.0234375, "logps/chosen": -37.0, "logps/rejected": -52.0, "loss": 0.6906, "loss/demonstration_loss": -472.0, "loss/preference_loss": -470.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0024871826171875, "rewards/rejected": -0.0087890625, "step": 1013 }, { "epoch": 0.8112, "grad_norm": 1.646613105428865, "learning_rate": 5.235432259281175e-08, "logits/chosen": 0.921875, "logits/rejected": 0.9140625, "logps/chosen": -36.0, "logps/rejected": -38.5, "loss": 0.6926, "loss/demonstration_loss": -390.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0115966796875, "rewards/margins": -0.00750732421875, "rewards/rejected": -0.004058837890625, "step": 1014 }, { "epoch": 0.812, "grad_norm": 2.10676564020642, "learning_rate": 5.192758916120235e-08, "logits/chosen": 1.1015625, "logits/rejected": 0.9453125, "logps/chosen": -49.25, "logps/rejected": -66.0, "loss": 0.694, "loss/demonstration_loss": -458.0, "loss/preference_loss": -458.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.00811767578125, "step": 1015 }, { "epoch": 0.8128, "grad_norm": 1.4161507574255257, "learning_rate": 5.150240033804115e-08, "logits/chosen": 0.76953125, "logits/rejected": 0.8671875, "logps/chosen": -17.875, "logps/rejected": -13.0625, "loss": 0.6929, "loss/demonstration_loss": -246.0, "loss/preference_loss": -249.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00156402587890625, "rewards/margins": -0.0028076171875, "rewards/rejected": 0.001251220703125, "step": 1016 }, { "epoch": 0.8136, "grad_norm": 1.2913430125878609, "learning_rate": 5.107875943903614e-08, "logits/chosen": 0.80859375, "logits/rejected": 0.84765625, "logps/chosen": -40.75, "logps/rejected": -34.75, "loss": 0.6929, "loss/demonstration_loss": -400.0, "loss/preference_loss": -402.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.00031280517578125, "step": 1017 }, { "epoch": 0.8144, "grad_norm": 1.5088124569917867, "learning_rate": 5.065666976782412e-08, "logits/chosen": 0.5234375, "logits/rejected": 0.59375, "logps/chosen": -62.5, "logps/rejected": -60.5, "loss": 0.6952, "loss/demonstration_loss": -392.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.003753662109375, "step": 1018 }, { "epoch": 0.8152, "grad_norm": 1.0038374528239902, "learning_rate": 5.023613461594511e-08, "logits/chosen": 0.578125, "logits/rejected": 0.63671875, "logps/chosen": -10.875, "logps/rejected": -10.875, "loss": 0.6895, "loss/demonstration_loss": -348.0, "loss/preference_loss": -348.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0, "rewards/rejected": -0.0006256103515625, "step": 1019 }, { "epoch": 0.816, "grad_norm": 0.6183201431792337, "learning_rate": 4.981715726281666e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.90234375, "logps/chosen": -16.875, "logps/rejected": -3.328125, "loss": 0.6919, "loss/demonstration_loss": -320.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.002197265625, "rewards/rejected": -0.00031280517578125, "step": 1020 }, { "epoch": 0.8168, "grad_norm": 0.9225760496714414, "learning_rate": 4.93997409757084e-08, "logits/chosen": 0.85546875, "logits/rejected": 0.88671875, "logps/chosen": -35.25, "logps/rejected": -35.25, "loss": 0.692, "loss/demonstration_loss": -560.0, "loss/preference_loss": -560.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.0031280517578125, "step": 1021 }, { "epoch": 0.8176, "grad_norm": 1.6340181870328139, "learning_rate": 4.898388900971634e-08, "logits/chosen": 0.8203125, "logits/rejected": 0.94140625, "logps/chosen": -109.0, "logps/rejected": -96.0, "loss": 0.6949, "loss/demonstration_loss": -648.0, "loss/preference_loss": -648.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.03125, "rewards/margins": -0.0146484375, "rewards/rejected": -0.0166015625, "step": 1022 }, { "epoch": 0.8184, "grad_norm": 1.6195242364351694, "learning_rate": 4.856960460773765e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.91015625, "logps/chosen": -84.0, "logps/rejected": -65.0, "loss": 0.6937, "loss/demonstration_loss": -472.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01129150390625, "rewards/margins": -0.0012664794921875, "rewards/rejected": -0.010009765625, "step": 1023 }, { "epoch": 0.8192, "grad_norm": 1.6045266179905944, "learning_rate": 4.81568910004454e-08, "logits/chosen": 0.6796875, "logits/rejected": 0.671875, "logps/chosen": -25.875, "logps/rejected": -25.875, "loss": 0.6918, "loss/demonstration_loss": -828.0, "loss/preference_loss": -828.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1024 }, { "epoch": 0.82, "grad_norm": 0.6597775758286529, "learning_rate": 4.774575140626316e-08, "logits/chosen": 0.7734375, "logits/rejected": 0.79296875, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6907, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1025 }, { "epoch": 0.8208, "grad_norm": 2.0085705824306896, "learning_rate": 4.733618903134004e-08, "logits/chosen": 0.6796875, "logits/rejected": 0.6953125, "logps/chosen": -40.75, "logps/rejected": -53.0, "loss": 0.6838, "loss/demonstration_loss": -374.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.000782012939453125, "rewards/margins": 0.028564453125, "rewards/rejected": -0.0294189453125, "step": 1026 }, { "epoch": 0.8216, "grad_norm": 1.067353276053082, "learning_rate": 4.692820706952569e-08, "logits/chosen": 0.4140625, "logits/rejected": 0.388671875, "logps/chosen": -62.5, "logps/rejected": -61.0, "loss": 0.6912, "loss/demonstration_loss": -394.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.000640869140625, "rewards/rejected": -0.0087890625, "step": 1027 }, { "epoch": 0.8224, "grad_norm": 0.8564305754846535, "learning_rate": 4.652180870234551e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.8125, "logps/chosen": -34.0, "logps/rejected": -34.75, "loss": 0.691, "loss/demonstration_loss": -218.0, "loss/preference_loss": -217.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.007354736328125, "rewards/margins": 0.0015716552734375, "rewards/rejected": -0.0089111328125, "step": 1028 }, { "epoch": 0.8232, "grad_norm": 1.2188739356485379, "learning_rate": 4.611699709897546e-08, "logits/chosen": 0.6484375, "logits/rejected": 0.61328125, "logps/chosen": -10.375, "logps/rejected": -10.875, "loss": 0.6927, "loss/demonstration_loss": -344.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0006256103515625, "step": 1029 }, { "epoch": 0.824, "grad_norm": 1.0991396845552102, "learning_rate": 4.5713775416217875e-08, "logits/chosen": 0.90625, "logits/rejected": 0.8359375, "logps/chosen": -6.6875, "logps/rejected": -13.875, "loss": 0.6891, "loss/demonstration_loss": -330.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.0062255859375, "rewards/rejected": -0.005615234375, "step": 1030 }, { "epoch": 0.8248, "grad_norm": 1.7925938102337986, "learning_rate": 4.531214679847639e-08, "logits/chosen": 0.8515625, "logits/rejected": 0.83203125, "logps/chosen": -27.625, "logps/rejected": -28.375, "loss": 0.6873, "loss/demonstration_loss": -296.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.003448486328125, "rewards/margins": 0.0037384033203125, "rewards/rejected": -0.0072021484375, "step": 1031 }, { "epoch": 0.8256, "grad_norm": 1.8419316063760678, "learning_rate": 4.491211437773165e-08, "logits/chosen": 0.66015625, "logits/rejected": 0.62890625, "logps/chosen": -46.25, "logps/rejected": -43.75, "loss": 0.6927, "loss/demonstration_loss": -286.0, "loss/preference_loss": -288.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004547119140625, "rewards/margins": -0.007049560546875, "rewards/rejected": 0.00250244140625, "step": 1032 }, { "epoch": 0.8264, "grad_norm": 0.926130001303649, "learning_rate": 4.451368127351674e-08, "logits/chosen": 0.65234375, "logits/rejected": 0.65234375, "logps/chosen": -7.09375, "logps/rejected": -9.875, "loss": 0.6895, "loss/demonstration_loss": -266.0, "loss/preference_loss": -266.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0031280517578125, "step": 1033 }, { "epoch": 0.8272, "grad_norm": 1.6238646009869515, "learning_rate": 4.4116850592893135e-08, "logits/chosen": 0.859375, "logits/rejected": 0.89453125, "logps/chosen": -48.75, "logps/rejected": -47.25, "loss": 0.6912, "loss/demonstration_loss": -508.0, "loss/preference_loss": -506.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": 0.0, "rewards/rejected": -0.00970458984375, "step": 1034 }, { "epoch": 0.828, "grad_norm": 1.595234126533992, "learning_rate": 4.372162543042623e-08, "logits/chosen": 0.61328125, "logits/rejected": 0.59765625, "logps/chosen": -23.75, "logps/rejected": -23.875, "loss": 0.6898, "loss/demonstration_loss": -374.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00872802734375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.010009765625, "step": 1035 }, { "epoch": 0.8288, "grad_norm": 1.3539761165188222, "learning_rate": 4.3328008868161124e-08, "logits/chosen": 0.72265625, "logits/rejected": 0.703125, "logps/chosen": -30.125, "logps/rejected": -30.375, "loss": 0.6925, "loss/demonstration_loss": -484.0, "loss/preference_loss": -480.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.003448486328125, "step": 1036 }, { "epoch": 0.8296, "grad_norm": 1.3295654327654085, "learning_rate": 4.293600397559896e-08, "logits/chosen": 0.796875, "logits/rejected": 0.77734375, "logps/chosen": -47.75, "logps/rejected": -51.5, "loss": 0.6914, "loss/demonstration_loss": -394.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.01202392578125, "step": 1037 }, { "epoch": 0.8304, "grad_norm": 1.3325427223545998, "learning_rate": 4.254561380967259e-08, "logits/chosen": 0.498046875, "logits/rejected": 0.59375, "logps/chosen": -53.75, "logps/rejected": -58.75, "loss": 0.6926, "loss/demonstration_loss": -592.0, "loss/preference_loss": -592.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01434326171875, "rewards/margins": -0.00372314453125, "rewards/rejected": -0.0106201171875, "step": 1038 }, { "epoch": 0.8312, "grad_norm": 1.5422215379099904, "learning_rate": 4.215684141472292e-08, "logits/chosen": 0.6171875, "logits/rejected": 0.703125, "logps/chosen": -59.25, "logps/rejected": -54.5, "loss": 0.6891, "loss/demonstration_loss": -452.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.00970458984375, "rewards/rejected": -0.0159912109375, "step": 1039 }, { "epoch": 0.832, "grad_norm": 1.6371624550961532, "learning_rate": 4.176968982247514e-08, "logits/chosen": 0.8359375, "logits/rejected": 0.75390625, "logps/chosen": -60.5, "logps/rejected": -77.5, "loss": 0.6899, "loss/demonstration_loss": -368.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.002655029296875, "rewards/margins": 0.0029754638671875, "rewards/rejected": -0.005615234375, "step": 1040 }, { "epoch": 0.8328, "grad_norm": 2.0544684634496515, "learning_rate": 4.1384162052015255e-08, "logits/chosen": 0.7890625, "logits/rejected": 0.8046875, "logps/chosen": -80.0, "logps/rejected": -74.5, "loss": 0.6898, "loss/demonstration_loss": -350.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01092529296875, "rewards/margins": 0.0078125, "rewards/rejected": -0.018798828125, "step": 1041 }, { "epoch": 0.8336, "grad_norm": 1.3945413792032841, "learning_rate": 4.100026110976615e-08, "logits/chosen": 0.87109375, "logits/rejected": 0.88671875, "logps/chosen": -41.0, "logps/rejected": -40.5, "loss": 0.6947, "loss/demonstration_loss": -644.0, "loss/preference_loss": -648.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.010009765625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.00439453125, "step": 1042 }, { "epoch": 0.8344, "grad_norm": 1.149964609562771, "learning_rate": 4.061798998946459e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.80078125, "logps/chosen": -64.0, "logps/rejected": -45.5, "loss": 0.6923, "loss/demonstration_loss": -580.0, "loss/preference_loss": -580.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0087890625, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.006256103515625, "step": 1043 }, { "epoch": 0.8352, "grad_norm": 1.7443260891832302, "learning_rate": 4.023735167213751e-08, "logits/chosen": 0.5859375, "logits/rejected": 0.71875, "logps/chosen": -70.0, "logps/rejected": -57.5, "loss": 0.6936, "loss/demonstration_loss": -336.0, "loss/preference_loss": -336.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.015625, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.0137939453125, "step": 1044 }, { "epoch": 0.836, "grad_norm": 1.1322347552140417, "learning_rate": 3.9858349126078936e-08, "logits/chosen": 0.69140625, "logits/rejected": 0.79296875, "logps/chosen": -22.0, "logps/rejected": -9.625, "loss": 0.6914, "loss/demonstration_loss": -248.0, "loss/preference_loss": -252.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006866455078125, "rewards/margins": -0.0045166015625, "rewards/rejected": -0.002349853515625, "step": 1045 }, { "epoch": 0.8368, "grad_norm": 2.0903954890636873, "learning_rate": 3.9480985306826945e-08, "logits/chosen": 1.015625, "logits/rejected": 0.94921875, "logps/chosen": -85.5, "logps/rejected": -104.0, "loss": 0.6917, "loss/demonstration_loss": -502.0, "loss/preference_loss": -500.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0111083984375, "rewards/margins": 0.00360107421875, "rewards/rejected": -0.01470947265625, "step": 1046 }, { "epoch": 0.8376, "grad_norm": 0.6135775034114216, "learning_rate": 3.91052631571403e-08, "logits/chosen": 0.765625, "logits/rejected": 0.76171875, "logps/chosen": -15.5, "logps/rejected": -15.1875, "loss": 0.6926, "loss/demonstration_loss": -243.0, "loss/preference_loss": -246.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0029754638671875, "rewards/rejected": 0.000469207763671875, "step": 1047 }, { "epoch": 0.8384, "grad_norm": 1.3726979816097749, "learning_rate": 3.8731185606975916e-08, "logits/chosen": 0.8828125, "logits/rejected": 1.0546875, "logps/chosen": -36.5, "logps/rejected": -23.25, "loss": 0.6926, "loss/demonstration_loss": -476.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.005615234375, "step": 1048 }, { "epoch": 0.8392, "grad_norm": 1.1238338568703539, "learning_rate": 3.8358755573465516e-08, "logits/chosen": 0.6640625, "logits/rejected": 0.65625, "logps/chosen": -20.5, "logps/rejected": -25.5, "loss": 0.6906, "loss/demonstration_loss": -364.0, "loss/preference_loss": -364.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0012359619140625, "rewards/rejected": -0.00439453125, "step": 1049 }, { "epoch": 0.84, "grad_norm": 1.4864957975994917, "learning_rate": 3.798797596089351e-08, "logits/chosen": 0.63671875, "logits/rejected": 0.64453125, "logps/chosen": -30.375, "logps/rejected": -26.625, "loss": 0.689, "loss/demonstration_loss": -452.0, "loss/preference_loss": -456.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00360107421875, "rewards/margins": -0.0017242431640625, "rewards/rejected": -0.0018768310546875, "step": 1050 }, { "epoch": 0.8408, "grad_norm": 1.3454451930677325, "learning_rate": 3.761884966067369e-08, "logits/chosen": 0.84375, "logits/rejected": 0.796875, "logps/chosen": -43.5, "logps/rejected": -41.5, "loss": 0.6914, "loss/demonstration_loss": -272.0, "loss/preference_loss": -270.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.002197265625, "rewards/margins": 0.00830078125, "rewards/rejected": -0.006103515625, "step": 1051 }, { "epoch": 0.8416, "grad_norm": 1.558596709422178, "learning_rate": 3.725137955132707e-08, "logits/chosen": 0.75390625, "logits/rejected": 0.7578125, "logps/chosen": -59.0, "logps/rejected": -60.25, "loss": 0.6893, "loss/demonstration_loss": -476.0, "loss/preference_loss": -474.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.00030517578125, "rewards/rejected": -0.0072021484375, "step": 1052 }, { "epoch": 0.8424, "grad_norm": 0.5606358111854268, "learning_rate": 3.688556849845939e-08, "logits/chosen": 0.8359375, "logits/rejected": 0.8359375, "logps/chosen": -14.0, "logps/rejected": -14.125, "loss": 0.6929, "loss/demonstration_loss": -452.0, "loss/preference_loss": -448.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0006256103515625, "step": 1053 }, { "epoch": 0.8432, "grad_norm": 1.465416666177844, "learning_rate": 3.652141935473873e-08, "logits/chosen": 0.7109375, "logits/rejected": 0.65234375, "logps/chosen": -40.0, "logps/rejected": -54.25, "loss": 0.691, "loss/demonstration_loss": -752.0, "loss/preference_loss": -744.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.003143310546875, "rewards/rejected": -0.0087890625, "step": 1054 }, { "epoch": 0.844, "grad_norm": 1.1864142258337966, "learning_rate": 3.615893495987335e-08, "logits/chosen": 0.7109375, "logits/rejected": 0.765625, "logps/chosen": -54.75, "logps/rejected": -48.75, "loss": 0.6908, "loss/demonstration_loss": -548.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0068359375, "rewards/rejected": -0.01434326171875, "step": 1055 }, { "epoch": 0.8448, "grad_norm": 1.4768091014479339, "learning_rate": 3.5798118140589276e-08, "logits/chosen": 0.8203125, "logits/rejected": 0.81640625, "logps/chosen": -41.0, "logps/rejected": -43.25, "loss": 0.6914, "loss/demonstration_loss": -444.0, "loss/preference_loss": -450.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.0078125, "rewards/rejected": -0.00156402587890625, "step": 1056 }, { "epoch": 0.8456, "grad_norm": 0.4404615828297363, "learning_rate": 3.5438971710608614e-08, "logits/chosen": 0.6640625, "logits/rejected": 0.63671875, "logps/chosen": -2.140625, "logps/rejected": -4.875, "loss": 0.6919, "loss/demonstration_loss": -112.0, "loss/preference_loss": -112.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.0, "rewards/rejected": -0.00031280517578125, "step": 1057 }, { "epoch": 0.8464, "grad_norm": 1.3873536348756044, "learning_rate": 3.508149847062724e-08, "logits/chosen": 0.9453125, "logits/rejected": 1.1015625, "logps/chosen": -46.0, "logps/rejected": -30.75, "loss": 0.6923, "loss/demonstration_loss": -304.0, "loss/preference_loss": -306.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00439453125, "rewards/rejected": -0.00124359130859375, "step": 1058 }, { "epoch": 0.8472, "grad_norm": 1.317046425303454, "learning_rate": 3.472570120829343e-08, "logits/chosen": 0.8984375, "logits/rejected": 0.94921875, "logps/chosen": -29.0, "logps/rejected": -36.5, "loss": 0.6852, "loss/demonstration_loss": -262.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.000469207763671875, "rewards/margins": 0.01263427734375, "rewards/rejected": -0.01312255859375, "step": 1059 }, { "epoch": 0.848, "grad_norm": 1.5776124621685386, "learning_rate": 3.437158269818563e-08, "logits/chosen": 0.62890625, "logits/rejected": 0.57421875, "logps/chosen": -62.25, "logps/rejected": -77.5, "loss": 0.6881, "loss/demonstration_loss": -368.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.013427734375, "rewards/margins": 0.01220703125, "rewards/rejected": -0.025634765625, "step": 1060 }, { "epoch": 0.8488, "grad_norm": 1.410075074911165, "learning_rate": 3.401914570179118e-08, "logits/chosen": 0.88671875, "logits/rejected": 0.83203125, "logps/chosen": -47.0, "logps/rejected": -57.0, "loss": 0.6908, "loss/demonstration_loss": -332.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.003753662109375, "rewards/rejected": 1.52587890625e-05, "step": 1061 }, { "epoch": 0.8496, "grad_norm": 1.3192070543119652, "learning_rate": 3.36683929674845e-08, "logits/chosen": 0.67578125, "logits/rejected": 0.75, "logps/chosen": -55.5, "logps/rejected": -53.25, "loss": 0.6906, "loss/demonstration_loss": -580.0, "loss/preference_loss": -576.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.00811767578125, "rewards/rejected": -0.00689697265625, "step": 1062 }, { "epoch": 0.8504, "grad_norm": 1.4655164914888594, "learning_rate": 3.331932723050596e-08, "logits/chosen": 0.62109375, "logits/rejected": 0.7265625, "logps/chosen": -63.0, "logps/rejected": -60.5, "loss": 0.6935, "loss/demonstration_loss": -390.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0166015625, "rewards/margins": -0.0045166015625, "rewards/rejected": -0.01202392578125, "step": 1063 }, { "epoch": 0.8512, "grad_norm": 1.2928146146164001, "learning_rate": 3.297195121294022e-08, "logits/chosen": 0.734375, "logits/rejected": 0.75, "logps/chosen": -66.5, "logps/rejected": -62.25, "loss": 0.6901, "loss/demonstration_loss": -412.0, "loss/preference_loss": -410.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.0087890625, "step": 1064 }, { "epoch": 0.852, "grad_norm": 1.2841087282470227, "learning_rate": 3.262626762369525e-08, "logits/chosen": 0.63671875, "logits/rejected": 0.66015625, "logps/chosen": -22.25, "logps/rejected": -18.0, "loss": 0.692, "loss/demonstration_loss": -320.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.00469970703125, "rewards/rejected": 0.0028076171875, "step": 1065 }, { "epoch": 0.8528, "grad_norm": 1.4552084470157933, "learning_rate": 3.2282279158481165e-08, "logits/chosen": 0.83203125, "logits/rejected": 0.6328125, "logps/chosen": -38.75, "logps/rejected": -68.5, "loss": 0.6908, "loss/demonstration_loss": -342.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.001556396484375, "rewards/rejected": -0.0078125, "step": 1066 }, { "epoch": 0.8536, "grad_norm": 1.9244757855535262, "learning_rate": 3.1939988499789074e-08, "logits/chosen": 0.58984375, "logits/rejected": 0.75, "logps/chosen": -113.5, "logps/rejected": -103.0, "loss": 0.6882, "loss/demonstration_loss": -576.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.01806640625, "rewards/rejected": -0.030029296875, "step": 1067 }, { "epoch": 0.8544, "grad_norm": 0.9721300865731134, "learning_rate": 3.159939831687033e-08, "logits/chosen": 0.97265625, "logits/rejected": 0.96484375, "logps/chosen": -20.375, "logps/rejected": -29.375, "loss": 0.6912, "loss/demonstration_loss": -398.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0031280517578125, "step": 1068 }, { "epoch": 0.8552, "grad_norm": 2.3236996001482595, "learning_rate": 3.12605112657156e-08, "logits/chosen": 0.70703125, "logits/rejected": 0.66796875, "logps/chosen": -127.0, "logps/rejected": -120.0, "loss": 0.6935, "loss/demonstration_loss": -490.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.015625, "rewards/margins": 0.005615234375, "rewards/rejected": -0.021240234375, "step": 1069 }, { "epoch": 0.856, "grad_norm": 1.1435737092372544, "learning_rate": 3.092332998903416e-08, "logits/chosen": 0.416015625, "logits/rejected": 0.455078125, "logps/chosen": -51.25, "logps/rejected": -51.0, "loss": 0.6906, "loss/demonstration_loss": -540.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0078125, "rewards/margins": -0.001556396484375, "rewards/rejected": -0.006256103515625, "step": 1070 }, { "epoch": 0.8568, "grad_norm": 3.341191520606165, "learning_rate": 3.058785711623327e-08, "logits/chosen": 0.8359375, "logits/rejected": 0.78125, "logps/chosen": -27.25, "logps/rejected": -26.75, "loss": 0.6899, "loss/demonstration_loss": -432.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.001556396484375, "rewards/margins": 0.0078125, "rewards/rejected": -0.006256103515625, "step": 1071 }, { "epoch": 0.8576, "grad_norm": 0.5721562584247167, "learning_rate": 3.025409526339792e-08, "logits/chosen": 0.765625, "logits/rejected": 0.7578125, "logps/chosen": -35.5, "logps/rejected": -35.5, "loss": 0.6914, "loss/demonstration_loss": -374.0, "loss/preference_loss": -374.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00909423828125, "rewards/margins": 0.00030517578125, "rewards/rejected": -0.0093994140625, "step": 1072 }, { "epoch": 0.8584, "grad_norm": 1.4788495932846006, "learning_rate": 2.9922047033269945e-08, "logits/chosen": 0.68359375, "logits/rejected": 0.6796875, "logps/chosen": -18.5, "logps/rejected": -18.375, "loss": 0.6917, "loss/demonstration_loss": -588.0, "loss/preference_loss": -592.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.0, "step": 1073 }, { "epoch": 0.8592, "grad_norm": 0.6675271263685103, "learning_rate": 2.959171501522828e-08, "logits/chosen": 0.5859375, "logits/rejected": 0.65625, "logps/chosen": -14.375, "logps/rejected": -14.25, "loss": 0.6918, "loss/demonstration_loss": -227.0, "loss/preference_loss": -227.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0018768310546875, "step": 1074 }, { "epoch": 0.86, "grad_norm": 1.4375998383932613, "learning_rate": 2.9263101785268252e-08, "logits/chosen": 0.75, "logits/rejected": 0.5078125, "logps/chosen": -52.75, "logps/rejected": -78.0, "loss": 0.6907, "loss/demonstration_loss": -520.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.010009765625, "step": 1075 }, { "epoch": 0.8608, "grad_norm": 2.043812785564522, "learning_rate": 2.8936209905981916e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.66015625, "logps/chosen": -69.0, "logps/rejected": -73.0, "loss": 0.6918, "loss/demonstration_loss": -378.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.003448486328125, "step": 1076 }, { "epoch": 0.8616, "grad_norm": 1.2308090333660802, "learning_rate": 2.8611041926537793e-08, "logits/chosen": 0.5546875, "logits/rejected": 0.515625, "logps/chosen": -40.25, "logps/rejected": -56.5, "loss": 0.6897, "loss/demonstration_loss": -382.0, "loss/preference_loss": -382.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.012451171875, "step": 1077 }, { "epoch": 0.8624, "grad_norm": 1.0119061487139949, "learning_rate": 2.8287600382661037e-08, "logits/chosen": 0.6015625, "logits/rejected": 0.55078125, "logps/chosen": -53.25, "logps/rejected": -55.0, "loss": 0.6906, "loss/demonstration_loss": -430.0, "loss/preference_loss": -432.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.004669189453125, "rewards/rejected": -0.00469970703125, "step": 1078 }, { "epoch": 0.8632, "grad_norm": 1.1388162110372368, "learning_rate": 2.796588779661388e-08, "logits/chosen": 0.8984375, "logits/rejected": 0.84375, "logps/chosen": -7.59375, "logps/rejected": -11.4375, "loss": 0.6897, "loss/demonstration_loss": -304.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": 0.0028076171875, "rewards/rejected": -0.00250244140625, "step": 1079 }, { "epoch": 0.864, "grad_norm": 12.010774016876995, "learning_rate": 2.764590667717562e-08, "logits/chosen": 0.71484375, "logits/rejected": 0.71484375, "logps/chosen": -65.0, "logps/rejected": -59.0, "loss": 0.6945, "loss/demonstration_loss": -326.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00482177734375, "rewards/rejected": -0.00799560546875, "step": 1080 }, { "epoch": 0.8648, "grad_norm": 1.044322524896567, "learning_rate": 2.732765951962335e-08, "logits/chosen": 0.859375, "logits/rejected": 0.80859375, "logps/chosen": -23.0, "logps/rejected": -30.5, "loss": 0.693, "loss/demonstration_loss": -284.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.005615234375, "rewards/rejected": 0.0006256103515625, "step": 1081 }, { "epoch": 0.8656, "grad_norm": 1.9125454888551825, "learning_rate": 2.7011148805712315e-08, "logits/chosen": 0.640625, "logits/rejected": 0.466796875, "logps/chosen": -40.5, "logps/rejected": -68.5, "loss": 0.6885, "loss/demonstration_loss": -348.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.000774383544921875, "rewards/margins": 0.021728515625, "rewards/rejected": -0.0224609375, "step": 1082 }, { "epoch": 0.8664, "grad_norm": 1.557150200664872, "learning_rate": 2.6696377003656652e-08, "logits/chosen": 0.515625, "logits/rejected": 0.7109375, "logps/chosen": -44.5, "logps/rejected": -24.125, "loss": 0.6932, "loss/demonstration_loss": -540.0, "loss/preference_loss": -552.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.010986328125, "rewards/rejected": 0.003448486328125, "step": 1083 }, { "epoch": 0.8672, "grad_norm": 1.3761099687870624, "learning_rate": 2.638334656811006e-08, "logits/chosen": 0.6171875, "logits/rejected": 0.63671875, "logps/chosen": -25.5, "logps/rejected": -25.5, "loss": 0.6885, "loss/demonstration_loss": -404.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005157470703125, "rewards/margins": 0.00031280517578125, "rewards/rejected": -0.005462646484375, "step": 1084 }, { "epoch": 0.868, "grad_norm": 1.142356260586737, "learning_rate": 2.6072059940146772e-08, "logits/chosen": 0.80078125, "logits/rejected": 0.9296875, "logps/chosen": -65.5, "logps/rejected": -66.0, "loss": 0.6902, "loss/demonstration_loss": -420.0, "loss/preference_loss": -418.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.002655029296875, "rewards/margins": 0.00799560546875, "rewards/rejected": -0.005340576171875, "step": 1085 }, { "epoch": 0.8688, "grad_norm": 1.109821159399735, "learning_rate": 2.5762519547242513e-08, "logits/chosen": 0.81640625, "logits/rejected": 1.0, "logps/chosen": -27.5, "logps/rejected": -9.5, "loss": 0.6909, "loss/demonstration_loss": -296.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 1086 }, { "epoch": 0.8696, "grad_norm": 1.0514475784692854, "learning_rate": 2.545472780325536e-08, "logits/chosen": 0.427734375, "logits/rejected": 0.5703125, "logps/chosen": -40.5, "logps/rejected": -30.125, "loss": 0.6932, "loss/demonstration_loss": -372.0, "loss/preference_loss": -376.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00982666015625, "rewards/margins": -0.0089111328125, "rewards/rejected": -0.00093841552734375, "step": 1087 }, { "epoch": 0.8704, "grad_norm": 1.1463918128621566, "learning_rate": 2.514868710840723e-08, "logits/chosen": 0.61328125, "logits/rejected": 0.59765625, "logps/chosen": -20.75, "logps/rejected": -23.125, "loss": 0.6891, "loss/demonstration_loss": -236.0, "loss/preference_loss": -233.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0035858154296875, "rewards/margins": 0.006103515625, "rewards/rejected": -0.00250244140625, "step": 1088 }, { "epoch": 0.8712, "grad_norm": 0.9149385915381125, "learning_rate": 2.4844399849264924e-08, "logits/chosen": 0.76953125, "logits/rejected": 0.87890625, "logps/chosen": -34.0, "logps/rejected": -29.5, "loss": 0.693, "loss/demonstration_loss": -504.0, "loss/preference_loss": -508.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.00093841552734375, "step": 1089 }, { "epoch": 0.872, "grad_norm": 1.0451396744112715, "learning_rate": 2.4541868398721576e-08, "logits/chosen": 0.49609375, "logits/rejected": 0.482421875, "logps/chosen": -33.0, "logps/rejected": -32.0, "loss": 0.6912, "loss/demonstration_loss": -510.0, "loss/preference_loss": -520.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.010009765625, "rewards/rejected": -0.0006256103515625, "step": 1090 }, { "epoch": 0.8728, "grad_norm": 1.7172934703885334, "learning_rate": 2.4241095115978215e-08, "logits/chosen": 0.88671875, "logits/rejected": 0.953125, "logps/chosen": -37.25, "logps/rejected": -32.75, "loss": 0.6906, "loss/demonstration_loss": -280.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00030517578125, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.002197265625, "step": 1091 }, { "epoch": 0.8736, "grad_norm": 1.7662142609509188, "learning_rate": 2.3942082346525338e-08, "logits/chosen": 0.7734375, "logits/rejected": 0.92578125, "logps/chosen": -79.5, "logps/rejected": -79.5, "loss": 0.6918, "loss/demonstration_loss": -420.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.00689697265625, "step": 1092 }, { "epoch": 0.8744, "grad_norm": 1.1983507569849365, "learning_rate": 2.364483242212456e-08, "logits/chosen": 1.0546875, "logits/rejected": 0.99609375, "logps/chosen": -38.0, "logps/rejected": -50.0, "loss": 0.6926, "loss/demonstration_loss": -460.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.015625, "rewards/margins": -0.000274658203125, "rewards/rejected": -0.01531982421875, "step": 1093 }, { "epoch": 0.8752, "grad_norm": 1.0019058412458557, "learning_rate": 2.334934766079058e-08, "logits/chosen": 0.55859375, "logits/rejected": 0.51171875, "logps/chosen": -17.375, "logps/rejected": -24.75, "loss": 0.6896, "loss/demonstration_loss": -223.0, "loss/preference_loss": -223.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.002655029296875, "rewards/margins": 0.000152587890625, "rewards/rejected": -0.0028076171875, "step": 1094 }, { "epoch": 0.876, "grad_norm": 9.21195194221327, "learning_rate": 2.3055630366772856e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.6640625, "logps/chosen": -76.0, "logps/rejected": -62.75, "loss": 0.696, "loss/demonstration_loss": -440.0, "loss/preference_loss": -442.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00872802734375, "rewards/margins": -0.00156402587890625, "rewards/rejected": -0.0072021484375, "step": 1095 }, { "epoch": 0.8768, "grad_norm": 1.0587832527683882, "learning_rate": 2.276368283053781e-08, "logits/chosen": 0.65625, "logits/rejected": 0.640625, "logps/chosen": -19.25, "logps/rejected": -18.875, "loss": 0.6931, "loss/demonstration_loss": -202.0, "loss/preference_loss": -204.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.003448486328125, "rewards/rejected": 0.00031280517578125, "step": 1096 }, { "epoch": 0.8776, "grad_norm": 0.7314929507527174, "learning_rate": 2.2473507328751084e-08, "logits/chosen": 0.55859375, "logits/rejected": 0.5546875, "logps/chosen": -5.21875, "logps/rejected": -4.84375, "loss": 0.691, "loss/demonstration_loss": -156.0, "loss/preference_loss": -160.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0028076171875, "rewards/margins": -0.00186920166015625, "rewards/rejected": -0.00093841552734375, "step": 1097 }, { "epoch": 0.8784, "grad_norm": 1.3576990833338212, "learning_rate": 2.2185106124259446e-08, "logits/chosen": 0.74609375, "logits/rejected": 0.65234375, "logps/chosen": -64.5, "logps/rejected": -75.5, "loss": 0.6902, "loss/demonstration_loss": -444.0, "loss/preference_loss": -444.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0137939453125, "rewards/margins": 0.00311279296875, "rewards/rejected": -0.016845703125, "step": 1098 }, { "epoch": 0.8792, "grad_norm": 1.2798079253775632, "learning_rate": 2.1898481466073483e-08, "logits/chosen": 0.7578125, "logits/rejected": 0.75, "logps/chosen": -33.5, "logps/rejected": -33.5, "loss": 0.6915, "loss/demonstration_loss": -354.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": 0.001556396484375, "rewards/rejected": -0.0072021484375, "step": 1099 }, { "epoch": 0.88, "grad_norm": 1.5393427283037606, "learning_rate": 2.1613635589349756e-08, "logits/chosen": 0.5703125, "logits/rejected": 0.59765625, "logps/chosen": -50.0, "logps/rejected": -50.0, "loss": 0.6926, "loss/demonstration_loss": -792.0, "loss/preference_loss": -792.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006256103515625, "rewards/margins": 0.0, "rewards/rejected": -0.006256103515625, "step": 1100 }, { "epoch": 0.8808, "grad_norm": 1.6123158125747867, "learning_rate": 2.1330570715373752e-08, "logits/chosen": 0.65234375, "logits/rejected": 0.61328125, "logps/chosen": -27.625, "logps/rejected": -33.5, "loss": 0.6913, "loss/demonstration_loss": -324.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.004730224609375, "rewards/rejected": -0.0078125, "step": 1101 }, { "epoch": 0.8816, "grad_norm": 1.8040001618110257, "learning_rate": 2.1049289051542185e-08, "logits/chosen": 0.7265625, "logits/rejected": 0.7109375, "logps/chosen": -67.0, "logps/rejected": -65.5, "loss": 0.6952, "loss/demonstration_loss": -422.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.005615234375, "rewards/rejected": -0.004058837890625, "step": 1102 }, { "epoch": 0.8824, "grad_norm": 1.3939190564511215, "learning_rate": 2.076979279134594e-08, "logits/chosen": 0.6171875, "logits/rejected": 0.5546875, "logps/chosen": -50.5, "logps/rejected": -63.25, "loss": 0.692, "loss/demonstration_loss": -446.0, "loss/preference_loss": -450.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0206298828125, "rewards/margins": -0.00714111328125, "rewards/rejected": -0.013427734375, "step": 1103 }, { "epoch": 0.8832, "grad_norm": 1.1612925429776122, "learning_rate": 2.0492084114352964e-08, "logits/chosen": 0.859375, "logits/rejected": 0.859375, "logps/chosen": -21.25, "logps/rejected": -16.25, "loss": 0.6938, "loss/demonstration_loss": -592.0, "loss/preference_loss": -596.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0031280517578125, "step": 1104 }, { "epoch": 0.884, "grad_norm": 1.4687836222952555, "learning_rate": 2.0216165186191404e-08, "logits/chosen": 0.75390625, "logits/rejected": 0.69921875, "logps/chosen": -7.375, "logps/rejected": -24.75, "loss": 0.6904, "loss/demonstration_loss": -256.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.000782012939453125, "rewards/margins": 0.000782012939453125, "rewards/rejected": -0.00156402587890625, "step": 1105 }, { "epoch": 0.8848, "grad_norm": 1.8318707803214023, "learning_rate": 1.9942038158532403e-08, "logits/chosen": 0.78515625, "logits/rejected": 1.0234375, "logps/chosen": -56.0, "logps/rejected": -41.25, "loss": 0.6924, "loss/demonstration_loss": -384.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01080322265625, "rewards/margins": -0.00830078125, "rewards/rejected": -0.00250244140625, "step": 1106 }, { "epoch": 0.8856, "grad_norm": 0.9410620636173742, "learning_rate": 1.966970516907368e-08, "logits/chosen": 0.8203125, "logits/rejected": 0.7109375, "logps/chosen": -52.0, "logps/rejected": -63.75, "loss": 0.6887, "loss/demonstration_loss": -464.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.00030517578125, "rewards/margins": 0.01153564453125, "rewards/rejected": -0.01123046875, "step": 1107 }, { "epoch": 0.8864, "grad_norm": 1.3541444351281653, "learning_rate": 1.9399168341522527e-08, "logits/chosen": 0.94921875, "logits/rejected": 0.81640625, "logps/chosen": -7.875, "logps/rejected": -15.6875, "loss": 0.6899, "loss/demonstration_loss": -380.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.001251220703125, "rewards/margins": 0.00439453125, "rewards/rejected": -0.0031280517578125, "step": 1108 }, { "epoch": 0.8872, "grad_norm": 74.0187705363922, "learning_rate": 1.9130429785579437e-08, "logits/chosen": 0.74609375, "logits/rejected": 0.8203125, "logps/chosen": -39.5, "logps/rejected": -36.25, "loss": 0.6957, "loss/demonstration_loss": -396.0, "loss/preference_loss": -398.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0159912109375, "rewards/margins": -0.006866455078125, "rewards/rejected": -0.00909423828125, "step": 1109 }, { "epoch": 0.888, "grad_norm": 1.7946920641295423, "learning_rate": 1.8863491596921743e-08, "logits/chosen": 0.7734375, "logits/rejected": 0.79296875, "logps/chosen": -28.375, "logps/rejected": -22.875, "loss": 0.6941, "loss/demonstration_loss": -400.0, "loss/preference_loss": -406.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01092529296875, "rewards/margins": -0.0045166015625, "rewards/rejected": -0.00640869140625, "step": 1110 }, { "epoch": 0.8888, "grad_norm": 1.4718791731113439, "learning_rate": 1.859835585718697e-08, "logits/chosen": 0.87890625, "logits/rejected": 0.6953125, "logps/chosen": -32.75, "logps/rejected": -47.25, "loss": 0.6917, "loss/demonstration_loss": -318.0, "loss/preference_loss": -316.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005615234375, "rewards/margins": 0.001556396484375, "rewards/rejected": -0.0072021484375, "step": 1111 }, { "epoch": 0.8896, "grad_norm": 1.6974493058297315, "learning_rate": 1.8335024633956975e-08, "logits/chosen": 0.4453125, "logits/rejected": 0.57421875, "logps/chosen": -56.75, "logps/rejected": -47.75, "loss": 0.6968, "loss/demonstration_loss": -334.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.002349853515625, "rewards/margins": 0.0032806396484375, "rewards/rejected": -0.005615234375, "step": 1112 }, { "epoch": 0.8904, "grad_norm": 1.4617941376789287, "learning_rate": 1.8073499980741425e-08, "logits/chosen": 0.71484375, "logits/rejected": 0.490234375, "logps/chosen": -39.75, "logps/rejected": -57.25, "loss": 0.6938, "loss/demonstration_loss": -512.0, "loss/preference_loss": -516.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.00531005859375, "rewards/rejected": -0.0031280517578125, "step": 1113 }, { "epoch": 0.8912, "grad_norm": 1.1324137260489795, "learning_rate": 1.7813783936962257e-08, "logits/chosen": 0.5859375, "logits/rejected": 0.578125, "logps/chosen": -69.0, "logps/rejected": -75.5, "loss": 0.6946, "loss/demonstration_loss": -458.0, "loss/preference_loss": -460.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.014404296875, "rewards/margins": -0.01123046875, "rewards/rejected": -0.0031280517578125, "step": 1114 }, { "epoch": 0.892, "grad_norm": 1.3035746436200473, "learning_rate": 1.7555878527937163e-08, "logits/chosen": 0.50390625, "logits/rejected": 0.59375, "logps/chosen": -58.5, "logps/rejected": -47.0, "loss": 0.6903, "loss/demonstration_loss": -418.0, "loss/preference_loss": -418.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.001708984375, "rewards/rejected": -0.00982666015625, "step": 1115 }, { "epoch": 0.8928, "grad_norm": 1.623495629802976, "learning_rate": 1.7299785764864432e-08, "logits/chosen": 0.91796875, "logits/rejected": 0.875, "logps/chosen": -24.25, "logps/rejected": -33.75, "loss": 0.6929, "loss/demonstration_loss": -466.0, "loss/preference_loss": -462.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.000469207763671875, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.003753662109375, "step": 1116 }, { "epoch": 0.8936, "grad_norm": 2.2607248662698534, "learning_rate": 1.704550764480689e-08, "logits/chosen": 0.78125, "logits/rejected": 0.765625, "logps/chosen": -53.25, "logps/rejected": -59.25, "loss": 0.6895, "loss/demonstration_loss": -256.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.006072998046875, "rewards/margins": 0.00031280517578125, "rewards/rejected": -0.00640869140625, "step": 1117 }, { "epoch": 0.8944, "grad_norm": 1.9140870139944237, "learning_rate": 1.6793046150676337e-08, "logits/chosen": 0.58984375, "logits/rejected": 0.515625, "logps/chosen": -69.0, "logps/rejected": -69.5, "loss": 0.6899, "loss/demonstration_loss": -370.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.0, "rewards/margins": 0.01220703125, "rewards/rejected": -0.01220703125, "step": 1118 }, { "epoch": 0.8952, "grad_norm": 1.472700888629391, "learning_rate": 1.6542403251218305e-08, "logits/chosen": 0.8125, "logits/rejected": 1.0703125, "logps/chosen": -84.5, "logps/rejected": -65.5, "loss": 0.6921, "loss/demonstration_loss": -340.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0128173828125, "rewards/margins": -0.00640869140625, "rewards/rejected": -0.00640869140625, "step": 1119 }, { "epoch": 0.896, "grad_norm": 1.7947946916790058, "learning_rate": 1.629358090099639e-08, "logits/chosen": 0.6796875, "logits/rejected": 0.71875, "logps/chosen": -88.0, "logps/rejected": -87.5, "loss": 0.6938, "loss/demonstration_loss": -696.0, "loss/preference_loss": -696.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01806640625, "rewards/margins": -0.006866455078125, "rewards/rejected": -0.01123046875, "step": 1120 }, { "epoch": 0.8968, "grad_norm": 0.6503825163638228, "learning_rate": 1.6046581040377316e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.625, "logps/chosen": -29.75, "logps/rejected": -26.0, "loss": 0.6927, "loss/demonstration_loss": -292.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0093994140625, "rewards/margins": -0.005615234375, "rewards/rejected": -0.003753662109375, "step": 1121 }, { "epoch": 0.8976, "grad_norm": 1.3888002536456194, "learning_rate": 1.5801405595515543e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.6015625, "logps/chosen": -62.0, "logps/rejected": -68.0, "loss": 0.6932, "loss/demonstration_loss": -346.0, "loss/preference_loss": -346.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.004547119140625, "rewards/margins": -0.0007781982421875, "rewards/rejected": -0.003753662109375, "step": 1122 }, { "epoch": 0.8984, "grad_norm": 2.0362224614258686, "learning_rate": 1.555805647833852e-08, "logits/chosen": 0.609375, "logits/rejected": 0.81640625, "logps/chosen": -39.25, "logps/rejected": -18.75, "loss": 0.6915, "loss/demonstration_loss": -310.0, "loss/preference_loss": -306.0, "rewards/accuracies": 0.125, "rewards/chosen": -1.52587890625e-05, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.00531005859375, "step": 1123 }, { "epoch": 0.8992, "grad_norm": 1.6286450173655833, "learning_rate": 1.5316535586531483e-08, "logits/chosen": 0.5078125, "logits/rejected": 0.5390625, "logps/chosen": -28.25, "logps/rejected": -32.25, "loss": 0.6934, "loss/demonstration_loss": -318.0, "loss/preference_loss": -320.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": -0.00095367431640625, "rewards/rejected": -0.00592041015625, "step": 1124 }, { "epoch": 0.9, "grad_norm": 1.4793966500955311, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.68359375, "logits/rejected": 0.87109375, "logps/chosen": -28.5, "logps/rejected": -8.75, "loss": 0.6923, "loss/demonstration_loss": -592.0, "loss/preference_loss": -600.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.001251220703125, "step": 1125 }, { "epoch": 0.9008, "grad_norm": 2.078661430544348, "learning_rate": 1.4838985998469638e-08, "logits/chosen": 0.828125, "logits/rejected": 0.8046875, "logps/chosen": -63.25, "logps/rejected": -54.25, "loss": 0.6932, "loss/demonstration_loss": -372.0, "loss/preference_loss": -378.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.010009765625, "rewards/margins": -0.0106201171875, "rewards/rejected": 0.0006256103515625, "step": 1126 }, { "epoch": 0.9016, "grad_norm": 1.3582695590972977, "learning_rate": 1.4602961026242478e-08, "logits/chosen": 0.58203125, "logits/rejected": 0.59375, "logps/chosen": -18.5, "logps/rejected": -17.875, "loss": 0.6965, "loss/demonstration_loss": -288.0, "loss/preference_loss": -292.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0028076171875, "rewards/margins": -0.005615234375, "rewards/rejected": 0.0028076171875, "step": 1127 }, { "epoch": 0.9024, "grad_norm": 1.2901601523121078, "learning_rate": 1.4368771727411494e-08, "logits/chosen": 0.609375, "logits/rejected": 0.625, "logps/chosen": -39.5, "logps/rejected": -39.25, "loss": 0.6923, "loss/demonstration_loss": -416.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.006256103515625, "step": 1128 }, { "epoch": 0.9032, "grad_norm": 1.1429833672189331, "learning_rate": 1.4136419928231891e-08, "logits/chosen": 0.6484375, "logits/rejected": 0.52734375, "logps/chosen": -71.0, "logps/rejected": -87.5, "loss": 0.6866, "loss/demonstration_loss": -632.0, "loss/preference_loss": -628.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.003753662109375, "rewards/margins": 0.0150146484375, "rewards/rejected": -0.018798828125, "step": 1129 }, { "epoch": 0.904, "grad_norm": 1.0323307839175135, "learning_rate": 1.390590744062975e-08, "logits/chosen": 0.6171875, "logits/rejected": 0.6640625, "logps/chosen": -35.75, "logps/rejected": -39.5, "loss": 0.6917, "loss/demonstration_loss": -300.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004669189453125, "rewards/margins": 2.288818359375e-05, "rewards/rejected": -0.00469970703125, "step": 1130 }, { "epoch": 0.9048, "grad_norm": 1.0258331558630676, "learning_rate": 1.3677236062187653e-08, "logits/chosen": 0.734375, "logits/rejected": 0.74609375, "logps/chosen": -23.75, "logps/rejected": -23.75, "loss": 0.6915, "loss/demonstration_loss": -384.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.002349853515625, "rewards/margins": 0.00360107421875, "rewards/rejected": -0.001251220703125, "step": 1131 }, { "epoch": 0.9056, "grad_norm": 1.653359338583155, "learning_rate": 1.3450407576131029e-08, "logits/chosen": 0.59375, "logits/rejected": 0.578125, "logps/chosen": -2.59375, "logps/rejected": -4.4375, "loss": 0.6876, "loss/demonstration_loss": -111.5, "loss/preference_loss": -107.5, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.00250244140625, "rewards/rejected": -0.0031280517578125, "step": 1132 }, { "epoch": 0.9064, "grad_norm": 0.9675260630365345, "learning_rate": 1.3225423751313941e-08, "logits/chosen": 0.50390625, "logits/rejected": 0.494140625, "logps/chosen": -46.5, "logps/rejected": -47.0, "loss": 0.6893, "loss/demonstration_loss": -748.0, "loss/preference_loss": -740.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.006256103515625, "rewards/rejected": -0.0087890625, "step": 1133 }, { "epoch": 0.9072, "grad_norm": 1.7580081612923972, "learning_rate": 1.300228634220546e-08, "logits/chosen": 0.7421875, "logits/rejected": 0.7890625, "logps/chosen": -50.0, "logps/rejected": -45.5, "loss": 0.6929, "loss/demonstration_loss": -380.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006866455078125, "rewards/margins": -0.00061798095703125, "rewards/rejected": -0.006256103515625, "step": 1134 }, { "epoch": 0.908, "grad_norm": 1.3019204526744688, "learning_rate": 1.2780997088875866e-08, "logits/chosen": 0.828125, "logits/rejected": 0.765625, "logps/chosen": -49.25, "logps/rejected": -59.0, "loss": 0.6908, "loss/demonstration_loss": -432.0, "loss/preference_loss": -430.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00439453125, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.005950927734375, "step": 1135 }, { "epoch": 0.9088, "grad_norm": 1.3349142913521606, "learning_rate": 1.2561557716983307e-08, "logits/chosen": 0.96875, "logits/rejected": 0.625, "logps/chosen": -48.0, "logps/rejected": -83.0, "loss": 0.6853, "loss/demonstration_loss": -416.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.006561279296875, "rewards/margins": 0.0172119140625, "rewards/rejected": -0.023681640625, "step": 1136 }, { "epoch": 0.9096, "grad_norm": 1.1916094099641847, "learning_rate": 1.2343969937759991e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.5078125, "logps/chosen": -11.625, "logps/rejected": -18.125, "loss": 0.689, "loss/demonstration_loss": -241.0, "loss/preference_loss": -232.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.003753662109375, "rewards/margins": 0.0118408203125, "rewards/rejected": -0.00811767578125, "step": 1137 }, { "epoch": 0.9104, "grad_norm": 1.8282455707619507, "learning_rate": 1.212823544799918e-08, "logits/chosen": 0.875, "logits/rejected": 1.1171875, "logps/chosen": -88.5, "logps/rejected": -67.5, "loss": 0.6941, "loss/demonstration_loss": -410.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0230712890625, "rewards/margins": -0.013427734375, "rewards/rejected": -0.00970458984375, "step": 1138 }, { "epoch": 0.9112, "grad_norm": 1.2469762925941543, "learning_rate": 1.1914355930041836e-08, "logits/chosen": 0.734375, "logits/rejected": 0.671875, "logps/chosen": -33.25, "logps/rejected": -41.5, "loss": 0.6901, "loss/demonstration_loss": -396.0, "loss/preference_loss": -392.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.004852294921875, "rewards/margins": 0.0076904296875, "rewards/rejected": -0.01251220703125, "step": 1139 }, { "epoch": 0.912, "grad_norm": 1.1522272231949893, "learning_rate": 1.1702333051763268e-08, "logits/chosen": 0.8046875, "logits/rejected": 0.90234375, "logps/chosen": -42.75, "logps/rejected": -31.875, "loss": 0.6925, "loss/demonstration_loss": -296.0, "loss/preference_loss": -296.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0067138671875, "rewards/margins": -0.00077056884765625, "rewards/rejected": -0.005950927734375, "step": 1140 }, { "epoch": 0.9128, "grad_norm": 1.7496937222332365, "learning_rate": 1.1492168466560538e-08, "logits/chosen": 0.474609375, "logits/rejected": 0.455078125, "logps/chosen": -8.3125, "logps/rejected": -8.125, "loss": 0.691, "loss/demonstration_loss": -260.0, "loss/preference_loss": -262.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0018768310546875, "rewards/rejected": -0.0006256103515625, "step": 1141 }, { "epoch": 0.9136, "grad_norm": 0.8838797761226832, "learning_rate": 1.1283863813339262e-08, "logits/chosen": 0.8125, "logits/rejected": 0.7734375, "logps/chosen": -5.46875, "logps/rejected": -11.5625, "loss": 0.6934, "loss/demonstration_loss": -136.0, "loss/preference_loss": -137.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00109100341796875, "rewards/margins": -0.00140380859375, "rewards/rejected": 0.00031280517578125, "step": 1142 }, { "epoch": 0.9144, "grad_norm": 1.3594698900094921, "learning_rate": 1.107742071650103e-08, "logits/chosen": 0.98046875, "logits/rejected": 0.78125, "logps/chosen": -41.5, "logps/rejected": -56.25, "loss": 0.6885, "loss/demonstration_loss": -258.0, "loss/preference_loss": -256.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01141357421875, "rewards/margins": 0.00640869140625, "rewards/rejected": -0.017822265625, "step": 1143 }, { "epoch": 0.9152, "grad_norm": 0.6206807590015176, "learning_rate": 1.087284078593051e-08, "logits/chosen": 0.66796875, "logits/rejected": 0.68359375, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6903, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1144 }, { "epoch": 0.916, "grad_norm": 1.5887854182799697, "learning_rate": 1.0670125616983189e-08, "logits/chosen": 0.7109375, "logits/rejected": 0.671875, "logps/chosen": -51.5, "logps/rejected": -63.0, "loss": 0.6892, "loss/demonstration_loss": -608.0, "loss/preference_loss": -604.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00250244140625, "rewards/margins": 0.01123046875, "rewards/rejected": -0.01373291015625, "step": 1145 }, { "epoch": 0.9168, "grad_norm": 1.2764561837978137, "learning_rate": 1.0469276790472603e-08, "logits/chosen": 0.80859375, "logits/rejected": 0.80078125, "logps/chosen": -51.5, "logps/rejected": -48.0, "loss": 0.6917, "loss/demonstration_loss": -314.0, "loss/preference_loss": -316.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.01470947265625, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.0078125, "step": 1146 }, { "epoch": 0.9176, "grad_norm": 1.9203697376763125, "learning_rate": 1.0270295872658263e-08, "logits/chosen": 0.6328125, "logits/rejected": 0.5078125, "logps/chosen": -95.0, "logps/rejected": -109.5, "loss": 0.6901, "loss/demonstration_loss": -544.0, "loss/preference_loss": -540.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0106201171875, "rewards/margins": 0.0072021484375, "rewards/rejected": -0.017822265625, "step": 1147 }, { "epoch": 0.9184, "grad_norm": 1.2973226424045257, "learning_rate": 1.0073184415233333e-08, "logits/chosen": 1.046875, "logits/rejected": 0.89453125, "logps/chosen": -41.5, "logps/rejected": -52.75, "loss": 0.6913, "loss/demonstration_loss": -374.0, "loss/preference_loss": -374.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.006866455078125, "rewards/margins": 0.002532958984375, "rewards/rejected": -0.0093994140625, "step": 1148 }, { "epoch": 0.9192, "grad_norm": 0.8934127191764434, "learning_rate": 9.877943955312551e-09, "logits/chosen": 0.8125, "logits/rejected": 0.90625, "logps/chosen": -36.0, "logps/rejected": -23.0, "loss": 0.6923, "loss/demonstration_loss": -312.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006103515625, "rewards/margins": -0.00140380859375, "rewards/rejected": -0.00469970703125, "step": 1149 }, { "epoch": 0.92, "grad_norm": 1.2530721549708124, "learning_rate": 9.684576015420275e-09, "logits/chosen": 0.5546875, "logits/rejected": 0.57421875, "logps/chosen": -12.125, "logps/rejected": -14.75, "loss": 0.6921, "loss/demonstration_loss": -418.0, "loss/preference_loss": -424.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.004364013671875, "step": 1150 }, { "epoch": 0.9208, "grad_norm": 1.2091383249554941, "learning_rate": 9.493082103478517e-09, "logits/chosen": 0.82421875, "logits/rejected": 1.078125, "logps/chosen": -45.75, "logps/rejected": -29.0, "loss": 0.6915, "loss/demonstration_loss": -300.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.000469207763671875, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.0017242431640625, "step": 1151 }, { "epoch": 0.9216, "grad_norm": 1.7185223363597997, "learning_rate": 9.303463712795306e-09, "logits/chosen": 0.47265625, "logits/rejected": 0.53125, "logps/chosen": -47.75, "logps/rejected": -43.5, "loss": 0.693, "loss/demonstration_loss": -364.0, "loss/preference_loss": -362.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006561279296875, "rewards/margins": 0.00188446044921875, "rewards/rejected": -0.0084228515625, "step": 1152 }, { "epoch": 0.9224, "grad_norm": 1.27463644695769, "learning_rate": 9.115722322052876e-09, "logits/chosen": 0.5546875, "logits/rejected": 0.60546875, "logps/chosen": -72.5, "logps/rejected": -64.0, "loss": 0.6914, "loss/demonstration_loss": -544.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.002655029296875, "rewards/rejected": -0.009521484375, "step": 1153 }, { "epoch": 0.9232, "grad_norm": 1.2677489712170062, "learning_rate": 8.929859395296363e-09, "logits/chosen": 0.9296875, "logits/rejected": 0.828125, "logps/chosen": -9.5, "logps/rejected": -17.0, "loss": 0.6921, "loss/demonstration_loss": -424.0, "loss/preference_loss": -426.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.001251220703125, "step": 1154 }, { "epoch": 0.924, "grad_norm": 1.190570591928671, "learning_rate": 8.745876381922146e-09, "logits/chosen": 0.92578125, "logits/rejected": 0.90234375, "logps/chosen": -17.25, "logps/rejected": -15.625, "loss": 0.6896, "loss/demonstration_loss": -266.0, "loss/preference_loss": -260.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0045166015625, "rewards/margins": 0.00921630859375, "rewards/rejected": -0.00469970703125, "step": 1155 }, { "epoch": 0.9248, "grad_norm": 1.1343153892040287, "learning_rate": 8.563774716666777e-09, "logits/chosen": 0.97265625, "logits/rejected": 0.87890625, "logps/chosen": -37.5, "logps/rejected": -47.5, "loss": 0.6908, "loss/demonstration_loss": -450.0, "loss/preference_loss": -446.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.007476806640625, "rewards/rejected": -0.0150146484375, "step": 1156 }, { "epoch": 0.9256, "grad_norm": 1.10957816007386, "learning_rate": 8.383555819595601e-09, "logits/chosen": 0.59765625, "logits/rejected": 0.578125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1157 }, { "epoch": 0.9264, "grad_norm": 1.4274774548274372, "learning_rate": 8.205221096091786e-09, "logits/chosen": 0.765625, "logits/rejected": 0.8203125, "logps/chosen": -35.75, "logps/rejected": -20.125, "loss": 0.6909, "loss/demonstration_loss": -888.0, "loss/preference_loss": -896.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.0, "step": 1158 }, { "epoch": 0.9272, "grad_norm": 1.829607066041713, "learning_rate": 8.028771936845341e-09, "logits/chosen": 0.7421875, "logits/rejected": 0.734375, "logps/chosen": -16.5, "logps/rejected": -19.25, "loss": 0.687, "loss/demonstration_loss": -288.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.003753662109375, "step": 1159 }, { "epoch": 0.928, "grad_norm": 0.9688449832533774, "learning_rate": 7.85420971784223e-09, "logits/chosen": 0.58984375, "logits/rejected": 0.69140625, "logps/chosen": -44.0, "logps/rejected": -34.0, "loss": 0.6917, "loss/demonstration_loss": -416.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.00469970703125, "rewards/rejected": -0.0050048828125, "step": 1160 }, { "epoch": 0.9288, "grad_norm": 1.0916230055883323, "learning_rate": 7.681535800353717e-09, "logits/chosen": 0.640625, "logits/rejected": 0.6015625, "logps/chosen": -34.0, "logps/rejected": -38.25, "loss": 0.6903, "loss/demonstration_loss": -572.0, "loss/preference_loss": -568.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00531005859375, "rewards/margins": 0.004364013671875, "rewards/rejected": -0.00970458984375, "step": 1161 }, { "epoch": 0.9296, "grad_norm": 1.9620982021636055, "learning_rate": 7.510751530925675e-09, "logits/chosen": 0.8984375, "logits/rejected": 0.953125, "logps/chosen": -73.0, "logps/rejected": -73.0, "loss": 0.6877, "loss/demonstration_loss": -332.0, "loss/preference_loss": -332.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.00360107421875, "rewards/margins": 0.00531005859375, "rewards/rejected": -0.0089111328125, "step": 1162 }, { "epoch": 0.9304, "grad_norm": 1.4173608806658746, "learning_rate": 7.341858241368182e-09, "logits/chosen": 0.51171875, "logits/rejected": 0.53515625, "logps/chosen": -78.0, "logps/rejected": -79.5, "loss": 0.691, "loss/demonstration_loss": -624.0, "loss/preference_loss": -624.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01190185546875, "rewards/margins": 0.000152587890625, "rewards/rejected": -0.01202392578125, "step": 1163 }, { "epoch": 0.9312, "grad_norm": 1.025777116200613, "learning_rate": 7.174857248745003e-09, "logits/chosen": 0.86328125, "logits/rejected": 0.6953125, "logps/chosen": -18.5, "logps/rejected": -32.75, "loss": 0.6917, "loss/demonstration_loss": -408.0, "loss/preference_loss": -408.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00311279296875, "rewards/margins": -0.0006103515625, "rewards/rejected": -0.00250244140625, "step": 1164 }, { "epoch": 0.932, "grad_norm": 1.5249923711870665, "learning_rate": 7.009749855363456e-09, "logits/chosen": 0.5625, "logits/rejected": 0.5390625, "logps/chosen": -41.5, "logps/rejected": -40.25, "loss": 0.6925, "loss/demonstration_loss": -258.0, "loss/preference_loss": -260.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.006256103515625, "rewards/rejected": -0.00439453125, "step": 1165 }, { "epoch": 0.9328, "grad_norm": 1.3156176394638273, "learning_rate": 6.846537348764114e-09, "logits/chosen": 0.59765625, "logits/rejected": 0.5625, "logps/chosen": -28.625, "logps/rejected": -40.0, "loss": 0.6888, "loss/demonstration_loss": -276.0, "loss/preference_loss": -274.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.006256103515625, "rewards/margins": 0.00592041015625, "rewards/rejected": 0.00030517578125, "step": 1166 }, { "epoch": 0.9336, "grad_norm": 1.51622658038377, "learning_rate": 6.685221001710955e-09, "logits/chosen": 0.97265625, "logits/rejected": 0.83203125, "logps/chosen": -54.25, "logps/rejected": -68.0, "loss": 0.6915, "loss/demonstration_loss": -488.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.000335693359375, "rewards/rejected": -0.0031280517578125, "step": 1167 }, { "epoch": 0.9344, "grad_norm": 1.5714471499039935, "learning_rate": 6.525802072181203e-09, "logits/chosen": 0.7578125, "logits/rejected": 0.80078125, "logps/chosen": -18.25, "logps/rejected": -15.75, "loss": 0.6959, "loss/demonstration_loss": -268.0, "loss/preference_loss": -270.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.004058837890625, "rewards/margins": -0.002197265625, "rewards/rejected": -0.0018768310546875, "step": 1168 }, { "epoch": 0.9352, "grad_norm": 2.4965458647488914, "learning_rate": 6.368281803355691e-09, "logits/chosen": 0.74609375, "logits/rejected": 0.796875, "logps/chosen": -42.0, "logps/rejected": -36.75, "loss": 0.6915, "loss/demonstration_loss": -312.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00872802734375, "rewards/margins": 0.0015411376953125, "rewards/rejected": -0.01031494140625, "step": 1169 }, { "epoch": 0.936, "grad_norm": 1.432374957333342, "learning_rate": 6.2126614236091834e-09, "logits/chosen": 0.7421875, "logits/rejected": 0.69140625, "logps/chosen": -18.625, "logps/rejected": -22.5, "loss": 0.6906, "loss/demonstration_loss": -324.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.006866455078125, "rewards/margins": 0.000946044921875, "rewards/rejected": -0.0078125, "step": 1170 }, { "epoch": 0.9368, "grad_norm": 1.6066865966750805, "learning_rate": 6.0589421465006286e-09, "logits/chosen": 0.703125, "logits/rejected": 0.73046875, "logps/chosen": -89.5, "logps/rejected": -83.5, "loss": 0.6945, "loss/demonstration_loss": -552.0, "loss/preference_loss": -556.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01220703125, "rewards/margins": -0.0128173828125, "rewards/rejected": 0.0006256103515625, "step": 1171 }, { "epoch": 0.9376, "grad_norm": 1.1681647040702836, "learning_rate": 5.907125170763805e-09, "logits/chosen": 0.47265625, "logits/rejected": 0.55078125, "logps/chosen": -65.5, "logps/rejected": -51.0, "loss": 0.6934, "loss/demonstration_loss": -456.0, "loss/preference_loss": -462.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.023193359375, "rewards/margins": -0.01190185546875, "rewards/rejected": -0.01123046875, "step": 1172 }, { "epoch": 0.9384, "grad_norm": 2.1601487242136383, "learning_rate": 5.7572116802979685e-09, "logits/chosen": 0.64453125, "logits/rejected": 0.578125, "logps/chosen": -27.5, "logps/rejected": -34.0, "loss": 0.6936, "loss/demonstration_loss": -324.0, "loss/preference_loss": -322.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.002166748046875, "rewards/rejected": -0.00811767578125, "step": 1173 }, { "epoch": 0.9392, "grad_norm": 1.3078001484099284, "learning_rate": 5.609202844158723e-09, "logits/chosen": 0.8359375, "logits/rejected": 0.76953125, "logps/chosen": -29.75, "logps/rejected": -36.5, "loss": 0.6901, "loss/demonstration_loss": -264.0, "loss/preference_loss": -262.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00390625, "rewards/margins": 0.0031280517578125, "rewards/rejected": -0.007049560546875, "step": 1174 }, { "epoch": 0.94, "grad_norm": 1.8585246144876966, "learning_rate": 5.463099816548577e-09, "logits/chosen": 0.80078125, "logits/rejected": 0.81640625, "logps/chosen": -44.5, "logps/rejected": -37.75, "loss": 0.6919, "loss/demonstration_loss": -324.0, "loss/preference_loss": -324.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0115966796875, "rewards/margins": 0.00030517578125, "rewards/rejected": -0.0118408203125, "step": 1175 }, { "epoch": 0.9408, "grad_norm": 1.698724618693562, "learning_rate": 5.318903736808406e-09, "logits/chosen": 0.8046875, "logits/rejected": 0.71875, "logps/chosen": -26.375, "logps/rejected": -29.75, "loss": 0.6897, "loss/demonstration_loss": -224.0, "loss/preference_loss": -220.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00046539306640625, "rewards/margins": 0.0106201171875, "rewards/rejected": -0.01019287109375, "step": 1176 }, { "epoch": 0.9416, "grad_norm": 1.4031313156939034, "learning_rate": 5.176615729408168e-09, "logits/chosen": 0.40625, "logits/rejected": 0.640625, "logps/chosen": -71.0, "logps/rejected": -51.5, "loss": 0.6932, "loss/demonstration_loss": -386.0, "loss/preference_loss": -388.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0146484375, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.01409912109375, "step": 1177 }, { "epoch": 0.9424, "grad_norm": 1.5462809118557266, "learning_rate": 5.0362369039382845e-09, "logits/chosen": 0.7421875, "logits/rejected": 0.7421875, "logps/chosen": -78.0, "logps/rejected": -71.5, "loss": 0.6919, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01190185546875, "rewards/margins": -0.0001678466796875, "rewards/rejected": -0.01171875, "step": 1178 }, { "epoch": 0.9432, "grad_norm": 1.785577265952326, "learning_rate": 4.897768355101084e-09, "logits/chosen": 0.87109375, "logits/rejected": 0.9140625, "logps/chosen": -57.0, "logps/rejected": -57.0, "loss": 0.6917, "loss/demonstration_loss": -604.0, "loss/preference_loss": -604.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0087890625, "step": 1179 }, { "epoch": 0.944, "grad_norm": 1.8889450464193784, "learning_rate": 4.761211162702117e-09, "logits/chosen": 0.59765625, "logits/rejected": 0.77734375, "logps/chosen": -37.5, "logps/rejected": -24.25, "loss": 0.6895, "loss/demonstration_loss": -245.0, "loss/preference_loss": -243.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0032806396484375, "rewards/margins": 0.005615234375, "rewards/rejected": -0.0089111328125, "step": 1180 }, { "epoch": 0.9448, "grad_norm": 1.322713702935189, "learning_rate": 4.626566391641773e-09, "logits/chosen": 0.578125, "logits/rejected": 0.58203125, "logps/chosen": -12.75, "logps/rejected": -13.75, "loss": 0.6906, "loss/demonstration_loss": -141.0, "loss/preference_loss": -141.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0004730224609375, "rewards/margins": 0.0003204345703125, "rewards/rejected": -0.00078582763671875, "step": 1181 }, { "epoch": 0.9456, "grad_norm": 0.9464676388380862, "learning_rate": 4.493835091907067e-09, "logits/chosen": 0.75390625, "logits/rejected": 0.765625, "logps/chosen": -23.25, "logps/rejected": -24.5, "loss": 0.6918, "loss/demonstration_loss": -378.0, "loss/preference_loss": -380.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 1182 }, { "epoch": 0.9464, "grad_norm": 1.3530796604854878, "learning_rate": 4.363018298563309e-09, "logits/chosen": 0.8046875, "logits/rejected": 0.83203125, "logps/chosen": -42.0, "logps/rejected": -41.75, "loss": 0.6918, "loss/demonstration_loss": -438.0, "loss/preference_loss": -440.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.014404296875, "rewards/margins": -0.00189208984375, "rewards/rejected": -0.01251220703125, "step": 1183 }, { "epoch": 0.9472, "grad_norm": 1.253612731811808, "learning_rate": 4.234117031746142e-09, "logits/chosen": 0.89453125, "logits/rejected": 0.95703125, "logps/chosen": -47.0, "logps/rejected": -44.5, "loss": 0.6918, "loss/demonstration_loss": -484.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.006256103515625, "rewards/margins": -0.0028228759765625, "rewards/rejected": -0.0034332275390625, "step": 1184 }, { "epoch": 0.948, "grad_norm": 1.3195942140676589, "learning_rate": 4.107132296653548e-09, "logits/chosen": 0.52734375, "logits/rejected": 0.52734375, "logps/chosen": -20.0, "logps/rejected": -20.0, "loss": 0.6918, "loss/demonstration_loss": -640.0, "loss/preference_loss": -640.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.0, "rewards/rejected": -0.001251220703125, "step": 1185 }, { "epoch": 0.9488, "grad_norm": 1.587153865749255, "learning_rate": 3.982065083537961e-09, "logits/chosen": 0.7109375, "logits/rejected": 0.61328125, "logps/chosen": -24.75, "logps/rejected": -34.0, "loss": 0.6897, "loss/demonstration_loss": -468.0, "loss/preference_loss": -464.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00439453125, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.0093994140625, "step": 1186 }, { "epoch": 0.9496, "grad_norm": 0.7398440884732881, "learning_rate": 3.858916367698667e-09, "logits/chosen": 1.0703125, "logits/rejected": 1.0703125, "logps/chosen": -5.28125, "logps/rejected": -3.1875, "loss": 0.6929, "loss/demonstration_loss": -132.0, "loss/preference_loss": -139.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.00421142578125, "rewards/rejected": 0.002349853515625, "step": 1187 }, { "epoch": 0.9504, "grad_norm": 1.9063223182896287, "learning_rate": 3.737687109474058e-09, "logits/chosen": 0.75390625, "logits/rejected": 0.71484375, "logps/chosen": -53.5, "logps/rejected": -56.25, "loss": 0.6912, "loss/demonstration_loss": -352.0, "loss/preference_loss": -350.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.0031280517578125, "rewards/margins": 0.00439453125, "rewards/rejected": -0.001251220703125, "step": 1188 }, { "epoch": 0.9512, "grad_norm": 1.13019160810721, "learning_rate": 3.6183782542343056e-09, "logits/chosen": 0.609375, "logits/rejected": 0.7421875, "logps/chosen": -40.5, "logps/rejected": -25.5, "loss": 0.6937, "loss/demonstration_loss": -520.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0087890625, "rewards/margins": -0.0084228515625, "rewards/rejected": -0.00031280517578125, "step": 1189 }, { "epoch": 0.952, "grad_norm": 1.1619362897632275, "learning_rate": 3.5009907323737818e-09, "logits/chosen": 1.046875, "logits/rejected": 0.99609375, "logps/chosen": -10.4375, "logps/rejected": -20.75, "loss": 0.6888, "loss/demonstration_loss": -496.0, "loss/preference_loss": -488.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.005615234375, "rewards/rejected": -0.00750732421875, "step": 1190 }, { "epoch": 0.9528, "grad_norm": 1.0878355151408994, "learning_rate": 3.385525459303956e-09, "logits/chosen": 0.8828125, "logits/rejected": 0.91015625, "logps/chosen": -42.25, "logps/rejected": -41.25, "loss": 0.692, "loss/demonstration_loss": -442.0, "loss/preference_loss": -442.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00811767578125, "rewards/margins": 0.0, "rewards/rejected": -0.00811767578125, "step": 1191 }, { "epoch": 0.9536, "grad_norm": 1.2542485899751812, "learning_rate": 3.2719833354462323e-09, "logits/chosen": 0.75390625, "logits/rejected": 0.765625, "logps/chosen": -34.5, "logps/rejected": -29.625, "loss": 0.6918, "loss/demonstration_loss": -336.0, "loss/preference_loss": -340.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.01220703125, "rewards/margins": -0.0062255859375, "rewards/rejected": -0.005950927734375, "step": 1192 }, { "epoch": 0.9544, "grad_norm": 1.436101144942731, "learning_rate": 3.1603652462249e-09, "logits/chosen": 0.83984375, "logits/rejected": 0.76171875, "logps/chosen": -12.625, "logps/rejected": -22.125, "loss": 0.6896, "loss/demonstration_loss": -272.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00689697265625, "rewards/margins": 0.0015411376953125, "rewards/rejected": -0.0084228515625, "step": 1193 }, { "epoch": 0.9552, "grad_norm": 1.4490907173214072, "learning_rate": 3.0506720620602776e-09, "logits/chosen": 0.48046875, "logits/rejected": 0.462890625, "logps/chosen": -116.0, "logps/rejected": -110.0, "loss": 0.692, "loss/demonstration_loss": -512.0, "loss/preference_loss": -512.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.01708984375, "rewards/margins": -0.00080108642578125, "rewards/rejected": -0.0162353515625, "step": 1194 }, { "epoch": 0.956, "grad_norm": 1.08410904644305, "learning_rate": 2.9429046383618038e-09, "logits/chosen": 0.5, "logits/rejected": 0.50390625, "logps/chosen": -9.375, "logps/rejected": -9.25, "loss": 0.6919, "loss/demonstration_loss": -150.0, "loss/preference_loss": -149.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0010986328125, "rewards/margins": 0.0017242431640625, "rewards/rejected": -0.0006256103515625, "step": 1195 }, { "epoch": 0.9568, "grad_norm": 1.1096823277504066, "learning_rate": 2.837063815521512e-09, "logits/chosen": 0.671875, "logits/rejected": 0.6328125, "logps/chosen": -38.25, "logps/rejected": -45.0, "loss": 0.6906, "loss/demonstration_loss": -330.0, "loss/preference_loss": -328.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00469970703125, "rewards/margins": 0.005615234375, "rewards/rejected": -0.01031494140625, "step": 1196 }, { "epoch": 0.9576, "grad_norm": 1.5728343997107623, "learning_rate": 2.7331504189073982e-09, "logits/chosen": 0.65625, "logits/rejected": 0.625, "logps/chosen": -27.375, "logps/rejected": -28.25, "loss": 0.6942, "loss/demonstration_loss": -298.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0031280517578125, "rewards/margins": -0.0021820068359375, "rewards/rejected": 0.00531005859375, "step": 1197 }, { "epoch": 0.9584, "grad_norm": 1.9691600231553619, "learning_rate": 2.631165258856982e-09, "logits/chosen": 0.97265625, "logits/rejected": 0.71484375, "logps/chosen": -59.5, "logps/rejected": -81.5, "loss": 0.6891, "loss/demonstration_loss": -376.0, "loss/preference_loss": -372.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00031280517578125, "rewards/margins": 0.0172119140625, "rewards/rejected": -0.0174560546875, "step": 1198 }, { "epoch": 0.9592, "grad_norm": 1.7101634742572305, "learning_rate": 2.5311091306710606e-09, "logits/chosen": 0.72265625, "logits/rejected": 0.62890625, "logps/chosen": -29.375, "logps/rejected": -36.25, "loss": 0.6919, "loss/demonstration_loss": -264.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.004547119140625, "rewards/margins": 0.00299072265625, "rewards/rejected": 0.00156402587890625, "step": 1199 }, { "epoch": 0.96, "grad_norm": 1.4215620147906423, "learning_rate": 2.4329828146074096e-09, "logits/chosen": 0.72265625, "logits/rejected": 0.734375, "logps/chosen": -31.75, "logps/rejected": -30.75, "loss": 0.6902, "loss/demonstration_loss": -248.0, "loss/preference_loss": -245.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.00439453125, "rewards/margins": 0.005950927734375, "rewards/rejected": -0.01031494140625, "step": 1200 }, { "epoch": 0.9608, "grad_norm": 1.9451986202547633, "learning_rate": 2.3367870758747853e-09, "logits/chosen": 0.828125, "logits/rejected": 0.6953125, "logps/chosen": -25.75, "logps/rejected": -41.5, "loss": 0.6882, "loss/demonstration_loss": -536.0, "loss/preference_loss": -528.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0031280517578125, "rewards/margins": 0.00750732421875, "rewards/rejected": -0.0106201171875, "step": 1201 }, { "epoch": 0.9616, "grad_norm": 1.3977476287656918, "learning_rate": 2.2425226646268224e-09, "logits/chosen": 0.69921875, "logits/rejected": 0.81640625, "logps/chosen": -46.75, "logps/rejected": -35.25, "loss": 0.6946, "loss/demonstration_loss": -260.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.01263427734375, "rewards/rejected": 0.002960205078125, "step": 1202 }, { "epoch": 0.9624, "grad_norm": 1.3933742065735353, "learning_rate": 2.1501903159563683e-09, "logits/chosen": 0.7734375, "logits/rejected": 0.76171875, "logps/chosen": -31.5, "logps/rejected": -33.25, "loss": 0.6913, "loss/demonstration_loss": -344.0, "loss/preference_loss": -342.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0034332275390625, "rewards/margins": 0.0037384033203125, "rewards/rejected": -0.007171630859375, "step": 1203 }, { "epoch": 0.9632, "grad_norm": 1.0670986598230137, "learning_rate": 2.0597907498896006e-09, "logits/chosen": 0.60546875, "logits/rejected": 0.7734375, "logps/chosen": -20.0, "logps/rejected": -9.625, "loss": 0.694, "loss/demonstration_loss": -234.0, "loss/preference_loss": -236.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.002655029296875, "rewards/rejected": -0.0010986328125, "step": 1204 }, { "epoch": 0.964, "grad_norm": 1.309735791828411, "learning_rate": 1.9713246713805587e-09, "logits/chosen": 0.5859375, "logits/rejected": 0.55859375, "logps/chosen": -14.6875, "logps/rejected": -16.125, "loss": 0.6931, "loss/demonstration_loss": -247.0, "loss/preference_loss": -246.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.0009307861328125, "rewards/margins": 0.0021820068359375, "rewards/rejected": -0.001251220703125, "step": 1205 }, { "epoch": 0.9648, "grad_norm": 1.6925013105046838, "learning_rate": 1.884792770305399e-09, "logits/chosen": 0.9453125, "logits/rejected": 0.8984375, "logps/chosen": -46.5, "logps/rejected": -42.0, "loss": 0.6912, "loss/demonstration_loss": -352.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.005615234375, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00063323974609375, "step": 1206 }, { "epoch": 0.9656, "grad_norm": 1.3797502064599056, "learning_rate": 1.8001957214573704e-09, "logits/chosen": 0.68359375, "logits/rejected": 0.7578125, "logps/chosen": -36.5, "logps/rejected": -32.0, "loss": 0.6932, "loss/demonstration_loss": -272.0, "loss/preference_loss": -272.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0078125, "rewards/margins": -0.003143310546875, "rewards/rejected": -0.00469970703125, "step": 1207 }, { "epoch": 0.9664, "grad_norm": 1.364649876050948, "learning_rate": 1.7175341845411529e-09, "logits/chosen": 0.8046875, "logits/rejected": 0.80078125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6921, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1208 }, { "epoch": 0.9672, "grad_norm": 16.61913922690547, "learning_rate": 1.6368088041681105e-09, "logits/chosen": 0.71875, "logits/rejected": 0.7578125, "logps/chosen": -124.0, "logps/rejected": -103.5, "loss": 0.6833, "loss/demonstration_loss": -604.0, "loss/preference_loss": -592.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01123046875, "rewards/margins": 0.041748046875, "rewards/rejected": -0.052978515625, "step": 1209 }, { "epoch": 0.968, "grad_norm": 1.2617733219106222, "learning_rate": 1.5580202098509076e-09, "logits/chosen": 0.65234375, "logits/rejected": 0.66796875, "logps/chosen": -38.0, "logps/rejected": -32.5, "loss": 0.6908, "loss/demonstration_loss": -278.0, "loss/preference_loss": -280.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.01092529296875, "rewards/margins": -0.0026702880859375, "rewards/rejected": -0.00830078125, "step": 1210 }, { "epoch": 0.9688, "grad_norm": 1.0989572147801576, "learning_rate": 1.4811690159988454e-09, "logits/chosen": 0.74609375, "logits/rejected": 0.60546875, "logps/chosen": -9.5, "logps/rejected": -19.125, "loss": 0.6913, "loss/demonstration_loss": -456.0, "loss/preference_loss": -454.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.00250244140625, "step": 1211 }, { "epoch": 0.9696, "grad_norm": 0.9991611388655717, "learning_rate": 1.406255821913005e-09, "logits/chosen": 0.77734375, "logits/rejected": 0.78125, "logps/chosen": -46.0, "logps/rejected": -45.25, "loss": 0.6937, "loss/demonstration_loss": -484.0, "loss/preference_loss": -486.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0084228515625, "rewards/margins": -0.005950927734375, "rewards/rejected": -0.00250244140625, "step": 1212 }, { "epoch": 0.9704, "grad_norm": 1.3781363391971941, "learning_rate": 1.3332812117814728e-09, "logits/chosen": 0.5546875, "logits/rejected": 0.8125, "logps/chosen": -51.0, "logps/rejected": -26.875, "loss": 0.6921, "loss/demonstration_loss": -308.0, "loss/preference_loss": -310.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00970458984375, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.00469970703125, "step": 1213 }, { "epoch": 0.9712, "grad_norm": 0.7455477049818497, "learning_rate": 1.2622457546749566e-09, "logits/chosen": 0.58203125, "logits/rejected": 0.6796875, "logps/chosen": -18.875, "logps/rejected": -7.625, "loss": 0.6935, "loss/demonstration_loss": -416.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.0028228759765625, "rewards/rejected": -0.0021820068359375, "step": 1214 }, { "epoch": 0.972, "grad_norm": 1.6878385952292154, "learning_rate": 1.1931500045422038e-09, "logits/chosen": 0.96875, "logits/rejected": 0.91796875, "logps/chosen": -29.25, "logps/rejected": -37.25, "loss": 0.6913, "loss/demonstration_loss": -266.0, "loss/preference_loss": -264.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.000640869140625, "rewards/margins": 0.00372314453125, "rewards/rejected": -0.004364013671875, "step": 1215 }, { "epoch": 0.9728, "grad_norm": 1.3880684450453529, "learning_rate": 1.125994500205757e-09, "logits/chosen": 0.5859375, "logits/rejected": 0.5234375, "logps/chosen": -68.0, "logps/rejected": -72.5, "loss": 0.6926, "loss/demonstration_loss": -444.0, "loss/preference_loss": -444.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0179443359375, "rewards/margins": -0.0019989013671875, "rewards/rejected": -0.0159912109375, "step": 1216 }, { "epoch": 0.9736, "grad_norm": 1.3479140948653887, "learning_rate": 1.0607797653577333e-09, "logits/chosen": 0.72265625, "logits/rejected": 0.78125, "logps/chosen": -30.0, "logps/rejected": -24.0, "loss": 0.6896, "loss/demonstration_loss": -288.0, "loss/preference_loss": -286.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0018768310546875, "step": 1217 }, { "epoch": 0.9744, "grad_norm": 0.7847829378874979, "learning_rate": 9.975063085557177e-10, "logits/chosen": 0.80078125, "logits/rejected": 0.78515625, "logps/chosen": -45.25, "logps/rejected": -52.75, "loss": 0.691, "loss/demonstration_loss": -314.0, "loss/preference_loss": -312.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.000782012939453125, "rewards/margins": 0.002960205078125, "rewards/rejected": -0.003753662109375, "step": 1218 }, { "epoch": 0.9752, "grad_norm": 0.9911047226205093, "learning_rate": 9.361746232188495e-10, "logits/chosen": 0.390625, "logits/rejected": 0.45703125, "logps/chosen": -28.25, "logps/rejected": -28.125, "loss": 0.6931, "loss/demonstration_loss": -296.0, "loss/preference_loss": -300.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.007171630859375, "rewards/rejected": -0.00093841552734375, "step": 1219 }, { "epoch": 0.976, "grad_norm": 1.2028148789470947, "learning_rate": 8.767851876239074e-10, "logits/chosen": 0.73046875, "logits/rejected": 0.71484375, "logps/chosen": -10.6875, "logps/rejected": -15.0, "loss": 0.6902, "loss/demonstration_loss": -404.0, "loss/preference_loss": -412.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.00439453125, "rewards/rejected": 0.0006256103515625, "step": 1220 }, { "epoch": 0.9768, "grad_norm": 1.3020512229924224, "learning_rate": 8.193384649017032e-10, "logits/chosen": 0.5078125, "logits/rejected": 0.5859375, "logps/chosen": -35.5, "logps/rejected": -34.25, "loss": 0.6907, "loss/demonstration_loss": -368.0, "loss/preference_loss": -366.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.005950927734375, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.0111083984375, "step": 1221 }, { "epoch": 0.9776, "grad_norm": 1.0080795043925865, "learning_rate": 7.638349030332503e-10, "logits/chosen": 0.63671875, "logits/rejected": 0.64453125, "logps/chosen": 0.0, "logps/rejected": 0.0, "loss": 0.6903, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1222 }, { "epoch": 0.9784, "grad_norm": 1.6621829887295259, "learning_rate": 7.102749348465165e-10, "logits/chosen": 0.82421875, "logits/rejected": 0.93359375, "logps/chosen": -34.5, "logps/rejected": -27.875, "loss": 0.6892, "loss/demonstration_loss": -248.0, "loss/preference_loss": -248.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00390625, "rewards/margins": -0.0010986328125, "rewards/rejected": -0.0028076171875, "step": 1223 }, { "epoch": 0.9792, "grad_norm": 1.172974145069399, "learning_rate": 6.586589780128715e-10, "logits/chosen": 0.58984375, "logits/rejected": 0.5625, "logps/chosen": -5.5, "logps/rejected": -5.59375, "loss": 0.6906, "loss/demonstration_loss": -176.0, "loss/preference_loss": -174.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.00093841552734375, "rewards/margins": 0.00093841552734375, "rewards/rejected": -0.0018768310546875, "step": 1224 }, { "epoch": 0.98, "grad_norm": 0.6379980853339672, "learning_rate": 6.089874350439505e-10, "logits/chosen": 0.82421875, "logits/rejected": 0.81640625, "logps/chosen": -8.875, "logps/rejected": -8.9375, "loss": 0.6901, "loss/demonstration_loss": -284.0, "loss/preference_loss": -284.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.0006256103515625, "rewards/rejected": -0.001251220703125, "step": 1225 }, { "epoch": 0.9808, "grad_norm": 1.0713800236831827, "learning_rate": 5.612606932883512e-10, "logits/chosen": 0.703125, "logits/rejected": 0.67578125, "logps/chosen": -29.75, "logps/rejected": -37.25, "loss": 0.6917, "loss/demonstration_loss": -356.0, "loss/preference_loss": -354.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0028076171875, "rewards/margins": 0.00156402587890625, "rewards/rejected": -0.00439453125, "step": 1226 }, { "epoch": 0.9816, "grad_norm": 1.2220305545936485, "learning_rate": 5.154791249288859e-10, "logits/chosen": 0.50390625, "logits/rejected": 0.5078125, "logps/chosen": -50.75, "logps/rejected": -50.75, "loss": 0.6908, "loss/demonstration_loss": -404.0, "loss/preference_loss": -404.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0072021484375, "rewards/margins": -0.00030517578125, "rewards/rejected": -0.00689697265625, "step": 1227 }, { "epoch": 0.9824, "grad_norm": 1.5726603029363821, "learning_rate": 4.716430869793342e-10, "logits/chosen": 0.75390625, "logits/rejected": 0.6953125, "logps/chosen": -125.5, "logps/rejected": -131.0, "loss": 0.6953, "loss/demonstration_loss": -580.0, "loss/preference_loss": -584.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0250244140625, "rewards/margins": -0.012451171875, "rewards/rejected": -0.01251220703125, "step": 1228 }, { "epoch": 0.9832, "grad_norm": 1.295823047472496, "learning_rate": 4.2975292128200057e-10, "logits/chosen": 0.546875, "logits/rejected": 0.5625, "logps/chosen": -14.0, "logps/rejected": -13.4375, "loss": 0.692, "loss/demonstration_loss": -434.0, "loss/preference_loss": -444.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.005615234375, "rewards/rejected": 0.00250244140625, "step": 1229 }, { "epoch": 0.984, "grad_norm": 0.9821056256873398, "learning_rate": 3.898089545047445e-10, "logits/chosen": 0.416015625, "logits/rejected": 0.412109375, "logps/chosen": -29.5, "logps/rejected": -28.75, "loss": 0.6924, "loss/demonstration_loss": -462.0, "loss/preference_loss": -466.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.005615234375, "rewards/margins": -0.00689697265625, "rewards/rejected": 0.001251220703125, "step": 1230 }, { "epoch": 0.9848, "grad_norm": 1.467965602278172, "learning_rate": 3.5181149813870436e-10, "logits/chosen": 0.7578125, "logits/rejected": 0.6875, "logps/chosen": -61.0, "logps/rejected": -75.0, "loss": 0.6901, "loss/demonstration_loss": -544.0, "loss/preference_loss": -544.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.001251220703125, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.00811767578125, "step": 1231 }, { "epoch": 0.9856, "grad_norm": 1.8771616282312868, "learning_rate": 3.157608484956331e-10, "logits/chosen": 0.41015625, "logits/rejected": 0.451171875, "logps/chosen": -11.0, "logps/rejected": -7.40625, "loss": 0.6868, "loss/demonstration_loss": -149.0, "loss/preference_loss": -147.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.002349853515625, "rewards/margins": 0.00311279296875, "rewards/rejected": -0.000782012939453125, "step": 1232 }, { "epoch": 0.9864, "grad_norm": 0.9977493674506112, "learning_rate": 2.8165728670573316e-10, "logits/chosen": 0.396484375, "logits/rejected": 0.404296875, "logps/chosen": -24.875, "logps/rejected": -24.875, "loss": 0.6924, "loss/demonstration_loss": -394.0, "loss/preference_loss": -394.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.004058837890625, "rewards/margins": 0.00031280517578125, "rewards/rejected": -0.00439453125, "step": 1233 }, { "epoch": 0.9872, "grad_norm": 2.0957758581465256, "learning_rate": 2.495010787154916e-10, "logits/chosen": 1.0390625, "logits/rejected": 0.83203125, "logps/chosen": -65.0, "logps/rejected": -77.5, "loss": 0.6864, "loss/demonstration_loss": -450.0, "loss/preference_loss": -444.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.0172119140625, "rewards/margins": 0.015869140625, "rewards/rejected": -0.033203125, "step": 1234 }, { "epoch": 0.988, "grad_norm": 1.1988848317829686, "learning_rate": 2.1929247528540418e-10, "logits/chosen": 0.765625, "logits/rejected": 0.8671875, "logps/chosen": -30.0, "logps/rejected": -16.875, "loss": 0.6919, "loss/demonstration_loss": -249.0, "loss/preference_loss": -251.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0031280517578125, "rewards/margins": -0.003143310546875, "rewards/rejected": 0.0, "step": 1235 }, { "epoch": 0.9888, "grad_norm": 1.2025221134429054, "learning_rate": 1.9103171198828205e-10, "logits/chosen": 0.66796875, "logits/rejected": 0.55078125, "logps/chosen": -11.375, "logps/rejected": -32.75, "loss": 0.6923, "loss/demonstration_loss": -700.0, "loss/preference_loss": -704.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.003753662109375, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.00250244140625, "step": 1236 }, { "epoch": 0.9896, "grad_norm": 1.1564034050129222, "learning_rate": 1.6471900920719816e-10, "logits/chosen": 0.953125, "logits/rejected": 1.078125, "logps/chosen": -51.5, "logps/rejected": -38.25, "loss": 0.691, "loss/demonstration_loss": -356.0, "loss/preference_loss": -356.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.00982666015625, "rewards/margins": -0.0011138916015625, "rewards/rejected": -0.00872802734375, "step": 1237 }, { "epoch": 0.9904, "grad_norm": 1.468813123185437, "learning_rate": 1.4035457213393276e-10, "logits/chosen": 0.640625, "logits/rejected": 0.60546875, "logps/chosen": -31.125, "logps/rejected": -48.5, "loss": 0.6934, "loss/demonstration_loss": -632.0, "loss/preference_loss": -636.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.005615234375, "rewards/rejected": -0.00250244140625, "step": 1238 }, { "epoch": 0.9912, "grad_norm": 1.425907624996944, "learning_rate": 1.1793859076722479e-10, "logits/chosen": 0.58984375, "logits/rejected": 0.62109375, "logps/chosen": -40.75, "logps/rejected": -38.25, "loss": 0.6891, "loss/demonstration_loss": -422.0, "loss/preference_loss": -420.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0018768310546875, "rewards/rejected": -0.003753662109375, "step": 1239 }, { "epoch": 0.992, "grad_norm": 1.052236887131718, "learning_rate": 9.747123991141193e-11, "logits/chosen": 0.58984375, "logits/rejected": 0.57421875, "logps/chosen": -27.375, "logps/rejected": -29.125, "loss": 0.6901, "loss/demonstration_loss": -452.0, "loss/preference_loss": -450.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0010986328125, "rewards/margins": 0.0032806396484375, "rewards/rejected": -0.00439453125, "step": 1240 }, { "epoch": 0.9928, "grad_norm": 1.477007229056372, "learning_rate": 7.895267917501503e-11, "logits/chosen": 0.55078125, "logits/rejected": 0.6484375, "logps/chosen": -48.25, "logps/rejected": -31.25, "loss": 0.6909, "loss/demonstration_loss": -424.0, "loss/preference_loss": -422.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0018768310546875, "rewards/margins": 0.006591796875, "rewards/rejected": -0.00469970703125, "step": 1241 }, { "epoch": 0.9936, "grad_norm": 1.2219293122403274, "learning_rate": 6.238305296946134e-11, "logits/chosen": 0.95703125, "logits/rejected": 1.1015625, "logps/chosen": -22.25, "logps/rejected": -14.625, "loss": 0.6907, "loss/demonstration_loss": -197.0, "loss/preference_loss": -196.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.00031280517578125, "rewards/margins": -0.000152587890625, "rewards/rejected": 0.000469207763671875, "step": 1242 }, { "epoch": 0.9944, "grad_norm": 1.2516605041727067, "learning_rate": 4.7762490508057584e-11, "logits/chosen": 0.57421875, "logits/rejected": 0.54296875, "logps/chosen": -24.75, "logps/rejected": -25.5, "loss": 0.6888, "loss/demonstration_loss": -402.0, "loss/preference_loss": -396.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0, "rewards/margins": 0.00750732421875, "rewards/rejected": -0.00750732421875, "step": 1243 }, { "epoch": 0.9952, "grad_norm": 1.6037024298647007, "learning_rate": 3.5091105804907485e-11, "logits/chosen": 0.8203125, "logits/rejected": 0.8359375, "logps/chosen": -47.75, "logps/rejected": -45.0, "loss": 0.693, "loss/demonstration_loss": -292.0, "loss/preference_loss": -294.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.0145263671875, "rewards/margins": -0.00885009765625, "rewards/rejected": -0.005615234375, "step": 1244 }, { "epoch": 0.996, "grad_norm": 0.7824884485485597, "learning_rate": 2.4368997673940294e-11, "logits/chosen": 0.61328125, "logits/rejected": 0.55078125, "logps/chosen": -20.875, "logps/rejected": -18.875, "loss": 0.692, "loss/demonstration_loss": -312.0, "loss/preference_loss": -314.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.006561279296875, "rewards/margins": -0.0028076171875, "rewards/rejected": -0.003753662109375, "step": 1245 }, { "epoch": 0.9968, "grad_norm": 1.132899019750773, "learning_rate": 1.559624972838347e-11, "logits/chosen": 0.95703125, "logits/rejected": 0.890625, "logps/chosen": -27.5, "logps/rejected": -30.25, "loss": 0.6897, "loss/demonstration_loss": -308.0, "loss/preference_loss": -308.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.0018768310546875, "rewards/margins": -0.00093841552734375, "rewards/rejected": -0.00093841552734375, "step": 1246 }, { "epoch": 0.9976, "grad_norm": 1.0345781199071054, "learning_rate": 8.772930379846721e-12, "logits/chosen": 0.84375, "logits/rejected": 0.79296875, "logps/chosen": -22.0, "logps/rejected": -35.0, "loss": 0.6898, "loss/demonstration_loss": -304.0, "loss/preference_loss": -298.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.01129150390625, "rewards/rejected": -0.01031494140625, "step": 1247 }, { "epoch": 0.9984, "grad_norm": 0.8513448217819443, "learning_rate": 3.899092837933438e-12, "logits/chosen": 0.78125, "logits/rejected": 0.80078125, "logps/chosen": -25.25, "logps/rejected": -23.125, "loss": 0.6945, "loss/demonstration_loss": -384.0, "loss/preference_loss": -386.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.003448486328125, "rewards/margins": -0.004058837890625, "rewards/rejected": 0.0006256103515625, "step": 1248 }, { "epoch": 0.9992, "grad_norm": 1.2689918441682992, "learning_rate": 9.747751098521107e-13, "logits/chosen": 0.62890625, "logits/rejected": 0.61328125, "logps/chosen": -27.5, "logps/rejected": -24.75, "loss": 0.6908, "loss/demonstration_loss": -416.0, "loss/preference_loss": -416.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0006256103515625, "rewards/rejected": -0.0018768310546875, "step": 1249 }, { "epoch": 1.0, "grad_norm": 1.1800002066599917, "learning_rate": 0.0, "logits/chosen": 0.83203125, "logits/rejected": 0.75, "logps/chosen": -30.625, "logps/rejected": -33.5, "loss": 0.6912, "loss/demonstration_loss": -342.0, "loss/preference_loss": -338.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.0, "rewards/margins": 0.00421142578125, "rewards/rejected": -0.00421142578125, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.69168408203125, "train_runtime": 25620.0654, "train_samples_per_second": 1.561, "train_steps_per_second": 0.049 } ], "logging_steps": 1, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }