diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30630 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5192557334487234, + "eval_steps": 500, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002884754074715131, + "grad_norm": 15.105622098355184, + "learning_rate": 1.440922190201729e-09, + "logits/chosen": 2.75, + "logits/rejected": 2.828125, + "logps/chosen": -1552.0, + "logps/rejected": -1752.0, + "loss": 0.6947, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1435546875, + "rewards/margins": 0.012451171875, + "rewards/rejected": -0.15625, + "step": 1 + }, + { + "epoch": 0.0005769508149430262, + "grad_norm": 13.38233941885805, + "learning_rate": 2.881844380403458e-09, + "logits/chosen": 2.71875, + "logits/rejected": 2.75, + "logps/chosen": -1640.0, + "logps/rejected": -1560.0, + "loss": 0.6732, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09521484375, + "rewards/margins": 0.058837890625, + "rewards/rejected": -0.154296875, + "step": 2 + }, + { + "epoch": 0.0008654262224145391, + "grad_norm": 10.63226296734449, + "learning_rate": 4.3227665706051874e-09, + "logits/chosen": 2.875, + "logits/rejected": 2.8125, + "logps/chosen": -1632.0, + "logps/rejected": -1672.0, + "loss": 0.7098, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.09326171875, + "rewards/margins": -0.0230712890625, + "rewards/rejected": -0.0703125, + "step": 3 + }, + { + "epoch": 0.0011539016298860523, + "grad_norm": 14.234756126774585, + "learning_rate": 5.763688760806916e-09, + "logits/chosen": 2.734375, + "logits/rejected": 2.6875, + "logps/chosen": -1656.0, + "logps/rejected": -1824.0, + "loss": 0.6915, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1865234375, + "rewards/margins": -0.0306396484375, + "rewards/rejected": -0.15625, + "step": 4 + }, + { + "epoch": 0.0014423770373575653, + "grad_norm": 14.47425314509406, + "learning_rate": 7.204610951008645e-09, + "logits/chosen": 2.875, + "logits/rejected": 2.859375, + "logps/chosen": -1816.0, + "logps/rejected": -1472.0, + "loss": 0.6982, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1240234375, + "rewards/margins": 0.041748046875, + "rewards/rejected": -0.166015625, + "step": 5 + }, + { + "epoch": 0.0017308524448290783, + "grad_norm": 12.392300151453513, + "learning_rate": 8.645533141210375e-09, + "logits/chosen": 2.765625, + "logits/rejected": 2.859375, + "logps/chosen": -1944.0, + "logps/rejected": -1824.0, + "loss": 0.7202, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1376953125, + "rewards/margins": -0.02001953125, + "rewards/rejected": -0.11767578125, + "step": 6 + }, + { + "epoch": 0.0020193278523005912, + "grad_norm": 11.958961211656046, + "learning_rate": 1.0086455331412104e-08, + "logits/chosen": 2.78125, + "logits/rejected": 2.8125, + "logps/chosen": -1768.0, + "logps/rejected": -1792.0, + "loss": 0.6884, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1318359375, + "rewards/margins": 0.0081787109375, + "rewards/rejected": -0.140625, + "step": 7 + }, + { + "epoch": 0.0023078032597721046, + "grad_norm": 12.540658855939016, + "learning_rate": 1.1527377521613832e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1864.0, + "logps/rejected": -1928.0, + "loss": 0.7207, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12890625, + "rewards/margins": -0.02880859375, + "rewards/rejected": -0.10009765625, + "step": 8 + }, + { + "epoch": 0.0025962786672436176, + "grad_norm": 13.090625634922002, + "learning_rate": 1.2968299711815562e-08, + "logits/chosen": 2.65625, + "logits/rejected": 2.71875, + "logps/chosen": -2208.0, + "logps/rejected": -2144.0, + "loss": 0.7173, + "loss/demonstration_loss": -4352.0, + "loss/preference_loss": -4352.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.240234375, + "rewards/margins": -0.053955078125, + "rewards/rejected": -0.1865234375, + "step": 9 + }, + { + "epoch": 0.0028847540747151306, + "grad_norm": 15.235836453697397, + "learning_rate": 1.440922190201729e-08, + "logits/chosen": 2.953125, + "logits/rejected": 3.015625, + "logps/chosen": -1504.0, + "logps/rejected": -1336.0, + "loss": 0.7042, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1279296875, + "rewards/margins": -0.0263671875, + "rewards/rejected": -0.10107421875, + "step": 10 + }, + { + "epoch": 0.0031732294821866435, + "grad_norm": 14.204409404027695, + "learning_rate": 1.585014409221902e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.75, + "logps/chosen": -1616.0, + "logps/rejected": -1648.0, + "loss": 0.6975, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.13671875, + "rewards/margins": 0.00115966796875, + "rewards/rejected": -0.1376953125, + "step": 11 + }, + { + "epoch": 0.0034617048896581565, + "grad_norm": 18.844810544155305, + "learning_rate": 1.729106628242075e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.8125, + "logps/chosen": -1576.0, + "logps/rejected": -1440.0, + "loss": 0.7234, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.142578125, + "rewards/margins": -0.0419921875, + "rewards/rejected": -0.1005859375, + "step": 12 + }, + { + "epoch": 0.0037501802971296695, + "grad_norm": 12.791797669534505, + "learning_rate": 1.8731988472622476e-08, + "logits/chosen": 2.859375, + "logits/rejected": 2.859375, + "logps/chosen": -1744.0, + "logps/rejected": -1696.0, + "loss": 0.6899, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1259765625, + "rewards/margins": 0.03466796875, + "rewards/rejected": -0.16015625, + "step": 13 + }, + { + "epoch": 0.0040386557046011825, + "grad_norm": 12.869716561025204, + "learning_rate": 2.0172910662824208e-08, + "logits/chosen": 2.71875, + "logits/rejected": 2.6875, + "logps/chosen": -1672.0, + "logps/rejected": -1744.0, + "loss": 0.6702, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08740234375, + "rewards/margins": 0.0125732421875, + "rewards/rejected": -0.10009765625, + "step": 14 + }, + { + "epoch": 0.004327131112072695, + "grad_norm": 12.93353721714942, + "learning_rate": 2.1613832853025937e-08, + "logits/chosen": 2.75, + "logits/rejected": 2.71875, + "logps/chosen": -1568.0, + "logps/rejected": -1408.0, + "loss": 0.6797, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.130859375, + "rewards/margins": -0.00689697265625, + "rewards/rejected": -0.1240234375, + "step": 15 + }, + { + "epoch": 0.004615606519544209, + "grad_norm": 12.88234502217038, + "learning_rate": 2.3054755043227663e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.859375, + "logps/chosen": -1712.0, + "logps/rejected": -1736.0, + "loss": 0.7031, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.181640625, + "rewards/margins": 0.00616455078125, + "rewards/rejected": -0.1875, + "step": 16 + }, + { + "epoch": 0.004904081927015722, + "grad_norm": 15.034716429557914, + "learning_rate": 2.4495677233429392e-08, + "logits/chosen": 2.65625, + "logits/rejected": 2.5625, + "logps/chosen": -1648.0, + "logps/rejected": -1672.0, + "loss": 0.7188, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1630859375, + "rewards/margins": -0.06591796875, + "rewards/rejected": -0.09716796875, + "step": 17 + }, + { + "epoch": 0.005192557334487235, + "grad_norm": 17.502122918901282, + "learning_rate": 2.5936599423631125e-08, + "logits/chosen": 2.859375, + "logits/rejected": 2.828125, + "logps/chosen": -1888.0, + "logps/rejected": -1984.0, + "loss": 0.7198, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.181640625, + "rewards/margins": -0.07861328125, + "rewards/rejected": -0.1025390625, + "step": 18 + }, + { + "epoch": 0.005481032741958748, + "grad_norm": 15.442527217563052, + "learning_rate": 2.7377521613832854e-08, + "logits/chosen": 2.921875, + "logits/rejected": 3.015625, + "logps/chosen": -1944.0, + "logps/rejected": -1936.0, + "loss": 0.7505, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2216796875, + "rewards/margins": -0.095703125, + "rewards/rejected": -0.1259765625, + "step": 19 + }, + { + "epoch": 0.005769508149430261, + "grad_norm": 15.794170115027935, + "learning_rate": 2.881844380403458e-08, + "logits/chosen": 2.90625, + "logits/rejected": 2.921875, + "logps/chosen": -1560.0, + "logps/rejected": -1512.0, + "loss": 0.7106, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.150390625, + "rewards/margins": -0.04443359375, + "rewards/rejected": -0.1064453125, + "step": 20 + }, + { + "epoch": 0.006057983556901774, + "grad_norm": 12.564747763926832, + "learning_rate": 3.025936599423631e-08, + "logits/chosen": 2.84375, + "logits/rejected": 2.890625, + "logps/chosen": -1616.0, + "logps/rejected": -1544.0, + "loss": 0.7095, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1591796875, + "rewards/margins": -0.048828125, + "rewards/rejected": -0.1103515625, + "step": 21 + }, + { + "epoch": 0.006346458964373287, + "grad_norm": 12.59940769824516, + "learning_rate": 3.170028818443804e-08, + "logits/chosen": 2.6875, + "logits/rejected": 2.59375, + "logps/chosen": -1960.0, + "logps/rejected": -1720.0, + "loss": 0.7194, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1875, + "rewards/margins": -0.07373046875, + "rewards/rejected": -0.11376953125, + "step": 22 + }, + { + "epoch": 0.0066349343718448, + "grad_norm": 17.171978453205284, + "learning_rate": 3.314121037463977e-08, + "logits/chosen": 2.890625, + "logits/rejected": 2.953125, + "logps/chosen": -1960.0, + "logps/rejected": -1752.0, + "loss": 0.7145, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.154296875, + "rewards/margins": -0.0576171875, + "rewards/rejected": -0.09619140625, + "step": 23 + }, + { + "epoch": 0.006923409779316313, + "grad_norm": 16.54562857083525, + "learning_rate": 3.45821325648415e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.796875, + "logps/chosen": -1272.0, + "logps/rejected": -1336.0, + "loss": 0.7218, + "loss/demonstration_loss": -2592.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.16015625, + "rewards/margins": -0.095703125, + "rewards/rejected": -0.064453125, + "step": 24 + }, + { + "epoch": 0.007211885186787826, + "grad_norm": 13.444888343207758, + "learning_rate": 3.6023054755043225e-08, + "logits/chosen": 2.609375, + "logits/rejected": 2.703125, + "logps/chosen": -1592.0, + "logps/rejected": -1584.0, + "loss": 0.6952, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.150390625, + "rewards/margins": -0.0263671875, + "rewards/rejected": -0.1240234375, + "step": 25 + }, + { + "epoch": 0.007500360594259339, + "grad_norm": 12.995839736424188, + "learning_rate": 3.746397694524495e-08, + "logits/chosen": 2.890625, + "logits/rejected": 2.8125, + "logps/chosen": -1624.0, + "logps/rejected": -1840.0, + "loss": 0.6986, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1162109375, + "rewards/margins": -0.01556396484375, + "rewards/rejected": -0.1005859375, + "step": 26 + }, + { + "epoch": 0.007788836001730853, + "grad_norm": 13.86653715582766, + "learning_rate": 3.8904899135446684e-08, + "logits/chosen": 2.765625, + "logits/rejected": 2.75, + "logps/chosen": -1600.0, + "logps/rejected": -1624.0, + "loss": 0.6911, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.134765625, + "rewards/margins": -0.02001953125, + "rewards/rejected": -0.115234375, + "step": 27 + }, + { + "epoch": 0.008077311409202365, + "grad_norm": 13.030726174120343, + "learning_rate": 4.0345821325648416e-08, + "logits/chosen": 2.734375, + "logits/rejected": 2.71875, + "logps/chosen": -1536.0, + "logps/rejected": -1464.0, + "loss": 0.7053, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1796875, + "rewards/margins": -0.0263671875, + "rewards/rejected": -0.154296875, + "step": 28 + }, + { + "epoch": 0.008365786816673878, + "grad_norm": 15.683952023128214, + "learning_rate": 4.178674351585014e-08, + "logits/chosen": 2.921875, + "logits/rejected": 3.0, + "logps/chosen": -1624.0, + "logps/rejected": -1192.0, + "loss": 0.6995, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08642578125, + "rewards/margins": 0.030517578125, + "rewards/rejected": -0.11669921875, + "step": 29 + }, + { + "epoch": 0.00865426222414539, + "grad_norm": 13.90180927179223, + "learning_rate": 4.3227665706051874e-08, + "logits/chosen": 2.875, + "logits/rejected": 2.859375, + "logps/chosen": -1832.0, + "logps/rejected": -1872.0, + "loss": 0.6894, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.080078125, + "rewards/margins": 0.09521484375, + "rewards/rejected": -0.1748046875, + "step": 30 + }, + { + "epoch": 0.008942737631616904, + "grad_norm": 17.76007574919579, + "learning_rate": 4.46685878962536e-08, + "logits/chosen": 2.84375, + "logits/rejected": 2.859375, + "logps/chosen": -1880.0, + "logps/rejected": -1816.0, + "loss": 0.6858, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15234375, + "rewards/margins": 0.018798828125, + "rewards/rejected": -0.171875, + "step": 31 + }, + { + "epoch": 0.009231213039088419, + "grad_norm": 12.92750956578199, + "learning_rate": 4.6109510086455326e-08, + "logits/chosen": 2.890625, + "logits/rejected": 2.84375, + "logps/chosen": -1816.0, + "logps/rejected": -1824.0, + "loss": 0.7167, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.15625, + "rewards/margins": -0.0224609375, + "rewards/rejected": -0.1337890625, + "step": 32 + }, + { + "epoch": 0.009519688446559932, + "grad_norm": 13.672123525697366, + "learning_rate": 4.755043227665706e-08, + "logits/chosen": 2.625, + "logits/rejected": 2.6875, + "logps/chosen": -1256.0, + "logps/rejected": -1224.0, + "loss": 0.7009, + "loss/demonstration_loss": -2464.0, + "loss/preference_loss": -2464.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.142578125, + "rewards/margins": -0.003753662109375, + "rewards/rejected": -0.138671875, + "step": 33 + }, + { + "epoch": 0.009808163854031444, + "grad_norm": 11.021237673944645, + "learning_rate": 4.8991354466858784e-08, + "logits/chosen": 2.90625, + "logits/rejected": 2.921875, + "logps/chosen": -1384.0, + "logps/rejected": -1600.0, + "loss": 0.6967, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11279296875, + "rewards/margins": -0.012451171875, + "rewards/rejected": -0.10009765625, + "step": 34 + }, + { + "epoch": 0.010096639261502957, + "grad_norm": 14.153400876179415, + "learning_rate": 5.043227665706052e-08, + "logits/chosen": 2.75, + "logits/rejected": 2.8125, + "logps/chosen": -1816.0, + "logps/rejected": -1640.0, + "loss": 0.7126, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.19921875, + "rewards/margins": -0.043212890625, + "rewards/rejected": -0.1552734375, + "step": 35 + }, + { + "epoch": 0.01038511466897447, + "grad_norm": 13.06943765898696, + "learning_rate": 5.187319884726225e-08, + "logits/chosen": 2.90625, + "logits/rejected": 2.984375, + "logps/chosen": -1696.0, + "logps/rejected": -1680.0, + "loss": 0.6866, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10107421875, + "rewards/margins": -0.015380859375, + "rewards/rejected": -0.08544921875, + "step": 36 + }, + { + "epoch": 0.010673590076445983, + "grad_norm": 11.48175185532301, + "learning_rate": 5.3314121037463975e-08, + "logits/chosen": 2.875, + "logits/rejected": 2.859375, + "logps/chosen": -1992.0, + "logps/rejected": -2112.0, + "loss": 0.6775, + "loss/demonstration_loss": -4080.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.158203125, + "rewards/margins": 0.046875, + "rewards/rejected": -0.205078125, + "step": 37 + }, + { + "epoch": 0.010962065483917496, + "grad_norm": 12.88120116924046, + "learning_rate": 5.475504322766571e-08, + "logits/chosen": 2.9375, + "logits/rejected": 2.9375, + "logps/chosen": -1656.0, + "logps/rejected": -1568.0, + "loss": 0.6763, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1455078125, + "rewards/margins": 0.034912109375, + "rewards/rejected": -0.1796875, + "step": 38 + }, + { + "epoch": 0.01125054089138901, + "grad_norm": 13.50696208674342, + "learning_rate": 5.6195965417867433e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.890625, + "logps/chosen": -1472.0, + "logps/rejected": -1376.0, + "loss": 0.7014, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0888671875, + "rewards/margins": 0.002838134765625, + "rewards/rejected": -0.091796875, + "step": 39 + }, + { + "epoch": 0.011539016298860522, + "grad_norm": 12.5791808171264, + "learning_rate": 5.763688760806916e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.796875, + "logps/chosen": -1984.0, + "logps/rejected": -1952.0, + "loss": 0.6979, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0927734375, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.1328125, + "step": 40 + }, + { + "epoch": 0.011827491706332035, + "grad_norm": 18.17073907429118, + "learning_rate": 5.907780979827089e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.78125, + "logps/chosen": -1768.0, + "logps/rejected": -1632.0, + "loss": 0.7208, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1875, + "rewards/margins": -0.1025390625, + "rewards/rejected": -0.0849609375, + "step": 41 + }, + { + "epoch": 0.012115967113803548, + "grad_norm": 17.653131599336444, + "learning_rate": 6.051873198847262e-08, + "logits/chosen": 2.890625, + "logits/rejected": 2.890625, + "logps/chosen": -1680.0, + "logps/rejected": -1488.0, + "loss": 0.6991, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1416015625, + "rewards/margins": -0.002838134765625, + "rewards/rejected": -0.138671875, + "step": 42 + }, + { + "epoch": 0.012404442521275061, + "grad_norm": 15.3987909231053, + "learning_rate": 6.195965417867434e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.8125, + "logps/chosen": -2384.0, + "logps/rejected": -1936.0, + "loss": 0.7133, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22265625, + "rewards/margins": 0.02490234375, + "rewards/rejected": -0.248046875, + "step": 43 + }, + { + "epoch": 0.012692917928746574, + "grad_norm": 13.991809287766788, + "learning_rate": 6.340057636887608e-08, + "logits/chosen": 2.78125, + "logits/rejected": 2.71875, + "logps/chosen": -1768.0, + "logps/rejected": -1864.0, + "loss": 0.7067, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1611328125, + "rewards/margins": -0.021240234375, + "rewards/rejected": -0.1396484375, + "step": 44 + }, + { + "epoch": 0.012981393336218087, + "grad_norm": 13.067207440719582, + "learning_rate": 6.484149855907781e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1952.0, + "logps/rejected": -1720.0, + "loss": 0.7087, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1474609375, + "rewards/margins": -0.0224609375, + "rewards/rejected": -0.125, + "step": 45 + }, + { + "epoch": 0.0132698687436896, + "grad_norm": 12.431510651646285, + "learning_rate": 6.628242074927953e-08, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -1896.0, + "logps/rejected": -1864.0, + "loss": 0.7286, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1953125, + "rewards/margins": -0.0859375, + "rewards/rejected": -0.109375, + "step": 46 + }, + { + "epoch": 0.013558344151161113, + "grad_norm": 14.707946307588788, + "learning_rate": 6.772334293948126e-08, + "logits/chosen": 2.765625, + "logits/rejected": 2.6875, + "logps/chosen": -1992.0, + "logps/rejected": -2032.0, + "loss": 0.6945, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1728515625, + "rewards/margins": 0.0, + "rewards/rejected": -0.1728515625, + "step": 47 + }, + { + "epoch": 0.013846819558632626, + "grad_norm": 13.897995519062215, + "learning_rate": 6.9164265129683e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.828125, + "logps/chosen": -1632.0, + "logps/rejected": -1400.0, + "loss": 0.7103, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10888671875, + "rewards/margins": 0.0, + "rewards/rejected": -0.10888671875, + "step": 48 + }, + { + "epoch": 0.014135294966104139, + "grad_norm": 12.275750579827823, + "learning_rate": 7.060518731988472e-08, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -2192.0, + "logps/rejected": -2144.0, + "loss": 0.6973, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.146484375, + "rewards/margins": 0.039306640625, + "rewards/rejected": -0.185546875, + "step": 49 + }, + { + "epoch": 0.014423770373575652, + "grad_norm": 12.468090358827032, + "learning_rate": 7.204610951008645e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.859375, + "logps/chosen": -2008.0, + "logps/rejected": -2144.0, + "loss": 0.6705, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0732421875, + "rewards/margins": 0.119140625, + "rewards/rejected": -0.1923828125, + "step": 50 + }, + { + "epoch": 0.014712245781047165, + "grad_norm": 12.444198551952201, + "learning_rate": 7.348703170028818e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.75, + "logps/chosen": -2000.0, + "logps/rejected": -1784.0, + "loss": 0.7191, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.1650390625, + "rewards/margins": -0.08984375, + "rewards/rejected": -0.0751953125, + "step": 51 + }, + { + "epoch": 0.015000721188518678, + "grad_norm": 15.483526762563997, + "learning_rate": 7.49279538904899e-08, + "logits/chosen": 2.90625, + "logits/rejected": 2.984375, + "logps/chosen": -1696.0, + "logps/rejected": -1568.0, + "loss": 0.7098, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09765625, + "rewards/margins": 0.0050048828125, + "rewards/rejected": -0.1025390625, + "step": 52 + }, + { + "epoch": 0.015289196595990193, + "grad_norm": 13.08033369855885, + "learning_rate": 7.636887608069163e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.859375, + "logps/chosen": -1704.0, + "logps/rejected": -1384.0, + "loss": 0.6826, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.171875, + "rewards/margins": -0.0224609375, + "rewards/rejected": -0.1484375, + "step": 53 + }, + { + "epoch": 0.015577672003461706, + "grad_norm": 13.451313196439699, + "learning_rate": 7.780979827089337e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.78125, + "logps/chosen": -1888.0, + "logps/rejected": -1760.0, + "loss": 0.7076, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11767578125, + "rewards/margins": -0.0025177001953125, + "rewards/rejected": -0.115234375, + "step": 54 + }, + { + "epoch": 0.01586614741093322, + "grad_norm": 16.85520055071422, + "learning_rate": 7.925072046109509e-08, + "logits/chosen": 2.75, + "logits/rejected": 2.828125, + "logps/chosen": -1632.0, + "logps/rejected": -1560.0, + "loss": 0.7167, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2177734375, + "rewards/margins": -0.039306640625, + "rewards/rejected": -0.1787109375, + "step": 55 + }, + { + "epoch": 0.01615462281840473, + "grad_norm": 12.865724187002614, + "learning_rate": 8.069164265129683e-08, + "logits/chosen": 2.828125, + "logits/rejected": 2.90625, + "logps/chosen": -2032.0, + "logps/rejected": -1784.0, + "loss": 0.71, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.259765625, + "rewards/margins": -0.05078125, + "rewards/rejected": -0.2099609375, + "step": 56 + }, + { + "epoch": 0.016443098225876245, + "grad_norm": 11.59857459648346, + "learning_rate": 8.213256484149856e-08, + "logits/chosen": 2.765625, + "logits/rejected": 2.6875, + "logps/chosen": -1640.0, + "logps/rejected": -1776.0, + "loss": 0.7191, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09765625, + "rewards/margins": 0.003753662109375, + "rewards/rejected": -0.1015625, + "step": 57 + }, + { + "epoch": 0.016731573633347756, + "grad_norm": 14.902926837818534, + "learning_rate": 8.357348703170028e-08, + "logits/chosen": 2.5625, + "logits/rejected": 2.71875, + "logps/chosen": -1800.0, + "logps/rejected": -1488.0, + "loss": 0.7085, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.146484375, + "rewards/margins": -0.07568359375, + "rewards/rejected": -0.0703125, + "step": 58 + }, + { + "epoch": 0.01702004904081927, + "grad_norm": 13.728302979266594, + "learning_rate": 8.501440922190202e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.828125, + "logps/chosen": -1696.0, + "logps/rejected": -1832.0, + "loss": 0.6771, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1376953125, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.177734375, + "step": 59 + }, + { + "epoch": 0.01730852444829078, + "grad_norm": 16.461096859082186, + "learning_rate": 8.645533141210375e-08, + "logits/chosen": 2.75, + "logits/rejected": 2.71875, + "logps/chosen": -1816.0, + "logps/rejected": -1920.0, + "loss": 0.7271, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1513671875, + "rewards/margins": -0.068359375, + "rewards/rejected": -0.0830078125, + "step": 60 + }, + { + "epoch": 0.017596999855762296, + "grad_norm": 10.891807329172197, + "learning_rate": 8.789625360230547e-08, + "logits/chosen": 2.859375, + "logits/rejected": 2.796875, + "logps/chosen": -1184.0, + "logps/rejected": -1152.0, + "loss": 0.6869, + "loss/demonstration_loss": -2320.0, + "loss/preference_loss": -2320.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08642578125, + "rewards/margins": 0.00750732421875, + "rewards/rejected": -0.09423828125, + "step": 61 + }, + { + "epoch": 0.017885475263233808, + "grad_norm": 11.177363888430538, + "learning_rate": 8.93371757925072e-08, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1568.0, + "logps/rejected": -1536.0, + "loss": 0.6735, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05810546875, + "rewards/margins": 0.04443359375, + "rewards/rejected": -0.1025390625, + "step": 62 + }, + { + "epoch": 0.018173950670705322, + "grad_norm": 14.063135789692483, + "learning_rate": 9.077809798270893e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1976.0, + "logps/rejected": -1896.0, + "loss": 0.7209, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1435546875, + "rewards/margins": -0.06396484375, + "rewards/rejected": -0.080078125, + "step": 63 + }, + { + "epoch": 0.018462426078176837, + "grad_norm": 15.930494726432237, + "learning_rate": 9.221902017291065e-08, + "logits/chosen": 2.6875, + "logits/rejected": 2.65625, + "logps/chosen": -1416.0, + "logps/rejected": -1424.0, + "loss": 0.7045, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12060546875, + "rewards/margins": -0.0230712890625, + "rewards/rejected": -0.09765625, + "step": 64 + }, + { + "epoch": 0.01875090148564835, + "grad_norm": 11.892934715925998, + "learning_rate": 9.365994236311239e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.765625, + "logps/chosen": -1736.0, + "logps/rejected": -1664.0, + "loss": 0.698, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1201171875, + "rewards/margins": 0.04052734375, + "rewards/rejected": -0.1611328125, + "step": 65 + }, + { + "epoch": 0.019039376893119863, + "grad_norm": 13.55115315282989, + "learning_rate": 9.510086455331412e-08, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1808.0, + "logps/rejected": -1696.0, + "loss": 0.7002, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08642578125, + "rewards/margins": 0.06884765625, + "rewards/rejected": -0.1552734375, + "step": 66 + }, + { + "epoch": 0.019327852300591374, + "grad_norm": 13.20889923237034, + "learning_rate": 9.654178674351584e-08, + "logits/chosen": 2.8125, + "logits/rejected": 2.859375, + "logps/chosen": -1872.0, + "logps/rejected": -1864.0, + "loss": 0.7179, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.203125, + "rewards/margins": -0.0301513671875, + "rewards/rejected": -0.1728515625, + "step": 67 + }, + { + "epoch": 0.01961632770806289, + "grad_norm": 13.858685182563192, + "learning_rate": 9.798270893371757e-08, + "logits/chosen": 2.796875, + "logits/rejected": 2.765625, + "logps/chosen": -1632.0, + "logps/rejected": -1472.0, + "loss": 0.7089, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.197265625, + "rewards/margins": -0.07275390625, + "rewards/rejected": -0.12451171875, + "step": 68 + }, + { + "epoch": 0.0199048031155344, + "grad_norm": 13.189866812536431, + "learning_rate": 9.94236311239193e-08, + "logits/chosen": 2.875, + "logits/rejected": 2.84375, + "logps/chosen": -1808.0, + "logps/rejected": -1776.0, + "loss": 0.706, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1611328125, + "rewards/margins": -0.01434326171875, + "rewards/rejected": -0.146484375, + "step": 69 + }, + { + "epoch": 0.020193278523005915, + "grad_norm": 12.815904493910594, + "learning_rate": 1.0086455331412103e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.84375, + "logps/chosen": -1816.0, + "logps/rejected": -1816.0, + "loss": 0.7035, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.125, + "rewards/margins": -0.00872802734375, + "rewards/rejected": -0.1162109375, + "step": 70 + }, + { + "epoch": 0.020481753930477426, + "grad_norm": 16.66476038366138, + "learning_rate": 1.0230547550432277e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.84375, + "logps/chosen": -1368.0, + "logps/rejected": -1088.0, + "loss": 0.7234, + "loss/demonstration_loss": -2432.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1806640625, + "rewards/margins": -0.08056640625, + "rewards/rejected": -0.10009765625, + "step": 71 + }, + { + "epoch": 0.02077022933794894, + "grad_norm": 11.2773314220454, + "learning_rate": 1.037463976945245e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.921875, + "logps/chosen": -1440.0, + "logps/rejected": -1216.0, + "loss": 0.697, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2656.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12255859375, + "rewards/margins": -0.0162353515625, + "rewards/rejected": -0.1064453125, + "step": 72 + }, + { + "epoch": 0.021058704745420452, + "grad_norm": 12.97423541376043, + "learning_rate": 1.0518731988472622e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.796875, + "logps/chosen": -1528.0, + "logps/rejected": -1440.0, + "loss": 0.6967, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1611328125, + "rewards/margins": 0.0054931640625, + "rewards/rejected": -0.166015625, + "step": 73 + }, + { + "epoch": 0.021347180152891967, + "grad_norm": 13.209840943982604, + "learning_rate": 1.0662824207492795e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.78125, + "logps/chosen": -1552.0, + "logps/rejected": -1336.0, + "loss": 0.7178, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1064453125, + "rewards/margins": -0.04638671875, + "rewards/rejected": -0.06005859375, + "step": 74 + }, + { + "epoch": 0.021635655560363478, + "grad_norm": 16.866049245330643, + "learning_rate": 1.0806916426512968e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.96875, + "logps/chosen": -1936.0, + "logps/rejected": -1576.0, + "loss": 0.7155, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.173828125, + "rewards/margins": -0.0286865234375, + "rewards/rejected": -0.1455078125, + "step": 75 + }, + { + "epoch": 0.021924130967834993, + "grad_norm": 11.6298357116585, + "learning_rate": 1.0951008645533142e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.765625, + "logps/chosen": -1704.0, + "logps/rejected": -1680.0, + "loss": 0.6767, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0703125, + "rewards/margins": 0.061279296875, + "rewards/rejected": -0.1318359375, + "step": 76 + }, + { + "epoch": 0.022212606375306504, + "grad_norm": 14.091723378559442, + "learning_rate": 1.1095100864553314e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.84375, + "logps/chosen": -2128.0, + "logps/rejected": -2208.0, + "loss": 0.707, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.0625, + "rewards/chosen": -0.1875, + "rewards/margins": -0.09521484375, + "rewards/rejected": -0.0927734375, + "step": 77 + }, + { + "epoch": 0.02250108178277802, + "grad_norm": 13.86489071963396, + "learning_rate": 1.1239193083573487e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.875, + "logps/chosen": -1896.0, + "logps/rejected": -1928.0, + "loss": 0.6869, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.125, + "rewards/margins": 0.0625, + "rewards/rejected": -0.1875, + "step": 78 + }, + { + "epoch": 0.02278955719024953, + "grad_norm": 12.316191181409046, + "learning_rate": 1.1383285302593659e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1984.0, + "logps/rejected": -1768.0, + "loss": 0.7059, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1064453125, + "rewards/margins": -0.03564453125, + "rewards/rejected": -0.07080078125, + "step": 79 + }, + { + "epoch": 0.023078032597721045, + "grad_norm": 15.10554682285765, + "learning_rate": 1.1527377521613832e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.890625, + "logps/chosen": -1768.0, + "logps/rejected": -1808.0, + "loss": 0.74, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.20703125, + "rewards/margins": -0.056396484375, + "rewards/rejected": -0.150390625, + "step": 80 + }, + { + "epoch": 0.023366508005192556, + "grad_norm": 12.586347980858184, + "learning_rate": 1.1671469740634004e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.828125, + "logps/chosen": -1624.0, + "logps/rejected": -1712.0, + "loss": 0.6951, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1171875, + "rewards/margins": -0.034423828125, + "rewards/rejected": -0.08251953125, + "step": 81 + }, + { + "epoch": 0.02365498341266407, + "grad_norm": 14.22680500938693, + "learning_rate": 1.1815561959654178e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.796875, + "logps/chosen": -1416.0, + "logps/rejected": -1440.0, + "loss": 0.7041, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1201171875, + "rewards/margins": -0.0113525390625, + "rewards/rejected": -0.10888671875, + "step": 82 + }, + { + "epoch": 0.023943458820135582, + "grad_norm": 11.682746980381589, + "learning_rate": 1.195965417867435e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -2112.0, + "logps/rejected": -2128.0, + "loss": 0.6851, + "loss/demonstration_loss": -4224.0, + "loss/preference_loss": -4224.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.10498046875, + "rewards/margins": -0.0050048828125, + "rewards/rejected": -0.10009765625, + "step": 83 + }, + { + "epoch": 0.024231934227607096, + "grad_norm": 13.036826056110216, + "learning_rate": 1.2103746397694524e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.78125, + "logps/chosen": -1944.0, + "logps/rejected": -1760.0, + "loss": 0.6872, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12255859375, + "rewards/margins": -0.012451171875, + "rewards/rejected": -0.1103515625, + "step": 84 + }, + { + "epoch": 0.02452040963507861, + "grad_norm": 12.935984208490911, + "learning_rate": 1.2247838616714696e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.625, + "logps/chosen": -1848.0, + "logps/rejected": -1904.0, + "loss": 0.6606, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.134765625, + "rewards/margins": 0.0673828125, + "rewards/rejected": -0.203125, + "step": 85 + }, + { + "epoch": 0.024808885042550122, + "grad_norm": 13.826537769831619, + "learning_rate": 1.2391930835734869e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.0, + "logps/chosen": -1816.0, + "logps/rejected": -1768.0, + "loss": 0.7135, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1953125, + "rewards/margins": -0.06494140625, + "rewards/rejected": -0.1298828125, + "step": 86 + }, + { + "epoch": 0.025097360450021637, + "grad_norm": 12.225318828058995, + "learning_rate": 1.2536023054755044e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1640.0, + "logps/rejected": -1608.0, + "loss": 0.7056, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1328125, + "rewards/margins": -0.0150146484375, + "rewards/rejected": -0.11767578125, + "step": 87 + }, + { + "epoch": 0.02538583585749315, + "grad_norm": 14.57331483472006, + "learning_rate": 1.2680115273775216e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.90625, + "logps/chosen": -2048.0, + "logps/rejected": -1872.0, + "loss": 0.7023, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09521484375, + "rewards/margins": 0.0181884765625, + "rewards/rejected": -0.11328125, + "step": 88 + }, + { + "epoch": 0.025674311264964663, + "grad_norm": 14.106600156674954, + "learning_rate": 1.282420749279539e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.984375, + "logps/chosen": -1792.0, + "logps/rejected": -1720.0, + "loss": 0.7029, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1611328125, + "rewards/margins": -0.0712890625, + "rewards/rejected": -0.08984375, + "step": 89 + }, + { + "epoch": 0.025962786672436174, + "grad_norm": 13.01103712770991, + "learning_rate": 1.2968299711815562e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.828125, + "logps/chosen": -1240.0, + "logps/rejected": -1320.0, + "loss": 0.7124, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2544.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.18359375, + "rewards/margins": -0.068359375, + "rewards/rejected": -0.11572265625, + "step": 90 + }, + { + "epoch": 0.02625126207990769, + "grad_norm": 12.517008339381956, + "learning_rate": 1.3112391930835734e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.796875, + "logps/chosen": -1656.0, + "logps/rejected": -1776.0, + "loss": 0.6832, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11962890625, + "rewards/margins": 0.0615234375, + "rewards/rejected": -0.1806640625, + "step": 91 + }, + { + "epoch": 0.0265397374873792, + "grad_norm": 13.570496963689006, + "learning_rate": 1.3256484149855907e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.90625, + "logps/chosen": -1312.0, + "logps/rejected": -1336.0, + "loss": 0.6992, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1279296875, + "rewards/margins": -0.0113525390625, + "rewards/rejected": -0.1162109375, + "step": 92 + }, + { + "epoch": 0.026828212894850715, + "grad_norm": 13.340900026542391, + "learning_rate": 1.340057636887608e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.875, + "logps/chosen": -1696.0, + "logps/rejected": -1648.0, + "loss": 0.7139, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1455078125, + "rewards/margins": -0.0830078125, + "rewards/rejected": -0.0625, + "step": 93 + }, + { + "epoch": 0.027116688302322226, + "grad_norm": 12.511035832484655, + "learning_rate": 1.3544668587896252e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.84375, + "logps/chosen": -2160.0, + "logps/rejected": -1840.0, + "loss": 0.6982, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11279296875, + "rewards/margins": 0.012451171875, + "rewards/rejected": -0.125, + "step": 94 + }, + { + "epoch": 0.02740516370979374, + "grad_norm": 12.497757623359346, + "learning_rate": 1.3688760806916425e-07, + "logits/chosen": 2.90625, + "logits/rejected": 3.0, + "logps/chosen": -2080.0, + "logps/rejected": -1792.0, + "loss": 0.698, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12255859375, + "rewards/margins": -0.023681640625, + "rewards/rejected": -0.0986328125, + "step": 95 + }, + { + "epoch": 0.027693639117265252, + "grad_norm": 12.601935370996427, + "learning_rate": 1.38328530259366e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.828125, + "logps/chosen": -1656.0, + "logps/rejected": -1672.0, + "loss": 0.6927, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07080078125, + "rewards/margins": 0.052490234375, + "rewards/rejected": -0.123046875, + "step": 96 + }, + { + "epoch": 0.027982114524736767, + "grad_norm": 13.988997792118045, + "learning_rate": 1.3976945244956772e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.84375, + "logps/chosen": -1760.0, + "logps/rejected": -1600.0, + "loss": 0.6881, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09619140625, + "rewards/margins": 0.0218505859375, + "rewards/rejected": -0.1181640625, + "step": 97 + }, + { + "epoch": 0.028270589932208278, + "grad_norm": 12.27595689606032, + "learning_rate": 1.4121037463976945e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.875, + "logps/chosen": -1744.0, + "logps/rejected": -1616.0, + "loss": 0.7092, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.107421875, + "rewards/margins": -0.0400390625, + "rewards/rejected": -0.0673828125, + "step": 98 + }, + { + "epoch": 0.028559065339679793, + "grad_norm": 15.315086949174743, + "learning_rate": 1.4265129682997118e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.828125, + "logps/chosen": -1928.0, + "logps/rejected": -1832.0, + "loss": 0.7043, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.166015625, + "rewards/margins": 0.0067138671875, + "rewards/rejected": -0.1728515625, + "step": 99 + }, + { + "epoch": 0.028847540747151304, + "grad_norm": 12.130639080352182, + "learning_rate": 1.440922190201729e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.875, + "logps/chosen": -1976.0, + "logps/rejected": -1728.0, + "loss": 0.69, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.052490234375, + "rewards/margins": 0.043212890625, + "rewards/rejected": -0.095703125, + "step": 100 + }, + { + "epoch": 0.02913601615462282, + "grad_norm": 13.273837294627922, + "learning_rate": 1.4553314121037463e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1800.0, + "logps/rejected": -1480.0, + "loss": 0.7045, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15234375, + "rewards/margins": -0.036865234375, + "rewards/rejected": -0.11572265625, + "step": 101 + }, + { + "epoch": 0.02942449156209433, + "grad_norm": 13.2645073150252, + "learning_rate": 1.4697406340057635e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.796875, + "logps/chosen": -1632.0, + "logps/rejected": -1736.0, + "loss": 0.6984, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1298828125, + "rewards/margins": 0.0274658203125, + "rewards/rejected": -0.1572265625, + "step": 102 + }, + { + "epoch": 0.029712966969565845, + "grad_norm": 14.798587107954155, + "learning_rate": 1.4841498559077808e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.828125, + "logps/chosen": -1504.0, + "logps/rejected": -1632.0, + "loss": 0.7145, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09765625, + "rewards/margins": -0.061279296875, + "rewards/rejected": -0.036376953125, + "step": 103 + }, + { + "epoch": 0.030001442377037356, + "grad_norm": 14.76828065189772, + "learning_rate": 1.498559077809798e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.765625, + "logps/chosen": -1624.0, + "logps/rejected": -1800.0, + "loss": 0.6865, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12255859375, + "rewards/margins": 0.08740234375, + "rewards/rejected": -0.2099609375, + "step": 104 + }, + { + "epoch": 0.03028991778450887, + "grad_norm": 14.609936274691126, + "learning_rate": 1.5129682997118153e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -1616.0, + "logps/rejected": -1528.0, + "loss": 0.6688, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06884765625, + "rewards/margins": 0.0118408203125, + "rewards/rejected": -0.08056640625, + "step": 105 + }, + { + "epoch": 0.030578393191980385, + "grad_norm": 12.412594246631425, + "learning_rate": 1.5273775216138326e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.890625, + "logps/chosen": -1784.0, + "logps/rejected": -1656.0, + "loss": 0.6895, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0849609375, + "rewards/margins": 0.036376953125, + "rewards/rejected": -0.12158203125, + "step": 106 + }, + { + "epoch": 0.030866868599451897, + "grad_norm": 13.978415254779147, + "learning_rate": 1.54178674351585e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.796875, + "logps/chosen": -1728.0, + "logps/rejected": -1608.0, + "loss": 0.7108, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1474609375, + "rewards/margins": -0.03125, + "rewards/rejected": -0.1162109375, + "step": 107 + }, + { + "epoch": 0.03115534400692341, + "grad_norm": 16.041591734644538, + "learning_rate": 1.5561959654178673e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.859375, + "logps/chosen": -1744.0, + "logps/rejected": -1688.0, + "loss": 0.71, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1533203125, + "rewards/margins": -0.061767578125, + "rewards/rejected": -0.091796875, + "step": 108 + }, + { + "epoch": 0.031443819414394926, + "grad_norm": 14.176549820260293, + "learning_rate": 1.5706051873198846e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.734375, + "logps/chosen": -1472.0, + "logps/rejected": -1528.0, + "loss": 0.7073, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.125, + "rewards/margins": -0.0169677734375, + "rewards/rejected": -0.10791015625, + "step": 109 + }, + { + "epoch": 0.03173229482186644, + "grad_norm": 14.018013413620691, + "learning_rate": 1.5850144092219019e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.84375, + "logps/chosen": -1512.0, + "logps/rejected": -1488.0, + "loss": 0.6989, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.123046875, + "rewards/margins": -0.015625, + "rewards/rejected": -0.107421875, + "step": 110 + }, + { + "epoch": 0.03202077022933795, + "grad_norm": 14.354386488282962, + "learning_rate": 1.5994236311239194e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.75, + "logps/chosen": -2032.0, + "logps/rejected": -2024.0, + "loss": 0.7012, + "loss/demonstration_loss": -4048.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.13671875, + "rewards/margins": 0.001220703125, + "rewards/rejected": -0.1376953125, + "step": 111 + }, + { + "epoch": 0.03230924563680946, + "grad_norm": 11.234816353620845, + "learning_rate": 1.6138328530259366e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.703125, + "logps/chosen": -1392.0, + "logps/rejected": -1520.0, + "loss": 0.7076, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.138671875, + "rewards/margins": -0.033935546875, + "rewards/rejected": -0.10498046875, + "step": 112 + }, + { + "epoch": 0.03259772104428098, + "grad_norm": 12.791484180303588, + "learning_rate": 1.628242074927954e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.90625, + "logps/chosen": -1640.0, + "logps/rejected": -1440.0, + "loss": 0.7328, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15625, + "rewards/margins": -0.05859375, + "rewards/rejected": -0.09765625, + "step": 113 + }, + { + "epoch": 0.03288619645175249, + "grad_norm": 13.20804432595701, + "learning_rate": 1.6426512968299712e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.9375, + "logps/chosen": -1680.0, + "logps/rejected": -1496.0, + "loss": 0.719, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.111328125, + "rewards/margins": -0.021240234375, + "rewards/rejected": -0.08984375, + "step": 114 + }, + { + "epoch": 0.033174671859224, + "grad_norm": 12.745479685925165, + "learning_rate": 1.6570605187319884e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.71875, + "logps/chosen": -1656.0, + "logps/rejected": -1488.0, + "loss": 0.6754, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.050048828125, + "rewards/margins": 0.0439453125, + "rewards/rejected": -0.09375, + "step": 115 + }, + { + "epoch": 0.03346314726669551, + "grad_norm": 13.451664393092186, + "learning_rate": 1.6714697406340057e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.71875, + "logps/chosen": -1488.0, + "logps/rejected": -1384.0, + "loss": 0.722, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1923828125, + "rewards/margins": -0.07373046875, + "rewards/rejected": -0.11865234375, + "step": 116 + }, + { + "epoch": 0.03375162267416703, + "grad_norm": 12.456018395696045, + "learning_rate": 1.685878962536023e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.890625, + "logps/chosen": -1848.0, + "logps/rejected": -1824.0, + "loss": 0.6857, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.076171875, + "rewards/margins": 0.03369140625, + "rewards/rejected": -0.1103515625, + "step": 117 + }, + { + "epoch": 0.03404009808163854, + "grad_norm": 12.801575501642514, + "learning_rate": 1.7002881844380405e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.859375, + "logps/chosen": -2208.0, + "logps/rejected": -2080.0, + "loss": 0.6843, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12158203125, + "rewards/margins": -0.001251220703125, + "rewards/rejected": -0.1201171875, + "step": 118 + }, + { + "epoch": 0.03432857348911005, + "grad_norm": 17.31704302667602, + "learning_rate": 1.7146974063400577e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.84375, + "logps/chosen": -2048.0, + "logps/rejected": -1608.0, + "loss": 0.7007, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.111328125, + "rewards/margins": -0.03125, + "rewards/rejected": -0.080078125, + "step": 119 + }, + { + "epoch": 0.03461704889658156, + "grad_norm": 14.117864982793359, + "learning_rate": 1.729106628242075e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.859375, + "logps/chosen": -1848.0, + "logps/rejected": -1552.0, + "loss": 0.7281, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10498046875, + "rewards/margins": -0.00738525390625, + "rewards/rejected": -0.09765625, + "step": 120 + }, + { + "epoch": 0.03490552430405308, + "grad_norm": 10.992825594000015, + "learning_rate": 1.7435158501440922e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.828125, + "logps/chosen": -1448.0, + "logps/rejected": -1464.0, + "loss": 0.6809, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08154296875, + "rewards/margins": -0.0068359375, + "rewards/rejected": -0.07421875, + "step": 121 + }, + { + "epoch": 0.03519399971152459, + "grad_norm": 11.472419374590679, + "learning_rate": 1.7579250720461095e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.90625, + "logps/chosen": -1680.0, + "logps/rejected": -2080.0, + "loss": 0.667, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.039306640625, + "rewards/margins": 0.09423828125, + "rewards/rejected": -0.1337890625, + "step": 122 + }, + { + "epoch": 0.035482475118996104, + "grad_norm": 14.585823976713755, + "learning_rate": 1.7723342939481268e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.75, + "logps/chosen": -1848.0, + "logps/rejected": -1840.0, + "loss": 0.7073, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.150390625, + "rewards/margins": -0.030029296875, + "rewards/rejected": -0.1201171875, + "step": 123 + }, + { + "epoch": 0.035770950526467615, + "grad_norm": 13.530686368045174, + "learning_rate": 1.786743515850144e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.96875, + "logps/chosen": -1512.0, + "logps/rejected": -1536.0, + "loss": 0.658, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06884765625, + "rewards/margins": 0.10009765625, + "rewards/rejected": -0.1689453125, + "step": 124 + }, + { + "epoch": 0.036059425933939133, + "grad_norm": 14.11384466587561, + "learning_rate": 1.8011527377521613e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.90625, + "logps/chosen": -1752.0, + "logps/rejected": -1784.0, + "loss": 0.7164, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1318359375, + "rewards/margins": -0.04443359375, + "rewards/rejected": -0.08740234375, + "step": 125 + }, + { + "epoch": 0.036347901341410645, + "grad_norm": 12.345335830360996, + "learning_rate": 1.8155619596541785e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.90625, + "logps/chosen": -1968.0, + "logps/rejected": -1872.0, + "loss": 0.6993, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1201171875, + "rewards/margins": -0.052001953125, + "rewards/rejected": -0.068359375, + "step": 126 + }, + { + "epoch": 0.036636376748882156, + "grad_norm": 15.304159764818719, + "learning_rate": 1.8299711815561958e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.75, + "logps/chosen": -1312.0, + "logps/rejected": -1216.0, + "loss": 0.7166, + "loss/demonstration_loss": -2512.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10205078125, + "rewards/margins": -0.0247802734375, + "rewards/rejected": -0.07763671875, + "step": 127 + }, + { + "epoch": 0.036924852156353674, + "grad_norm": 11.856000837487837, + "learning_rate": 1.844380403458213e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.875, + "logps/chosen": -1264.0, + "logps/rejected": -1152.0, + "loss": 0.6893, + "loss/demonstration_loss": -2400.0, + "loss/preference_loss": -2416.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08056640625, + "rewards/margins": -0.0306396484375, + "rewards/rejected": -0.050048828125, + "step": 128 + }, + { + "epoch": 0.037213327563825185, + "grad_norm": 13.671704598170162, + "learning_rate": 1.8587896253602306e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.828125, + "logps/chosen": -2144.0, + "logps/rejected": -1928.0, + "loss": 0.6885, + "loss/demonstration_loss": -4048.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1162109375, + "rewards/margins": 0.0498046875, + "rewards/rejected": -0.166015625, + "step": 129 + }, + { + "epoch": 0.0375018029712967, + "grad_norm": 14.264431962101927, + "learning_rate": 1.8731988472622478e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.8125, + "logps/chosen": -1792.0, + "logps/rejected": -2040.0, + "loss": 0.6912, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0712890625, + "rewards/margins": -0.0037841796875, + "rewards/rejected": -0.0673828125, + "step": 130 + }, + { + "epoch": 0.03779027837876821, + "grad_norm": 12.183805910868063, + "learning_rate": 1.887608069164265e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1632.0, + "logps/rejected": -1560.0, + "loss": 0.6959, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.11865234375, + "rewards/margins": -0.0625, + "rewards/rejected": -0.056396484375, + "step": 131 + }, + { + "epoch": 0.038078753786239726, + "grad_norm": 12.949687293287298, + "learning_rate": 1.9020172910662823e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.9375, + "logps/chosen": -2000.0, + "logps/rejected": -1808.0, + "loss": 0.71, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.11865234375, + "rewards/margins": -0.04443359375, + "rewards/rejected": -0.07421875, + "step": 132 + }, + { + "epoch": 0.03836722919371124, + "grad_norm": 10.92216606166572, + "learning_rate": 1.9164265129682996e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1384.0, + "logps/rejected": -1424.0, + "loss": 0.6804, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.058837890625, + "rewards/margins": 0.0283203125, + "rewards/rejected": -0.0869140625, + "step": 133 + }, + { + "epoch": 0.03865570460118275, + "grad_norm": 14.268022353633459, + "learning_rate": 1.9308357348703169e-07, + "logits/chosen": 2.734375, + "logits/rejected": 2.8125, + "logps/chosen": -1576.0, + "logps/rejected": -1424.0, + "loss": 0.6696, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02880859375, + "rewards/margins": 0.07373046875, + "rewards/rejected": -0.1025390625, + "step": 134 + }, + { + "epoch": 0.03894418000865426, + "grad_norm": 12.639083223763501, + "learning_rate": 1.945244956772334e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.84375, + "logps/chosen": -1920.0, + "logps/rejected": -1712.0, + "loss": 0.7136, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1357421875, + "rewards/margins": -0.05078125, + "rewards/rejected": -0.0849609375, + "step": 135 + }, + { + "epoch": 0.03923265541612578, + "grad_norm": 11.257423131412404, + "learning_rate": 1.9596541786743514e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.859375, + "logps/chosen": -2080.0, + "logps/rejected": -1984.0, + "loss": 0.6952, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1640625, + "rewards/margins": -0.021240234375, + "rewards/rejected": -0.142578125, + "step": 136 + }, + { + "epoch": 0.03952113082359729, + "grad_norm": 14.19130633312644, + "learning_rate": 1.9740634005763686e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.796875, + "logps/chosen": -1640.0, + "logps/rejected": -1408.0, + "loss": 0.7228, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.13671875, + "rewards/margins": -0.10693359375, + "rewards/rejected": -0.0303955078125, + "step": 137 + }, + { + "epoch": 0.0398096062310688, + "grad_norm": 12.951420099355532, + "learning_rate": 1.988472622478386e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.796875, + "logps/chosen": -1824.0, + "logps/rejected": -1648.0, + "loss": 0.7073, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.1259765625, + "rewards/margins": -0.03369140625, + "rewards/rejected": -0.0927734375, + "step": 138 + }, + { + "epoch": 0.04009808163854031, + "grad_norm": 12.327022694242821, + "learning_rate": 2.0028818443804031e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -1776.0, + "logps/rejected": -1712.0, + "loss": 0.7034, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08056640625, + "rewards/margins": 0.0025177001953125, + "rewards/rejected": -0.0830078125, + "step": 139 + }, + { + "epoch": 0.04038655704601183, + "grad_norm": 14.121671421167523, + "learning_rate": 2.0172910662824207e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.953125, + "logps/chosen": -1600.0, + "logps/rejected": -1616.0, + "loss": 0.7064, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13671875, + "rewards/margins": 0.02197265625, + "rewards/rejected": -0.1591796875, + "step": 140 + }, + { + "epoch": 0.04067503245348334, + "grad_norm": 18.92552737036175, + "learning_rate": 2.031700288184438e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.65625, + "logps/chosen": -1616.0, + "logps/rejected": -1304.0, + "loss": 0.7136, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1474609375, + "rewards/margins": -0.1015625, + "rewards/rejected": -0.04638671875, + "step": 141 + }, + { + "epoch": 0.04096350786095485, + "grad_norm": 13.699282288775105, + "learning_rate": 2.0461095100864555e-07, + "logits/chosen": 2.921875, + "logits/rejected": 3.0, + "logps/chosen": -1936.0, + "logps/rejected": -1912.0, + "loss": 0.6901, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09521484375, + "rewards/margins": -0.00445556640625, + "rewards/rejected": -0.0908203125, + "step": 142 + }, + { + "epoch": 0.041251983268426363, + "grad_norm": 14.837494800798574, + "learning_rate": 2.0605187319884727e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.875, + "logps/chosen": -1936.0, + "logps/rejected": -1928.0, + "loss": 0.6838, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.146484375, + "rewards/margins": 0.015869140625, + "rewards/rejected": -0.162109375, + "step": 143 + }, + { + "epoch": 0.04154045867589788, + "grad_norm": 13.555400417587519, + "learning_rate": 2.07492795389049e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.9375, + "logps/chosen": -1312.0, + "logps/rejected": -1080.0, + "loss": 0.6891, + "loss/demonstration_loss": -2384.0, + "loss/preference_loss": -2384.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.123046875, + "rewards/margins": -0.0458984375, + "rewards/rejected": -0.0771484375, + "step": 144 + }, + { + "epoch": 0.04182893408336939, + "grad_norm": 14.267474595802195, + "learning_rate": 2.0893371757925072e-07, + "logits/chosen": 2.546875, + "logits/rejected": 2.578125, + "logps/chosen": -1408.0, + "logps/rejected": -1592.0, + "loss": 0.717, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1748046875, + "rewards/margins": -0.032958984375, + "rewards/rejected": -0.1416015625, + "step": 145 + }, + { + "epoch": 0.042117409490840904, + "grad_norm": 15.505696769909273, + "learning_rate": 2.1037463976945245e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1760.0, + "logps/rejected": -1728.0, + "loss": 0.7064, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.09375, + "rewards/margins": -0.03369140625, + "rewards/rejected": -0.06005859375, + "step": 146 + }, + { + "epoch": 0.04240588489831242, + "grad_norm": 11.35862329173575, + "learning_rate": 2.1181556195965417e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.765625, + "logps/chosen": -1680.0, + "logps/rejected": -1672.0, + "loss": 0.6884, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08203125, + "rewards/margins": -0.018798828125, + "rewards/rejected": -0.06298828125, + "step": 147 + }, + { + "epoch": 0.042694360305783934, + "grad_norm": 13.796716005612092, + "learning_rate": 2.132564841498559e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.84375, + "logps/chosen": -1704.0, + "logps/rejected": -1608.0, + "loss": 0.7004, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08056640625, + "rewards/margins": 0.0087890625, + "rewards/rejected": -0.08935546875, + "step": 148 + }, + { + "epoch": 0.042982835713255445, + "grad_norm": 12.974192271382416, + "learning_rate": 2.1469740634005763e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.703125, + "logps/chosen": -1840.0, + "logps/rejected": -1976.0, + "loss": 0.71, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.095703125, + "rewards/margins": 0.005645751953125, + "rewards/rejected": -0.1015625, + "step": 149 + }, + { + "epoch": 0.043271311120726956, + "grad_norm": 12.81111710173094, + "learning_rate": 2.1613832853025935e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.703125, + "logps/chosen": -1800.0, + "logps/rejected": -1976.0, + "loss": 0.7019, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.125, + "rewards/margins": -0.0093994140625, + "rewards/rejected": -0.11572265625, + "step": 150 + }, + { + "epoch": 0.043559786528198474, + "grad_norm": 13.444000972153871, + "learning_rate": 2.1757925072046108e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.796875, + "logps/chosen": -1464.0, + "logps/rejected": -1568.0, + "loss": 0.6994, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.146484375, + "rewards/margins": -0.0159912109375, + "rewards/rejected": -0.130859375, + "step": 151 + }, + { + "epoch": 0.043848261935669985, + "grad_norm": 12.898639566580528, + "learning_rate": 2.1902017291066283e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.78125, + "logps/chosen": -1520.0, + "logps/rejected": -1432.0, + "loss": 0.6987, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.162109375, + "rewards/margins": 0.0272216796875, + "rewards/rejected": -0.189453125, + "step": 152 + }, + { + "epoch": 0.0441367373431415, + "grad_norm": 11.630726904366368, + "learning_rate": 2.2046109510086456e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.84375, + "logps/chosen": -1720.0, + "logps/rejected": -1800.0, + "loss": 0.6931, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09326171875, + "rewards/margins": 0.0419921875, + "rewards/rejected": -0.134765625, + "step": 153 + }, + { + "epoch": 0.04442521275061301, + "grad_norm": 11.169770201898757, + "learning_rate": 2.2190201729106628e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.78125, + "logps/chosen": -1704.0, + "logps/rejected": -1856.0, + "loss": 0.6864, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.125, + "rewards/margins": -0.017578125, + "rewards/rejected": -0.107421875, + "step": 154 + }, + { + "epoch": 0.044713688158084526, + "grad_norm": 13.455924614723772, + "learning_rate": 2.23342939481268e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.8125, + "logps/chosen": -1568.0, + "logps/rejected": -1440.0, + "loss": 0.7084, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12158203125, + "rewards/margins": -0.032470703125, + "rewards/rejected": -0.0888671875, + "step": 155 + }, + { + "epoch": 0.04500216356555604, + "grad_norm": 12.553915789493537, + "learning_rate": 2.2478386167146973e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.8125, + "logps/chosen": -2024.0, + "logps/rejected": -1904.0, + "loss": 0.7038, + "loss/demonstration_loss": -3920.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1630859375, + "rewards/margins": -0.09375, + "rewards/rejected": -0.06884765625, + "step": 156 + }, + { + "epoch": 0.04529063897302755, + "grad_norm": 13.478313307695142, + "learning_rate": 2.2622478386167146e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.8125, + "logps/chosen": -2040.0, + "logps/rejected": -1952.0, + "loss": 0.6841, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1025390625, + "rewards/margins": 0.03759765625, + "rewards/rejected": -0.140625, + "step": 157 + }, + { + "epoch": 0.04557911438049906, + "grad_norm": 16.691461931401555, + "learning_rate": 2.2766570605187319e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.859375, + "logps/chosen": -1960.0, + "logps/rejected": -1752.0, + "loss": 0.7239, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.083984375, + "rewards/margins": -0.01611328125, + "rewards/rejected": -0.0673828125, + "step": 158 + }, + { + "epoch": 0.04586758978797058, + "grad_norm": 11.385581956074466, + "learning_rate": 2.291066282420749e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.921875, + "logps/chosen": -1792.0, + "logps/rejected": -1704.0, + "loss": 0.6858, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0576171875, + "rewards/margins": 0.027099609375, + "rewards/rejected": -0.08447265625, + "step": 159 + }, + { + "epoch": 0.04615606519544209, + "grad_norm": 15.23127200272958, + "learning_rate": 2.3054755043227664e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.953125, + "logps/chosen": -2144.0, + "logps/rejected": -1640.0, + "loss": 0.7311, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1357421875, + "rewards/margins": -0.0791015625, + "rewards/rejected": -0.056640625, + "step": 160 + }, + { + "epoch": 0.0464445406029136, + "grad_norm": 11.63659336058798, + "learning_rate": 2.3198847262247836e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.921875, + "logps/chosen": -1504.0, + "logps/rejected": -1392.0, + "loss": 0.7076, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0888671875, + "rewards/margins": -0.0262451171875, + "rewards/rejected": -0.0625, + "step": 161 + }, + { + "epoch": 0.04673301601038511, + "grad_norm": 11.850391571770666, + "learning_rate": 2.334293948126801e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.71875, + "logps/chosen": -1552.0, + "logps/rejected": -1680.0, + "loss": 0.6801, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07177734375, + "rewards/margins": 0.05322265625, + "rewards/rejected": -0.125, + "step": 162 + }, + { + "epoch": 0.04702149141785663, + "grad_norm": 22.090105508211398, + "learning_rate": 2.3487031700288184e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.890625, + "logps/chosen": -1920.0, + "logps/rejected": -1832.0, + "loss": 0.7281, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1376953125, + "rewards/margins": 0.0019989013671875, + "rewards/rejected": -0.1396484375, + "step": 163 + }, + { + "epoch": 0.04730996682532814, + "grad_norm": 13.361887958756412, + "learning_rate": 2.3631123919308357e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.859375, + "logps/chosen": -1456.0, + "logps/rejected": -1464.0, + "loss": 0.6953, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.115234375, + "rewards/margins": -0.0137939453125, + "rewards/rejected": -0.1015625, + "step": 164 + }, + { + "epoch": 0.04759844223279965, + "grad_norm": 12.742986548037846, + "learning_rate": 2.377521613832853e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.953125, + "logps/chosen": -1824.0, + "logps/rejected": -1928.0, + "loss": 0.7018, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1162109375, + "rewards/margins": -0.03564453125, + "rewards/rejected": -0.08056640625, + "step": 165 + }, + { + "epoch": 0.047886917640271164, + "grad_norm": 14.605682008275203, + "learning_rate": 2.39193083573487e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.859375, + "logps/chosen": -1848.0, + "logps/rejected": -1568.0, + "loss": 0.7106, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12255859375, + "rewards/margins": -0.0673828125, + "rewards/rejected": -0.05517578125, + "step": 166 + }, + { + "epoch": 0.04817539304774268, + "grad_norm": 12.533637459912104, + "learning_rate": 2.4063400576368874e-07, + "logits/chosen": 2.65625, + "logits/rejected": 2.59375, + "logps/chosen": -2008.0, + "logps/rejected": -2048.0, + "loss": 0.7023, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1064453125, + "rewards/margins": -0.023193359375, + "rewards/rejected": -0.0830078125, + "step": 167 + }, + { + "epoch": 0.04846386845521419, + "grad_norm": 13.271104990641957, + "learning_rate": 2.4207492795389047e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.84375, + "logps/chosen": -1360.0, + "logps/rejected": -1552.0, + "loss": 0.6725, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08203125, + "rewards/margins": 0.0281982421875, + "rewards/rejected": -0.10986328125, + "step": 168 + }, + { + "epoch": 0.048752343862685704, + "grad_norm": 15.575067536250037, + "learning_rate": 2.435158501440922e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.84375, + "logps/chosen": -1760.0, + "logps/rejected": -2000.0, + "loss": 0.7154, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.1376953125, + "rewards/margins": -0.1123046875, + "rewards/rejected": -0.02490234375, + "step": 169 + }, + { + "epoch": 0.04904081927015722, + "grad_norm": 12.850108851026656, + "learning_rate": 2.449567723342939e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.796875, + "logps/chosen": -1616.0, + "logps/rejected": -1648.0, + "loss": 0.6802, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12890625, + "rewards/margins": 0.021240234375, + "rewards/rejected": -0.150390625, + "step": 170 + }, + { + "epoch": 0.049329294677628734, + "grad_norm": 12.890074709688625, + "learning_rate": 2.4639769452449565e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.8125, + "logps/chosen": -1632.0, + "logps/rejected": -1664.0, + "loss": 0.6982, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.056884765625, + "rewards/margins": 0.0294189453125, + "rewards/rejected": -0.08642578125, + "step": 171 + }, + { + "epoch": 0.049617770085100245, + "grad_norm": 13.723740966557617, + "learning_rate": 2.4783861671469737e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.859375, + "logps/chosen": -1936.0, + "logps/rejected": -1704.0, + "loss": 0.6964, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.1064453125, + "rewards/margins": -0.056396484375, + "rewards/rejected": -0.050048828125, + "step": 172 + }, + { + "epoch": 0.049906245492571756, + "grad_norm": 13.6595813240613, + "learning_rate": 2.492795389048991e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.890625, + "logps/chosen": -1768.0, + "logps/rejected": -1696.0, + "loss": 0.7081, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.173828125, + "rewards/margins": -0.0380859375, + "rewards/rejected": -0.1357421875, + "step": 173 + }, + { + "epoch": 0.050194720900043274, + "grad_norm": 15.6352438826303, + "learning_rate": 2.507204610951009e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.9375, + "logps/chosen": -2048.0, + "logps/rejected": -1792.0, + "loss": 0.7213, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1552734375, + "rewards/margins": -0.10302734375, + "rewards/rejected": -0.052001953125, + "step": 174 + }, + { + "epoch": 0.050483196307514785, + "grad_norm": 15.61004338321909, + "learning_rate": 2.5216138328530255e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.71875, + "logps/chosen": -2040.0, + "logps/rejected": -2000.0, + "loss": 0.7123, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1201171875, + "rewards/margins": -0.01385498046875, + "rewards/rejected": -0.1064453125, + "step": 175 + }, + { + "epoch": 0.0507716717149863, + "grad_norm": 11.551037242327421, + "learning_rate": 2.5360230547550433e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.875, + "logps/chosen": -1024.0, + "logps/rejected": -1200.0, + "loss": 0.6741, + "loss/demonstration_loss": -2224.0, + "loss/preference_loss": -2224.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0018463134765625, + "rewards/margins": 0.06201171875, + "rewards/rejected": -0.06396484375, + "step": 176 + }, + { + "epoch": 0.05106014712245781, + "grad_norm": 12.946875161977205, + "learning_rate": 2.55043227665706e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.953125, + "logps/chosen": -1592.0, + "logps/rejected": -1520.0, + "loss": 0.6958, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.054931640625, + "rewards/margins": -0.0037689208984375, + "rewards/rejected": -0.05126953125, + "step": 177 + }, + { + "epoch": 0.051348622529929326, + "grad_norm": 16.215660861493703, + "learning_rate": 2.564841498559078e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.734375, + "logps/chosen": -1920.0, + "logps/rejected": -2024.0, + "loss": 0.6995, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.064453125, + "rewards/margins": 0.07958984375, + "rewards/rejected": -0.1435546875, + "step": 178 + }, + { + "epoch": 0.05163709793740084, + "grad_norm": 14.963728613493695, + "learning_rate": 2.5792507204610945e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.921875, + "logps/chosen": -1632.0, + "logps/rejected": -1624.0, + "loss": 0.7036, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.0625, + "rewards/chosen": -0.1435546875, + "rewards/margins": -0.09521484375, + "rewards/rejected": -0.048828125, + "step": 179 + }, + { + "epoch": 0.05192557334487235, + "grad_norm": 16.03304682815614, + "learning_rate": 2.5936599423631123e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.90625, + "logps/chosen": -1720.0, + "logps/rejected": -1696.0, + "loss": 0.704, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08642578125, + "rewards/margins": -0.0006866455078125, + "rewards/rejected": -0.0859375, + "step": 180 + }, + { + "epoch": 0.05221404875234386, + "grad_norm": 12.80502164549832, + "learning_rate": 2.6080691642651296e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.9375, + "logps/chosen": -1904.0, + "logps/rejected": -1680.0, + "loss": 0.6893, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.041259765625, + "rewards/margins": 0.002197265625, + "rewards/rejected": -0.04345703125, + "step": 181 + }, + { + "epoch": 0.05250252415981538, + "grad_norm": 14.76302232613939, + "learning_rate": 2.622478386167147e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.75, + "logps/chosen": -1672.0, + "logps/rejected": -1576.0, + "loss": 0.6946, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0712890625, + "rewards/margins": 0.004913330078125, + "rewards/rejected": -0.076171875, + "step": 182 + }, + { + "epoch": 0.05279099956728689, + "grad_norm": 17.701462545711838, + "learning_rate": 2.636887608069164e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.90625, + "logps/chosen": -1856.0, + "logps/rejected": -1760.0, + "loss": 0.7339, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.189453125, + "rewards/margins": -0.11279296875, + "rewards/rejected": -0.0771484375, + "step": 183 + }, + { + "epoch": 0.0530794749747584, + "grad_norm": 14.62512152000554, + "learning_rate": 2.6512968299711814e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.859375, + "logps/chosen": -1904.0, + "logps/rejected": -1560.0, + "loss": 0.7172, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09619140625, + "rewards/margins": -0.06689453125, + "rewards/rejected": -0.0294189453125, + "step": 184 + }, + { + "epoch": 0.05336795038222991, + "grad_norm": 14.812223557858013, + "learning_rate": 2.6657060518731986e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.828125, + "logps/chosen": -1936.0, + "logps/rejected": -1528.0, + "loss": 0.7019, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03125, + "rewards/margins": 0.00439453125, + "rewards/rejected": -0.03564453125, + "step": 185 + }, + { + "epoch": 0.05365642578970143, + "grad_norm": 14.926754068736647, + "learning_rate": 2.680115273775216e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.9375, + "logps/chosen": -2008.0, + "logps/rejected": -1864.0, + "loss": 0.7031, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11865234375, + "rewards/margins": -0.0150146484375, + "rewards/rejected": -0.10400390625, + "step": 186 + }, + { + "epoch": 0.05394490119717294, + "grad_norm": 12.996290251553956, + "learning_rate": 2.694524495677233e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.84375, + "logps/chosen": -1960.0, + "logps/rejected": -1848.0, + "loss": 0.6931, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03759765625, + "rewards/margins": 0.010009765625, + "rewards/rejected": -0.047607421875, + "step": 187 + }, + { + "epoch": 0.05423337660464445, + "grad_norm": 15.284209995890105, + "learning_rate": 2.7089337175792504e-07, + "logits/chosen": 2.671875, + "logits/rejected": 2.75, + "logps/chosen": -1840.0, + "logps/rejected": -1576.0, + "loss": 0.7025, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07275390625, + "rewards/margins": -0.0150146484375, + "rewards/rejected": -0.0576171875, + "step": 188 + }, + { + "epoch": 0.05452185201211597, + "grad_norm": 12.316054202172852, + "learning_rate": 2.7233429394812677e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.828125, + "logps/chosen": -1416.0, + "logps/rejected": -1472.0, + "loss": 0.7148, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.091796875, + "rewards/margins": -0.06884765625, + "rewards/rejected": -0.023193359375, + "step": 189 + }, + { + "epoch": 0.05481032741958748, + "grad_norm": 13.905673621606834, + "learning_rate": 2.737752161383285e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.828125, + "logps/chosen": -1904.0, + "logps/rejected": -1784.0, + "loss": 0.7106, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.076171875, + "rewards/margins": -0.017578125, + "rewards/rejected": -0.058837890625, + "step": 190 + }, + { + "epoch": 0.05509880282705899, + "grad_norm": 14.504704138091942, + "learning_rate": 2.7521613832853027e-07, + "logits/chosen": 2.734375, + "logits/rejected": 2.765625, + "logps/chosen": -2024.0, + "logps/rejected": -2008.0, + "loss": 0.6959, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08251953125, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.12255859375, + "step": 191 + }, + { + "epoch": 0.055387278234530504, + "grad_norm": 14.96693664872217, + "learning_rate": 2.76657060518732e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.859375, + "logps/chosen": -2192.0, + "logps/rejected": -2008.0, + "loss": 0.7009, + "loss/demonstration_loss": -4192.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0849609375, + "rewards/margins": -0.0250244140625, + "rewards/rejected": -0.06005859375, + "step": 192 + }, + { + "epoch": 0.05567575364200202, + "grad_norm": 14.123079865036004, + "learning_rate": 2.780979827089337e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.984375, + "logps/chosen": -2224.0, + "logps/rejected": -1808.0, + "loss": 0.6887, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030029296875, + "rewards/margins": 0.023193359375, + "rewards/rejected": -0.05322265625, + "step": 193 + }, + { + "epoch": 0.055964229049473534, + "grad_norm": 13.516746625425505, + "learning_rate": 2.7953890489913545e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.796875, + "logps/chosen": -1976.0, + "logps/rejected": -1712.0, + "loss": 0.6777, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04443359375, + "rewards/margins": 0.00689697265625, + "rewards/rejected": -0.05126953125, + "step": 194 + }, + { + "epoch": 0.056252704456945045, + "grad_norm": 13.444320240932006, + "learning_rate": 2.809798270893372e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.859375, + "logps/chosen": -1896.0, + "logps/rejected": -1872.0, + "loss": 0.6724, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.00750732421875, + "rewards/margins": 0.06201171875, + "rewards/rejected": -0.054443359375, + "step": 195 + }, + { + "epoch": 0.056541179864416556, + "grad_norm": 12.252323492786209, + "learning_rate": 2.824207492795389e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.984375, + "logps/chosen": -1824.0, + "logps/rejected": -1752.0, + "loss": 0.6705, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03369140625, + "rewards/margins": 0.038818359375, + "rewards/rejected": -0.07275390625, + "step": 196 + }, + { + "epoch": 0.056829655271888074, + "grad_norm": 12.876626846589938, + "learning_rate": 2.838616714697406e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.8125, + "logps/chosen": -1952.0, + "logps/rejected": -1864.0, + "loss": 0.6958, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0576171875, + "rewards/margins": -0.00689697265625, + "rewards/rejected": -0.05078125, + "step": 197 + }, + { + "epoch": 0.057118130679359586, + "grad_norm": 15.372487534461387, + "learning_rate": 2.8530259365994235e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.890625, + "logps/chosen": -2128.0, + "logps/rejected": -2032.0, + "loss": 0.6931, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0211181640625, + "rewards/margins": 0.12353515625, + "rewards/rejected": -0.1025390625, + "step": 198 + }, + { + "epoch": 0.0574066060868311, + "grad_norm": 13.89838825564844, + "learning_rate": 2.867435158501441e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.03125, + "logps/chosen": -1600.0, + "logps/rejected": -1432.0, + "loss": 0.694, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.061279296875, + "rewards/margins": -0.02001953125, + "rewards/rejected": -0.041259765625, + "step": 199 + }, + { + "epoch": 0.05769508149430261, + "grad_norm": 12.2390300861769, + "learning_rate": 2.881844380403458e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.984375, + "logps/chosen": -1824.0, + "logps/rejected": -1768.0, + "loss": 0.7135, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.04931640625, + "rewards/margins": -0.046875, + "rewards/rejected": -0.00250244140625, + "step": 200 + }, + { + "epoch": 0.057983556901774126, + "grad_norm": 14.418346101592789, + "learning_rate": 2.8962536023054753e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.015625, + "logps/chosen": -1800.0, + "logps/rejected": -1712.0, + "loss": 0.6849, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.030029296875, + "rewards/margins": 0.027587890625, + "rewards/rejected": -0.0576171875, + "step": 201 + }, + { + "epoch": 0.05827203230924564, + "grad_norm": 13.16540704902075, + "learning_rate": 2.9106628242074925e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.78125, + "logps/chosen": -1688.0, + "logps/rejected": -1840.0, + "loss": 0.6862, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.023193359375, + "rewards/margins": 0.0556640625, + "rewards/rejected": -0.07861328125, + "step": 202 + }, + { + "epoch": 0.05856050771671715, + "grad_norm": 11.983946095637638, + "learning_rate": 2.9250720461095103e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.046875, + "logps/chosen": -1888.0, + "logps/rejected": -1776.0, + "loss": 0.6957, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03125, + "rewards/margins": -0.010009765625, + "rewards/rejected": -0.021240234375, + "step": 203 + }, + { + "epoch": 0.05884898312418866, + "grad_norm": 11.227781823086811, + "learning_rate": 2.939481268011527e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.84375, + "logps/chosen": -1648.0, + "logps/rejected": -1640.0, + "loss": 0.7103, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0162353515625, + "rewards/margins": -0.01251220703125, + "rewards/rejected": 0.02880859375, + "step": 204 + }, + { + "epoch": 0.05913745853166018, + "grad_norm": 14.438945820821774, + "learning_rate": 2.953890489913545e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.671875, + "logps/chosen": -1472.0, + "logps/rejected": -1632.0, + "loss": 0.6932, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023193359375, + "rewards/margins": 0.027587890625, + "rewards/rejected": -0.05078125, + "step": 205 + }, + { + "epoch": 0.05942593393913169, + "grad_norm": 11.6561126649981, + "learning_rate": 2.9682997118155616e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0, + "logps/chosen": -1640.0, + "logps/rejected": -1680.0, + "loss": 0.7043, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0087890625, + "rewards/margins": 0.01123046875, + "rewards/rejected": -0.02001953125, + "step": 206 + }, + { + "epoch": 0.0597144093466032, + "grad_norm": 13.802370085354822, + "learning_rate": 2.9827089337175794e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.890625, + "logps/chosen": -1896.0, + "logps/rejected": -2000.0, + "loss": 0.6804, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0224609375, + "rewards/margins": 0.04248046875, + "rewards/rejected": -0.02001953125, + "step": 207 + }, + { + "epoch": 0.06000288475407471, + "grad_norm": 13.72589295749893, + "learning_rate": 2.997118155619596e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1872.0, + "logps/rejected": -1752.0, + "loss": 0.7146, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0250244140625, + "rewards/margins": -0.050048828125, + "rewards/rejected": 0.0250244140625, + "step": 208 + }, + { + "epoch": 0.06029136016154623, + "grad_norm": 12.338817262211686, + "learning_rate": 3.011527377521614e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.828125, + "logps/chosen": -2208.0, + "logps/rejected": -2256.0, + "loss": 0.6925, + "loss/demonstration_loss": -4480.0, + "loss/preference_loss": -4480.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.003753662109375, + "rewards/margins": -0.0224609375, + "rewards/rejected": 0.0262451171875, + "step": 209 + }, + { + "epoch": 0.06057983556901774, + "grad_norm": 11.614647514316792, + "learning_rate": 3.0259365994236306e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.96875, + "logps/chosen": -1656.0, + "logps/rejected": -1640.0, + "loss": 0.6911, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.01251220703125, + "rewards/margins": -0.0137939453125, + "rewards/rejected": 0.001251220703125, + "step": 210 + }, + { + "epoch": 0.06086831097648925, + "grad_norm": 11.407490945392553, + "learning_rate": 3.0403458213256484e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.90625, + "logps/chosen": -1512.0, + "logps/rejected": -1600.0, + "loss": 0.6812, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0238037109375, + "rewards/margins": 0.0174560546875, + "rewards/rejected": 0.006256103515625, + "step": 211 + }, + { + "epoch": 0.06115678638396077, + "grad_norm": 12.904669304061635, + "learning_rate": 3.054755043227665e-07, + "logits/chosen": 3.0625, + "logits/rejected": 2.984375, + "logps/chosen": -1704.0, + "logps/rejected": -1736.0, + "loss": 0.701, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05517578125, + "rewards/margins": 0.017578125, + "rewards/rejected": 0.03759765625, + "step": 212 + }, + { + "epoch": 0.06144526179143228, + "grad_norm": 11.831078067632724, + "learning_rate": 3.069164265129683e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.828125, + "logps/chosen": -2096.0, + "logps/rejected": -2112.0, + "loss": 0.688, + "loss/demonstration_loss": -4224.0, + "loss/preference_loss": -4224.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.044921875, + "rewards/margins": 0.01123046875, + "rewards/rejected": 0.03369140625, + "step": 213 + }, + { + "epoch": 0.06173373719890379, + "grad_norm": 12.606126784777697, + "learning_rate": 3.0835734870317e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0, + "logps/chosen": -1488.0, + "logps/rejected": -1352.0, + "loss": 0.7183, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0224609375, + "rewards/margins": 0.016845703125, + "rewards/rejected": 0.005615234375, + "step": 214 + }, + { + "epoch": 0.062022212606375304, + "grad_norm": 10.772112954532906, + "learning_rate": 3.0979827089337174e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0, + "logps/chosen": -1520.0, + "logps/rejected": -1600.0, + "loss": 0.7125, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.03564453125, + "rewards/margins": -0.0830078125, + "rewards/rejected": 0.047607421875, + "step": 215 + }, + { + "epoch": 0.06231068801384682, + "grad_norm": 11.465071264758075, + "learning_rate": 3.1123919308357347e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.90625, + "logps/chosen": -1776.0, + "logps/rejected": -1872.0, + "loss": 0.6883, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05078125, + "rewards/margins": 0.0050048828125, + "rewards/rejected": 0.045654296875, + "step": 216 + }, + { + "epoch": 0.06259916342131833, + "grad_norm": 14.407193027959861, + "learning_rate": 3.126801152737752e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -1824.0, + "logps/rejected": -1768.0, + "loss": 0.6899, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04248046875, + "rewards/margins": -0.0087890625, + "rewards/rejected": -0.03369140625, + "step": 217 + }, + { + "epoch": 0.06288763882878985, + "grad_norm": 13.18961999723925, + "learning_rate": 3.141210374639769e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.90625, + "logps/chosen": -1840.0, + "logps/rejected": -1384.0, + "loss": 0.6797, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0576171875, + "rewards/margins": 0.080078125, + "rewards/rejected": -0.0224609375, + "step": 218 + }, + { + "epoch": 0.06317611423626136, + "grad_norm": 13.126494834514967, + "learning_rate": 3.1556195965417865e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.984375, + "logps/chosen": -1664.0, + "logps/rejected": -1376.0, + "loss": 0.7163, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0093994140625, + "rewards/margins": 0.0037384033203125, + "rewards/rejected": 0.005615234375, + "step": 219 + }, + { + "epoch": 0.06346458964373287, + "grad_norm": 14.081995596872781, + "learning_rate": 3.1700288184438037e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.828125, + "logps/chosen": -1888.0, + "logps/rejected": -1512.0, + "loss": 0.7113, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.038818359375, + "rewards/margins": -0.0030975341796875, + "rewards/rejected": -0.03564453125, + "step": 220 + }, + { + "epoch": 0.06375306505120439, + "grad_norm": 12.043954587426272, + "learning_rate": 3.184438040345821e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1728.0, + "logps/rejected": -1792.0, + "loss": 0.6764, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.080078125, + "rewards/margins": 0.04931640625, + "rewards/rejected": 0.0306396484375, + "step": 221 + }, + { + "epoch": 0.0640415404586759, + "grad_norm": 13.958981536855855, + "learning_rate": 3.198847262247839e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.9375, + "logps/chosen": -1456.0, + "logps/rejected": -1496.0, + "loss": 0.693, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.083984375, + "rewards/margins": 0.044921875, + "rewards/rejected": 0.038818359375, + "step": 222 + }, + { + "epoch": 0.06433001586614741, + "grad_norm": 12.952665203623601, + "learning_rate": 3.2132564841498555e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.84375, + "logps/chosen": -1432.0, + "logps/rejected": -1664.0, + "loss": 0.7032, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0125732421875, + "rewards/margins": -0.0106201171875, + "rewards/rejected": 0.023193359375, + "step": 223 + }, + { + "epoch": 0.06461849127361892, + "grad_norm": 10.659089739422535, + "learning_rate": 3.2276657060518733e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.953125, + "logps/chosen": -1432.0, + "logps/rejected": -1480.0, + "loss": 0.6884, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.05517578125, + "rewards/margins": 0.025634765625, + "rewards/rejected": 0.0294189453125, + "step": 224 + }, + { + "epoch": 0.06490696668109043, + "grad_norm": 14.239858913396796, + "learning_rate": 3.2420749279538905e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.953125, + "logps/chosen": -1840.0, + "logps/rejected": -1784.0, + "loss": 0.72, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0238037109375, + "rewards/margins": -0.018798828125, + "rewards/rejected": 0.04248046875, + "step": 225 + }, + { + "epoch": 0.06519544208856196, + "grad_norm": 12.56030848447778, + "learning_rate": 3.256484149855908e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.96875, + "logps/chosen": -1272.0, + "logps/rejected": -1328.0, + "loss": 0.6979, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.034423828125, + "rewards/margins": -0.023193359375, + "rewards/rejected": 0.0576171875, + "step": 226 + }, + { + "epoch": 0.06548391749603347, + "grad_norm": 13.329347590768073, + "learning_rate": 3.270893371757925e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.171875, + "logps/chosen": -1888.0, + "logps/rejected": -1368.0, + "loss": 0.6821, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.056396484375, + "rewards/margins": 0.0419921875, + "rewards/rejected": 0.01434326171875, + "step": 227 + }, + { + "epoch": 0.06577239290350498, + "grad_norm": 13.282746824623697, + "learning_rate": 3.2853025936599423e-07, + "logits/chosen": 2.71875, + "logits/rejected": 2.84375, + "logps/chosen": -1928.0, + "logps/rejected": -1664.0, + "loss": 0.7008, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0537109375, + "rewards/margins": 0.02490234375, + "rewards/rejected": 0.02880859375, + "step": 228 + }, + { + "epoch": 0.06606086831097649, + "grad_norm": 12.940826991838383, + "learning_rate": 3.2997118155619596e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.8125, + "logps/chosen": -1576.0, + "logps/rejected": -1696.0, + "loss": 0.6915, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.083984375, + "rewards/margins": 0.0081787109375, + "rewards/rejected": 0.07568359375, + "step": 229 + }, + { + "epoch": 0.066349343718448, + "grad_norm": 13.69041000016369, + "learning_rate": 3.314121037463977e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.859375, + "logps/chosen": -1584.0, + "logps/rejected": -1456.0, + "loss": 0.682, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00750732421875, + "rewards/margins": 0.0247802734375, + "rewards/rejected": -0.0172119140625, + "step": 230 + }, + { + "epoch": 0.06663781912591951, + "grad_norm": 11.549184347200583, + "learning_rate": 3.328530259365994e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1528.0, + "logps/rejected": -1552.0, + "loss": 0.6798, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.050048828125, + "rewards/margins": 0.017578125, + "rewards/rejected": 0.032470703125, + "step": 231 + }, + { + "epoch": 0.06692629453339102, + "grad_norm": 11.509496305810329, + "learning_rate": 3.3429394812680114e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.9375, + "logps/chosen": -1456.0, + "logps/rejected": -1424.0, + "loss": 0.6863, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.08154296875, + "rewards/margins": 0.033203125, + "rewards/rejected": 0.048095703125, + "step": 232 + }, + { + "epoch": 0.06721476994086255, + "grad_norm": 17.10045964597626, + "learning_rate": 3.3573487031700286e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.796875, + "logps/chosen": -1608.0, + "logps/rejected": -1656.0, + "loss": 0.7194, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.02001953125, + "rewards/margins": -0.01055908203125, + "rewards/rejected": 0.0306396484375, + "step": 233 + }, + { + "epoch": 0.06750324534833406, + "grad_norm": 11.782122908947224, + "learning_rate": 3.371757925072046e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.890625, + "logps/chosen": -1904.0, + "logps/rejected": -1608.0, + "loss": 0.6781, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0673828125, + "rewards/margins": 0.040771484375, + "rewards/rejected": 0.02685546875, + "step": 234 + }, + { + "epoch": 0.06779172075580557, + "grad_norm": 13.939692328911423, + "learning_rate": 3.386167146974063e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0625, + "logps/chosen": -1864.0, + "logps/rejected": -1568.0, + "loss": 0.7227, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.044921875, + "rewards/margins": -0.0537109375, + "rewards/rejected": 0.0087890625, + "step": 235 + }, + { + "epoch": 0.06808019616327708, + "grad_norm": 12.844284359940602, + "learning_rate": 3.400576368876081e-07, + "logits/chosen": 2.734375, + "logits/rejected": 2.796875, + "logps/chosen": -1904.0, + "logps/rejected": -1624.0, + "loss": 0.7188, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.0137939453125, + "rewards/margins": -0.08154296875, + "rewards/rejected": 0.0673828125, + "step": 236 + }, + { + "epoch": 0.0683686715707486, + "grad_norm": 15.095519333688204, + "learning_rate": 3.4149855907780976e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.828125, + "logps/chosen": -1624.0, + "logps/rejected": -1560.0, + "loss": 0.6948, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.06396484375, + "rewards/margins": -0.00750732421875, + "rewards/rejected": 0.0712890625, + "step": 237 + }, + { + "epoch": 0.0686571469782201, + "grad_norm": 15.687937466811052, + "learning_rate": 3.4293948126801154e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.96875, + "logps/chosen": -2032.0, + "logps/rejected": -1824.0, + "loss": 0.6926, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.08740234375, + "rewards/margins": 0.016357421875, + "rewards/rejected": 0.0712890625, + "step": 238 + }, + { + "epoch": 0.06894562238569162, + "grad_norm": 13.634207736710202, + "learning_rate": 3.443804034582132e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -1632.0, + "logps/rejected": -1520.0, + "loss": 0.7124, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.09130859375, + "rewards/margins": -0.00439453125, + "rewards/rejected": 0.095703125, + "step": 239 + }, + { + "epoch": 0.06923409779316313, + "grad_norm": 12.199715076807127, + "learning_rate": 3.45821325648415e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.921875, + "logps/chosen": -1704.0, + "logps/rejected": -1680.0, + "loss": 0.6906, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0859375, + "rewards/margins": 0.045654296875, + "rewards/rejected": 0.0400390625, + "step": 240 + }, + { + "epoch": 0.06952257320063465, + "grad_norm": 14.82092716001698, + "learning_rate": 3.4726224783861667e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.765625, + "logps/chosen": -1632.0, + "logps/rejected": -1632.0, + "loss": 0.6959, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0908203125, + "rewards/margins": 0.00689697265625, + "rewards/rejected": 0.083984375, + "step": 241 + }, + { + "epoch": 0.06981104860810616, + "grad_norm": 12.667383672709043, + "learning_rate": 3.4870317002881845e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.953125, + "logps/chosen": -1416.0, + "logps/rejected": -1472.0, + "loss": 0.7061, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.005645751953125, + "rewards/margins": -0.0380859375, + "rewards/rejected": 0.043701171875, + "step": 242 + }, + { + "epoch": 0.07009952401557767, + "grad_norm": 12.382361001615099, + "learning_rate": 3.501440922190201e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.8125, + "logps/chosen": -1816.0, + "logps/rejected": -1760.0, + "loss": 0.6903, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08642578125, + "rewards/margins": 0.021240234375, + "rewards/rejected": 0.06494140625, + "step": 243 + }, + { + "epoch": 0.07038799942304919, + "grad_norm": 15.859188391017698, + "learning_rate": 3.515850144092219e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.984375, + "logps/chosen": -1568.0, + "logps/rejected": -1416.0, + "loss": 0.6932, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08642578125, + "rewards/margins": 0.032470703125, + "rewards/rejected": 0.0537109375, + "step": 244 + }, + { + "epoch": 0.0706764748305207, + "grad_norm": 13.088822358150253, + "learning_rate": 3.5302593659942357e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -1616.0, + "logps/rejected": -1736.0, + "loss": 0.6625, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.10205078125, + "rewards/margins": 0.028076171875, + "rewards/rejected": 0.07373046875, + "step": 245 + }, + { + "epoch": 0.07096495023799221, + "grad_norm": 12.277061788936658, + "learning_rate": 3.5446685878962535e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.046875, + "logps/chosen": -1408.0, + "logps/rejected": -1560.0, + "loss": 0.672, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.076171875, + "rewards/margins": 0.0849609375, + "rewards/rejected": -0.0087890625, + "step": 246 + }, + { + "epoch": 0.07125342564546372, + "grad_norm": 12.994224051688281, + "learning_rate": 3.559077809798271e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.9375, + "logps/chosen": -1544.0, + "logps/rejected": -1360.0, + "loss": 0.6937, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0771484375, + "rewards/margins": 0.0181884765625, + "rewards/rejected": 0.058837890625, + "step": 247 + }, + { + "epoch": 0.07154190105293523, + "grad_norm": 11.58354785804806, + "learning_rate": 3.573487031700288e-07, + "logits/chosen": 2.921875, + "logits/rejected": 3.015625, + "logps/chosen": -1616.0, + "logps/rejected": -1304.0, + "loss": 0.6918, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.091796875, + "rewards/margins": 0.001251220703125, + "rewards/rejected": 0.0908203125, + "step": 248 + }, + { + "epoch": 0.07183037646040676, + "grad_norm": 12.754971398553955, + "learning_rate": 3.5878962536023053e-07, + "logits/chosen": 2.703125, + "logits/rejected": 2.703125, + "logps/chosen": -1512.0, + "logps/rejected": -1568.0, + "loss": 0.7312, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.017578125, + "rewards/margins": -0.09130859375, + "rewards/rejected": 0.10888671875, + "step": 249 + }, + { + "epoch": 0.07211885186787827, + "grad_norm": 13.448078587745409, + "learning_rate": 3.6023054755043225e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.9375, + "logps/chosen": -1728.0, + "logps/rejected": -1736.0, + "loss": 0.6868, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.08203125, + "rewards/margins": 0.04833984375, + "rewards/rejected": 0.03369140625, + "step": 250 + }, + { + "epoch": 0.07240732727534978, + "grad_norm": 14.303317136857709, + "learning_rate": 3.61671469740634e-07, + "logits/chosen": 2.890625, + "logits/rejected": 3.015625, + "logps/chosen": -1648.0, + "logps/rejected": -1360.0, + "loss": 0.6739, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.044921875, + "rewards/margins": 0.048583984375, + "rewards/rejected": -0.00372314453125, + "step": 251 + }, + { + "epoch": 0.07269580268282129, + "grad_norm": 11.871220919952881, + "learning_rate": 3.631123919308357e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0, + "logps/chosen": -1376.0, + "logps/rejected": -1352.0, + "loss": 0.7146, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.038818359375, + "rewards/margins": -0.072265625, + "rewards/rejected": 0.111328125, + "step": 252 + }, + { + "epoch": 0.0729842780902928, + "grad_norm": 12.712874921887503, + "learning_rate": 3.645533141210375e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0, + "logps/chosen": -1672.0, + "logps/rejected": -1584.0, + "loss": 0.6986, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.09912109375, + "rewards/margins": -0.004058837890625, + "rewards/rejected": 0.10302734375, + "step": 253 + }, + { + "epoch": 0.07327275349776431, + "grad_norm": 11.50468218868833, + "learning_rate": 3.6599423631123916e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.984375, + "logps/chosen": -1288.0, + "logps/rejected": -1536.0, + "loss": 0.6884, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08447265625, + "rewards/margins": 0.032470703125, + "rewards/rejected": 0.052001953125, + "step": 254 + }, + { + "epoch": 0.07356122890523582, + "grad_norm": 11.121474221622007, + "learning_rate": 3.6743515850144094e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.84375, + "logps/chosen": -1552.0, + "logps/rejected": -1648.0, + "loss": 0.703, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1064453125, + "rewards/margins": 0.0137939453125, + "rewards/rejected": 0.0927734375, + "step": 255 + }, + { + "epoch": 0.07384970431270735, + "grad_norm": 10.93597365996318, + "learning_rate": 3.688760806916426e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.96875, + "logps/chosen": -1872.0, + "logps/rejected": -1600.0, + "loss": 0.6947, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10400390625, + "rewards/margins": 0.0162353515625, + "rewards/rejected": 0.08740234375, + "step": 256 + }, + { + "epoch": 0.07413817972017886, + "grad_norm": 13.389610221343382, + "learning_rate": 3.703170028818444e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.984375, + "logps/chosen": -1760.0, + "logps/rejected": -1504.0, + "loss": 0.6831, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0830078125, + "rewards/margins": 0.0194091796875, + "rewards/rejected": 0.0634765625, + "step": 257 + }, + { + "epoch": 0.07442665512765037, + "grad_norm": 12.54312204571574, + "learning_rate": 3.717579250720461e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -1616.0, + "logps/rejected": -1472.0, + "loss": 0.6851, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.076171875, + "rewards/margins": 0.0216064453125, + "rewards/rejected": 0.054443359375, + "step": 258 + }, + { + "epoch": 0.07471513053512188, + "grad_norm": 12.143904901542992, + "learning_rate": 3.7319884726224784e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.90625, + "logps/chosen": -1904.0, + "logps/rejected": -1744.0, + "loss": 0.7185, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10009765625, + "rewards/margins": 0.0, + "rewards/rejected": 0.10009765625, + "step": 259 + }, + { + "epoch": 0.0750036059425934, + "grad_norm": 11.113536745358273, + "learning_rate": 3.7463976945244956e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.03125, + "logps/chosen": -2040.0, + "logps/rejected": -1736.0, + "loss": 0.708, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.0927734375, + "rewards/margins": -0.01318359375, + "rewards/rejected": 0.10595703125, + "step": 260 + }, + { + "epoch": 0.0752920813500649, + "grad_norm": 12.038149058643892, + "learning_rate": 3.760806916426513e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.734375, + "logps/chosen": -1696.0, + "logps/rejected": -1544.0, + "loss": 0.6874, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1201171875, + "rewards/margins": 0.05810546875, + "rewards/rejected": 0.06201171875, + "step": 261 + }, + { + "epoch": 0.07558055675753642, + "grad_norm": 12.973911622755267, + "learning_rate": 3.77521613832853e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.921875, + "logps/chosen": -1592.0, + "logps/rejected": -1568.0, + "loss": 0.6886, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.10888671875, + "rewards/margins": 0.0238037109375, + "rewards/rejected": 0.0849609375, + "step": 262 + }, + { + "epoch": 0.07586903216500793, + "grad_norm": 12.160771283866747, + "learning_rate": 3.7896253602305474e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0, + "logps/chosen": -1760.0, + "logps/rejected": -1696.0, + "loss": 0.6807, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.115234375, + "rewards/margins": 0.032470703125, + "rewards/rejected": 0.08251953125, + "step": 263 + }, + { + "epoch": 0.07615750757247945, + "grad_norm": 15.099553792009583, + "learning_rate": 3.8040345821325647e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1896.0, + "logps/rejected": -1792.0, + "loss": 0.6838, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1640625, + "rewards/margins": 0.0244140625, + "rewards/rejected": 0.1396484375, + "step": 264 + }, + { + "epoch": 0.07644598297995096, + "grad_norm": 13.234494901110835, + "learning_rate": 3.818443804034582e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.015625, + "logps/chosen": -1600.0, + "logps/rejected": -1488.0, + "loss": 0.6965, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.109375, + "rewards/margins": 0.0194091796875, + "rewards/rejected": 0.08984375, + "step": 265 + }, + { + "epoch": 0.07673445838742247, + "grad_norm": 12.919941450531079, + "learning_rate": 3.832853025936599e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.859375, + "logps/chosen": -1920.0, + "logps/rejected": -1912.0, + "loss": 0.7354, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.04248046875, + "rewards/margins": -0.0927734375, + "rewards/rejected": 0.134765625, + "step": 266 + }, + { + "epoch": 0.07702293379489399, + "grad_norm": 10.836261820124863, + "learning_rate": 3.8472622478386165e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.984375, + "logps/chosen": -1592.0, + "logps/rejected": -1848.0, + "loss": 0.6782, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1318359375, + "rewards/margins": 0.091796875, + "rewards/rejected": 0.0400390625, + "step": 267 + }, + { + "epoch": 0.0773114092023655, + "grad_norm": 11.856712605255007, + "learning_rate": 3.8616714697406337e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.03125, + "logps/chosen": -1824.0, + "logps/rejected": -1600.0, + "loss": 0.6873, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.115234375, + "rewards/margins": 0.01312255859375, + "rewards/rejected": 0.10205078125, + "step": 268 + }, + { + "epoch": 0.07759988460983701, + "grad_norm": 12.86560996101754, + "learning_rate": 3.8760806916426515e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.796875, + "logps/chosen": -1536.0, + "logps/rejected": -1656.0, + "loss": 0.7104, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.06494140625, + "rewards/margins": -0.044921875, + "rewards/rejected": 0.1103515625, + "step": 269 + }, + { + "epoch": 0.07788836001730852, + "grad_norm": 13.029873728402967, + "learning_rate": 3.890489913544668e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.125, + "logps/chosen": -1968.0, + "logps/rejected": -1616.0, + "loss": 0.7231, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.07763671875, + "rewards/margins": -0.052490234375, + "rewards/rejected": 0.1298828125, + "step": 270 + }, + { + "epoch": 0.07817683542478003, + "grad_norm": 13.810212926425034, + "learning_rate": 3.904899135446686e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.953125, + "logps/chosen": -1928.0, + "logps/rejected": -1816.0, + "loss": 0.6897, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.140625, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.13671875, + "step": 271 + }, + { + "epoch": 0.07846531083225156, + "grad_norm": 12.45664192992779, + "learning_rate": 3.919308357348703e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -1808.0, + "logps/rejected": -1616.0, + "loss": 0.689, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1201171875, + "rewards/margins": 0.014404296875, + "rewards/rejected": 0.10595703125, + "step": 272 + }, + { + "epoch": 0.07875378623972307, + "grad_norm": 13.677171557938841, + "learning_rate": 3.9337175792507205e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.84375, + "logps/chosen": -2368.0, + "logps/rejected": -2272.0, + "loss": 0.71, + "loss/demonstration_loss": -4640.0, + "loss/preference_loss": -4672.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.1484375, + "rewards/margins": -0.04150390625, + "rewards/rejected": 0.1904296875, + "step": 273 + }, + { + "epoch": 0.07904226164719458, + "grad_norm": 13.26257309492729, + "learning_rate": 3.9481268011527373e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.796875, + "logps/chosen": -1712.0, + "logps/rejected": -1768.0, + "loss": 0.724, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.080078125, + "rewards/margins": -0.095703125, + "rewards/rejected": 0.17578125, + "step": 274 + }, + { + "epoch": 0.07933073705466609, + "grad_norm": 15.789491585011902, + "learning_rate": 3.962536023054755e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.9375, + "logps/chosen": -1688.0, + "logps/rejected": -1856.0, + "loss": 0.6714, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16015625, + "rewards/margins": 0.01123046875, + "rewards/rejected": 0.1484375, + "step": 275 + }, + { + "epoch": 0.0796192124621376, + "grad_norm": 13.224673959063026, + "learning_rate": 3.976945244956772e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.90625, + "logps/chosen": -1776.0, + "logps/rejected": -1984.0, + "loss": 0.6925, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.154296875, + "rewards/margins": 0.0244140625, + "rewards/rejected": 0.1298828125, + "step": 276 + }, + { + "epoch": 0.07990768786960911, + "grad_norm": 14.864306427447907, + "learning_rate": 3.9913544668587896e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.96875, + "logps/chosen": -1472.0, + "logps/rejected": -1512.0, + "loss": 0.7036, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11962890625, + "rewards/margins": -0.0106201171875, + "rewards/rejected": 0.1298828125, + "step": 277 + }, + { + "epoch": 0.08019616327708062, + "grad_norm": 11.757720526653452, + "learning_rate": 4.0057636887608063e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.9375, + "logps/chosen": -1560.0, + "logps/rejected": -1704.0, + "loss": 0.7144, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1015625, + "rewards/margins": -0.02880859375, + "rewards/rejected": 0.1298828125, + "step": 278 + }, + { + "epoch": 0.08048463868455215, + "grad_norm": 11.903795875944434, + "learning_rate": 4.020172910662824e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1680.0, + "logps/rejected": -1576.0, + "loss": 0.6685, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.08251953125, + "rewards/margins": 0.0262451171875, + "rewards/rejected": 0.056396484375, + "step": 279 + }, + { + "epoch": 0.08077311409202366, + "grad_norm": 13.240292839327235, + "learning_rate": 4.0345821325648413e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -2256.0, + "logps/rejected": -2024.0, + "loss": 0.6991, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.10009765625, + "rewards/margins": -0.06884765625, + "rewards/rejected": 0.1689453125, + "step": 280 + }, + { + "epoch": 0.08106158949949517, + "grad_norm": 13.952479681228839, + "learning_rate": 4.0489913544668586e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.875, + "logps/chosen": -2000.0, + "logps/rejected": -2000.0, + "loss": 0.6967, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.212890625, + "rewards/margins": 0.046875, + "rewards/rejected": 0.166015625, + "step": 281 + }, + { + "epoch": 0.08135006490696668, + "grad_norm": 12.462553393885862, + "learning_rate": 4.063400576368876e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.921875, + "logps/chosen": -1328.0, + "logps/rejected": -1312.0, + "loss": 0.688, + "loss/demonstration_loss": -2656.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08984375, + "rewards/margins": 0.0074462890625, + "rewards/rejected": 0.08251953125, + "step": 282 + }, + { + "epoch": 0.0816385403144382, + "grad_norm": 16.478243742118174, + "learning_rate": 4.077809798270893e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.96875, + "logps/chosen": -1680.0, + "logps/rejected": -1656.0, + "loss": 0.7126, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.08203125, + "rewards/margins": -0.060791015625, + "rewards/rejected": 0.142578125, + "step": 283 + }, + { + "epoch": 0.0819270157219097, + "grad_norm": 10.096768480393164, + "learning_rate": 4.092219020172911e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.96875, + "logps/chosen": -1760.0, + "logps/rejected": -1576.0, + "loss": 0.7027, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1552734375, + "rewards/margins": 0.0174560546875, + "rewards/rejected": 0.1376953125, + "step": 284 + }, + { + "epoch": 0.08221549112938122, + "grad_norm": 13.419073157123501, + "learning_rate": 4.1066282420749276e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.875, + "logps/chosen": -1872.0, + "logps/rejected": -1856.0, + "loss": 0.7064, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.15234375, + "rewards/margins": -0.03515625, + "rewards/rejected": 0.1875, + "step": 285 + }, + { + "epoch": 0.08250396653685273, + "grad_norm": 14.724515764537559, + "learning_rate": 4.1210374639769454e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.734375, + "logps/chosen": -1792.0, + "logps/rejected": -1824.0, + "loss": 0.6962, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15234375, + "rewards/margins": 0.0074462890625, + "rewards/rejected": 0.1455078125, + "step": 286 + }, + { + "epoch": 0.08279244194432425, + "grad_norm": 11.764717302995162, + "learning_rate": 4.135446685878962e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.921875, + "logps/chosen": -1312.0, + "logps/rejected": -1592.0, + "loss": 0.6841, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1533203125, + "rewards/margins": -0.010009765625, + "rewards/rejected": 0.1630859375, + "step": 287 + }, + { + "epoch": 0.08308091735179576, + "grad_norm": 11.526199113507058, + "learning_rate": 4.14985590778098e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -1920.0, + "logps/rejected": -1808.0, + "loss": 0.705, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.22265625, + "rewards/margins": 0.043701171875, + "rewards/rejected": 0.1787109375, + "step": 288 + }, + { + "epoch": 0.08336939275926727, + "grad_norm": 12.810133124144434, + "learning_rate": 4.1642651296829967e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -1848.0, + "logps/rejected": -1824.0, + "loss": 0.6912, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.10546875, + "rewards/margins": -0.0791015625, + "rewards/rejected": 0.1845703125, + "step": 289 + }, + { + "epoch": 0.08365786816673879, + "grad_norm": 13.345197064838015, + "learning_rate": 4.1786743515850145e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.90625, + "logps/chosen": -1712.0, + "logps/rejected": -1896.0, + "loss": 0.7095, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.181640625, + "rewards/margins": 0.031982421875, + "rewards/rejected": 0.150390625, + "step": 290 + }, + { + "epoch": 0.0839463435742103, + "grad_norm": 10.072933630704417, + "learning_rate": 4.193083573487031e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.828125, + "logps/chosen": -1784.0, + "logps/rejected": -1600.0, + "loss": 0.6717, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1982421875, + "rewards/margins": 0.07763671875, + "rewards/rejected": 0.12060546875, + "step": 291 + }, + { + "epoch": 0.08423481898168181, + "grad_norm": 13.744718323467263, + "learning_rate": 4.207492795389049e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.859375, + "logps/chosen": -1808.0, + "logps/rejected": -1744.0, + "loss": 0.6766, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.140625, + "rewards/margins": 0.0224609375, + "rewards/rejected": 0.11767578125, + "step": 292 + }, + { + "epoch": 0.08452329438915332, + "grad_norm": 11.779287359089318, + "learning_rate": 4.221902017291066e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1608.0, + "logps/rejected": -1400.0, + "loss": 0.6865, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1982421875, + "rewards/margins": 0.01434326171875, + "rewards/rejected": 0.18359375, + "step": 293 + }, + { + "epoch": 0.08481176979662484, + "grad_norm": 10.613632492923646, + "learning_rate": 4.2363112391930835e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.90625, + "logps/chosen": -1472.0, + "logps/rejected": -1520.0, + "loss": 0.6968, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.177734375, + "rewards/margins": -0.00628662109375, + "rewards/rejected": 0.18359375, + "step": 294 + }, + { + "epoch": 0.08510024520409636, + "grad_norm": 12.029297966064473, + "learning_rate": 4.250720461095101e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.09375, + "logps/chosen": -1768.0, + "logps/rejected": -1616.0, + "loss": 0.6914, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2294921875, + "rewards/margins": 0.00592041015625, + "rewards/rejected": 0.2236328125, + "step": 295 + }, + { + "epoch": 0.08538872061156787, + "grad_norm": 11.102052930471627, + "learning_rate": 4.265129682997118e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.109375, + "logps/chosen": -1704.0, + "logps/rejected": -1704.0, + "loss": 0.712, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1240234375, + "rewards/margins": -0.046875, + "rewards/rejected": 0.1708984375, + "step": 296 + }, + { + "epoch": 0.08567719601903938, + "grad_norm": 12.503377623561018, + "learning_rate": 4.2795389048991353e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.0, + "logps/chosen": -1992.0, + "logps/rejected": -1632.0, + "loss": 0.684, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2490234375, + "rewards/margins": 0.08251953125, + "rewards/rejected": 0.166015625, + "step": 297 + }, + { + "epoch": 0.08596567142651089, + "grad_norm": 10.381876486993336, + "learning_rate": 4.2939481268011525e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.9375, + "logps/chosen": -1648.0, + "logps/rejected": -1608.0, + "loss": 0.6806, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.158203125, + "rewards/margins": -0.00030517578125, + "rewards/rejected": 0.1591796875, + "step": 298 + }, + { + "epoch": 0.0862541468339824, + "grad_norm": 11.902101328113877, + "learning_rate": 4.30835734870317e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.921875, + "logps/chosen": -1408.0, + "logps/rejected": -1488.0, + "loss": 0.705, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.19921875, + "rewards/margins": -0.041259765625, + "rewards/rejected": 0.240234375, + "step": 299 + }, + { + "epoch": 0.08654262224145391, + "grad_norm": 11.307581926775873, + "learning_rate": 4.322766570605187e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -1280.0, + "logps/rejected": -1552.0, + "loss": 0.7014, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.12060546875, + "rewards/margins": -0.050048828125, + "rewards/rejected": 0.1708984375, + "step": 300 + }, + { + "epoch": 0.08683109764892542, + "grad_norm": 11.45869248213672, + "learning_rate": 4.3371757925072043e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.953125, + "logps/chosen": -1672.0, + "logps/rejected": -1856.0, + "loss": 0.6917, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1982421875, + "rewards/margins": 0.00567626953125, + "rewards/rejected": 0.1923828125, + "step": 301 + }, + { + "epoch": 0.08711957305639695, + "grad_norm": 17.207787172513388, + "learning_rate": 4.3515850144092216e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.03125, + "logps/chosen": -2048.0, + "logps/rejected": -1728.0, + "loss": 0.6835, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.263671875, + "rewards/margins": 0.01806640625, + "rewards/rejected": 0.24609375, + "step": 302 + }, + { + "epoch": 0.08740804846386846, + "grad_norm": 11.709457249898819, + "learning_rate": 4.365994236311239e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1728.0, + "logps/rejected": -1768.0, + "loss": 0.7097, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.251953125, + "rewards/margins": -0.003814697265625, + "rewards/rejected": 0.255859375, + "step": 303 + }, + { + "epoch": 0.08769652387133997, + "grad_norm": 12.69806809330215, + "learning_rate": 4.3804034582132566e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.9375, + "logps/chosen": -1648.0, + "logps/rejected": -1688.0, + "loss": 0.688, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.236328125, + "rewards/margins": 0.0274658203125, + "rewards/rejected": 0.208984375, + "step": 304 + }, + { + "epoch": 0.08798499927881148, + "grad_norm": 12.446924126675057, + "learning_rate": 4.3948126801152733e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.046875, + "logps/chosen": -1600.0, + "logps/rejected": -1552.0, + "loss": 0.6891, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.185546875, + "rewards/margins": -0.0150146484375, + "rewards/rejected": 0.201171875, + "step": 305 + }, + { + "epoch": 0.088273474686283, + "grad_norm": 13.187359169399343, + "learning_rate": 4.409221902017291e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1616.0, + "logps/rejected": -1640.0, + "loss": 0.7092, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1875, + "rewards/margins": -0.017578125, + "rewards/rejected": 0.205078125, + "step": 306 + }, + { + "epoch": 0.0885619500937545, + "grad_norm": 13.477804732617766, + "learning_rate": 4.423631123919308e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1728.0, + "logps/rejected": -1824.0, + "loss": 0.689, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.1943359375, + "rewards/margins": -0.033203125, + "rewards/rejected": 0.2275390625, + "step": 307 + }, + { + "epoch": 0.08885042550122602, + "grad_norm": 15.056873499408436, + "learning_rate": 4.4380403458213256e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.859375, + "logps/chosen": -1672.0, + "logps/rejected": -1408.0, + "loss": 0.7184, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.19140625, + "rewards/margins": -0.01123046875, + "rewards/rejected": 0.2021484375, + "step": 308 + }, + { + "epoch": 0.08913890090869753, + "grad_norm": 13.019298531625479, + "learning_rate": 4.4524495677233424e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.0, + "logps/chosen": -1744.0, + "logps/rejected": -1376.0, + "loss": 0.6713, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.263671875, + "rewards/margins": 0.068359375, + "rewards/rejected": 0.1943359375, + "step": 309 + }, + { + "epoch": 0.08942737631616905, + "grad_norm": 15.217809319721413, + "learning_rate": 4.46685878962536e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.921875, + "logps/chosen": -1488.0, + "logps/rejected": -1464.0, + "loss": 0.6623, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2041015625, + "rewards/margins": 0.0576171875, + "rewards/rejected": 0.146484375, + "step": 310 + }, + { + "epoch": 0.08971585172364056, + "grad_norm": 11.224707026465401, + "learning_rate": 4.481268011527377e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.890625, + "logps/chosen": -1632.0, + "logps/rejected": -1768.0, + "loss": 0.6852, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1611328125, + "rewards/margins": -0.0198974609375, + "rewards/rejected": 0.181640625, + "step": 311 + }, + { + "epoch": 0.09000432713111207, + "grad_norm": 13.575381385862697, + "learning_rate": 4.4956772334293947e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1960.0, + "logps/rejected": -1520.0, + "loss": 0.7255, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1826171875, + "rewards/margins": -0.012451171875, + "rewards/rejected": 0.1953125, + "step": 312 + }, + { + "epoch": 0.09029280253858359, + "grad_norm": 11.452838570371567, + "learning_rate": 4.5100864553314114e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -1752.0, + "logps/rejected": -1688.0, + "loss": 0.6887, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.043212890625, + "rewards/rejected": 0.224609375, + "step": 313 + }, + { + "epoch": 0.0905812779460551, + "grad_norm": 12.768219472102764, + "learning_rate": 4.524495677233429e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.109375, + "logps/chosen": -2096.0, + "logps/rejected": -1792.0, + "loss": 0.6848, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.248046875, + "rewards/margins": 0.056884765625, + "rewards/rejected": 0.1904296875, + "step": 314 + }, + { + "epoch": 0.09086975335352661, + "grad_norm": 11.908153175706465, + "learning_rate": 4.538904899135447e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.796875, + "logps/chosen": -1656.0, + "logps/rejected": -1456.0, + "loss": 0.6865, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2421875, + "rewards/margins": 0.01263427734375, + "rewards/rejected": 0.23046875, + "step": 315 + }, + { + "epoch": 0.09115822876099812, + "grad_norm": 11.926939631481423, + "learning_rate": 4.5533141210374637e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0, + "logps/chosen": -1808.0, + "logps/rejected": -1704.0, + "loss": 0.7203, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2275390625, + "rewards/margins": -0.00616455078125, + "rewards/rejected": 0.234375, + "step": 316 + }, + { + "epoch": 0.09144670416846964, + "grad_norm": 13.691197642707056, + "learning_rate": 4.5677233429394815e-07, + "logits/chosen": 2.796875, + "logits/rejected": 2.859375, + "logps/chosen": -1840.0, + "logps/rejected": -1752.0, + "loss": 0.6754, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.1884765625, + "step": 317 + }, + { + "epoch": 0.09173517957594116, + "grad_norm": 12.286636519748805, + "learning_rate": 4.582132564841498e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1792.0, + "logps/rejected": -1744.0, + "loss": 0.6899, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2001953125, + "rewards/margins": 0.019287109375, + "rewards/rejected": 0.1806640625, + "step": 318 + }, + { + "epoch": 0.09202365498341267, + "grad_norm": 11.496348608058788, + "learning_rate": 4.596541786743516e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.078125, + "logps/chosen": -1560.0, + "logps/rejected": -1392.0, + "loss": 0.6841, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2236328125, + "rewards/margins": 0.0012969970703125, + "rewards/rejected": 0.22265625, + "step": 319 + }, + { + "epoch": 0.09231213039088418, + "grad_norm": 10.56894773571941, + "learning_rate": 4.610951008645533e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.859375, + "logps/chosen": -1536.0, + "logps/rejected": -1304.0, + "loss": 0.6736, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2353515625, + "rewards/margins": 0.06884765625, + "rewards/rejected": 0.166015625, + "step": 320 + }, + { + "epoch": 0.09260060579835569, + "grad_norm": 9.70495369325244, + "learning_rate": 4.6253602305475505e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.078125, + "logps/chosen": -1816.0, + "logps/rejected": -1752.0, + "loss": 0.676, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.00189208984375, + "rewards/rejected": 0.2578125, + "step": 321 + }, + { + "epoch": 0.0928890812058272, + "grad_norm": 11.947230054535686, + "learning_rate": 4.639769452449567e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1600.0, + "logps/rejected": -1680.0, + "loss": 0.7113, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2373046875, + "rewards/margins": -0.052734375, + "rewards/rejected": 0.291015625, + "step": 322 + }, + { + "epoch": 0.09317755661329871, + "grad_norm": 12.323761341626186, + "learning_rate": 4.654178674351585e-07, + "logits/chosen": 2.875, + "logits/rejected": 3.015625, + "logps/chosen": -1760.0, + "logps/rejected": -1696.0, + "loss": 0.6863, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2197265625, + "rewards/margins": 0.0115966796875, + "rewards/rejected": 0.2080078125, + "step": 323 + }, + { + "epoch": 0.09346603202077022, + "grad_norm": 12.918962222490473, + "learning_rate": 4.668587896253602e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.96875, + "logps/chosen": -1576.0, + "logps/rejected": -1664.0, + "loss": 0.7018, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1728515625, + "rewards/margins": -0.1103515625, + "rewards/rejected": 0.283203125, + "step": 324 + }, + { + "epoch": 0.09375450742824175, + "grad_norm": 12.673405909506833, + "learning_rate": 4.6829971181556196e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.84375, + "logps/chosen": -2080.0, + "logps/rejected": -2176.0, + "loss": 0.6989, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.27734375, + "rewards/margins": -0.027587890625, + "rewards/rejected": 0.3046875, + "step": 325 + }, + { + "epoch": 0.09404298283571326, + "grad_norm": 12.277683224572902, + "learning_rate": 4.697406340057637e-07, + "logits/chosen": 2.78125, + "logits/rejected": 2.859375, + "logps/chosen": -1752.0, + "logps/rejected": -1520.0, + "loss": 0.6759, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28515625, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.19921875, + "step": 326 + }, + { + "epoch": 0.09433145824318477, + "grad_norm": 12.256988184227385, + "learning_rate": 4.711815561959654e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.0, + "logps/chosen": -1464.0, + "logps/rejected": -1440.0, + "loss": 0.7084, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20703125, + "rewards/margins": -0.0081787109375, + "rewards/rejected": 0.21484375, + "step": 327 + }, + { + "epoch": 0.09461993365065628, + "grad_norm": 14.865328761434416, + "learning_rate": 4.7262247838616713e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.90625, + "logps/chosen": -1392.0, + "logps/rejected": -1368.0, + "loss": 0.6826, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21875, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.146484375, + "step": 328 + }, + { + "epoch": 0.0949084090581278, + "grad_norm": 10.962210847782162, + "learning_rate": 4.7406340057636886e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1872.0, + "logps/rejected": -1688.0, + "loss": 0.6869, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29296875, + "rewards/margins": 0.036376953125, + "rewards/rejected": 0.255859375, + "step": 329 + }, + { + "epoch": 0.0951968844655993, + "grad_norm": 11.93833949929247, + "learning_rate": 4.755043227665706e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.96875, + "logps/chosen": -1872.0, + "logps/rejected": -1840.0, + "loss": 0.6961, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.259765625, + "rewards/margins": -0.0224609375, + "rewards/rejected": 0.283203125, + "step": 330 + }, + { + "epoch": 0.09548535987307082, + "grad_norm": 11.348894472522236, + "learning_rate": 4.769452449567723e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.109375, + "logps/chosen": -1928.0, + "logps/rejected": -1512.0, + "loss": 0.6837, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.328125, + "rewards/margins": 0.0625, + "rewards/rejected": 0.265625, + "step": 331 + }, + { + "epoch": 0.09577383528054233, + "grad_norm": 12.799129514136016, + "learning_rate": 4.78386167146974e-07, + "logits/chosen": 2.890625, + "logits/rejected": 3.0, + "logps/chosen": -1592.0, + "logps/rejected": -1656.0, + "loss": 0.6778, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.248046875, + "rewards/margins": -0.00592041015625, + "rewards/rejected": 0.25390625, + "step": 332 + }, + { + "epoch": 0.09606231068801385, + "grad_norm": 11.369320647741647, + "learning_rate": 4.798270893371757e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.953125, + "logps/chosen": -1880.0, + "logps/rejected": -1624.0, + "loss": 0.6763, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.043701171875, + "rewards/rejected": 0.2734375, + "step": 333 + }, + { + "epoch": 0.09635078609548536, + "grad_norm": 10.54250252226706, + "learning_rate": 4.812680115273775e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.875, + "logps/chosen": -1592.0, + "logps/rejected": -1664.0, + "loss": 0.6991, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.306640625, + "rewards/margins": -0.0294189453125, + "rewards/rejected": 0.3359375, + "step": 334 + }, + { + "epoch": 0.09663926150295687, + "grad_norm": 11.463685060515383, + "learning_rate": 4.827089337175792e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -1592.0, + "logps/rejected": -1520.0, + "loss": 0.6895, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.013916015625, + "rewards/rejected": 0.296875, + "step": 335 + }, + { + "epoch": 0.09692773691042839, + "grad_norm": 9.96117351998309, + "learning_rate": 4.841498559077809e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.8125, + "logps/chosen": -1176.0, + "logps/rejected": -1144.0, + "loss": 0.6863, + "loss/demonstration_loss": -2352.0, + "loss/preference_loss": -2336.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2578125, + "rewards/margins": 0.0269775390625, + "rewards/rejected": 0.2314453125, + "step": 336 + }, + { + "epoch": 0.0972162123178999, + "grad_norm": 12.18025141847497, + "learning_rate": 4.855907780979827e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.96875, + "logps/chosen": -1760.0, + "logps/rejected": -1872.0, + "loss": 0.7124, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.296875, + "rewards/margins": -0.06884765625, + "rewards/rejected": 0.3671875, + "step": 337 + }, + { + "epoch": 0.09750468772537141, + "grad_norm": 12.448832794886972, + "learning_rate": 4.870317002881844e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.96875, + "logps/chosen": -1864.0, + "logps/rejected": -1864.0, + "loss": 0.7123, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.25, + "rewards/margins": -0.0751953125, + "rewards/rejected": 0.32421875, + "step": 338 + }, + { + "epoch": 0.09779316313284292, + "grad_norm": 13.043572555308508, + "learning_rate": 4.884726224783862e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1880.0, + "logps/rejected": -1960.0, + "loss": 0.675, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.283203125, + "rewards/margins": -0.0224609375, + "rewards/rejected": 0.3046875, + "step": 339 + }, + { + "epoch": 0.09808163854031444, + "grad_norm": 11.082715012938753, + "learning_rate": 4.899135446685878e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.03125, + "logps/chosen": -1712.0, + "logps/rejected": -1840.0, + "loss": 0.6736, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.02490234375, + "rewards/rejected": 0.296875, + "step": 340 + }, + { + "epoch": 0.09837011394778596, + "grad_norm": 13.193880086049354, + "learning_rate": 4.913544668587896e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.03125, + "logps/chosen": -1536.0, + "logps/rejected": -1752.0, + "loss": 0.719, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.291015625, + "rewards/margins": -0.064453125, + "rewards/rejected": 0.35546875, + "step": 341 + }, + { + "epoch": 0.09865858935525747, + "grad_norm": 10.814716863646092, + "learning_rate": 4.927953890489913e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1760.0, + "logps/rejected": -1576.0, + "loss": 0.6821, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.0181884765625, + "rewards/rejected": 0.302734375, + "step": 342 + }, + { + "epoch": 0.09894706476272898, + "grad_norm": 11.309814746059253, + "learning_rate": 4.942363112391931e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.90625, + "logps/chosen": -1680.0, + "logps/rejected": -1504.0, + "loss": 0.7376, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.22265625, + "rewards/margins": -0.10400390625, + "rewards/rejected": 0.326171875, + "step": 343 + }, + { + "epoch": 0.09923554017020049, + "grad_norm": 12.896102464020627, + "learning_rate": 4.956772334293947e-07, + "logits/chosen": 3.0625, + "logits/rejected": 2.921875, + "logps/chosen": -1584.0, + "logps/rejected": -1896.0, + "loss": 0.7078, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.296875, + "rewards/margins": -0.041259765625, + "rewards/rejected": 0.337890625, + "step": 344 + }, + { + "epoch": 0.099524015577672, + "grad_norm": 11.65631367181338, + "learning_rate": 4.971181556195965e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.890625, + "logps/chosen": -1640.0, + "logps/rejected": -1784.0, + "loss": 0.6967, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.298828125, + "rewards/margins": -0.059326171875, + "rewards/rejected": 0.357421875, + "step": 345 + }, + { + "epoch": 0.09981249098514351, + "grad_norm": 13.187069779527267, + "learning_rate": 4.985590778097982e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.90625, + "logps/chosen": -2096.0, + "logps/rejected": -1696.0, + "loss": 0.6722, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.06787109375, + "rewards/rejected": 0.31640625, + "step": 346 + }, + { + "epoch": 0.10010096639261502, + "grad_norm": 13.327265365798548, + "learning_rate": 5e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0, + "logps/chosen": -1920.0, + "logps/rejected": -1656.0, + "loss": 0.7143, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.337890625, + "rewards/margins": -0.00494384765625, + "rewards/rejected": 0.341796875, + "step": 347 + }, + { + "epoch": 0.10038944180008655, + "grad_norm": 10.045276255832176, + "learning_rate": 4.999998731825629e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1416.0, + "logps/rejected": -1264.0, + "loss": 0.6618, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.03955078125, + "rewards/rejected": 0.2119140625, + "step": 348 + }, + { + "epoch": 0.10067791720755806, + "grad_norm": 14.434478135022456, + "learning_rate": 4.999994927303802e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.921875, + "logps/chosen": -1760.0, + "logps/rejected": -1808.0, + "loss": 0.6913, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.044921875, + "rewards/rejected": 0.2734375, + "step": 349 + }, + { + "epoch": 0.10096639261502957, + "grad_norm": 13.967602744275279, + "learning_rate": 4.99998858643838e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.9375, + "logps/chosen": -1864.0, + "logps/rejected": -1696.0, + "loss": 0.6844, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.283203125, + "rewards/margins": 0.03515625, + "rewards/rejected": 0.2490234375, + "step": 350 + }, + { + "epoch": 0.10125486802250108, + "grad_norm": 12.506510735692627, + "learning_rate": 4.999979709235794e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.90625, + "logps/chosen": -1552.0, + "logps/rejected": -1440.0, + "loss": 0.6941, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.04296875, + "rewards/rejected": 0.29296875, + "step": 351 + }, + { + "epoch": 0.1015433434299726, + "grad_norm": 13.892066219848537, + "learning_rate": 4.999968295705053e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -1712.0, + "logps/rejected": -1688.0, + "loss": 0.7126, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.0400390625, + "rewards/rejected": 0.302734375, + "step": 352 + }, + { + "epoch": 0.1018318188374441, + "grad_norm": 13.165475242303698, + "learning_rate": 4.999954345857734e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -2128.0, + "logps/rejected": -1912.0, + "loss": 0.6938, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.02880859375, + "rewards/rejected": 0.3125, + "step": 353 + }, + { + "epoch": 0.10212029424491562, + "grad_norm": 11.760053768935132, + "learning_rate": 4.999937859707991e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1752.0, + "logps/rejected": -1760.0, + "loss": 0.6834, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2734375, + "rewards/margins": -4.57763671875e-05, + "rewards/rejected": 0.2734375, + "step": 354 + }, + { + "epoch": 0.10240876965238713, + "grad_norm": 12.114893389034815, + "learning_rate": 4.999918837272549e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.90625, + "logps/chosen": -1912.0, + "logps/rejected": -1896.0, + "loss": 0.7156, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.330078125, + "rewards/margins": -0.0537109375, + "rewards/rejected": 0.384765625, + "step": 355 + }, + { + "epoch": 0.10269724505985865, + "grad_norm": 11.264317900552692, + "learning_rate": 4.999897278570708e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1760.0, + "logps/rejected": -1800.0, + "loss": 0.6838, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.0274658203125, + "rewards/rejected": 0.287109375, + "step": 356 + }, + { + "epoch": 0.10298572046733016, + "grad_norm": 11.892960347103365, + "learning_rate": 4.99987318362434e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.0, + "logps/chosen": -1648.0, + "logps/rejected": -1704.0, + "loss": 0.6799, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.025634765625, + "rewards/rejected": 0.29296875, + "step": 357 + }, + { + "epoch": 0.10327419587480167, + "grad_norm": 11.593657010577436, + "learning_rate": 4.99984655245789e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.96875, + "logps/chosen": -1416.0, + "logps/rejected": -1560.0, + "loss": 0.6794, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.26171875, + "rewards/margins": -0.00439453125, + "rewards/rejected": 0.265625, + "step": 358 + }, + { + "epoch": 0.10356267128227319, + "grad_norm": 12.17415164738371, + "learning_rate": 4.999817385098376e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.9375, + "logps/chosen": -1688.0, + "logps/rejected": -1736.0, + "loss": 0.6893, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.353515625, + "rewards/margins": -0.01513671875, + "rewards/rejected": 0.3671875, + "step": 359 + }, + { + "epoch": 0.1038511466897447, + "grad_norm": 11.024073265841533, + "learning_rate": 4.99978568157539e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -1800.0, + "logps/rejected": -1664.0, + "loss": 0.7, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.27734375, + "rewards/margins": -0.0263671875, + "rewards/rejected": 0.3046875, + "step": 360 + }, + { + "epoch": 0.10413962209721621, + "grad_norm": 11.535957195902807, + "learning_rate": 4.999751441921096e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.890625, + "logps/chosen": -1688.0, + "logps/rejected": -1648.0, + "loss": 0.6801, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.201171875, + "step": 361 + }, + { + "epoch": 0.10442809750468772, + "grad_norm": 11.570431364911059, + "learning_rate": 4.999714666170232e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.875, + "logps/chosen": -1344.0, + "logps/rejected": -1264.0, + "loss": 0.6957, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25390625, + "rewards/margins": 0.0093994140625, + "rewards/rejected": 0.2451171875, + "step": 362 + }, + { + "epoch": 0.10471657291215924, + "grad_norm": 11.775547233530617, + "learning_rate": 4.999675354360108e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.109375, + "logps/chosen": -1784.0, + "logps/rejected": -1680.0, + "loss": 0.6796, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.28125, + "step": 363 + }, + { + "epoch": 0.10500504831963076, + "grad_norm": 13.033084161210898, + "learning_rate": 4.999633506530608e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -1680.0, + "logps/rejected": -1688.0, + "loss": 0.6883, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.271484375, + "rewards/margins": -0.07470703125, + "rewards/rejected": 0.345703125, + "step": 364 + }, + { + "epoch": 0.10529352372710227, + "grad_norm": 12.278058508261, + "learning_rate": 4.999589122724187e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.953125, + "logps/chosen": -1472.0, + "logps/rejected": -1448.0, + "loss": 0.6927, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2421875, + "rewards/margins": -0.03369140625, + "rewards/rejected": 0.27734375, + "step": 365 + }, + { + "epoch": 0.10558199913457378, + "grad_norm": 11.086831133173982, + "learning_rate": 4.999542202985876e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.859375, + "logps/chosen": -1584.0, + "logps/rejected": -1488.0, + "loss": 0.7029, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.279296875, + "rewards/margins": -0.00811767578125, + "rewards/rejected": 0.287109375, + "step": 366 + }, + { + "epoch": 0.10587047454204529, + "grad_norm": 11.293695231653452, + "learning_rate": 4.999492747363275e-07, + "logits/chosen": 3.078125, + "logits/rejected": 2.984375, + "logps/chosen": -1736.0, + "logps/rejected": -1816.0, + "loss": 0.7023, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.283203125, + "rewards/margins": -0.048828125, + "rewards/rejected": 0.33203125, + "step": 367 + }, + { + "epoch": 0.1061589499495168, + "grad_norm": 11.796173413400084, + "learning_rate": 4.999440755906561e-07, + "logits/chosen": 2.953125, + "logits/rejected": 3.046875, + "logps/chosen": -1552.0, + "logps/rejected": -1376.0, + "loss": 0.6865, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.0198974609375, + "rewards/rejected": 0.248046875, + "step": 368 + }, + { + "epoch": 0.10644742535698831, + "grad_norm": 10.951040422904933, + "learning_rate": 4.999386228668479e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -1408.0, + "logps/rejected": -1280.0, + "loss": 0.6767, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.0162353515625, + "rewards/rejected": 0.296875, + "step": 369 + }, + { + "epoch": 0.10673590076445982, + "grad_norm": 12.84640075683829, + "learning_rate": 4.999329165704349e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.90625, + "logps/chosen": -1448.0, + "logps/rejected": -1520.0, + "loss": 0.7158, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2412109375, + "rewards/margins": -0.034912109375, + "rewards/rejected": 0.275390625, + "step": 370 + }, + { + "epoch": 0.10702437617193135, + "grad_norm": 12.77330159672173, + "learning_rate": 4.999269567072067e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -1824.0, + "logps/rejected": -1792.0, + "loss": 0.6525, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.0361328125, + "rewards/rejected": 0.33984375, + "step": 371 + }, + { + "epoch": 0.10731285157940286, + "grad_norm": 11.783193519252135, + "learning_rate": 4.999207432832094e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0, + "logps/chosen": -1488.0, + "logps/rejected": -1544.0, + "loss": 0.6927, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.28125, + "rewards/margins": 0.00994873046875, + "rewards/rejected": 0.271484375, + "step": 372 + }, + { + "epoch": 0.10760132698687437, + "grad_norm": 11.110919628422778, + "learning_rate": 4.99914276304747e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.84375, + "logps/chosen": -1544.0, + "logps/rejected": -1640.0, + "loss": 0.7225, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.255859375, + "rewards/margins": -0.03369140625, + "rewards/rejected": 0.291015625, + "step": 373 + }, + { + "epoch": 0.10788980239434588, + "grad_norm": 13.277910096141774, + "learning_rate": 4.999075557783804e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.96875, + "logps/chosen": -1584.0, + "logps/rejected": -1520.0, + "loss": 0.6896, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27734375, + "rewards/margins": -0.03564453125, + "rewards/rejected": 0.314453125, + "step": 374 + }, + { + "epoch": 0.1081782778018174, + "grad_norm": 12.917317085902917, + "learning_rate": 4.99900581710928e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -2464.0, + "logps/rejected": -2256.0, + "loss": 0.694, + "loss/demonstration_loss": -4768.0, + "loss/preference_loss": -4768.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.498046875, + "rewards/margins": -0.01239013671875, + "rewards/rejected": 0.51171875, + "step": 375 + }, + { + "epoch": 0.1084667532092889, + "grad_norm": 12.461358320488536, + "learning_rate": 4.99893354109465e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1568.0, + "logps/rejected": -1616.0, + "loss": 0.6903, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.34375, + "rewards/margins": 0.023193359375, + "rewards/rejected": 0.3203125, + "step": 376 + }, + { + "epoch": 0.10875522861676042, + "grad_norm": 11.716811830286172, + "learning_rate": 4.998858729813244e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1552.0, + "logps/rejected": -1440.0, + "loss": 0.6836, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.0274658203125, + "rewards/rejected": 0.27734375, + "step": 377 + }, + { + "epoch": 0.10904370402423194, + "grad_norm": 17.707198057038767, + "learning_rate": 4.998781383340959e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.03125, + "logps/chosen": -1776.0, + "logps/rejected": -1536.0, + "loss": 0.7072, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.018310546875, + "rewards/rejected": 0.3046875, + "step": 378 + }, + { + "epoch": 0.10933217943170345, + "grad_norm": 11.023741104642982, + "learning_rate": 4.998701501756266e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1616.0, + "logps/rejected": -1520.0, + "loss": 0.6949, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.0172119140625, + "rewards/rejected": 0.294921875, + "step": 379 + }, + { + "epoch": 0.10962065483917496, + "grad_norm": 12.485350390493423, + "learning_rate": 4.998619085140208e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.140625, + "logps/chosen": -1664.0, + "logps/rejected": -1536.0, + "loss": 0.6795, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34375, + "rewards/margins": 0.0498046875, + "rewards/rejected": 0.294921875, + "step": 380 + }, + { + "epoch": 0.10990913024664647, + "grad_norm": 11.193723079995971, + "learning_rate": 4.998534133576402e-07, + "logits/chosen": 2.828125, + "logits/rejected": 2.796875, + "logps/chosen": -1840.0, + "logps/rejected": -1776.0, + "loss": 0.6899, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.0274658203125, + "rewards/rejected": 0.3046875, + "step": 381 + }, + { + "epoch": 0.11019760565411799, + "grad_norm": 11.72892734729227, + "learning_rate": 4.998446647151032e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0, + "logps/chosen": -1256.0, + "logps/rejected": -1568.0, + "loss": 0.7074, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1962890625, + "rewards/margins": -0.047607421875, + "rewards/rejected": 0.244140625, + "step": 382 + }, + { + "epoch": 0.1104860810615895, + "grad_norm": 15.162140841900214, + "learning_rate": 4.998356625952859e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1552.0, + "logps/rejected": -1432.0, + "loss": 0.6693, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.021240234375, + "rewards/rejected": 0.25, + "step": 383 + }, + { + "epoch": 0.11077455646906101, + "grad_norm": 14.227548130386323, + "learning_rate": 4.99826407007321e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.921875, + "logps/chosen": -1280.0, + "logps/rejected": -1480.0, + "loss": 0.7054, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2138671875, + "rewards/margins": -0.0576171875, + "rewards/rejected": 0.271484375, + "step": 384 + }, + { + "epoch": 0.11106303187653252, + "grad_norm": 11.090272476920955, + "learning_rate": 4.998168979605988e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.9375, + "logps/chosen": -1704.0, + "logps/rejected": -1488.0, + "loss": 0.6775, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.26171875, + "rewards/margins": 0.037353515625, + "rewards/rejected": 0.2236328125, + "step": 385 + }, + { + "epoch": 0.11135150728400404, + "grad_norm": 13.807018243839257, + "learning_rate": 4.998071354647668e-07, + "logits/chosen": 2.6875, + "logits/rejected": 2.765625, + "logps/chosen": -1824.0, + "logps/rejected": -1720.0, + "loss": 0.713, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.0013427734375, + "rewards/rejected": 0.302734375, + "step": 386 + }, + { + "epoch": 0.11163998269147556, + "grad_norm": 13.193719820615023, + "learning_rate": 4.997971195297292e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1784.0, + "logps/rejected": -1424.0, + "loss": 0.6612, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.201171875, + "step": 387 + }, + { + "epoch": 0.11192845809894707, + "grad_norm": 10.788098958449911, + "learning_rate": 4.997868501656476e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.0, + "logps/chosen": -1472.0, + "logps/rejected": -1552.0, + "loss": 0.6841, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2392578125, + "rewards/margins": -0.045166015625, + "rewards/rejected": 0.28515625, + "step": 388 + }, + { + "epoch": 0.11221693350641858, + "grad_norm": 11.05793384682053, + "learning_rate": 4.997763273829407e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -1336.0, + "logps/rejected": -1232.0, + "loss": 0.6741, + "loss/demonstration_loss": -2592.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23046875, + "rewards/margins": 0.017578125, + "rewards/rejected": 0.212890625, + "step": 389 + }, + { + "epoch": 0.11250540891389009, + "grad_norm": 10.788131766783017, + "learning_rate": 4.997655511922843e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -2032.0, + "logps/rejected": -1968.0, + "loss": 0.6812, + "loss/demonstration_loss": -4048.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.453125, + "rewards/margins": 0.027587890625, + "rewards/rejected": 0.42578125, + "step": 390 + }, + { + "epoch": 0.1127938843213616, + "grad_norm": 12.261534202909708, + "learning_rate": 4.997545216046112e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0, + "logps/chosen": -1648.0, + "logps/rejected": -1768.0, + "loss": 0.6989, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.310546875, + "rewards/margins": -0.02734375, + "rewards/rejected": 0.337890625, + "step": 391 + }, + { + "epoch": 0.11308235972883311, + "grad_norm": 10.75043528463301, + "learning_rate": 4.997432386311114e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.03125, + "logps/chosen": -1392.0, + "logps/rejected": -1496.0, + "loss": 0.7007, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.298828125, + "rewards/margins": 0.05126953125, + "rewards/rejected": 0.248046875, + "step": 392 + }, + { + "epoch": 0.11337083513630462, + "grad_norm": 13.459322288398441, + "learning_rate": 4.99731702283232e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.078125, + "logps/chosen": -1816.0, + "logps/rejected": -1856.0, + "loss": 0.6843, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.388671875, + "rewards/margins": -0.001251220703125, + "rewards/rejected": 0.388671875, + "step": 393 + }, + { + "epoch": 0.11365931054377615, + "grad_norm": 11.69505172687698, + "learning_rate": 4.997199125726769e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -1824.0, + "logps/rejected": -1728.0, + "loss": 0.7077, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2197265625, + "rewards/margins": -0.0224609375, + "rewards/rejected": 0.2421875, + "step": 394 + }, + { + "epoch": 0.11394778595124766, + "grad_norm": 12.290693170480603, + "learning_rate": 4.997078695114075e-07, + "logits/chosen": 3.0625, + "logits/rejected": 2.96875, + "logps/chosen": -1376.0, + "logps/rejected": -1424.0, + "loss": 0.7065, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3125, + "rewards/margins": -0.028076171875, + "rewards/rejected": 0.341796875, + "step": 395 + }, + { + "epoch": 0.11423626135871917, + "grad_norm": 11.138301783679744, + "learning_rate": 4.996955731116417e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -2144.0, + "logps/rejected": -2096.0, + "loss": 0.7006, + "loss/demonstration_loss": -4256.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.34765625, + "rewards/margins": -0.03955078125, + "rewards/rejected": 0.38671875, + "step": 396 + }, + { + "epoch": 0.11452473676619068, + "grad_norm": 11.69278126430225, + "learning_rate": 4.996830233858547e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.921875, + "logps/chosen": -1832.0, + "logps/rejected": -1936.0, + "loss": 0.6902, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.27734375, + "rewards/margins": 0.044189453125, + "rewards/rejected": 0.232421875, + "step": 397 + }, + { + "epoch": 0.1148132121736622, + "grad_norm": 11.737666952590862, + "learning_rate": 4.996702203467789e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1168.0, + "logps/rejected": -1336.0, + "loss": 0.6924, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.26171875, + "rewards/margins": -0.005523681640625, + "rewards/rejected": 0.267578125, + "step": 398 + }, + { + "epoch": 0.1151016875811337, + "grad_norm": 11.380407152789367, + "learning_rate": 4.996571640074033e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -1824.0, + "logps/rejected": -1512.0, + "loss": 0.6678, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.10791015625, + "rewards/rejected": 0.30078125, + "step": 399 + }, + { + "epoch": 0.11539016298860522, + "grad_norm": 12.966311830687642, + "learning_rate": 4.996438543809742e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1760.0, + "logps/rejected": -1648.0, + "loss": 0.6995, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.036376953125, + "rewards/rejected": 0.30078125, + "step": 400 + }, + { + "epoch": 0.11567863839607674, + "grad_norm": 11.138684980739974, + "learning_rate": 4.996302914809946e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1416.0, + "logps/rejected": -1408.0, + "loss": 0.6884, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3125, + "rewards/margins": -0.01458740234375, + "rewards/rejected": 0.326171875, + "step": 401 + }, + { + "epoch": 0.11596711380354825, + "grad_norm": 11.039411503538636, + "learning_rate": 4.996164753212247e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.859375, + "logps/chosen": -1736.0, + "logps/rejected": -1784.0, + "loss": 0.678, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.027587890625, + "rewards/rejected": 0.365234375, + "step": 402 + }, + { + "epoch": 0.11625558921101976, + "grad_norm": 12.650995680888709, + "learning_rate": 4.996024059156815e-07, + "logits/chosen": 2.84375, + "logits/rejected": 2.90625, + "logps/chosen": -1512.0, + "logps/rejected": -1840.0, + "loss": 0.6964, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.359375, + "rewards/margins": 0.033203125, + "rewards/rejected": 0.32421875, + "step": 403 + }, + { + "epoch": 0.11654406461849127, + "grad_norm": 10.769179424923426, + "learning_rate": 4.99588083278639e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.09375, + "logps/chosen": -2032.0, + "logps/rejected": -1736.0, + "loss": 0.6774, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.07763671875, + "rewards/rejected": 0.30078125, + "step": 404 + }, + { + "epoch": 0.11683254002596279, + "grad_norm": 11.517230629593275, + "learning_rate": 4.99573507424628e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1600.0, + "logps/rejected": -1672.0, + "loss": 0.7019, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.294921875, + "rewards/margins": -0.0908203125, + "rewards/rejected": 0.384765625, + "step": 405 + }, + { + "epoch": 0.1171210154334343, + "grad_norm": 11.802766660909494, + "learning_rate": 4.995586783684363e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1496.0, + "logps/rejected": -1496.0, + "loss": 0.7174, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.296875, + "rewards/margins": 0.002410888671875, + "rewards/rejected": 0.294921875, + "step": 406 + }, + { + "epoch": 0.11740949084090581, + "grad_norm": 11.169480853542668, + "learning_rate": 4.995435961251088e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1824.0, + "logps/rejected": -1616.0, + "loss": 0.6655, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.0107421875, + "rewards/rejected": 0.3828125, + "step": 407 + }, + { + "epoch": 0.11769796624837732, + "grad_norm": 10.27843734823317, + "learning_rate": 4.995282607099467e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.046875, + "logps/chosen": -1576.0, + "logps/rejected": -1640.0, + "loss": 0.6769, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.068359375, + "rewards/rejected": 0.349609375, + "step": 408 + }, + { + "epoch": 0.11798644165584884, + "grad_norm": 12.346267179870837, + "learning_rate": 4.995126721385085e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.796875, + "logps/chosen": -1432.0, + "logps/rejected": -1408.0, + "loss": 0.7251, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.275390625, + "rewards/margins": -0.09912109375, + "rewards/rejected": 0.375, + "step": 409 + }, + { + "epoch": 0.11827491706332036, + "grad_norm": 12.674708500126071, + "learning_rate": 4.994968304266095e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1448.0, + "logps/rejected": -1560.0, + "loss": 0.7049, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.24609375, + "rewards/margins": -0.0125732421875, + "rewards/rejected": 0.259765625, + "step": 410 + }, + { + "epoch": 0.11856339247079187, + "grad_norm": 12.675925233095406, + "learning_rate": 4.994807355903217e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.9375, + "logps/chosen": -1624.0, + "logps/rejected": -1496.0, + "loss": 0.6935, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.298828125, + "rewards/margins": 0.029296875, + "rewards/rejected": 0.26953125, + "step": 411 + }, + { + "epoch": 0.11885186787826338, + "grad_norm": 11.25341318054663, + "learning_rate": 4.994643876459737e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1384.0, + "logps/rejected": -1288.0, + "loss": 0.6818, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.10302734375, + "rewards/rejected": 0.322265625, + "step": 412 + }, + { + "epoch": 0.11914034328573489, + "grad_norm": 11.017715728761027, + "learning_rate": 4.994477866101517e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.96875, + "logps/chosen": -1456.0, + "logps/rejected": -1384.0, + "loss": 0.6552, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.06201171875, + "rewards/rejected": 0.251953125, + "step": 413 + }, + { + "epoch": 0.1194288186932064, + "grad_norm": 11.84260987175539, + "learning_rate": 4.994309324996976e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.140625, + "logps/chosen": -1584.0, + "logps/rejected": -1448.0, + "loss": 0.6791, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.30859375, + "rewards/margins": 0.04345703125, + "rewards/rejected": 0.265625, + "step": 414 + }, + { + "epoch": 0.11971729410067791, + "grad_norm": 12.32663216952759, + "learning_rate": 4.994138253317107e-07, + "logits/chosen": 2.75, + "logits/rejected": 2.84375, + "logps/chosen": -1784.0, + "logps/rejected": -1728.0, + "loss": 0.6805, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.369140625, + "rewards/margins": 0.046142578125, + "rewards/rejected": 0.322265625, + "step": 415 + }, + { + "epoch": 0.12000576950814942, + "grad_norm": 14.08264896084647, + "learning_rate": 4.99396465123547e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1608.0, + "logps/rejected": -1464.0, + "loss": 0.7081, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.32421875, + "rewards/margins": -0.0072021484375, + "rewards/rejected": 0.33203125, + "step": 416 + }, + { + "epoch": 0.12029424491562095, + "grad_norm": 11.865145517516, + "learning_rate": 4.99378851892819e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.046875, + "logps/chosen": -1728.0, + "logps/rejected": -1600.0, + "loss": 0.7057, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.040771484375, + "rewards/rejected": 0.341796875, + "step": 417 + }, + { + "epoch": 0.12058272032309246, + "grad_norm": 10.569651738583094, + "learning_rate": 4.99360985657396e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1904.0, + "logps/rejected": -1880.0, + "loss": 0.6597, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.0908203125, + "rewards/rejected": 0.318359375, + "step": 418 + }, + { + "epoch": 0.12087119573056397, + "grad_norm": 10.749124921358813, + "learning_rate": 4.993428664354041e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.109375, + "logps/chosen": -1864.0, + "logps/rejected": -1784.0, + "loss": 0.7182, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.306640625, + "rewards/margins": -0.0218505859375, + "rewards/rejected": 0.328125, + "step": 419 + }, + { + "epoch": 0.12115967113803548, + "grad_norm": 12.045921528662834, + "learning_rate": 4.99324494245226e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.921875, + "logps/chosen": -1992.0, + "logps/rejected": -2048.0, + "loss": 0.662, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.1103515625, + "rewards/rejected": 0.41015625, + "step": 420 + }, + { + "epoch": 0.121448146545507, + "grad_norm": 10.904436298811431, + "learning_rate": 4.99305869105501e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.8125, + "logps/chosen": -1368.0, + "logps/rejected": -1432.0, + "loss": 0.6655, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.001861572265625, + "rewards/rejected": 0.31640625, + "step": 421 + }, + { + "epoch": 0.1217366219529785, + "grad_norm": 12.071667651710214, + "learning_rate": 4.992869910351249e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1288.0, + "logps/rejected": -1400.0, + "loss": 0.7188, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.2333984375, + "rewards/margins": -0.0791015625, + "rewards/rejected": 0.3125, + "step": 422 + }, + { + "epoch": 0.12202509736045002, + "grad_norm": 11.780346960823579, + "learning_rate": 4.992678600532503e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.078125, + "logps/chosen": -1608.0, + "logps/rejected": -1520.0, + "loss": 0.7131, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.314453125, + "rewards/margins": -0.06005859375, + "rewards/rejected": 0.375, + "step": 423 + }, + { + "epoch": 0.12231357276792154, + "grad_norm": 12.224839945194843, + "learning_rate": 4.992484761792865e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.03125, + "logps/chosen": -1552.0, + "logps/rejected": -1736.0, + "loss": 0.6599, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.0654296875, + "rewards/rejected": 0.345703125, + "step": 424 + }, + { + "epoch": 0.12260204817539305, + "grad_norm": 10.88755251542413, + "learning_rate": 4.99228839432899e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.984375, + "logps/chosen": -1608.0, + "logps/rejected": -1760.0, + "loss": 0.6818, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.04736328125, + "rewards/rejected": 0.337890625, + "step": 425 + }, + { + "epoch": 0.12289052358286456, + "grad_norm": 10.867749018766673, + "learning_rate": 4.992089498340101e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1352.0, + "logps/rejected": -1656.0, + "loss": 0.7012, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2890625, + "rewards/margins": -0.022705078125, + "rewards/rejected": 0.310546875, + "step": 426 + }, + { + "epoch": 0.12317899899033607, + "grad_norm": 10.196699378594237, + "learning_rate": 4.991888074027985e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1856.0, + "logps/rejected": -1808.0, + "loss": 0.6611, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.359375, + "rewards/margins": 0.0771484375, + "rewards/rejected": 0.28125, + "step": 427 + }, + { + "epoch": 0.12346747439780759, + "grad_norm": 10.87709180925879, + "learning_rate": 4.991684121596998e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0, + "logps/chosen": -1640.0, + "logps/rejected": -1728.0, + "loss": 0.6871, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.05126953125, + "rewards/rejected": 0.271484375, + "step": 428 + }, + { + "epoch": 0.1237559498052791, + "grad_norm": 11.19434474732812, + "learning_rate": 4.991477641254055e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0, + "logps/chosen": -1456.0, + "logps/rejected": -1528.0, + "loss": 0.6641, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28125, + "rewards/margins": 0.06982421875, + "rewards/rejected": 0.2109375, + "step": 429 + }, + { + "epoch": 0.12404442521275061, + "grad_norm": 11.238345351868436, + "learning_rate": 4.99126863320864e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1984.0, + "logps/rejected": -1984.0, + "loss": 0.719, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.333984375, + "rewards/margins": -0.0556640625, + "rewards/rejected": 0.390625, + "step": 430 + }, + { + "epoch": 0.12433290062022212, + "grad_norm": 11.458387677602374, + "learning_rate": 4.991057097672798e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1368.0, + "logps/rejected": -1376.0, + "loss": 0.7018, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.263671875, + "rewards/margins": -0.029296875, + "rewards/rejected": 0.29296875, + "step": 431 + }, + { + "epoch": 0.12462137602769364, + "grad_norm": 11.817716504736982, + "learning_rate": 4.990843034861143e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1992.0, + "logps/rejected": -1800.0, + "loss": 0.66, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.37890625, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.3125, + "step": 432 + }, + { + "epoch": 0.12490985143516516, + "grad_norm": 11.657289397930844, + "learning_rate": 4.990626444990848e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.90625, + "logps/chosen": -1520.0, + "logps/rejected": -1496.0, + "loss": 0.6896, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.29296875, + "rewards/margins": 0.0223388671875, + "rewards/rejected": 0.26953125, + "step": 433 + }, + { + "epoch": 0.12519832684263665, + "grad_norm": 12.308708717277893, + "learning_rate": 4.990407328281651e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.96875, + "logps/chosen": -1808.0, + "logps/rejected": -1616.0, + "loss": 0.6609, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.1103515625, + "rewards/rejected": 0.298828125, + "step": 434 + }, + { + "epoch": 0.12548680225010816, + "grad_norm": 12.31859895211819, + "learning_rate": 4.990185684955858e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.953125, + "logps/chosen": -1672.0, + "logps/rejected": -1608.0, + "loss": 0.7099, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.287109375, + "rewards/margins": -0.000518798828125, + "rewards/rejected": 0.287109375, + "step": 435 + }, + { + "epoch": 0.1257752776575797, + "grad_norm": 10.812363768756235, + "learning_rate": 4.989961515238333e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -2336.0, + "logps/rejected": -2208.0, + "loss": 0.6683, + "loss/demonstration_loss": -4576.0, + "loss/preference_loss": -4576.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.030029296875, + "rewards/rejected": 0.435546875, + "step": 436 + }, + { + "epoch": 0.12606375306505122, + "grad_norm": 12.895646729122682, + "learning_rate": 4.989734819356503e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -1864.0, + "logps/rejected": -1712.0, + "loss": 0.6873, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.00567626953125, + "rewards/rejected": 0.388671875, + "step": 437 + }, + { + "epoch": 0.12635222847252273, + "grad_norm": 12.384810735149756, + "learning_rate": 4.989505597540365e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.90625, + "logps/chosen": -1736.0, + "logps/rejected": -1728.0, + "loss": 0.6877, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.318359375, + "rewards/margins": -0.02099609375, + "rewards/rejected": 0.337890625, + "step": 438 + }, + { + "epoch": 0.12664070387999424, + "grad_norm": 11.04595161539796, + "learning_rate": 4.989273850022468e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -1280.0, + "logps/rejected": -1392.0, + "loss": 0.6914, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.24609375, + "rewards/margins": -0.018798828125, + "rewards/rejected": 0.265625, + "step": 439 + }, + { + "epoch": 0.12692917928746575, + "grad_norm": 12.535500951366224, + "learning_rate": 4.989039577037933e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.96875, + "logps/chosen": -1952.0, + "logps/rejected": -1952.0, + "loss": 0.6918, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.38671875, + "rewards/margins": -0.0081787109375, + "rewards/rejected": 0.39453125, + "step": 440 + }, + { + "epoch": 0.12721765469493726, + "grad_norm": 12.037641912147905, + "learning_rate": 4.988802778824437e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.03125, + "logps/chosen": -1328.0, + "logps/rejected": -1648.0, + "loss": 0.6973, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.0419921875, + "rewards/rejected": 0.29296875, + "step": 441 + }, + { + "epoch": 0.12750613010240877, + "grad_norm": 13.59476767942337, + "learning_rate": 4.988563455622222e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0, + "logps/chosen": -1752.0, + "logps/rejected": -1520.0, + "loss": 0.6719, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.349609375, + "rewards/margins": -0.00360107421875, + "rewards/rejected": 0.353515625, + "step": 442 + }, + { + "epoch": 0.12779460550988028, + "grad_norm": 12.482000024972319, + "learning_rate": 4.988321607674091e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.9375, + "logps/chosen": -1768.0, + "logps/rejected": -1712.0, + "loss": 0.6641, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.07958984375, + "rewards/rejected": 0.314453125, + "step": 443 + }, + { + "epoch": 0.1280830809173518, + "grad_norm": 11.943922095909839, + "learning_rate": 4.988077235225407e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0, + "logps/chosen": -1912.0, + "logps/rejected": -1912.0, + "loss": 0.6762, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.09130859375, + "rewards/rejected": 0.40234375, + "step": 444 + }, + { + "epoch": 0.1283715563248233, + "grad_norm": 11.076323161978614, + "learning_rate": 4.987830338524098e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1784.0, + "logps/rejected": -1560.0, + "loss": 0.6735, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34375, + "rewards/margins": 0.01312255859375, + "rewards/rejected": 0.330078125, + "step": 445 + }, + { + "epoch": 0.12866003173229482, + "grad_norm": 11.050690941040932, + "learning_rate": 4.987580917820649e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1632.0, + "logps/rejected": -1600.0, + "loss": 0.6868, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.36328125, + "rewards/margins": -0.005615234375, + "rewards/rejected": 0.369140625, + "step": 446 + }, + { + "epoch": 0.12894850713976633, + "grad_norm": 11.232999306414936, + "learning_rate": 4.987328973368106e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.921875, + "logps/chosen": -1048.0, + "logps/rejected": -1128.0, + "loss": 0.6916, + "loss/demonstration_loss": -2208.0, + "loss/preference_loss": -2208.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2236328125, + "rewards/margins": -0.0186767578125, + "rewards/rejected": 0.2421875, + "step": 447 + }, + { + "epoch": 0.12923698254723784, + "grad_norm": 11.837612777477373, + "learning_rate": 4.987074505422078e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1920.0, + "logps/rejected": -2064.0, + "loss": 0.6935, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.40625, + "rewards/margins": -0.039794921875, + "rewards/rejected": 0.4453125, + "step": 448 + }, + { + "epoch": 0.12952545795470935, + "grad_norm": 11.471188644583533, + "learning_rate": 4.986817514240734e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.90625, + "logps/chosen": -1920.0, + "logps/rejected": -1808.0, + "loss": 0.6791, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.038330078125, + "rewards/rejected": 0.296875, + "step": 449 + }, + { + "epoch": 0.12981393336218086, + "grad_norm": 12.968585608845181, + "learning_rate": 4.986558000084798e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.953125, + "logps/chosen": -1408.0, + "logps/rejected": -1568.0, + "loss": 0.6886, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.279296875, + "rewards/margins": -0.003387451171875, + "rewards/rejected": 0.283203125, + "step": 450 + }, + { + "epoch": 0.1301024087696524, + "grad_norm": 12.577737137438792, + "learning_rate": 4.98629596321756e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1776.0, + "logps/rejected": -1568.0, + "loss": 0.6766, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.400390625, + "rewards/margins": 0.00836181640625, + "rewards/rejected": 0.390625, + "step": 451 + }, + { + "epoch": 0.1303908841771239, + "grad_norm": 12.646107856174552, + "learning_rate": 4.986031403904868e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.03125, + "logps/chosen": -1928.0, + "logps/rejected": -1808.0, + "loss": 0.6656, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.0751953125, + "rewards/rejected": 0.36328125, + "step": 452 + }, + { + "epoch": 0.13067935958459542, + "grad_norm": 10.8715534249111, + "learning_rate": 4.985764322415124e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1872.0, + "logps/rejected": -2032.0, + "loss": 0.7211, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.30859375, + "rewards/margins": -0.053955078125, + "rewards/rejected": 0.361328125, + "step": 453 + }, + { + "epoch": 0.13096783499206693, + "grad_norm": 10.66878191952447, + "learning_rate": 4.985494719019297e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1848.0, + "logps/rejected": -1904.0, + "loss": 0.6908, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.3828125, + "rewards/margins": -0.045654296875, + "rewards/rejected": 0.427734375, + "step": 454 + }, + { + "epoch": 0.13125631039953845, + "grad_norm": 11.664312218691633, + "learning_rate": 4.985222593990907e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -1584.0, + "logps/rejected": -1560.0, + "loss": 0.6768, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.263671875, + "rewards/margins": 0.02587890625, + "rewards/rejected": 0.2373046875, + "step": 455 + }, + { + "epoch": 0.13154478580700996, + "grad_norm": 11.827868633518412, + "learning_rate": 4.984947947606038e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.984375, + "logps/chosen": -1632.0, + "logps/rejected": -1768.0, + "loss": 0.6743, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.1171875, + "rewards/rejected": 0.294921875, + "step": 456 + }, + { + "epoch": 0.13183326121448147, + "grad_norm": 13.64045911269315, + "learning_rate": 4.984670780143327e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1640.0, + "logps/rejected": -1536.0, + "loss": 0.6859, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.326171875, + "rewards/margins": -0.02001953125, + "rewards/rejected": 0.345703125, + "step": 457 + }, + { + "epoch": 0.13212173662195298, + "grad_norm": 10.830075394101764, + "learning_rate": 4.984391091883973e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.96875, + "logps/chosen": -1600.0, + "logps/rejected": -1576.0, + "loss": 0.6599, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.053955078125, + "rewards/rejected": 0.310546875, + "step": 458 + }, + { + "epoch": 0.1324102120294245, + "grad_norm": 11.566196448986432, + "learning_rate": 4.984108883111732e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.96875, + "logps/chosen": -1856.0, + "logps/rejected": -1960.0, + "loss": 0.7008, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3203125, + "rewards/margins": -0.099609375, + "rewards/rejected": 0.419921875, + "step": 459 + }, + { + "epoch": 0.132698687436896, + "grad_norm": 12.89274973770771, + "learning_rate": 4.983824154112913e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0, + "logps/chosen": -1728.0, + "logps/rejected": -1536.0, + "loss": 0.7102, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27734375, + "rewards/margins": -0.034912109375, + "rewards/rejected": 0.3125, + "step": 460 + }, + { + "epoch": 0.1329871628443675, + "grad_norm": 12.851351655799954, + "learning_rate": 4.983536905176387e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -1488.0, + "logps/rejected": -1424.0, + "loss": 0.6927, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.302734375, + "rewards/margins": -0.009521484375, + "rewards/rejected": 0.3125, + "step": 461 + }, + { + "epoch": 0.13327563825183902, + "grad_norm": 12.24378500561336, + "learning_rate": 4.983247136593578e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.046875, + "logps/chosen": -1728.0, + "logps/rejected": -1656.0, + "loss": 0.6857, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.1142578125, + "rewards/rejected": 0.29296875, + "step": 462 + }, + { + "epoch": 0.13356411365931053, + "grad_norm": 10.466269810760314, + "learning_rate": 4.982954848658469e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.921875, + "logps/chosen": -1960.0, + "logps/rejected": -1768.0, + "loss": 0.6609, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34375, + "rewards/margins": 0.043212890625, + "rewards/rejected": 0.298828125, + "step": 463 + }, + { + "epoch": 0.13385258906678205, + "grad_norm": 11.302917504764295, + "learning_rate": 4.982660041667597e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.921875, + "logps/chosen": -1616.0, + "logps/rejected": -1448.0, + "loss": 0.6852, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.390625, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.326171875, + "step": 464 + }, + { + "epoch": 0.13414106447425356, + "grad_norm": 10.878424035843983, + "learning_rate": 4.982362715920054e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.125, + "logps/chosen": -1616.0, + "logps/rejected": -1136.0, + "loss": 0.6487, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.345703125, + "rewards/margins": 0.162109375, + "rewards/rejected": 0.1826171875, + "step": 465 + }, + { + "epoch": 0.1344295398817251, + "grad_norm": 11.994104282298917, + "learning_rate": 4.982062871717492e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.078125, + "logps/chosen": -1432.0, + "logps/rejected": -1176.0, + "loss": 0.6815, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.036865234375, + "rewards/rejected": 0.27734375, + "step": 466 + }, + { + "epoch": 0.1347180152891966, + "grad_norm": 11.114639824143396, + "learning_rate": 4.981760509364112e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1856.0, + "logps/rejected": -1952.0, + "loss": 0.674, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.006256103515625, + "rewards/rejected": 0.40625, + "step": 467 + }, + { + "epoch": 0.13500649069666812, + "grad_norm": 10.894157065756591, + "learning_rate": 4.981455629166674e-07, + "logits/chosen": 3.03125, + "logits/rejected": 2.921875, + "logps/chosen": -1592.0, + "logps/rejected": -1632.0, + "loss": 0.6718, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37109375, + "rewards/margins": 0.07470703125, + "rewards/rejected": 0.296875, + "step": 468 + }, + { + "epoch": 0.13529496610413963, + "grad_norm": 15.32373775505376, + "learning_rate": 4.98114823143449e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.9375, + "logps/chosen": -1712.0, + "logps/rejected": -1616.0, + "loss": 0.7105, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.423828125, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.34765625, + "step": 469 + }, + { + "epoch": 0.13558344151161114, + "grad_norm": 12.684554010628778, + "learning_rate": 4.980838316479427e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1568.0, + "logps/rejected": -1376.0, + "loss": 0.6589, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.038818359375, + "rewards/rejected": 0.29296875, + "step": 470 + }, + { + "epoch": 0.13587191691908265, + "grad_norm": 10.479653630459104, + "learning_rate": 4.980525884615907e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1568.0, + "logps/rejected": -1664.0, + "loss": 0.7075, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.345703125, + "rewards/margins": -0.0113525390625, + "rewards/rejected": 0.357421875, + "step": 471 + }, + { + "epoch": 0.13616039232655416, + "grad_norm": 12.171377176430834, + "learning_rate": 4.980210936160904e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.015625, + "logps/chosen": -1784.0, + "logps/rejected": -1600.0, + "loss": 0.6942, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.016357421875, + "rewards/rejected": 0.3671875, + "step": 472 + }, + { + "epoch": 0.13644886773402568, + "grad_norm": 13.86944091475449, + "learning_rate": 4.979893471433946e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.046875, + "logps/chosen": -1584.0, + "logps/rejected": -1792.0, + "loss": 0.7258, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3671875, + "rewards/margins": -0.00384521484375, + "rewards/rejected": 0.37109375, + "step": 473 + }, + { + "epoch": 0.1367373431414972, + "grad_norm": 11.511931924315167, + "learning_rate": 4.979573490757112e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.125, + "logps/chosen": -1760.0, + "logps/rejected": -1720.0, + "loss": 0.7013, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.0126953125, + "rewards/rejected": 0.38671875, + "step": 474 + }, + { + "epoch": 0.1370258185489687, + "grad_norm": 11.802226915711545, + "learning_rate": 4.979250994455038e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1824.0, + "logps/rejected": -1712.0, + "loss": 0.7106, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.08154296875, + "rewards/rejected": 0.412109375, + "step": 475 + }, + { + "epoch": 0.1373142939564402, + "grad_norm": 12.279123869774102, + "learning_rate": 4.978925982854906e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1872.0, + "logps/rejected": -2048.0, + "loss": 0.7271, + "loss/demonstration_loss": -3968.0, + "loss/preference_loss": -3968.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.40625, + "rewards/margins": -0.058837890625, + "rewards/rejected": 0.46484375, + "step": 476 + }, + { + "epoch": 0.13760276936391172, + "grad_norm": 11.293577935133893, + "learning_rate": 4.978598456286455e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.875, + "logps/chosen": -1672.0, + "logps/rejected": -1648.0, + "loss": 0.6981, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26953125, + "rewards/margins": -0.04541015625, + "rewards/rejected": 0.314453125, + "step": 477 + }, + { + "epoch": 0.13789124477138323, + "grad_norm": 10.82962450929594, + "learning_rate": 4.978268415081973e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1752.0, + "logps/rejected": -1720.0, + "loss": 0.6935, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.400390625, + "rewards/margins": 0.01708984375, + "rewards/rejected": 0.3828125, + "step": 478 + }, + { + "epoch": 0.13817972017885474, + "grad_norm": 13.905992733274704, + "learning_rate": 4.9779358595763e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.171875, + "logps/chosen": -1528.0, + "logps/rejected": -1432.0, + "loss": 0.7242, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3125, + "rewards/margins": -0.0186767578125, + "rewards/rejected": 0.33203125, + "step": 479 + }, + { + "epoch": 0.13846819558632625, + "grad_norm": 10.162473885017894, + "learning_rate": 4.977600790106826e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -1600.0, + "logps/rejected": -1552.0, + "loss": 0.693, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.34765625, + "rewards/margins": -0.034423828125, + "rewards/rejected": 0.3828125, + "step": 480 + }, + { + "epoch": 0.13875667099379776, + "grad_norm": 13.908433660172133, + "learning_rate": 4.977263207013493e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1880.0, + "logps/rejected": -1944.0, + "loss": 0.7183, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.00360107421875, + "rewards/rejected": 0.44140625, + "step": 481 + }, + { + "epoch": 0.1390451464012693, + "grad_norm": 13.913766960186566, + "learning_rate": 4.976923110638794e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1728.0, + "logps/rejected": -1792.0, + "loss": 0.6802, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.0101318359375, + "rewards/rejected": 0.455078125, + "step": 482 + }, + { + "epoch": 0.13933362180874082, + "grad_norm": 12.677172340535385, + "learning_rate": 4.976580501327767e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1688.0, + "logps/rejected": -1584.0, + "loss": 0.6474, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.330078125, + "step": 483 + }, + { + "epoch": 0.13962209721621233, + "grad_norm": 11.556130226393334, + "learning_rate": 4.976235379428004e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.046875, + "logps/chosen": -1824.0, + "logps/rejected": -1816.0, + "loss": 0.7037, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.375, + "rewards/margins": -0.030029296875, + "rewards/rejected": 0.404296875, + "step": 484 + }, + { + "epoch": 0.13991057262368384, + "grad_norm": 10.399818391535014, + "learning_rate": 4.975887745289646e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.078125, + "logps/chosen": -1400.0, + "logps/rejected": -1560.0, + "loss": 0.694, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29296875, + "rewards/margins": -0.05322265625, + "rewards/rejected": 0.345703125, + "step": 485 + }, + { + "epoch": 0.14019904803115535, + "grad_norm": 12.129636704268066, + "learning_rate": 4.97553759926538e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1224.0, + "logps/rejected": -1184.0, + "loss": 0.6909, + "loss/demonstration_loss": -2432.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.27734375, + "rewards/margins": -0.019775390625, + "rewards/rejected": 0.296875, + "step": 486 + }, + { + "epoch": 0.14048752343862686, + "grad_norm": 11.160443819370817, + "learning_rate": 4.975184941710444e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.9375, + "logps/chosen": -1680.0, + "logps/rejected": -1712.0, + "loss": 0.7097, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.341796875, + "rewards/margins": -0.00286865234375, + "rewards/rejected": 0.345703125, + "step": 487 + }, + { + "epoch": 0.14077599884609837, + "grad_norm": 10.673503276953756, + "learning_rate": 4.974829772982622e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1424.0, + "logps/rejected": -1464.0, + "loss": 0.7014, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.404296875, + "rewards/margins": 0.0458984375, + "rewards/rejected": 0.357421875, + "step": 488 + }, + { + "epoch": 0.14106447425356988, + "grad_norm": 11.863813185530908, + "learning_rate": 4.974472093442247e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -1696.0, + "logps/rejected": -1696.0, + "loss": 0.7107, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4296875, + "rewards/margins": -0.04248046875, + "rewards/rejected": 0.47265625, + "step": 489 + }, + { + "epoch": 0.1413529496610414, + "grad_norm": 10.923678398397007, + "learning_rate": 4.9741119034522e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1688.0, + "logps/rejected": -1664.0, + "loss": 0.6685, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.04736328125, + "rewards/rejected": 0.35546875, + "step": 490 + }, + { + "epoch": 0.1416414250685129, + "grad_norm": 11.75287871932818, + "learning_rate": 4.973749203377906e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.1875, + "logps/chosen": -1672.0, + "logps/rejected": -1584.0, + "loss": 0.6703, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.345703125, + "rewards/margins": -0.056884765625, + "rewards/rejected": 0.40234375, + "step": 491 + }, + { + "epoch": 0.14192990047598442, + "grad_norm": 10.14461251472448, + "learning_rate": 4.97338399358734e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.171875, + "logps/chosen": -1520.0, + "logps/rejected": -1472.0, + "loss": 0.6993, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.34765625, + "rewards/margins": -0.0291748046875, + "rewards/rejected": 0.376953125, + "step": 492 + }, + { + "epoch": 0.14221837588345593, + "grad_norm": 12.913836665899643, + "learning_rate": 4.973016274451022e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.03125, + "logps/chosen": -2016.0, + "logps/rejected": -2080.0, + "loss": 0.7001, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.361328125, + "rewards/margins": -0.07080078125, + "rewards/rejected": 0.431640625, + "step": 493 + }, + { + "epoch": 0.14250685129092744, + "grad_norm": 12.800683838309105, + "learning_rate": 4.972646046342018e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.046875, + "logps/chosen": -1152.0, + "logps/rejected": -1272.0, + "loss": 0.6843, + "loss/demonstration_loss": -2448.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2412109375, + "rewards/margins": -0.005401611328125, + "rewards/rejected": 0.2470703125, + "step": 494 + }, + { + "epoch": 0.14279532669839895, + "grad_norm": 12.285986535124662, + "learning_rate": 4.972273309635936e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1736.0, + "logps/rejected": -1592.0, + "loss": 0.683, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.056396484375, + "rewards/rejected": 0.390625, + "step": 495 + }, + { + "epoch": 0.14308380210587046, + "grad_norm": 10.13839707498339, + "learning_rate": 4.971898064710935e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -1640.0, + "logps/rejected": -1328.0, + "loss": 0.689, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.330078125, + "rewards/margins": 0.018310546875, + "rewards/rejected": 0.3125, + "step": 496 + }, + { + "epoch": 0.143372277513342, + "grad_norm": 9.980473186457463, + "learning_rate": 4.971520311947717e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1744.0, + "logps/rejected": -1600.0, + "loss": 0.6873, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.03369140625, + "rewards/rejected": 0.404296875, + "step": 497 + }, + { + "epoch": 0.1436607529208135, + "grad_norm": 13.023225719010433, + "learning_rate": 4.971140051729522e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -1760.0, + "logps/rejected": -1728.0, + "loss": 0.7256, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.408203125, + "rewards/margins": -0.03759765625, + "rewards/rejected": 0.4453125, + "step": 498 + }, + { + "epoch": 0.14394922832828502, + "grad_norm": 10.743599075053961, + "learning_rate": 4.970757284442144e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1592.0, + "logps/rejected": -1544.0, + "loss": 0.71, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.392578125, + "rewards/margins": -0.04052734375, + "rewards/rejected": 0.431640625, + "step": 499 + }, + { + "epoch": 0.14423770373575653, + "grad_norm": 13.06207089049958, + "learning_rate": 4.970372010473914e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.171875, + "logps/chosen": -1544.0, + "logps/rejected": -1352.0, + "loss": 0.6778, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.11181640625, + "rewards/rejected": 0.296875, + "step": 500 + }, + { + "epoch": 0.14452617914322805, + "grad_norm": 11.510145092047358, + "learning_rate": 4.969984230215707e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1632.0, + "logps/rejected": -1696.0, + "loss": 0.6606, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.3046875, + "step": 501 + }, + { + "epoch": 0.14481465455069956, + "grad_norm": 10.651575592315242, + "learning_rate": 4.969593944060941e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.1875, + "logps/chosen": -1368.0, + "logps/rejected": -1408.0, + "loss": 0.6675, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.07470703125, + "rewards/rejected": 0.306640625, + "step": 502 + }, + { + "epoch": 0.14510312995817107, + "grad_norm": 11.880529427851066, + "learning_rate": 4.969201152405579e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.078125, + "logps/chosen": -1696.0, + "logps/rejected": -1600.0, + "loss": 0.6906, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.002685546875, + "rewards/rejected": 0.390625, + "step": 503 + }, + { + "epoch": 0.14539160536564258, + "grad_norm": 10.413485604537877, + "learning_rate": 4.968805855648121e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.9375, + "logps/chosen": -1800.0, + "logps/rejected": -1768.0, + "loss": 0.7117, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.375, + "rewards/margins": -0.09814453125, + "rewards/rejected": 0.474609375, + "step": 504 + }, + { + "epoch": 0.1456800807731141, + "grad_norm": 10.373490660400943, + "learning_rate": 4.968408054189612e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1608.0, + "logps/rejected": -1648.0, + "loss": 0.6699, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.357421875, + "rewards/margins": -0.0152587890625, + "rewards/rejected": 0.373046875, + "step": 505 + }, + { + "epoch": 0.1459685561805856, + "grad_norm": 10.339528955237336, + "learning_rate": 4.968007748433638e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0, + "logps/chosen": -1744.0, + "logps/rejected": -1696.0, + "loss": 0.7034, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.052978515625, + "rewards/rejected": 0.42578125, + "step": 506 + }, + { + "epoch": 0.1462570315880571, + "grad_norm": 11.132867207070843, + "learning_rate": 4.967604938786324e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.109375, + "logps/chosen": -1536.0, + "logps/rejected": -1552.0, + "loss": 0.6699, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.388671875, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.30078125, + "step": 507 + }, + { + "epoch": 0.14654550699552862, + "grad_norm": 11.036855249285532, + "learning_rate": 4.967199625656337e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1896.0, + "logps/rejected": -1808.0, + "loss": 0.7117, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.0439453125, + "rewards/rejected": 0.375, + "step": 508 + }, + { + "epoch": 0.14683398240300014, + "grad_norm": 10.460695809387607, + "learning_rate": 4.966791809454885e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.125, + "logps/chosen": -1256.0, + "logps/rejected": -1136.0, + "loss": 0.6721, + "loss/demonstration_loss": -2432.0, + "loss/preference_loss": -2416.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.357421875, + "rewards/margins": 0.078125, + "rewards/rejected": 0.279296875, + "step": 509 + }, + { + "epoch": 0.14712245781047165, + "grad_norm": 9.528923214109243, + "learning_rate": 4.966381490595709e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.015625, + "logps/chosen": -1464.0, + "logps/rejected": -1216.0, + "loss": 0.67, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.043212890625, + "rewards/rejected": 0.275390625, + "step": 510 + }, + { + "epoch": 0.14741093321794316, + "grad_norm": 11.72081958272131, + "learning_rate": 4.965968669495097e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.1875, + "logps/chosen": -1792.0, + "logps/rejected": -1552.0, + "loss": 0.6562, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.0830078125, + "rewards/rejected": 0.421875, + "step": 511 + }, + { + "epoch": 0.1476994086254147, + "grad_norm": 10.907725844354541, + "learning_rate": 4.965553346571873e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.875, + "logps/chosen": -1792.0, + "logps/rejected": -1504.0, + "loss": 0.6899, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34765625, + "rewards/margins": 0.0264892578125, + "rewards/rejected": 0.322265625, + "step": 512 + }, + { + "epoch": 0.1479878840328862, + "grad_norm": 10.02020249462617, + "learning_rate": 4.965135522247396e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1432.0, + "logps/rejected": -1240.0, + "loss": 0.663, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3671875, + "rewards/margins": 0.0927734375, + "rewards/rejected": 0.275390625, + "step": 513 + }, + { + "epoch": 0.14827635944035772, + "grad_norm": 10.842272867250761, + "learning_rate": 4.964715196945567e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -2192.0, + "logps/rejected": -2224.0, + "loss": 0.7009, + "loss/demonstration_loss": -4480.0, + "loss/preference_loss": -4480.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.48046875, + "rewards/margins": -0.023681640625, + "rewards/rejected": 0.50390625, + "step": 514 + }, + { + "epoch": 0.14856483484782923, + "grad_norm": 9.68066870407659, + "learning_rate": 4.964292371092822e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1744.0, + "logps/rejected": -1520.0, + "loss": 0.6712, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.036865234375, + "rewards/rejected": 0.34765625, + "step": 515 + }, + { + "epoch": 0.14885331025530074, + "grad_norm": 10.366629315944794, + "learning_rate": 4.963867045118135e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1984.0, + "logps/rejected": -1768.0, + "loss": 0.6433, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.0810546875, + "rewards/rejected": 0.40625, + "step": 516 + }, + { + "epoch": 0.14914178566277225, + "grad_norm": 10.620667795775566, + "learning_rate": 4.963439219453015e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1816.0, + "logps/rejected": -1840.0, + "loss": 0.7, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.439453125, + "rewards/margins": 0.004638671875, + "rewards/rejected": 0.435546875, + "step": 517 + }, + { + "epoch": 0.14943026107024376, + "grad_norm": 10.174217265487732, + "learning_rate": 4.963008894531508e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1928.0, + "logps/rejected": -1648.0, + "loss": 0.6749, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.421875, + "rewards/margins": 0.049072265625, + "rewards/rejected": 0.373046875, + "step": 518 + }, + { + "epoch": 0.14971873647771528, + "grad_norm": 13.570250637417114, + "learning_rate": 4.962576070790198e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.109375, + "logps/chosen": -1968.0, + "logps/rejected": -1760.0, + "loss": 0.7209, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.35546875, + "rewards/margins": -0.020751953125, + "rewards/rejected": 0.375, + "step": 519 + }, + { + "epoch": 0.1500072118851868, + "grad_norm": 11.494283585065263, + "learning_rate": 4.962140748668199e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.890625, + "logps/chosen": -1656.0, + "logps/rejected": -1856.0, + "loss": 0.7144, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.45703125, + "rewards/margins": -0.052490234375, + "rewards/rejected": 0.51171875, + "step": 520 + }, + { + "epoch": 0.1502956872926583, + "grad_norm": 10.13905200325323, + "learning_rate": 4.961702928607165e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.09375, + "logps/chosen": -2024.0, + "logps/rejected": -1600.0, + "loss": 0.643, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.169921875, + "rewards/rejected": 0.306640625, + "step": 521 + }, + { + "epoch": 0.1505841627001298, + "grad_norm": 11.13609701771337, + "learning_rate": 4.961262611051278e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1776.0, + "logps/rejected": -1776.0, + "loss": 0.6756, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.041015625, + "rewards/rejected": 0.451171875, + "step": 522 + }, + { + "epoch": 0.15087263810760132, + "grad_norm": 9.710250967343773, + "learning_rate": 4.960819796447261e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.03125, + "logps/chosen": -1256.0, + "logps/rejected": -1272.0, + "loss": 0.6805, + "loss/demonstration_loss": -2576.0, + "loss/preference_loss": -2560.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.359375, + "rewards/margins": 0.03759765625, + "rewards/rejected": 0.322265625, + "step": 523 + }, + { + "epoch": 0.15116111351507283, + "grad_norm": 13.364431601252312, + "learning_rate": 4.960374485244365e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -2040.0, + "logps/rejected": -1792.0, + "loss": 0.6866, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.0113525390625, + "rewards/rejected": 0.5390625, + "step": 524 + }, + { + "epoch": 0.15144958892254434, + "grad_norm": 10.221468709368152, + "learning_rate": 4.959926677894379e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1840.0, + "logps/rejected": -2024.0, + "loss": 0.6888, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.431640625, + "rewards/margins": -0.021240234375, + "rewards/rejected": 0.453125, + "step": 525 + }, + { + "epoch": 0.15173806433001585, + "grad_norm": 10.345780175545148, + "learning_rate": 4.959476374851616e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1296.0, + "logps/rejected": -1592.0, + "loss": 0.726, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.384765625, + "rewards/margins": -0.07373046875, + "rewards/rejected": 0.45703125, + "step": 526 + }, + { + "epoch": 0.1520265397374874, + "grad_norm": 12.840249468504261, + "learning_rate": 4.959023576572931e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1976.0, + "logps/rejected": -1960.0, + "loss": 0.6823, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.515625, + "rewards/margins": 0.03662109375, + "rewards/rejected": 0.4765625, + "step": 527 + }, + { + "epoch": 0.1523150151449589, + "grad_norm": 12.082983395582415, + "learning_rate": 4.958568283517702e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1616.0, + "logps/rejected": -1480.0, + "loss": 0.6873, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.053466796875, + "rewards/rejected": 0.412109375, + "step": 528 + }, + { + "epoch": 0.15260349055243042, + "grad_norm": 13.089640007219462, + "learning_rate": 4.958110496147845e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1544.0, + "logps/rejected": -1504.0, + "loss": 0.7128, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.298828125, + "rewards/margins": -0.046875, + "rewards/rejected": 0.345703125, + "step": 529 + }, + { + "epoch": 0.15289196595990193, + "grad_norm": 11.86952872353304, + "learning_rate": 4.957650214927801e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1960.0, + "logps/rejected": -1992.0, + "loss": 0.6774, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.021240234375, + "rewards/rejected": 0.44921875, + "step": 530 + }, + { + "epoch": 0.15318044136737344, + "grad_norm": 10.677294276490391, + "learning_rate": 4.957187440324545e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1488.0, + "logps/rejected": -1888.0, + "loss": 0.7001, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3203125, + "rewards/margins": -0.04296875, + "rewards/rejected": 0.36328125, + "step": 531 + }, + { + "epoch": 0.15346891677484495, + "grad_norm": 11.546477717803599, + "learning_rate": 4.95672217280758e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.0625, + "logps/chosen": -1448.0, + "logps/rejected": -1536.0, + "loss": 0.7168, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.271484375, + "rewards/margins": -0.0274658203125, + "rewards/rejected": 0.298828125, + "step": 532 + }, + { + "epoch": 0.15375739218231646, + "grad_norm": 13.029540269827725, + "learning_rate": 4.956254412848936e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.234375, + "logps/chosen": -1976.0, + "logps/rejected": -1832.0, + "loss": 0.6652, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.482421875, + "rewards/margins": 0.0576171875, + "rewards/rejected": 0.42578125, + "step": 533 + }, + { + "epoch": 0.15404586758978797, + "grad_norm": 11.632828272662968, + "learning_rate": 4.955784160923176e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.203125, + "logps/chosen": -1968.0, + "logps/rejected": -1832.0, + "loss": 0.7059, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46484375, + "rewards/margins": -0.003265380859375, + "rewards/rejected": 0.46875, + "step": 534 + }, + { + "epoch": 0.15433434299725948, + "grad_norm": 10.96310161450628, + "learning_rate": 4.955311417507391e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.140625, + "logps/chosen": -1472.0, + "logps/rejected": -1280.0, + "loss": 0.6476, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.375, + "rewards/margins": 0.109375, + "rewards/rejected": 0.265625, + "step": 535 + }, + { + "epoch": 0.154622818404731, + "grad_norm": 10.926793381308396, + "learning_rate": 4.954836183081194e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.078125, + "logps/chosen": -1840.0, + "logps/rejected": -1800.0, + "loss": 0.6913, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4453125, + "rewards/margins": 0.06689453125, + "rewards/rejected": 0.37890625, + "step": 536 + }, + { + "epoch": 0.1549112938122025, + "grad_norm": 12.892540389020454, + "learning_rate": 4.954358458126731e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1920.0, + "logps/rejected": -1808.0, + "loss": 0.6653, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.53125, + "rewards/margins": 0.1494140625, + "rewards/rejected": 0.3828125, + "step": 537 + }, + { + "epoch": 0.15519976921967402, + "grad_norm": 10.670508031829, + "learning_rate": 4.953878243128673e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1864.0, + "logps/rejected": -1712.0, + "loss": 0.6748, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.0810546875, + "rewards/rejected": 0.41015625, + "step": 538 + }, + { + "epoch": 0.15548824462714553, + "grad_norm": 11.223434535741061, + "learning_rate": 4.953395538574218e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.046875, + "logps/chosen": -1800.0, + "logps/rejected": -1920.0, + "loss": 0.7, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.416015625, + "rewards/margins": -0.0264892578125, + "rewards/rejected": 0.44140625, + "step": 539 + }, + { + "epoch": 0.15577672003461704, + "grad_norm": 9.760028605257483, + "learning_rate": 4.952910344953085e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.15625, + "logps/chosen": -1656.0, + "logps/rejected": -1392.0, + "loss": 0.6932, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.375, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.2890625, + "step": 540 + }, + { + "epoch": 0.15606519544208855, + "grad_norm": 10.563072988719771, + "learning_rate": 4.952422662757526e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.046875, + "logps/chosen": -1648.0, + "logps/rejected": -1736.0, + "loss": 0.6512, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.458984375, + "rewards/margins": -0.001953125, + "rewards/rejected": 0.4609375, + "step": 541 + }, + { + "epoch": 0.15635367084956006, + "grad_norm": 12.650461973480182, + "learning_rate": 4.951932492482313e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.234375, + "logps/chosen": -1312.0, + "logps/rejected": -1552.0, + "loss": 0.6984, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.330078125, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.26953125, + "step": 542 + }, + { + "epoch": 0.1566421462570316, + "grad_norm": 11.042808094789574, + "learning_rate": 4.951439834624742e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.078125, + "logps/chosen": -1432.0, + "logps/rejected": -1384.0, + "loss": 0.7131, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.396484375, + "rewards/margins": 0.0247802734375, + "rewards/rejected": 0.37109375, + "step": 543 + }, + { + "epoch": 0.1569306216645031, + "grad_norm": 10.576030928370736, + "learning_rate": 4.950944689684636e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.1875, + "logps/chosen": -1760.0, + "logps/rejected": -1576.0, + "loss": 0.6729, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.390625, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.326171875, + "step": 544 + }, + { + "epoch": 0.15721909707197462, + "grad_norm": 14.316960361305092, + "learning_rate": 4.950447058164335e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1648.0, + "logps/rejected": -1552.0, + "loss": 0.7203, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.36328125, + "step": 545 + }, + { + "epoch": 0.15750757247944613, + "grad_norm": 11.134769731051994, + "learning_rate": 4.94994694056871e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1432.0, + "logps/rejected": -1064.0, + "loss": 0.6721, + "loss/demonstration_loss": -2544.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.373046875, + "rewards/margins": 0.1064453125, + "rewards/rejected": 0.265625, + "step": 546 + }, + { + "epoch": 0.15779604788691765, + "grad_norm": 9.849954202160367, + "learning_rate": 4.949444337405149e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1976.0, + "logps/rejected": -1880.0, + "loss": 0.7021, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.38671875, + "rewards/margins": -0.029541015625, + "rewards/rejected": 0.416015625, + "step": 547 + }, + { + "epoch": 0.15808452329438916, + "grad_norm": 11.830963170816082, + "learning_rate": 4.948939249183561e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1576.0, + "logps/rejected": -1448.0, + "loss": 0.6794, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.283203125, + "step": 548 + }, + { + "epoch": 0.15837299870186067, + "grad_norm": 9.84218907015708, + "learning_rate": 4.94843167641638e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.140625, + "logps/chosen": -1216.0, + "logps/rejected": -1336.0, + "loss": 0.6709, + "loss/demonstration_loss": -2576.0, + "loss/preference_loss": -2576.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.02490234375, + "rewards/rejected": 0.314453125, + "step": 549 + }, + { + "epoch": 0.15866147410933218, + "grad_norm": 11.750362458082861, + "learning_rate": 4.947921619618558e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1776.0, + "logps/rejected": -1744.0, + "loss": 0.6757, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.328125, + "rewards/margins": -0.044921875, + "rewards/rejected": 0.373046875, + "step": 550 + }, + { + "epoch": 0.1589499495168037, + "grad_norm": 11.347953790530728, + "learning_rate": 4.947409079307567e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1640.0, + "logps/rejected": -1568.0, + "loss": 0.6455, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.0869140625, + "rewards/rejected": 0.306640625, + "step": 551 + }, + { + "epoch": 0.1592384249242752, + "grad_norm": 12.639425609580405, + "learning_rate": 4.9468940560034e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.234375, + "logps/chosen": -1616.0, + "logps/rejected": -1496.0, + "loss": 0.6597, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.373046875, + "rewards/margins": 0.04931640625, + "rewards/rejected": 0.32421875, + "step": 552 + }, + { + "epoch": 0.1595269003317467, + "grad_norm": 12.065778851709316, + "learning_rate": 4.946376550228569e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1848.0, + "logps/rejected": -1544.0, + "loss": 0.6545, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.125, + "rewards/rejected": 0.322265625, + "step": 553 + }, + { + "epoch": 0.15981537573921822, + "grad_norm": 10.705187479167527, + "learning_rate": 4.945856562508103e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.96875, + "logps/chosen": -1848.0, + "logps/rejected": -1736.0, + "loss": 0.6621, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5, + "rewards/margins": 0.0546875, + "rewards/rejected": 0.4453125, + "step": 554 + }, + { + "epoch": 0.16010385114668974, + "grad_norm": 11.384753910838244, + "learning_rate": 4.945334093369551e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -1808.0, + "logps/rejected": -1816.0, + "loss": 0.7045, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.0086669921875, + "rewards/rejected": 0.33984375, + "step": 555 + }, + { + "epoch": 0.16039232655416125, + "grad_norm": 13.531740260839971, + "learning_rate": 4.944809143342978e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.125, + "logps/chosen": -1768.0, + "logps/rejected": -1792.0, + "loss": 0.681, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.003997802734375, + "rewards/rejected": 0.453125, + "step": 556 + }, + { + "epoch": 0.16068080196163276, + "grad_norm": 10.678839024047571, + "learning_rate": 4.944281712960966e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1264.0, + "logps/rejected": -1240.0, + "loss": 0.725, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.26171875, + "rewards/margins": -0.06396484375, + "rewards/rejected": 0.326171875, + "step": 557 + }, + { + "epoch": 0.1609692773691043, + "grad_norm": 10.782655491579556, + "learning_rate": 4.943751802758615e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1680.0, + "logps/rejected": -1368.0, + "loss": 0.6752, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.515625, + "rewards/margins": 0.1396484375, + "rewards/rejected": 0.376953125, + "step": 558 + }, + { + "epoch": 0.1612577527765758, + "grad_norm": 11.41244309532753, + "learning_rate": 4.94321941327354e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -2176.0, + "logps/rejected": -1832.0, + "loss": 0.6719, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.10888671875, + "rewards/rejected": 0.45703125, + "step": 559 + }, + { + "epoch": 0.16154622818404732, + "grad_norm": 13.550724062033822, + "learning_rate": 4.94268454504587e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.203125, + "logps/chosen": -1696.0, + "logps/rejected": -1424.0, + "loss": 0.709, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38671875, + "rewards/margins": 0.06591796875, + "rewards/rejected": 0.322265625, + "step": 560 + }, + { + "epoch": 0.16183470359151883, + "grad_norm": 11.742548621328552, + "learning_rate": 4.942147198618252e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.03125, + "logps/chosen": -1704.0, + "logps/rejected": -1936.0, + "loss": 0.7133, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.41796875, + "rewards/margins": -0.06982421875, + "rewards/rejected": 0.48828125, + "step": 561 + }, + { + "epoch": 0.16212317899899034, + "grad_norm": 11.073705088880684, + "learning_rate": 4.941607374535842e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1896.0, + "logps/rejected": -1872.0, + "loss": 0.6747, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.451171875, + "step": 562 + }, + { + "epoch": 0.16241165440646185, + "grad_norm": 15.90612267262837, + "learning_rate": 4.941065073346315e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.203125, + "logps/chosen": -1568.0, + "logps/rejected": -1528.0, + "loss": 0.6974, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.375, + "rewards/margins": -0.0235595703125, + "rewards/rejected": 0.3984375, + "step": 563 + }, + { + "epoch": 0.16270012981393336, + "grad_norm": 13.352533989655855, + "learning_rate": 4.940520295599858e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1712.0, + "logps/rejected": -1544.0, + "loss": 0.6791, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.453125, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.3671875, + "step": 564 + }, + { + "epoch": 0.16298860522140488, + "grad_norm": 13.705771691596883, + "learning_rate": 4.939973041849167e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.0625, + "logps/chosen": -1616.0, + "logps/rejected": -1584.0, + "loss": 0.6919, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.0771484375, + "rewards/rejected": 0.33203125, + "step": 565 + }, + { + "epoch": 0.1632770806288764, + "grad_norm": 10.531353691490725, + "learning_rate": 4.939423312649454e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.28125, + "logps/chosen": -1808.0, + "logps/rejected": -1720.0, + "loss": 0.6892, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.546875, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.40234375, + "step": 566 + }, + { + "epoch": 0.1635655560363479, + "grad_norm": 10.918313592091286, + "learning_rate": 4.93887110855844e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1144.0, + "logps/rejected": -1384.0, + "loss": 0.6977, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2560.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.00628662109375, + "rewards/rejected": 0.318359375, + "step": 567 + }, + { + "epoch": 0.1638540314438194, + "grad_norm": 10.036954175094754, + "learning_rate": 4.938316430136359e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -992.0, + "logps/rejected": -1288.0, + "loss": 0.6988, + "loss/demonstration_loss": -2304.0, + "loss/preference_loss": -2320.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3046875, + "rewards/margins": -0.04541015625, + "rewards/rejected": 0.349609375, + "step": 568 + }, + { + "epoch": 0.16414250685129092, + "grad_norm": 11.911551589305803, + "learning_rate": 4.937759277945954e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.125, + "logps/chosen": -1640.0, + "logps/rejected": -1584.0, + "loss": 0.7003, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.003631591796875, + "rewards/rejected": 0.349609375, + "step": 569 + }, + { + "epoch": 0.16443098225876243, + "grad_norm": 10.256236683486325, + "learning_rate": 4.937199652552477e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1904.0, + "logps/rejected": -1496.0, + "loss": 0.65, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.53125, + "rewards/margins": 0.1689453125, + "rewards/rejected": 0.36328125, + "step": 570 + }, + { + "epoch": 0.16471945766623394, + "grad_norm": 10.931009748407062, + "learning_rate": 4.936637554523691e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -2208.0, + "logps/rejected": -2096.0, + "loss": 0.6785, + "loss/demonstration_loss": -4352.0, + "loss/preference_loss": -4352.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.453125, + "rewards/margins": -0.0223388671875, + "rewards/rejected": 0.474609375, + "step": 571 + }, + { + "epoch": 0.16500793307370545, + "grad_norm": 12.206924178661245, + "learning_rate": 4.936072984429866e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -1672.0, + "logps/rejected": -1600.0, + "loss": 0.6725, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.0089111328125, + "rewards/rejected": 0.38671875, + "step": 572 + }, + { + "epoch": 0.165296408481177, + "grad_norm": 11.448393418558922, + "learning_rate": 4.935505942843781e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0, + "logps/chosen": -1600.0, + "logps/rejected": -1624.0, + "loss": 0.7087, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.0361328125, + "rewards/rejected": 0.40234375, + "step": 573 + }, + { + "epoch": 0.1655848838886485, + "grad_norm": 11.696136392422394, + "learning_rate": 4.934936430340724e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.078125, + "logps/chosen": -1328.0, + "logps/rejected": -1360.0, + "loss": 0.7256, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.353515625, + "rewards/margins": -0.07568359375, + "rewards/rejected": 0.4296875, + "step": 574 + }, + { + "epoch": 0.16587335929612002, + "grad_norm": 13.479571754239823, + "learning_rate": 4.934364447498484e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.03125, + "logps/chosen": -1888.0, + "logps/rejected": -1744.0, + "loss": 0.6829, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.0301513671875, + "rewards/rejected": 0.39453125, + "step": 575 + }, + { + "epoch": 0.16616183470359153, + "grad_norm": 12.215958988474764, + "learning_rate": 4.933789994897362e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.8125, + "logps/chosen": -2064.0, + "logps/rejected": -2048.0, + "loss": 0.702, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.004150390625, + "rewards/rejected": 0.494140625, + "step": 576 + }, + { + "epoch": 0.16645031011106304, + "grad_norm": 11.011025757107882, + "learning_rate": 4.933213073120163e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.078125, + "logps/chosen": -1640.0, + "logps/rejected": -1712.0, + "loss": 0.6902, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4921875, + "rewards/margins": -0.048095703125, + "rewards/rejected": 0.54296875, + "step": 577 + }, + { + "epoch": 0.16673878551853455, + "grad_norm": 13.528056439672381, + "learning_rate": 4.932633682752199e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.890625, + "logps/chosen": -1776.0, + "logps/rejected": -1624.0, + "loss": 0.7017, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.384765625, + "rewards/margins": -0.03662109375, + "rewards/rejected": 0.419921875, + "step": 578 + }, + { + "epoch": 0.16702726092600606, + "grad_norm": 10.212937356123117, + "learning_rate": 4.932051824381281e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.140625, + "logps/chosen": -1840.0, + "logps/rejected": -1696.0, + "loss": 0.6677, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.050537109375, + "rewards/rejected": 0.365234375, + "step": 579 + }, + { + "epoch": 0.16731573633347757, + "grad_norm": 11.950478176164427, + "learning_rate": 4.931467498597728e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.078125, + "logps/chosen": -1936.0, + "logps/rejected": -1712.0, + "loss": 0.6985, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.32421875, + "rewards/margins": -0.0274658203125, + "rewards/rejected": 0.3515625, + "step": 580 + }, + { + "epoch": 0.16760421174094908, + "grad_norm": 12.326379214219001, + "learning_rate": 4.930880705994362e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -1576.0, + "logps/rejected": -1632.0, + "loss": 0.6763, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.0205078125, + "rewards/rejected": 0.4296875, + "step": 581 + }, + { + "epoch": 0.1678926871484206, + "grad_norm": 11.451542913511348, + "learning_rate": 4.930291447166509e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1784.0, + "logps/rejected": -1672.0, + "loss": 0.6608, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.05712890625, + "rewards/rejected": 0.37890625, + "step": 582 + }, + { + "epoch": 0.1681811625558921, + "grad_norm": 11.333245878375362, + "learning_rate": 4.929699722711993e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0, + "logps/chosen": -1696.0, + "logps/rejected": -1760.0, + "loss": 0.6874, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.412109375, + "rewards/margins": -0.007568359375, + "rewards/rejected": 0.419921875, + "step": 583 + }, + { + "epoch": 0.16846963796336362, + "grad_norm": 11.50176149428191, + "learning_rate": 4.929105533231143e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1824.0, + "logps/rejected": -1816.0, + "loss": 0.6827, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.03369140625, + "rewards/rejected": 0.4375, + "step": 584 + }, + { + "epoch": 0.16875811337083513, + "grad_norm": 12.006637934597457, + "learning_rate": 4.928508879326787e-07, + "logits/chosen": 2.90625, + "logits/rejected": 3.0, + "logps/chosen": -1584.0, + "logps/rejected": -1616.0, + "loss": 0.6583, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.115234375, + "rewards/rejected": 0.294921875, + "step": 585 + }, + { + "epoch": 0.16904658877830664, + "grad_norm": 11.300936879759172, + "learning_rate": 4.927909761604254e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1768.0, + "logps/rejected": -1800.0, + "loss": 0.71, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.494140625, + "rewards/margins": 0.0137939453125, + "rewards/rejected": 0.48046875, + "step": 586 + }, + { + "epoch": 0.16933506418577815, + "grad_norm": 11.018490069183825, + "learning_rate": 4.927308180671375e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.09375, + "logps/chosen": -1776.0, + "logps/rejected": -1664.0, + "loss": 0.6876, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.486328125, + "rewards/margins": -0.037109375, + "rewards/rejected": 0.5234375, + "step": 587 + }, + { + "epoch": 0.1696235395932497, + "grad_norm": 10.37678633527871, + "learning_rate": 4.926704137138473e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1408.0, + "logps/rejected": -1520.0, + "loss": 0.7058, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.30859375, + "rewards/margins": -0.0693359375, + "rewards/rejected": 0.37890625, + "step": 588 + }, + { + "epoch": 0.1699120150007212, + "grad_norm": 11.352274123987936, + "learning_rate": 4.926097631618378e-07, + "logits/chosen": 2.765625, + "logits/rejected": 2.859375, + "logps/chosen": -1472.0, + "logps/rejected": -1488.0, + "loss": 0.69, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.0947265625, + "rewards/rejected": 0.412109375, + "step": 589 + }, + { + "epoch": 0.1702004904081927, + "grad_norm": 9.35952563279581, + "learning_rate": 4.925488664726413e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1488.0, + "logps/rejected": -1520.0, + "loss": 0.6826, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.00543212890625, + "rewards/rejected": 0.337890625, + "step": 590 + }, + { + "epoch": 0.17048896581566422, + "grad_norm": 10.682379410068636, + "learning_rate": 4.924877237080397e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.140625, + "logps/chosen": -1872.0, + "logps/rejected": -1688.0, + "loss": 0.6798, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.05615234375, + "rewards/rejected": 0.48046875, + "step": 591 + }, + { + "epoch": 0.17077744122313573, + "grad_norm": 10.630494997162494, + "learning_rate": 4.924263349300649e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1664.0, + "logps/rejected": -1608.0, + "loss": 0.6752, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.051025390625, + "rewards/rejected": 0.447265625, + "step": 592 + }, + { + "epoch": 0.17106591663060725, + "grad_norm": 10.758262688015513, + "learning_rate": 4.923647002009983e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.1875, + "logps/chosen": -2128.0, + "logps/rejected": -1688.0, + "loss": 0.6545, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.494140625, + "rewards/margins": 0.126953125, + "rewards/rejected": 0.3671875, + "step": 593 + }, + { + "epoch": 0.17135439203807876, + "grad_norm": 11.000373621627954, + "learning_rate": 4.923028195833706e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0625, + "logps/chosen": -1584.0, + "logps/rejected": -1472.0, + "loss": 0.7069, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3203125, + "rewards/margins": -0.0194091796875, + "rewards/rejected": 0.33984375, + "step": 594 + }, + { + "epoch": 0.17164286744555027, + "grad_norm": 11.158457197307875, + "learning_rate": 4.922406931399623e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1600.0, + "logps/rejected": -1544.0, + "loss": 0.6865, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.380859375, + "rewards/margins": -0.059326171875, + "rewards/rejected": 0.44140625, + "step": 595 + }, + { + "epoch": 0.17193134285302178, + "grad_norm": 10.187945197873365, + "learning_rate": 4.921783209338031e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.125, + "logps/chosen": -1376.0, + "logps/rejected": -1096.0, + "loss": 0.6556, + "loss/demonstration_loss": -2512.0, + "loss/preference_loss": -2496.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3515625, + "rewards/margins": 0.11865234375, + "rewards/rejected": 0.232421875, + "step": 596 + }, + { + "epoch": 0.1722198182604933, + "grad_norm": 11.45351416140906, + "learning_rate": 4.921157030281719e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1832.0, + "logps/rejected": -1920.0, + "loss": 0.6691, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.515625, + "rewards/margins": 0.0004425048828125, + "rewards/rejected": 0.515625, + "step": 597 + }, + { + "epoch": 0.1725082936679648, + "grad_norm": 12.095494544379603, + "learning_rate": 4.920528394865973e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1288.0, + "logps/rejected": -1448.0, + "loss": 0.6811, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.29296875, + "step": 598 + }, + { + "epoch": 0.1727967690754363, + "grad_norm": 11.033353921402762, + "learning_rate": 4.919897303728565e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.890625, + "logps/chosen": -1408.0, + "logps/rejected": -1552.0, + "loss": 0.6855, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.052978515625, + "rewards/rejected": 0.271484375, + "step": 599 + }, + { + "epoch": 0.17308524448290782, + "grad_norm": 13.282549704639296, + "learning_rate": 4.919263757509765e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1888.0, + "logps/rejected": -1904.0, + "loss": 0.7312, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.63671875, + "rewards/margins": -0.0140380859375, + "rewards/rejected": 0.6484375, + "step": 600 + }, + { + "epoch": 0.17337371989037934, + "grad_norm": 10.464614874282104, + "learning_rate": 4.91862775685233e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1480.0, + "logps/rejected": -1560.0, + "loss": 0.6664, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.04638671875, + "rewards/rejected": 0.3984375, + "step": 601 + }, + { + "epoch": 0.17366219529785085, + "grad_norm": 11.163597359640665, + "learning_rate": 4.917989302401507e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1880.0, + "logps/rejected": -1936.0, + "loss": 0.6786, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.0283203125, + "rewards/rejected": 0.5078125, + "step": 602 + }, + { + "epoch": 0.17395067070532236, + "grad_norm": 13.816618698580243, + "learning_rate": 4.917348394805034e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0, + "logps/chosen": -1520.0, + "logps/rejected": -1464.0, + "loss": 0.666, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.37890625, + "step": 603 + }, + { + "epoch": 0.1742391461127939, + "grad_norm": 12.745696270952374, + "learning_rate": 4.916705034713136e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.984375, + "logps/chosen": -1720.0, + "logps/rejected": -1816.0, + "loss": 0.6978, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.0419921875, + "rewards/rejected": 0.408203125, + "step": 604 + }, + { + "epoch": 0.1745276215202654, + "grad_norm": 11.53009242379067, + "learning_rate": 4.916059222778529e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1312.0, + "logps/rejected": -1320.0, + "loss": 0.7162, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2672.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3984375, + "rewards/margins": 0.03271484375, + "rewards/rejected": 0.3671875, + "step": 605 + }, + { + "epoch": 0.17481609692773692, + "grad_norm": 9.584784133565641, + "learning_rate": 4.915410959656414e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -1512.0, + "logps/rejected": -1656.0, + "loss": 0.6971, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.341796875, + "rewards/margins": -0.058837890625, + "rewards/rejected": 0.400390625, + "step": 606 + }, + { + "epoch": 0.17510457233520843, + "grad_norm": 11.434226357045482, + "learning_rate": 4.914760246004477e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.96875, + "logps/chosen": -1592.0, + "logps/rejected": -1760.0, + "loss": 0.6572, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.328125, + "rewards/margins": 0.025634765625, + "rewards/rejected": 0.302734375, + "step": 607 + }, + { + "epoch": 0.17539304774267994, + "grad_norm": 11.784044376966978, + "learning_rate": 4.914107082482897e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1864.0, + "logps/rejected": -1808.0, + "loss": 0.6782, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.515625, + "rewards/margins": -0.0205078125, + "rewards/rejected": 0.53515625, + "step": 608 + }, + { + "epoch": 0.17568152315015145, + "grad_norm": 11.845626202546088, + "learning_rate": 4.91345146975433e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.109375, + "logps/chosen": -1760.0, + "logps/rejected": -1824.0, + "loss": 0.692, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.045166015625, + "rewards/rejected": 0.431640625, + "step": 609 + }, + { + "epoch": 0.17596999855762296, + "grad_norm": 11.290759515734107, + "learning_rate": 4.912793408483925e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.125, + "logps/chosen": -2048.0, + "logps/rejected": -2080.0, + "loss": 0.6589, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.1103515625, + "rewards/rejected": 0.412109375, + "step": 610 + }, + { + "epoch": 0.17625847396509448, + "grad_norm": 11.386363632287404, + "learning_rate": 4.912132899339309e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -1832.0, + "logps/rejected": -2008.0, + "loss": 0.6796, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.498046875, + "rewards/margins": -0.0283203125, + "rewards/rejected": 0.52734375, + "step": 611 + }, + { + "epoch": 0.176546949372566, + "grad_norm": 10.247241951203067, + "learning_rate": 4.911469942990593e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.984375, + "logps/chosen": -1360.0, + "logps/rejected": -1392.0, + "loss": 0.6765, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.07177734375, + "rewards/rejected": 0.34375, + "step": 612 + }, + { + "epoch": 0.1768354247800375, + "grad_norm": 11.623852092176875, + "learning_rate": 4.910804540110377e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1832.0, + "logps/rejected": -1752.0, + "loss": 0.6556, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.09521484375, + "rewards/rejected": 0.359375, + "step": 613 + }, + { + "epoch": 0.177123900187509, + "grad_norm": 11.36474654173702, + "learning_rate": 4.910136691373734e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1816.0, + "logps/rejected": -1856.0, + "loss": 0.6914, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5, + "rewards/margins": 0.00396728515625, + "rewards/rejected": 0.498046875, + "step": 614 + }, + { + "epoch": 0.17741237559498052, + "grad_norm": 11.105889579445893, + "learning_rate": 4.909466397458225e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1728.0, + "logps/rejected": -1792.0, + "loss": 0.7299, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.4375, + "rewards/margins": -0.0301513671875, + "rewards/rejected": 0.46875, + "step": 615 + }, + { + "epoch": 0.17770085100245203, + "grad_norm": 10.543388807209881, + "learning_rate": 4.90879365904389e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1640.0, + "logps/rejected": -1728.0, + "loss": 0.7005, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.365234375, + "rewards/margins": -0.04541015625, + "rewards/rejected": 0.41015625, + "step": 616 + }, + { + "epoch": 0.17798932640992354, + "grad_norm": 11.728949498890294, + "learning_rate": 4.908118476813246e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.1875, + "logps/chosen": -1792.0, + "logps/rejected": -1688.0, + "loss": 0.6853, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.546875, + "rewards/margins": 0.1044921875, + "rewards/rejected": 0.44140625, + "step": 617 + }, + { + "epoch": 0.17827780181739505, + "grad_norm": 9.667782757607464, + "learning_rate": 4.907440851451296e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.0, + "logps/chosen": -1416.0, + "logps/rejected": -1528.0, + "loss": 0.6843, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.0198974609375, + "rewards/rejected": 0.443359375, + "step": 618 + }, + { + "epoch": 0.1785662772248666, + "grad_norm": 12.104784553070072, + "learning_rate": 4.906760783645516e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -2064.0, + "logps/rejected": -2192.0, + "loss": 0.7223, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.57421875, + "rewards/margins": -0.068359375, + "rewards/rejected": 0.640625, + "step": 619 + }, + { + "epoch": 0.1788547526323381, + "grad_norm": 12.267576427240048, + "learning_rate": 4.906078274085861e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.078125, + "logps/chosen": -1744.0, + "logps/rejected": -1568.0, + "loss": 0.6486, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.302734375, + "step": 620 + }, + { + "epoch": 0.17914322803980962, + "grad_norm": 11.778584569235743, + "learning_rate": 4.905393323464763e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1624.0, + "logps/rejected": -1616.0, + "loss": 0.7095, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.39453125, + "rewards/margins": -0.03564453125, + "rewards/rejected": 0.431640625, + "step": 621 + }, + { + "epoch": 0.17943170344728113, + "grad_norm": 10.696015523265535, + "learning_rate": 4.904705932477135e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.109375, + "logps/chosen": -2112.0, + "logps/rejected": -2024.0, + "loss": 0.6873, + "loss/demonstration_loss": -4192.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.002197265625, + "rewards/rejected": 0.490234375, + "step": 622 + }, + { + "epoch": 0.17972017885475264, + "grad_norm": 12.117510568408344, + "learning_rate": 4.904016101820359e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -1760.0, + "logps/rejected": -1568.0, + "loss": 0.6965, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.10009765625, + "rewards/rejected": 0.30859375, + "step": 623 + }, + { + "epoch": 0.18000865426222415, + "grad_norm": 11.436652194332098, + "learning_rate": 4.903323832194296e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1440.0, + "logps/rejected": -1448.0, + "loss": 0.6726, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.048583984375, + "rewards/rejected": 0.3984375, + "step": 624 + }, + { + "epoch": 0.18029712966969566, + "grad_norm": 10.386732486544162, + "learning_rate": 4.902629124301282e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1456.0, + "logps/rejected": -1456.0, + "loss": 0.6946, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.5, + "rewards/margins": -0.01007080078125, + "rewards/rejected": 0.51171875, + "step": 625 + }, + { + "epoch": 0.18058560507716717, + "grad_norm": 10.337238726866888, + "learning_rate": 4.901931978846125e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1656.0, + "logps/rejected": -1760.0, + "loss": 0.7064, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.443359375, + "rewards/margins": -0.04150390625, + "rewards/rejected": 0.484375, + "step": 626 + }, + { + "epoch": 0.18087408048463868, + "grad_norm": 11.682357066808725, + "learning_rate": 4.901232396536105e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1880.0, + "logps/rejected": -1624.0, + "loss": 0.6703, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.08251953125, + "rewards/rejected": 0.515625, + "step": 627 + }, + { + "epoch": 0.1811625558921102, + "grad_norm": 10.479362323288035, + "learning_rate": 4.90053037808098e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.0625, + "logps/chosen": -1704.0, + "logps/rejected": -1576.0, + "loss": 0.6888, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40625, + "rewards/margins": 0.0400390625, + "rewards/rejected": 0.3671875, + "step": 628 + }, + { + "epoch": 0.1814510312995817, + "grad_norm": 11.814134280954951, + "learning_rate": 4.899825924192972e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.203125, + "logps/chosen": -1832.0, + "logps/rejected": -1800.0, + "loss": 0.6847, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.042236328125, + "rewards/rejected": 0.427734375, + "step": 629 + }, + { + "epoch": 0.18173950670705322, + "grad_norm": 11.627789784074366, + "learning_rate": 4.899119035586778e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.1875, + "logps/chosen": -1512.0, + "logps/rejected": -1704.0, + "loss": 0.7087, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48828125, + "rewards/margins": -0.029052734375, + "rewards/rejected": 0.51953125, + "step": 630 + }, + { + "epoch": 0.18202798211452473, + "grad_norm": 12.232679814842381, + "learning_rate": 4.898409712979565e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1192.0, + "logps/rejected": -1160.0, + "loss": 0.6701, + "loss/demonstration_loss": -2384.0, + "loss/preference_loss": -2384.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.400390625, + "rewards/margins": 0.0269775390625, + "rewards/rejected": 0.373046875, + "step": 631 + }, + { + "epoch": 0.18231645752199624, + "grad_norm": 10.452932775255238, + "learning_rate": 4.897697957090968e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1744.0, + "logps/rejected": -1728.0, + "loss": 0.6725, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.0224609375, + "rewards/rejected": 0.439453125, + "step": 632 + }, + { + "epoch": 0.18260493292946775, + "grad_norm": 13.91406027162934, + "learning_rate": 4.896983768643091e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.140625, + "logps/chosen": -2400.0, + "logps/rejected": -2288.0, + "loss": 0.6609, + "loss/demonstration_loss": -4768.0, + "loss/preference_loss": -4736.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.447265625, + "step": 633 + }, + { + "epoch": 0.1828934083369393, + "grad_norm": 11.955134698840961, + "learning_rate": 4.896267148360509e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -2128.0, + "logps/rejected": -1760.0, + "loss": 0.6721, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.484375, + "rewards/margins": 0.0299072265625, + "rewards/rejected": 0.455078125, + "step": 634 + }, + { + "epoch": 0.1831818837444108, + "grad_norm": 11.61545895947558, + "learning_rate": 4.895548096970259e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.171875, + "logps/chosen": -1496.0, + "logps/rejected": -1608.0, + "loss": 0.692, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.4375, + "rewards/margins": -0.018310546875, + "rewards/rejected": 0.45703125, + "step": 635 + }, + { + "epoch": 0.1834703591518823, + "grad_norm": 11.61130649929906, + "learning_rate": 4.894826615201849e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.078125, + "logps/chosen": -1784.0, + "logps/rejected": -1656.0, + "loss": 0.7145, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.435546875, + "rewards/margins": -0.005767822265625, + "rewards/rejected": 0.44140625, + "step": 636 + }, + { + "epoch": 0.18375883455935382, + "grad_norm": 10.595527960954731, + "learning_rate": 4.894102703787249e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1808.0, + "logps/rejected": -1584.0, + "loss": 0.6552, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.0927734375, + "rewards/rejected": 0.38671875, + "step": 637 + }, + { + "epoch": 0.18404730996682533, + "grad_norm": 10.43162983484717, + "learning_rate": 4.893376363460896e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1632.0, + "logps/rejected": -1648.0, + "loss": 0.6871, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.466796875, + "rewards/margins": -0.00634765625, + "rewards/rejected": 0.474609375, + "step": 638 + }, + { + "epoch": 0.18433578537429685, + "grad_norm": 11.031885564984536, + "learning_rate": 4.892647594959691e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -1520.0, + "logps/rejected": -1800.0, + "loss": 0.6968, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.44140625, + "rewards/margins": -0.01513671875, + "rewards/rejected": 0.45703125, + "step": 639 + }, + { + "epoch": 0.18462426078176836, + "grad_norm": 10.853815900375052, + "learning_rate": 4.891916399022999e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.15625, + "logps/chosen": -1632.0, + "logps/rejected": -1512.0, + "loss": 0.6938, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.427734375, + "rewards/margins": 0.0439453125, + "rewards/rejected": 0.3828125, + "step": 640 + }, + { + "epoch": 0.18491273618923987, + "grad_norm": 11.618782679854233, + "learning_rate": 4.891182776392647e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.03125, + "logps/chosen": -1736.0, + "logps/rejected": -1488.0, + "loss": 0.6592, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.08544921875, + "rewards/rejected": 0.369140625, + "step": 641 + }, + { + "epoch": 0.18520121159671138, + "grad_norm": 11.376716137136828, + "learning_rate": 4.890446727812924e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -1848.0, + "logps/rejected": -1744.0, + "loss": 0.6695, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5625, + "rewards/margins": 0.1298828125, + "rewards/rejected": 0.431640625, + "step": 642 + }, + { + "epoch": 0.1854896870041829, + "grad_norm": 9.360828919284325, + "learning_rate": 4.889708254030581e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -1664.0, + "logps/rejected": -1592.0, + "loss": 0.6411, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.1328125, + "rewards/rejected": 0.353515625, + "step": 643 + }, + { + "epoch": 0.1857781624116544, + "grad_norm": 9.707904199639916, + "learning_rate": 4.888967355794829e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1728.0, + "logps/rejected": -1568.0, + "loss": 0.6674, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.10546875, + "rewards/rejected": 0.3046875, + "step": 644 + }, + { + "epoch": 0.1860666378191259, + "grad_norm": 11.937288760650981, + "learning_rate": 4.888224033857337e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.140625, + "logps/chosen": -1792.0, + "logps/rejected": -1808.0, + "loss": 0.6746, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.00341796875, + "rewards/rejected": 0.515625, + "step": 645 + }, + { + "epoch": 0.18635511322659742, + "grad_norm": 12.499561749000547, + "learning_rate": 4.887478288972234e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1680.0, + "logps/rejected": -1584.0, + "loss": 0.6901, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.451171875, + "rewards/margins": -0.01611328125, + "rewards/rejected": 0.466796875, + "step": 646 + }, + { + "epoch": 0.18664358863406894, + "grad_norm": 11.264640436866037, + "learning_rate": 4.88673012189611e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.109375, + "logps/chosen": -1648.0, + "logps/rejected": -1664.0, + "loss": 0.6767, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.0, + "rewards/rejected": 0.4296875, + "step": 647 + }, + { + "epoch": 0.18693206404154045, + "grad_norm": 11.338102487716444, + "learning_rate": 4.885979533388009e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.015625, + "logps/chosen": -1576.0, + "logps/rejected": -1720.0, + "loss": 0.7008, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.388671875, + "rewards/margins": 0.0228271484375, + "rewards/rejected": 0.3671875, + "step": 648 + }, + { + "epoch": 0.18722053944901196, + "grad_norm": 11.314978052708812, + "learning_rate": 4.885226524209432e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.109375, + "logps/chosen": -1624.0, + "logps/rejected": -1848.0, + "loss": 0.709, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.00408935546875, + "rewards/rejected": 0.5234375, + "step": 649 + }, + { + "epoch": 0.1875090148564835, + "grad_norm": 9.312481855897774, + "learning_rate": 4.884471095124337e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1304.0, + "logps/rejected": -1192.0, + "loss": 0.6609, + "loss/demonstration_loss": -2544.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.056884765625, + "rewards/rejected": 0.34375, + "step": 650 + }, + { + "epoch": 0.187797490263955, + "grad_norm": 11.218547720768596, + "learning_rate": 4.883713246899137e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -2000.0, + "logps/rejected": -1896.0, + "loss": 0.6613, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.5234375, + "step": 651 + }, + { + "epoch": 0.18808596567142652, + "grad_norm": 10.364791336068805, + "learning_rate": 4.882952980302699e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1504.0, + "logps/rejected": -1536.0, + "loss": 0.6946, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.333984375, + "rewards/margins": 0.01141357421875, + "rewards/rejected": 0.322265625, + "step": 652 + }, + { + "epoch": 0.18837444107889803, + "grad_norm": 11.563087481367752, + "learning_rate": 4.882190296106343e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.078125, + "logps/chosen": -976.0, + "logps/rejected": -1112.0, + "loss": 0.6973, + "loss/demonstration_loss": -2112.0, + "loss/preference_loss": -2112.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.21484375, + "rewards/margins": -0.0400390625, + "rewards/rejected": 0.25390625, + "step": 653 + }, + { + "epoch": 0.18866291648636954, + "grad_norm": 11.412774475255304, + "learning_rate": 4.881425195083842e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1592.0, + "logps/rejected": -1288.0, + "loss": 0.6699, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.08447265625, + "rewards/rejected": 0.36328125, + "step": 654 + }, + { + "epoch": 0.18895139189384105, + "grad_norm": 11.356964004110743, + "learning_rate": 4.880657678011422e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.03125, + "logps/chosen": -1672.0, + "logps/rejected": -1432.0, + "loss": 0.647, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.515625, + "rewards/margins": 0.10986328125, + "rewards/rejected": 0.404296875, + "step": 655 + }, + { + "epoch": 0.18923986730131256, + "grad_norm": 10.400530437105829, + "learning_rate": 4.87988774566776e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1600.0, + "logps/rejected": -1544.0, + "loss": 0.6767, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.060791015625, + "rewards/rejected": 0.33203125, + "step": 656 + }, + { + "epoch": 0.18952834270878408, + "grad_norm": 11.673465836776696, + "learning_rate": 4.879115398833981e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.171875, + "logps/chosen": -1864.0, + "logps/rejected": -1608.0, + "loss": 0.6953, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.0556640625, + "rewards/rejected": 0.439453125, + "step": 657 + }, + { + "epoch": 0.1898168181162556, + "grad_norm": 9.708019364590635, + "learning_rate": 4.878340638293663e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1352.0, + "logps/rejected": -1568.0, + "loss": 0.6778, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.029296875, + "rewards/rejected": 0.291015625, + "step": 658 + }, + { + "epoch": 0.1901052935237271, + "grad_norm": 11.595573804063251, + "learning_rate": 4.87756346483283e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1616.0, + "logps/rejected": -1440.0, + "loss": 0.7052, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.439453125, + "rewards/margins": 0.00701904296875, + "rewards/rejected": 0.431640625, + "step": 659 + }, + { + "epoch": 0.1903937689311986, + "grad_norm": 12.563441180207436, + "learning_rate": 4.876783879239955e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -1664.0, + "logps/rejected": -1608.0, + "loss": 0.6707, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.486328125, + "rewards/margins": 0.12158203125, + "rewards/rejected": 0.365234375, + "step": 660 + }, + { + "epoch": 0.19068224433867012, + "grad_norm": 10.065375150408824, + "learning_rate": 4.876001882305959e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1808.0, + "logps/rejected": -1696.0, + "loss": 0.6614, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.458984375, + "step": 661 + }, + { + "epoch": 0.19097071974614163, + "grad_norm": 12.160144473767362, + "learning_rate": 4.875217474824209e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.03125, + "logps/chosen": -1256.0, + "logps/rejected": -1312.0, + "loss": 0.7008, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.490234375, + "rewards/margins": -0.0230712890625, + "rewards/rejected": 0.51171875, + "step": 662 + }, + { + "epoch": 0.19125919515361314, + "grad_norm": 11.04545219522318, + "learning_rate": 4.874430657590517e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1648.0, + "logps/rejected": -1544.0, + "loss": 0.6885, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.453125, + "rewards/margins": 0.033935546875, + "rewards/rejected": 0.41796875, + "step": 663 + }, + { + "epoch": 0.19154767056108465, + "grad_norm": 11.569681034757473, + "learning_rate": 4.87364143140314e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1992.0, + "logps/rejected": -1624.0, + "loss": 0.6685, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.396484375, + "step": 664 + }, + { + "epoch": 0.1918361459685562, + "grad_norm": 9.64977957417266, + "learning_rate": 4.87284979706278e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1680.0, + "logps/rejected": -1344.0, + "loss": 0.6656, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.038818359375, + "rewards/rejected": 0.314453125, + "step": 665 + }, + { + "epoch": 0.1921246213760277, + "grad_norm": 10.14867875139304, + "learning_rate": 4.87205575537258e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1584.0, + "logps/rejected": -1400.0, + "loss": 0.6777, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.37109375, + "step": 666 + }, + { + "epoch": 0.19241309678349922, + "grad_norm": 11.971018077768433, + "learning_rate": 4.871259307138128e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1720.0, + "logps/rejected": -1824.0, + "loss": 0.6723, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.470703125, + "step": 667 + }, + { + "epoch": 0.19270157219097073, + "grad_norm": 11.784758492952502, + "learning_rate": 4.870460453167451e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1552.0, + "logps/rejected": -1680.0, + "loss": 0.714, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.007568359375, + "rewards/rejected": 0.412109375, + "step": 668 + }, + { + "epoch": 0.19299004759844224, + "grad_norm": 11.36392329957994, + "learning_rate": 4.869659194271019e-07, + "logits/chosen": 3.078125, + "logits/rejected": 2.953125, + "logps/chosen": -1680.0, + "logps/rejected": -1680.0, + "loss": 0.6641, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.05126953125, + "rewards/rejected": 0.439453125, + "step": 669 + }, + { + "epoch": 0.19327852300591375, + "grad_norm": 9.674440585539758, + "learning_rate": 4.86885553126174e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1744.0, + "logps/rejected": -1776.0, + "loss": 0.6851, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.0498046875, + "rewards/rejected": 0.470703125, + "step": 670 + }, + { + "epoch": 0.19356699841338526, + "grad_norm": 10.617435786761819, + "learning_rate": 4.868049464954962e-07, + "logits/chosen": 3.0625, + "logits/rejected": 2.984375, + "logps/chosen": -1496.0, + "logps/rejected": -1440.0, + "loss": 0.679, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.05224609375, + "rewards/rejected": 0.4296875, + "step": 671 + }, + { + "epoch": 0.19385547382085677, + "grad_norm": 11.911754871557925, + "learning_rate": 4.867240996168471e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.015625, + "logps/chosen": -1288.0, + "logps/rejected": -1352.0, + "loss": 0.6447, + "loss/demonstration_loss": -2672.0, + "loss/preference_loss": -2672.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.390625, + "rewards/margins": 0.09423828125, + "rewards/rejected": 0.296875, + "step": 672 + }, + { + "epoch": 0.19414394922832828, + "grad_norm": 11.207796371810625, + "learning_rate": 4.866430125722491e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1528.0, + "logps/rejected": -1472.0, + "loss": 0.6838, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.41796875, + "rewards/margins": -0.034912109375, + "rewards/rejected": 0.453125, + "step": 673 + }, + { + "epoch": 0.1944324246357998, + "grad_norm": 10.63240051661404, + "learning_rate": 4.865616854439681e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1720.0, + "logps/rejected": -1712.0, + "loss": 0.7007, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.470703125, + "rewards/margins": -0.03271484375, + "rewards/rejected": 0.50390625, + "step": 674 + }, + { + "epoch": 0.1947209000432713, + "grad_norm": 9.946930580897428, + "learning_rate": 4.864801183145138e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.125, + "logps/chosen": -1784.0, + "logps/rejected": -1448.0, + "loss": 0.6475, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.140625, + "rewards/rejected": 0.36328125, + "step": 675 + }, + { + "epoch": 0.19500937545074282, + "grad_norm": 10.578052170219387, + "learning_rate": 4.863983112666393e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.234375, + "logps/chosen": -1464.0, + "logps/rejected": -1488.0, + "loss": 0.6772, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.388671875, + "rewards/margins": 0.01177978515625, + "rewards/rejected": 0.375, + "step": 676 + }, + { + "epoch": 0.19529785085821433, + "grad_norm": 10.711801539836088, + "learning_rate": 4.863162643833411e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1704.0, + "logps/rejected": -1480.0, + "loss": 0.675, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.515625, + "rewards/margins": 0.08154296875, + "rewards/rejected": 0.435546875, + "step": 677 + }, + { + "epoch": 0.19558632626568584, + "grad_norm": 11.920460055325549, + "learning_rate": 4.862339777478587e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.265625, + "logps/chosen": -2176.0, + "logps/rejected": -2040.0, + "loss": 0.6811, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4256.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.041015625, + "rewards/rejected": 0.52734375, + "step": 678 + }, + { + "epoch": 0.19587480167315735, + "grad_norm": 12.109920260590938, + "learning_rate": 4.861514514436755e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1840.0, + "logps/rejected": -1744.0, + "loss": 0.6787, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.05615234375, + "rewards/rejected": 0.404296875, + "step": 679 + }, + { + "epoch": 0.1961632770806289, + "grad_norm": 9.723362757321727, + "learning_rate": 4.860686855545175e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.21875, + "logps/chosen": -1616.0, + "logps/rejected": -1576.0, + "loss": 0.6614, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.053955078125, + "rewards/rejected": 0.388671875, + "step": 680 + }, + { + "epoch": 0.1964517524881004, + "grad_norm": 12.179933799639938, + "learning_rate": 4.859856801643542e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1392.0, + "logps/rejected": -1352.0, + "loss": 0.7133, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.32421875, + "rewards/margins": -0.01226806640625, + "rewards/rejected": 0.3359375, + "step": 681 + }, + { + "epoch": 0.1967402278955719, + "grad_norm": 9.742605886305688, + "learning_rate": 4.859024353573975e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.109375, + "logps/chosen": -1840.0, + "logps/rejected": -1560.0, + "loss": 0.6739, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.1181640625, + "rewards/rejected": 0.361328125, + "step": 682 + }, + { + "epoch": 0.19702870330304342, + "grad_norm": 10.591025629184088, + "learning_rate": 4.858189512181027e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1728.0, + "logps/rejected": -1744.0, + "loss": 0.6952, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.0091552734375, + "rewards/rejected": 0.50390625, + "step": 683 + }, + { + "epoch": 0.19731717871051493, + "grad_norm": 10.851007533404983, + "learning_rate": 4.857352278311679e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.078125, + "logps/chosen": -1608.0, + "logps/rejected": -1536.0, + "loss": 0.679, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.111328125, + "rewards/rejected": 0.390625, + "step": 684 + }, + { + "epoch": 0.19760565411798645, + "grad_norm": 11.948588514401317, + "learning_rate": 4.856512652815335e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.265625, + "logps/chosen": -1656.0, + "logps/rejected": -1536.0, + "loss": 0.6744, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.134765625, + "rewards/rejected": 0.41015625, + "step": 685 + }, + { + "epoch": 0.19789412952545796, + "grad_norm": 10.018760469895629, + "learning_rate": 4.85567063654383e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1784.0, + "logps/rejected": -1760.0, + "loss": 0.689, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.474609375, + "rewards/margins": -0.0108642578125, + "rewards/rejected": 0.484375, + "step": 686 + }, + { + "epoch": 0.19818260493292947, + "grad_norm": 14.102078162572806, + "learning_rate": 4.854826230351425e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -1504.0, + "logps/rejected": -1488.0, + "loss": 0.6993, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.482421875, + "step": 687 + }, + { + "epoch": 0.19847108034040098, + "grad_norm": 10.799319125408367, + "learning_rate": 4.853979435094798e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.046875, + "logps/chosen": -1456.0, + "logps/rejected": -1648.0, + "loss": 0.6768, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4375, + "rewards/margins": -0.02587890625, + "rewards/rejected": 0.462890625, + "step": 688 + }, + { + "epoch": 0.1987595557478725, + "grad_norm": 11.440696500060202, + "learning_rate": 4.853130251633061e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1776.0, + "logps/rejected": -1856.0, + "loss": 0.6583, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.53125, + "rewards/margins": 0.11181640625, + "rewards/rejected": 0.419921875, + "step": 689 + }, + { + "epoch": 0.199048031155344, + "grad_norm": 11.920239418737804, + "learning_rate": 4.852278680827741e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.1875, + "logps/chosen": -1712.0, + "logps/rejected": -1752.0, + "loss": 0.7269, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.439453125, + "rewards/margins": -0.1640625, + "rewards/rejected": 0.60546875, + "step": 690 + }, + { + "epoch": 0.1993365065628155, + "grad_norm": 10.259290554511898, + "learning_rate": 4.851424723542793e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1600.0, + "logps/rejected": -1312.0, + "loss": 0.6951, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.07568359375, + "rewards/rejected": 0.341796875, + "step": 691 + }, + { + "epoch": 0.19962498197028702, + "grad_norm": 11.312589039583735, + "learning_rate": 4.850568380644587e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1296.0, + "logps/rejected": -1336.0, + "loss": 0.6781, + "loss/demonstration_loss": -2656.0, + "loss/preference_loss": -2656.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.34765625, + "rewards/margins": -0.02685546875, + "rewards/rejected": 0.375, + "step": 692 + }, + { + "epoch": 0.19991345737775854, + "grad_norm": 12.263397013496872, + "learning_rate": 4.849709653001921e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1440.0, + "logps/rejected": -1440.0, + "loss": 0.6759, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5, + "rewards/margins": 0.099609375, + "rewards/rejected": 0.40234375, + "step": 693 + }, + { + "epoch": 0.20020193278523005, + "grad_norm": 10.957957872409663, + "learning_rate": 4.848848541486005e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1560.0, + "logps/rejected": -1536.0, + "loss": 0.688, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.416015625, + "rewards/margins": -0.034423828125, + "rewards/rejected": 0.451171875, + "step": 694 + }, + { + "epoch": 0.20049040819270159, + "grad_norm": 10.271050783069304, + "learning_rate": 4.847985046970471e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1432.0, + "logps/rejected": -1424.0, + "loss": 0.7087, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.020263671875, + "rewards/rejected": 0.42578125, + "step": 695 + }, + { + "epoch": 0.2007788836001731, + "grad_norm": 11.926952083717993, + "learning_rate": 4.847119170331369e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1784.0, + "logps/rejected": -1664.0, + "loss": 0.6847, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.00732421875, + "rewards/rejected": 0.447265625, + "step": 696 + }, + { + "epoch": 0.2010673590076446, + "grad_norm": 10.520748430768954, + "learning_rate": 4.846250912447164e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1576.0, + "logps/rejected": -1528.0, + "loss": 0.696, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.380859375, + "rewards/margins": -0.0274658203125, + "rewards/rejected": 0.408203125, + "step": 697 + }, + { + "epoch": 0.20135583441511612, + "grad_norm": 10.999228935581225, + "learning_rate": 4.84538027419874e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1720.0, + "logps/rejected": -1632.0, + "loss": 0.6985, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.59375, + "rewards/margins": 0.028076171875, + "rewards/rejected": 0.56640625, + "step": 698 + }, + { + "epoch": 0.20164430982258763, + "grad_norm": 11.836114288510027, + "learning_rate": 4.844507256469392e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1944.0, + "logps/rejected": -1872.0, + "loss": 0.6912, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5859375, + "rewards/margins": -0.0157470703125, + "rewards/rejected": 0.6015625, + "step": 699 + }, + { + "epoch": 0.20193278523005914, + "grad_norm": 11.081049944785299, + "learning_rate": 4.843631860144831e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -1416.0, + "logps/rejected": -1360.0, + "loss": 0.7131, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.0107421875, + "rewards/rejected": 0.43359375, + "step": 700 + }, + { + "epoch": 0.20222126063753065, + "grad_norm": 11.318437277350487, + "learning_rate": 4.842754086113183e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1480.0, + "logps/rejected": -1608.0, + "loss": 0.6798, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.37890625, + "rewards/margins": -0.043701171875, + "rewards/rejected": 0.421875, + "step": 701 + }, + { + "epoch": 0.20250973604500216, + "grad_norm": 11.823880302215205, + "learning_rate": 4.841873935264982e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1816.0, + "logps/rejected": -1720.0, + "loss": 0.6569, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.033203125, + "rewards/rejected": 0.5546875, + "step": 702 + }, + { + "epoch": 0.20279821145247368, + "grad_norm": 9.605328610704674, + "learning_rate": 4.840991408493177e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1888.0, + "logps/rejected": -1704.0, + "loss": 0.6494, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.1640625, + "rewards/rejected": 0.49609375, + "step": 703 + }, + { + "epoch": 0.2030866868599452, + "grad_norm": 9.842920213456006, + "learning_rate": 4.840106506693127e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1400.0, + "logps/rejected": -1352.0, + "loss": 0.6995, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.421875, + "rewards/margins": -0.0361328125, + "rewards/rejected": 0.458984375, + "step": 704 + }, + { + "epoch": 0.2033751622674167, + "grad_norm": 13.645161505423774, + "learning_rate": 4.839219230762598e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -2128.0, + "logps/rejected": -1952.0, + "loss": 0.6902, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.05419921875, + "rewards/rejected": 0.58984375, + "step": 705 + }, + { + "epoch": 0.2036636376748882, + "grad_norm": 14.426707028631657, + "learning_rate": 4.838329581601768e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.03125, + "logps/chosen": -1400.0, + "logps/rejected": -1328.0, + "loss": 0.6826, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.451171875, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.38671875, + "step": 706 + }, + { + "epoch": 0.20395211308235972, + "grad_norm": 12.634909182552262, + "learning_rate": 4.837437560113221e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.1875, + "logps/chosen": -1760.0, + "logps/rejected": -1160.0, + "loss": 0.6858, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.474609375, + "rewards/margins": 0.10302734375, + "rewards/rejected": 0.37109375, + "step": 707 + }, + { + "epoch": 0.20424058848983123, + "grad_norm": 12.134777792175141, + "learning_rate": 4.836543167201947e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -1568.0, + "logps/rejected": -1584.0, + "loss": 0.6671, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.08203125, + "rewards/rejected": 0.42578125, + "step": 708 + }, + { + "epoch": 0.20452906389730274, + "grad_norm": 10.734087179446105, + "learning_rate": 4.835646403775344e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1536.0, + "logps/rejected": -1416.0, + "loss": 0.6769, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.404296875, + "rewards/margins": 0.0235595703125, + "rewards/rejected": 0.380859375, + "step": 709 + }, + { + "epoch": 0.20481753930477425, + "grad_norm": 10.968610477029062, + "learning_rate": 4.834747270743214e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.109375, + "logps/chosen": -2016.0, + "logps/rejected": -1960.0, + "loss": 0.6611, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.05029296875, + "rewards/rejected": 0.51953125, + "step": 710 + }, + { + "epoch": 0.2051060147122458, + "grad_norm": 9.880917503093556, + "learning_rate": 4.833845769017762e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1640.0, + "logps/rejected": -1648.0, + "loss": 0.6823, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.024658203125, + "rewards/rejected": 0.52734375, + "step": 711 + }, + { + "epoch": 0.2053944901197173, + "grad_norm": 10.754545624863882, + "learning_rate": 4.832941899513599e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.09375, + "logps/chosen": -1968.0, + "logps/rejected": -1848.0, + "loss": 0.6762, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.515625, + "step": 712 + }, + { + "epoch": 0.20568296552718882, + "grad_norm": 9.816562259232935, + "learning_rate": 4.832035663147733e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1616.0, + "logps/rejected": -1512.0, + "loss": 0.6635, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.6796875, + "rewards/margins": 0.05908203125, + "rewards/rejected": 0.62109375, + "step": 713 + }, + { + "epoch": 0.20597144093466033, + "grad_norm": 9.781413257592055, + "learning_rate": 4.831127060839579e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.203125, + "logps/chosen": -1760.0, + "logps/rejected": -1360.0, + "loss": 0.68, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.15625, + "rewards/rejected": 0.4296875, + "step": 714 + }, + { + "epoch": 0.20625991634213184, + "grad_norm": 9.729851726983869, + "learning_rate": 4.830216093510951e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1608.0, + "logps/rejected": -1648.0, + "loss": 0.6412, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.08056640625, + "rewards/rejected": 0.5234375, + "step": 715 + }, + { + "epoch": 0.20654839174960335, + "grad_norm": 13.884011002386666, + "learning_rate": 4.829302762086058e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.25, + "logps/chosen": -1760.0, + "logps/rejected": -1560.0, + "loss": 0.6591, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.0263671875, + "rewards/rejected": 0.494140625, + "step": 716 + }, + { + "epoch": 0.20683686715707486, + "grad_norm": 11.022701363854448, + "learning_rate": 4.828387067491514e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1960.0, + "logps/rejected": -2040.0, + "loss": 0.6531, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.625, + "rewards/margins": 0.034423828125, + "rewards/rejected": 0.58984375, + "step": 717 + }, + { + "epoch": 0.20712534256454637, + "grad_norm": 11.196729465565406, + "learning_rate": 4.827469010656325e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1576.0, + "logps/rejected": -1336.0, + "loss": 0.6645, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.462890625, + "step": 718 + }, + { + "epoch": 0.20741381797201788, + "grad_norm": 10.716179400833628, + "learning_rate": 4.826548592511897e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1496.0, + "logps/rejected": -1200.0, + "loss": 0.6959, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3515625, + "rewards/margins": 0.02392578125, + "rewards/rejected": 0.328125, + "step": 719 + }, + { + "epoch": 0.2077022933794894, + "grad_norm": 10.811632977912513, + "learning_rate": 4.825625813992032e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1968.0, + "logps/rejected": -1648.0, + "loss": 0.6373, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.70703125, + "rewards/margins": 0.1259765625, + "rewards/rejected": 0.578125, + "step": 720 + }, + { + "epoch": 0.2079907687869609, + "grad_norm": 12.174280535165268, + "learning_rate": 4.824700676032922e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.171875, + "logps/chosen": -1776.0, + "logps/rejected": -1784.0, + "loss": 0.6888, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.04541015625, + "rewards/rejected": 0.50390625, + "step": 721 + }, + { + "epoch": 0.20827924419443242, + "grad_norm": 11.795632333869737, + "learning_rate": 4.823773179573158e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1608.0, + "logps/rejected": -1704.0, + "loss": 0.707, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5078125, + "rewards/margins": -0.0185546875, + "rewards/rejected": 0.5234375, + "step": 722 + }, + { + "epoch": 0.20856771960190393, + "grad_norm": 9.078251008803598, + "learning_rate": 4.822843325553721e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1448.0, + "logps/rejected": -1424.0, + "loss": 0.6699, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.072265625, + "rewards/rejected": 0.451171875, + "step": 723 + }, + { + "epoch": 0.20885619500937544, + "grad_norm": 10.222274664567664, + "learning_rate": 4.821911114917986e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1856.0, + "logps/rejected": -1840.0, + "loss": 0.6993, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.494140625, + "rewards/margins": -0.029296875, + "rewards/rejected": 0.5234375, + "step": 724 + }, + { + "epoch": 0.20914467041684695, + "grad_norm": 14.201053835839705, + "learning_rate": 4.820976548611717e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1792.0, + "logps/rejected": -1704.0, + "loss": 0.678, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.18359375, + "rewards/rejected": 0.447265625, + "step": 725 + }, + { + "epoch": 0.2094331458243185, + "grad_norm": 10.649266406239299, + "learning_rate": 4.820039627583066e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1824.0, + "logps/rejected": -1784.0, + "loss": 0.6754, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.0810546875, + "rewards/rejected": 0.4765625, + "step": 726 + }, + { + "epoch": 0.20972162123179, + "grad_norm": 10.88402246312366, + "learning_rate": 4.819100352782581e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1656.0, + "logps/rejected": -1648.0, + "loss": 0.6921, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.0625, + "rewards/rejected": 0.498046875, + "step": 727 + }, + { + "epoch": 0.2100100966392615, + "grad_norm": 12.457949993147707, + "learning_rate": 4.81815872516319e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1656.0, + "logps/rejected": -1456.0, + "loss": 0.6895, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.017333984375, + "rewards/rejected": 0.51953125, + "step": 728 + }, + { + "epoch": 0.21029857204673302, + "grad_norm": 10.745960850335821, + "learning_rate": 4.817214745680212e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1528.0, + "logps/rejected": -1536.0, + "loss": 0.6649, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.6875, + "rewards/margins": 0.033935546875, + "rewards/rejected": 0.65625, + "step": 729 + }, + { + "epoch": 0.21058704745420453, + "grad_norm": 10.316746638279156, + "learning_rate": 4.816268415291352e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1600.0, + "logps/rejected": -1648.0, + "loss": 0.6833, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.42578125, + "rewards/margins": -0.08203125, + "rewards/rejected": 0.5078125, + "step": 730 + }, + { + "epoch": 0.21087552286167605, + "grad_norm": 11.870927244951593, + "learning_rate": 4.815319734956699e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1936.0, + "logps/rejected": -1912.0, + "loss": 0.7054, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.50390625, + "rewards/margins": -0.1025390625, + "rewards/rejected": 0.60546875, + "step": 731 + }, + { + "epoch": 0.21116399826914756, + "grad_norm": 9.826235486019833, + "learning_rate": 4.814368705638726e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1416.0, + "logps/rejected": -1512.0, + "loss": 0.6851, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.357421875, + "rewards/margins": -0.009765625, + "rewards/rejected": 0.3671875, + "step": 732 + }, + { + "epoch": 0.21145247367661907, + "grad_norm": 10.981168775767602, + "learning_rate": 4.813415328302292e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1512.0, + "logps/rejected": -1448.0, + "loss": 0.6843, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.033935546875, + "rewards/rejected": 0.384765625, + "step": 733 + }, + { + "epoch": 0.21174094908409058, + "grad_norm": 11.419392207529663, + "learning_rate": 4.812459603914635e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.1875, + "logps/chosen": -1728.0, + "logps/rejected": -1672.0, + "loss": 0.685, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.546875, + "rewards/margins": 0.033935546875, + "rewards/rejected": 0.515625, + "step": 734 + }, + { + "epoch": 0.2120294244915621, + "grad_norm": 10.514347986848705, + "learning_rate": 4.811501533445374e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1688.0, + "logps/rejected": -1888.0, + "loss": 0.6918, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.515625, + "rewards/margins": 0.0224609375, + "rewards/rejected": 0.4921875, + "step": 735 + }, + { + "epoch": 0.2123178998990336, + "grad_norm": 10.770207230190788, + "learning_rate": 4.810541117866511e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1880.0, + "logps/rejected": -1464.0, + "loss": 0.675, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.1611328125, + "rewards/rejected": 0.427734375, + "step": 736 + }, + { + "epoch": 0.2126063753065051, + "grad_norm": 10.095393334059818, + "learning_rate": 4.809578358152423e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1384.0, + "logps/rejected": -1416.0, + "loss": 0.689, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39453125, + "rewards/margins": -0.0267333984375, + "rewards/rejected": 0.421875, + "step": 737 + }, + { + "epoch": 0.21289485071397662, + "grad_norm": 11.734817131212226, + "learning_rate": 4.808613255279871e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.25, + "logps/chosen": -2128.0, + "logps/rejected": -1784.0, + "loss": 0.6477, + "loss/demonstration_loss": -3968.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.1611328125, + "rewards/rejected": 0.3984375, + "step": 738 + }, + { + "epoch": 0.21318332612144814, + "grad_norm": 9.666998956048381, + "learning_rate": 4.807645810227988e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1960.0, + "logps/rejected": -1992.0, + "loss": 0.677, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.546875, + "rewards/margins": 0.0458984375, + "rewards/rejected": 0.5, + "step": 739 + }, + { + "epoch": 0.21347180152891965, + "grad_norm": 17.63609448211562, + "learning_rate": 4.806676023978285e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.0625, + "logps/chosen": -1672.0, + "logps/rejected": -1792.0, + "loss": 0.7662, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.49609375, + "rewards/margins": -0.12255859375, + "rewards/rejected": 0.6171875, + "step": 740 + }, + { + "epoch": 0.21376027693639119, + "grad_norm": 11.49331490384248, + "learning_rate": 4.80570389751465e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0625, + "logps/chosen": -1680.0, + "logps/rejected": -1520.0, + "loss": 0.7121, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.00201416015625, + "rewards/rejected": 0.375, + "step": 741 + }, + { + "epoch": 0.2140487523438627, + "grad_norm": 11.136275680249597, + "learning_rate": 4.804729431823343e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1512.0, + "logps/rejected": -1384.0, + "loss": 0.7009, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.482421875, + "rewards/margins": 0.0245361328125, + "rewards/rejected": 0.45703125, + "step": 742 + }, + { + "epoch": 0.2143372277513342, + "grad_norm": 9.9162938139212, + "learning_rate": 4.803752627892997e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.171875, + "logps/chosen": -1160.0, + "logps/rejected": -1120.0, + "loss": 0.6873, + "loss/demonstration_loss": -2320.0, + "loss/preference_loss": -2304.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.04052734375, + "rewards/rejected": 0.283203125, + "step": 743 + }, + { + "epoch": 0.21462570315880572, + "grad_norm": 12.526651487820564, + "learning_rate": 4.80277348671462e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.375, + "logps/chosen": -1944.0, + "logps/rejected": -2064.0, + "loss": 0.6979, + "loss/demonstration_loss": -4080.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.609375, + "rewards/margins": -0.0174560546875, + "rewards/rejected": 0.625, + "step": 744 + }, + { + "epoch": 0.21491417856627723, + "grad_norm": 11.414316313391137, + "learning_rate": 4.801792009281588e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1472.0, + "logps/rejected": -1384.0, + "loss": 0.6796, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.359375, + "step": 745 + }, + { + "epoch": 0.21520265397374874, + "grad_norm": 10.906583280987837, + "learning_rate": 4.800808196589649e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1888.0, + "logps/rejected": -1720.0, + "loss": 0.6701, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.11083984375, + "rewards/rejected": 0.447265625, + "step": 746 + }, + { + "epoch": 0.21549112938122025, + "grad_norm": 9.726929692644552, + "learning_rate": 4.799822049636919e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -2008.0, + "logps/rejected": -1584.0, + "loss": 0.6833, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.05419921875, + "rewards/rejected": 0.49609375, + "step": 747 + }, + { + "epoch": 0.21577960478869176, + "grad_norm": 10.07858396947618, + "learning_rate": 4.798833569423885e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1472.0, + "logps/rejected": -1288.0, + "loss": 0.6721, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.453125, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.3671875, + "step": 748 + }, + { + "epoch": 0.21606808019616328, + "grad_norm": 12.567324884876328, + "learning_rate": 4.797842756953396e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1896.0, + "logps/rejected": -1600.0, + "loss": 0.6531, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.162109375, + "rewards/rejected": 0.482421875, + "step": 749 + }, + { + "epoch": 0.2163565556036348, + "grad_norm": 11.011619684047062, + "learning_rate": 4.796849613230675e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1696.0, + "logps/rejected": -1664.0, + "loss": 0.7045, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.008056640625, + "rewards/rejected": 0.5, + "step": 750 + }, + { + "epoch": 0.2166450310111063, + "grad_norm": 11.627923604077534, + "learning_rate": 4.795854139263301e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1808.0, + "logps/rejected": -1736.0, + "loss": 0.6995, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.0018310546875, + "rewards/rejected": 0.54296875, + "step": 751 + }, + { + "epoch": 0.2169335064185778, + "grad_norm": 10.372329713893112, + "learning_rate": 4.794856336061224e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1240.0, + "logps/rejected": -1432.0, + "loss": 0.7021, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.498046875, + "rewards/margins": -0.03564453125, + "rewards/rejected": 0.53125, + "step": 752 + }, + { + "epoch": 0.21722198182604932, + "grad_norm": 9.674311966203964, + "learning_rate": 4.793856204636755e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -1224.0, + "logps/rejected": -1232.0, + "loss": 0.7018, + "loss/demonstration_loss": -2480.0, + "loss/preference_loss": -2480.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.31640625, + "rewards/margins": -0.03173828125, + "rewards/rejected": 0.34765625, + "step": 753 + }, + { + "epoch": 0.21751045723352083, + "grad_norm": 10.927400718305211, + "learning_rate": 4.792853746004566e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1784.0, + "logps/rejected": -2048.0, + "loss": 0.6879, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.546875, + "rewards/margins": -0.031982421875, + "rewards/rejected": 0.578125, + "step": 754 + }, + { + "epoch": 0.21779893264099234, + "grad_norm": 10.678696466368375, + "learning_rate": 4.79184896118169e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.15625, + "logps/chosen": -1792.0, + "logps/rejected": -1728.0, + "loss": 0.6694, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.037353515625, + "rewards/rejected": 0.453125, + "step": 755 + }, + { + "epoch": 0.21808740804846388, + "grad_norm": 10.42729407783746, + "learning_rate": 4.790841851187523e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1640.0, + "logps/rejected": -1480.0, + "loss": 0.6675, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4140625, + "rewards/margins": 0.033447265625, + "rewards/rejected": 0.380859375, + "step": 756 + }, + { + "epoch": 0.2183758834559354, + "grad_norm": 11.294229027280197, + "learning_rate": 4.789832417043817e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.078125, + "logps/chosen": -1808.0, + "logps/rejected": -1856.0, + "loss": 0.6999, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.47265625, + "rewards/margins": -0.06005859375, + "rewards/rejected": 0.53125, + "step": 757 + }, + { + "epoch": 0.2186643588634069, + "grad_norm": 10.129031655286697, + "learning_rate": 4.788820659774682e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1448.0, + "logps/rejected": -1552.0, + "loss": 0.696, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.57421875, + "rewards/margins": 0.0084228515625, + "rewards/rejected": 0.56640625, + "step": 758 + }, + { + "epoch": 0.21895283427087842, + "grad_norm": 10.160425206339376, + "learning_rate": 4.787806580406588e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1456.0, + "logps/rejected": -1448.0, + "loss": 0.6815, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.02685546875, + "rewards/rejected": 0.43359375, + "step": 759 + }, + { + "epoch": 0.21924130967834993, + "grad_norm": 10.706784668951755, + "learning_rate": 4.786790179968354e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1864.0, + "logps/rejected": -1872.0, + "loss": 0.681, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.578125, + "rewards/margins": 0.037109375, + "rewards/rejected": 0.5390625, + "step": 760 + }, + { + "epoch": 0.21952978508582144, + "grad_norm": 10.483601920873845, + "learning_rate": 4.785771459491164e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1424.0, + "logps/rejected": -1216.0, + "loss": 0.6832, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.0654296875, + "rewards/rejected": 0.453125, + "step": 761 + }, + { + "epoch": 0.21981826049329295, + "grad_norm": 12.278500643181989, + "learning_rate": 4.784750420008545e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1704.0, + "logps/rejected": -1632.0, + "loss": 0.6714, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.03857421875, + "rewards/rejected": 0.50390625, + "step": 762 + }, + { + "epoch": 0.22010673590076446, + "grad_norm": 11.456260638109258, + "learning_rate": 4.783727062556386e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1544.0, + "logps/rejected": -1576.0, + "loss": 0.6907, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.51953125, + "rewards/margins": -0.08544921875, + "rewards/rejected": 0.60546875, + "step": 763 + }, + { + "epoch": 0.22039521130823597, + "grad_norm": 9.891737451836569, + "learning_rate": 4.782701388172922e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.34375, + "logps/chosen": -1984.0, + "logps/rejected": -1784.0, + "loss": 0.639, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.59375, + "rewards/margins": 0.2421875, + "rewards/rejected": 0.349609375, + "step": 764 + }, + { + "epoch": 0.22068368671570748, + "grad_norm": 12.995043382884173, + "learning_rate": 4.781673397898739e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.046875, + "logps/chosen": -1464.0, + "logps/rejected": -1424.0, + "loss": 0.733, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.478515625, + "rewards/margins": -0.03662109375, + "rewards/rejected": 0.515625, + "step": 765 + }, + { + "epoch": 0.220972162123179, + "grad_norm": 10.688877235024968, + "learning_rate": 4.780643092776776e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -2240.0, + "logps/rejected": -1976.0, + "loss": 0.6799, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6796875, + "rewards/margins": 0.10498046875, + "rewards/rejected": 0.57421875, + "step": 766 + }, + { + "epoch": 0.2212606375306505, + "grad_norm": 10.702590110214809, + "learning_rate": 4.779610473852317e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.265625, + "logps/chosen": -2128.0, + "logps/rejected": -1920.0, + "loss": 0.6915, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.498046875, + "step": 767 + }, + { + "epoch": 0.22154911293812202, + "grad_norm": 12.544771235530325, + "learning_rate": 4.778575542172994e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.0, + "logps/chosen": -2096.0, + "logps/rejected": -1576.0, + "loss": 0.6801, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.49609375, + "step": 768 + }, + { + "epoch": 0.22183758834559353, + "grad_norm": 10.264003607868487, + "learning_rate": 4.777538298788787e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.34375, + "logps/chosen": -2096.0, + "logps/rejected": -1824.0, + "loss": 0.6567, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3968.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.734375, + "rewards/margins": 0.1845703125, + "rewards/rejected": 0.55078125, + "step": 769 + }, + { + "epoch": 0.22212606375306504, + "grad_norm": 11.248404200359586, + "learning_rate": 4.77649874475202e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -1624.0, + "logps/rejected": -1648.0, + "loss": 0.6671, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.458984375, + "rewards/margins": -0.0186767578125, + "rewards/rejected": 0.478515625, + "step": 770 + }, + { + "epoch": 0.22241453916053655, + "grad_norm": 9.383476495257506, + "learning_rate": 4.775456881117363e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1592.0, + "logps/rejected": -1472.0, + "loss": 0.653, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.11474609375, + "rewards/rejected": 0.345703125, + "step": 771 + }, + { + "epoch": 0.2227030145680081, + "grad_norm": 10.416448032488953, + "learning_rate": 4.774412708941825e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1840.0, + "logps/rejected": -1528.0, + "loss": 0.6714, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.1259765625, + "rewards/rejected": 0.33203125, + "step": 772 + }, + { + "epoch": 0.2229914899754796, + "grad_norm": 10.282109197122923, + "learning_rate": 4.773366229284762e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.203125, + "logps/chosen": -1504.0, + "logps/rejected": -1496.0, + "loss": 0.69, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.0625, + "rewards/chosen": 0.515625, + "rewards/margins": -0.0927734375, + "rewards/rejected": 0.609375, + "step": 773 + }, + { + "epoch": 0.2232799653829511, + "grad_norm": 11.55962934443606, + "learning_rate": 4.77231744320787e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1656.0, + "logps/rejected": -1616.0, + "loss": 0.6917, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.109375, + "rewards/rejected": 0.380859375, + "step": 774 + }, + { + "epoch": 0.22356844079042262, + "grad_norm": 11.33381977631596, + "learning_rate": 4.771266351775181e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1864.0, + "logps/rejected": -1680.0, + "loss": 0.6583, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.1396484375, + "rewards/rejected": 0.50390625, + "step": 775 + }, + { + "epoch": 0.22385691619789413, + "grad_norm": 11.409222112409122, + "learning_rate": 4.77021295605307e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.375, + "logps/chosen": -1856.0, + "logps/rejected": -1920.0, + "loss": 0.6959, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5546875, + "rewards/margins": -0.02978515625, + "rewards/rejected": 0.58203125, + "step": 776 + }, + { + "epoch": 0.22414539160536565, + "grad_norm": 10.512079896468661, + "learning_rate": 4.769157257110249e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1208.0, + "logps/rejected": -1208.0, + "loss": 0.6825, + "loss/demonstration_loss": -2448.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.023681640625, + "rewards/rejected": 0.357421875, + "step": 777 + }, + { + "epoch": 0.22443386701283716, + "grad_norm": 10.270088913394073, + "learning_rate": 4.7680992560177655e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1784.0, + "logps/rejected": -2016.0, + "loss": 0.7098, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.46484375, + "rewards/margins": -0.059326171875, + "rewards/rejected": 0.5234375, + "step": 778 + }, + { + "epoch": 0.22472234242030867, + "grad_norm": 11.021085673216014, + "learning_rate": 4.767038953849004e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.265625, + "logps/chosen": -1560.0, + "logps/rejected": -1536.0, + "loss": 0.6766, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.053955078125, + "rewards/rejected": 0.392578125, + "step": 779 + }, + { + "epoch": 0.22501081782778018, + "grad_norm": 11.74429204541403, + "learning_rate": 4.7659763516796834e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1640.0, + "logps/rejected": -1632.0, + "loss": 0.6841, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.494140625, + "rewards/margins": -0.0185546875, + "rewards/rejected": 0.51171875, + "step": 780 + }, + { + "epoch": 0.2252992932352517, + "grad_norm": 10.637778976965793, + "learning_rate": 4.7649114505878554e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.203125, + "logps/chosen": -1648.0, + "logps/rejected": -1480.0, + "loss": 0.6826, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.578125, + "rewards/margins": 0.10595703125, + "rewards/rejected": 0.47265625, + "step": 781 + }, + { + "epoch": 0.2255877686427232, + "grad_norm": 10.944275520852242, + "learning_rate": 4.763844251653902e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1480.0, + "logps/rejected": -1488.0, + "loss": 0.685, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.365234375, + "rewards/margins": -0.1005859375, + "rewards/rejected": 0.466796875, + "step": 782 + }, + { + "epoch": 0.2258762440501947, + "grad_norm": 10.12640516157928, + "learning_rate": 4.7627747559605425e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1576.0, + "logps/rejected": -1664.0, + "loss": 0.7059, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.53125, + "rewards/margins": -0.006591796875, + "rewards/rejected": 0.5390625, + "step": 783 + }, + { + "epoch": 0.22616471945766622, + "grad_norm": 9.480851705203083, + "learning_rate": 4.76170296459282e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.1875, + "logps/chosen": -884.0, + "logps/rejected": -1152.0, + "loss": 0.6787, + "loss/demonstration_loss": -2064.0, + "loss/preference_loss": -2064.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.232421875, + "rewards/margins": -0.0002899169921875, + "rewards/rejected": 0.232421875, + "step": 784 + }, + { + "epoch": 0.22645319486513774, + "grad_norm": 10.301037507342453, + "learning_rate": 4.760628878638109e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1528.0, + "logps/rejected": -1552.0, + "loss": 0.6366, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.47265625, + "rewards/margins": 0.045166015625, + "rewards/rejected": 0.427734375, + "step": 785 + }, + { + "epoch": 0.22674167027260925, + "grad_norm": 11.505669544271175, + "learning_rate": 4.759552499186113e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1496.0, + "logps/rejected": -1416.0, + "loss": 0.696, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.345703125, + "rewards/margins": -0.06884765625, + "rewards/rejected": 0.416015625, + "step": 786 + }, + { + "epoch": 0.22703014568008079, + "grad_norm": 8.843767392481718, + "learning_rate": 4.7584738273288615e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.28125, + "logps/chosen": -1792.0, + "logps/rejected": -1512.0, + "loss": 0.6517, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.1474609375, + "rewards/rejected": 0.36328125, + "step": 787 + }, + { + "epoch": 0.2273186210875523, + "grad_norm": 11.870974186373632, + "learning_rate": 4.757392864160709e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1800.0, + "logps/rejected": -1768.0, + "loss": 0.7079, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.408203125, + "rewards/margins": -0.068359375, + "rewards/rejected": 0.4765625, + "step": 788 + }, + { + "epoch": 0.2276070964950238, + "grad_norm": 10.021499097336493, + "learning_rate": 4.756309610778336e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1776.0, + "logps/rejected": -1624.0, + "loss": 0.6849, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.474609375, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.384765625, + "step": 789 + }, + { + "epoch": 0.22789557190249532, + "grad_norm": 11.042010895080205, + "learning_rate": 4.7552240682807466e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.25, + "logps/chosen": -2128.0, + "logps/rejected": -1936.0, + "loss": 0.6693, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.1484375, + "rewards/rejected": 0.51171875, + "step": 790 + }, + { + "epoch": 0.22818404730996683, + "grad_norm": 9.818454430936878, + "learning_rate": 4.754136237769264e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.296875, + "logps/chosen": -1608.0, + "logps/rejected": -1560.0, + "loss": 0.7092, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.515625, + "rewards/margins": 0.0003662109375, + "rewards/rejected": 0.515625, + "step": 791 + }, + { + "epoch": 0.22847252271743834, + "grad_norm": 9.404236270857183, + "learning_rate": 4.753046120347538e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.4375, + "logps/chosen": -1768.0, + "logps/rejected": -1640.0, + "loss": 0.6659, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.0576171875, + "rewards/rejected": 0.482421875, + "step": 792 + }, + { + "epoch": 0.22876099812490985, + "grad_norm": 11.664474784377166, + "learning_rate": 4.751953717121534e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1784.0, + "logps/rejected": -1712.0, + "loss": 0.6915, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.11181640625, + "rewards/rejected": 0.396484375, + "step": 793 + }, + { + "epoch": 0.22904947353238136, + "grad_norm": 9.591498649773056, + "learning_rate": 4.7508590291995387e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.34375, + "logps/chosen": -1464.0, + "logps/rejected": -1448.0, + "loss": 0.6516, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.078125, + "rewards/rejected": 0.38671875, + "step": 794 + }, + { + "epoch": 0.22933794893985288, + "grad_norm": 10.361518173349966, + "learning_rate": 4.749762057692157e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.359375, + "logps/chosen": -1696.0, + "logps/rejected": -1872.0, + "loss": 0.6879, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.51171875, + "step": 795 + }, + { + "epoch": 0.2296264243473244, + "grad_norm": 8.818637581966785, + "learning_rate": 4.748662803712309e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1552.0, + "logps/rejected": -1264.0, + "loss": 0.6754, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.10205078125, + "rewards/rejected": 0.40625, + "step": 796 + }, + { + "epoch": 0.2299148997547959, + "grad_norm": 13.867856803535817, + "learning_rate": 4.7475612683752307e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -1592.0, + "logps/rejected": -1728.0, + "loss": 0.7239, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.48046875, + "rewards/margins": -0.0252685546875, + "rewards/rejected": 0.50390625, + "step": 797 + }, + { + "epoch": 0.2302033751622674, + "grad_norm": 9.794300973888253, + "learning_rate": 4.7464574527984746e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.34375, + "logps/chosen": -1848.0, + "logps/rejected": -1648.0, + "loss": 0.6559, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6796875, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.53515625, + "step": 798 + }, + { + "epoch": 0.23049185056973892, + "grad_norm": 10.7887130072238, + "learning_rate": 4.7453513581019045e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.078125, + "logps/chosen": -2064.0, + "logps/rejected": -1792.0, + "loss": 0.6571, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.169921875, + "rewards/rejected": 0.474609375, + "step": 799 + }, + { + "epoch": 0.23078032597721043, + "grad_norm": 9.787222627532174, + "learning_rate": 4.744242985407697e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1736.0, + "logps/rejected": -1536.0, + "loss": 0.6699, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.09716796875, + "rewards/rejected": 0.455078125, + "step": 800 + }, + { + "epoch": 0.23106880138468194, + "grad_norm": 12.4120820523628, + "learning_rate": 4.7431323358403397e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.21875, + "logps/chosen": -1856.0, + "logps/rejected": -1776.0, + "loss": 0.6675, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.484375, + "rewards/margins": 0.040283203125, + "rewards/rejected": 0.443359375, + "step": 801 + }, + { + "epoch": 0.23135727679215348, + "grad_norm": 12.132228232391645, + "learning_rate": 4.742019410526632e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1544.0, + "logps/rejected": -1696.0, + "loss": 0.7286, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.494140625, + "rewards/margins": -0.07275390625, + "rewards/rejected": 0.56640625, + "step": 802 + }, + { + "epoch": 0.231645752199625, + "grad_norm": 10.83451110623162, + "learning_rate": 4.740904210595679e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1984.0, + "logps/rejected": -1672.0, + "loss": 0.6667, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.421875, + "step": 803 + }, + { + "epoch": 0.2319342276070965, + "grad_norm": 13.565057108496916, + "learning_rate": 4.739786737178895e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1376.0, + "logps/rejected": -1488.0, + "loss": 0.7132, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.01513671875, + "rewards/rejected": 0.419921875, + "step": 804 + }, + { + "epoch": 0.23222270301456802, + "grad_norm": 11.679900047201592, + "learning_rate": 4.7386669914100026e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1512.0, + "logps/rejected": -1560.0, + "loss": 0.6829, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5625, + "rewards/margins": 0.033935546875, + "rewards/rejected": 0.52734375, + "step": 805 + }, + { + "epoch": 0.23251117842203953, + "grad_norm": 12.37063049798951, + "learning_rate": 4.7375449744250264e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1496.0, + "logps/rejected": -1456.0, + "loss": 0.6904, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.458984375, + "rewards/margins": 0.060546875, + "rewards/rejected": 0.3984375, + "step": 806 + }, + { + "epoch": 0.23279965382951104, + "grad_norm": 11.037190300011083, + "learning_rate": 4.7364206873622974e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1480.0, + "logps/rejected": -1368.0, + "loss": 0.6632, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.390625, + "step": 807 + }, + { + "epoch": 0.23308812923698255, + "grad_norm": 11.362658973782954, + "learning_rate": 4.7352941313624495e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1888.0, + "logps/rejected": -1680.0, + "loss": 0.6901, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.0146484375, + "rewards/rejected": 0.62109375, + "step": 808 + }, + { + "epoch": 0.23337660464445406, + "grad_norm": 10.122933636489352, + "learning_rate": 4.7341653075684186e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.328125, + "logps/chosen": -1936.0, + "logps/rejected": -1792.0, + "loss": 0.6382, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.126953125, + "rewards/rejected": 0.392578125, + "step": 809 + }, + { + "epoch": 0.23366508005192557, + "grad_norm": 8.839221635012823, + "learning_rate": 4.73303421712544e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1464.0, + "logps/rejected": -1280.0, + "loss": 0.6655, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.484375, + "rewards/margins": 0.09130859375, + "rewards/rejected": 0.392578125, + "step": 810 + }, + { + "epoch": 0.23395355545939708, + "grad_norm": 10.980117329676265, + "learning_rate": 4.7319008611810504e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1696.0, + "logps/rejected": -1680.0, + "loss": 0.6497, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.498046875, + "rewards/margins": -0.01019287109375, + "rewards/rejected": 0.5078125, + "step": 811 + }, + { + "epoch": 0.2342420308668686, + "grad_norm": 11.545790571748764, + "learning_rate": 4.730765240885084e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.28125, + "logps/chosen": -1728.0, + "logps/rejected": -1408.0, + "loss": 0.6575, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.1416015625, + "rewards/rejected": 0.333984375, + "step": 812 + }, + { + "epoch": 0.2345305062743401, + "grad_norm": 11.506886564916305, + "learning_rate": 4.7296273573896726e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1832.0, + "logps/rejected": -1856.0, + "loss": 0.6799, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4140625, + "rewards/margins": -0.0205078125, + "rewards/rejected": 0.435546875, + "step": 813 + }, + { + "epoch": 0.23481898168181162, + "grad_norm": 9.950823895843715, + "learning_rate": 4.7284872118492436e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1928.0, + "logps/rejected": -1824.0, + "loss": 0.6603, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.578125, + "rewards/margins": 0.181640625, + "rewards/rejected": 0.396484375, + "step": 814 + }, + { + "epoch": 0.23510745708928313, + "grad_norm": 11.477741502553926, + "learning_rate": 4.72734480542052e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1976.0, + "logps/rejected": -1984.0, + "loss": 0.6876, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.65234375, + "rewards/margins": 0.06982421875, + "rewards/rejected": 0.58203125, + "step": 815 + }, + { + "epoch": 0.23539593249675464, + "grad_norm": 10.805987323200057, + "learning_rate": 4.7262001392625186e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.265625, + "logps/chosen": -1936.0, + "logps/rejected": -1640.0, + "loss": 0.6753, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.546875, + "rewards/margins": 0.008056640625, + "rewards/rejected": 0.5390625, + "step": 816 + }, + { + "epoch": 0.23568440790422618, + "grad_norm": 9.73096815758516, + "learning_rate": 4.725053214536547e-07, + "logits/chosen": 3.421875, + "logits/rejected": 3.34375, + "logps/chosen": -1864.0, + "logps/rejected": -1736.0, + "loss": 0.6801, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.06787109375, + "rewards/rejected": 0.439453125, + "step": 817 + }, + { + "epoch": 0.2359728833116977, + "grad_norm": 9.548384861756809, + "learning_rate": 4.723904032406206e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.140625, + "logps/chosen": -1616.0, + "logps/rejected": -1552.0, + "loss": 0.6224, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.70703125, + "rewards/margins": 0.21484375, + "rewards/rejected": 0.4921875, + "step": 818 + }, + { + "epoch": 0.2362613587191692, + "grad_norm": 10.49081696720962, + "learning_rate": 4.722752594037388e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.375, + "logps/chosen": -1672.0, + "logps/rejected": -1816.0, + "loss": 0.6826, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.466796875, + "rewards/margins": -0.0025482177734375, + "rewards/rejected": 0.46875, + "step": 819 + }, + { + "epoch": 0.2365498341266407, + "grad_norm": 11.108943157346348, + "learning_rate": 4.7215989005982714e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1320.0, + "logps/rejected": -1440.0, + "loss": 0.6708, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.52734375, + "rewards/margins": -0.01123046875, + "rewards/rejected": 0.5390625, + "step": 820 + }, + { + "epoch": 0.23683830953411222, + "grad_norm": 9.613600091322146, + "learning_rate": 4.7204429532593235e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.25, + "logps/chosen": -1568.0, + "logps/rejected": -1344.0, + "loss": 0.6649, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.1474609375, + "rewards/rejected": 0.390625, + "step": 821 + }, + { + "epoch": 0.23712678494158373, + "grad_norm": 11.886570874261434, + "learning_rate": 4.719284753193299e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1664.0, + "logps/rejected": -1408.0, + "loss": 0.6855, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.1181640625, + "rewards/rejected": 0.44140625, + "step": 822 + }, + { + "epoch": 0.23741526034905525, + "grad_norm": 10.050707587002151, + "learning_rate": 4.718124301575238e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.34375, + "logps/chosen": -1464.0, + "logps/rejected": -1368.0, + "loss": 0.6705, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.37109375, + "rewards/margins": 0.030517578125, + "rewards/rejected": 0.33984375, + "step": 823 + }, + { + "epoch": 0.23770373575652676, + "grad_norm": 9.994920041113383, + "learning_rate": 4.7169615995824637e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1208.0, + "logps/rejected": -1112.0, + "loss": 0.6497, + "loss/demonstration_loss": -2352.0, + "loss/preference_loss": -2352.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.04931640625, + "rewards/rejected": 0.34375, + "step": 824 + }, + { + "epoch": 0.23799221116399827, + "grad_norm": 12.190246233526418, + "learning_rate": 4.7157966483945835e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.03125, + "logps/chosen": -1560.0, + "logps/rejected": -1904.0, + "loss": 0.7321, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5859375, + "rewards/margins": -0.0703125, + "rewards/rejected": 0.65625, + "step": 825 + }, + { + "epoch": 0.23828068657146978, + "grad_norm": 11.557976031747671, + "learning_rate": 4.7146294491934865e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -1760.0, + "logps/rejected": -1648.0, + "loss": 0.6562, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.1533203125, + "rewards/rejected": 0.412109375, + "step": 826 + }, + { + "epoch": 0.2385691619789413, + "grad_norm": 10.536740357072409, + "learning_rate": 4.713460003163342e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1656.0, + "logps/rejected": -1576.0, + "loss": 0.6671, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.52734375, + "step": 827 + }, + { + "epoch": 0.2388576373864128, + "grad_norm": 10.934009091586816, + "learning_rate": 4.7122883114905997e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -2008.0, + "logps/rejected": -2016.0, + "loss": 0.7086, + "loss/demonstration_loss": -4080.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.58203125, + "rewards/margins": -0.04052734375, + "rewards/rejected": 0.62109375, + "step": 828 + }, + { + "epoch": 0.2391461127938843, + "grad_norm": 9.512945452235257, + "learning_rate": 4.711114375363987e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1448.0, + "logps/rejected": -1232.0, + "loss": 0.6549, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.453125, + "rewards/margins": 0.09228515625, + "rewards/rejected": 0.361328125, + "step": 829 + }, + { + "epoch": 0.23943458820135582, + "grad_norm": 12.986326945426041, + "learning_rate": 4.7099381959745077e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.328125, + "logps/chosen": -1944.0, + "logps/rejected": -1928.0, + "loss": 0.6951, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.671875, + "rewards/margins": -0.0303955078125, + "rewards/rejected": 0.69921875, + "step": 830 + }, + { + "epoch": 0.23972306360882734, + "grad_norm": 11.889624527518347, + "learning_rate": 4.708759774515444e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1968.0, + "logps/rejected": -1704.0, + "loss": 0.6827, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.625, + "rewards/margins": 0.03515625, + "rewards/rejected": 0.58984375, + "step": 831 + }, + { + "epoch": 0.24001153901629885, + "grad_norm": 9.671930946902412, + "learning_rate": 4.7075791121823487e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1480.0, + "logps/rejected": -1392.0, + "loss": 0.6969, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.0751953125, + "rewards/rejected": 0.4140625, + "step": 832 + }, + { + "epoch": 0.24030001442377039, + "grad_norm": 11.575294438296122, + "learning_rate": 4.7063962101730524e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1696.0, + "logps/rejected": -1608.0, + "loss": 0.7339, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.53515625, + "rewards/margins": -0.03515625, + "rewards/rejected": 0.5703125, + "step": 833 + }, + { + "epoch": 0.2405884898312419, + "grad_norm": 11.089579624254114, + "learning_rate": 4.7052110696876545e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1096.0, + "logps/rejected": -1096.0, + "loss": 0.7113, + "loss/demonstration_loss": -2240.0, + "loss/preference_loss": -2240.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.39453125, + "rewards/margins": -0.043701171875, + "rewards/rejected": 0.439453125, + "step": 834 + }, + { + "epoch": 0.2408769652387134, + "grad_norm": 10.85282376573568, + "learning_rate": 4.704023691928528e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1864.0, + "logps/rejected": -1656.0, + "loss": 0.627, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.703125, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.57421875, + "step": 835 + }, + { + "epoch": 0.24116544064618492, + "grad_norm": 8.973876395452272, + "learning_rate": 4.702834078100314e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -2040.0, + "logps/rejected": -1760.0, + "loss": 0.653, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.52734375, + "step": 836 + }, + { + "epoch": 0.24145391605365643, + "grad_norm": 9.85279863086701, + "learning_rate": 4.701642229409922e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1616.0, + "logps/rejected": -1784.0, + "loss": 0.6949, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5, + "rewards/margins": -0.00848388671875, + "rewards/rejected": 0.5078125, + "step": 837 + }, + { + "epoch": 0.24174239146112794, + "grad_norm": 11.679607802472916, + "learning_rate": 4.7004481470665305e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1680.0, + "logps/rejected": -1752.0, + "loss": 0.6858, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.55078125, + "rewards/margins": -0.0302734375, + "rewards/rejected": 0.58203125, + "step": 838 + }, + { + "epoch": 0.24203086686859945, + "grad_norm": 10.908930535095074, + "learning_rate": 4.6992518322815835e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.34375, + "logps/chosen": -1672.0, + "logps/rejected": -1712.0, + "loss": 0.7009, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.040771484375, + "rewards/rejected": 0.470703125, + "step": 839 + }, + { + "epoch": 0.24231934227607096, + "grad_norm": 10.542724071414089, + "learning_rate": 4.698053286268788e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.328125, + "logps/chosen": -1384.0, + "logps/rejected": -1368.0, + "loss": 0.6782, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.0059814453125, + "rewards/rejected": 0.431640625, + "step": 840 + }, + { + "epoch": 0.24260781768354248, + "grad_norm": 12.455285302848992, + "learning_rate": 4.6968525102441175e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.171875, + "logps/chosen": -1664.0, + "logps/rejected": -1688.0, + "loss": 0.6554, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.0079345703125, + "rewards/rejected": 0.47265625, + "step": 841 + }, + { + "epoch": 0.242896293091014, + "grad_norm": 10.842203346965643, + "learning_rate": 4.695649505425807e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -2008.0, + "logps/rejected": -2128.0, + "loss": 0.7206, + "loss/demonstration_loss": -4224.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.69140625, + "rewards/margins": -0.0147705078125, + "rewards/rejected": 0.70703125, + "step": 842 + }, + { + "epoch": 0.2431847684984855, + "grad_norm": 8.700913787891386, + "learning_rate": 4.694444273034351e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.375, + "logps/chosen": -1720.0, + "logps/rejected": -1728.0, + "loss": 0.676, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.625, + "rewards/margins": 0.059814453125, + "rewards/rejected": 0.56640625, + "step": 843 + }, + { + "epoch": 0.243473243905957, + "grad_norm": 9.87637023456039, + "learning_rate": 4.6932368142925076e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1456.0, + "logps/rejected": -1200.0, + "loss": 0.6865, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.390625, + "step": 844 + }, + { + "epoch": 0.24376171931342852, + "grad_norm": 10.08791195452272, + "learning_rate": 4.6920271304252893e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.09375, + "logps/chosen": -1208.0, + "logps/rejected": -1152.0, + "loss": 0.657, + "loss/demonstration_loss": -2400.0, + "loss/preference_loss": -2400.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.36328125, + "step": 845 + }, + { + "epoch": 0.24405019472090003, + "grad_norm": 10.539318405924906, + "learning_rate": 4.6908152226599696e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1384.0, + "logps/rejected": -1464.0, + "loss": 0.6971, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.017822265625, + "rewards/rejected": 0.458984375, + "step": 846 + }, + { + "epoch": 0.24433867012837154, + "grad_norm": 9.2156612814942, + "learning_rate": 4.689601092226075e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1968.0, + "logps/rejected": -1784.0, + "loss": 0.6456, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.734375, + "rewards/margins": 0.21875, + "rewards/rejected": 0.515625, + "step": 847 + }, + { + "epoch": 0.24462714553584308, + "grad_norm": 11.090165248234777, + "learning_rate": 4.688384740355391e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1784.0, + "logps/rejected": -1888.0, + "loss": 0.7018, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.109375, + "rewards/rejected": 0.5546875, + "step": 848 + }, + { + "epoch": 0.2449156209433146, + "grad_norm": 10.142164098617345, + "learning_rate": 4.687166168281953e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.875, + "logps/chosen": -1776.0, + "logps/rejected": -1712.0, + "loss": 0.6528, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46875, + "rewards/margins": 0.06689453125, + "rewards/rejected": 0.400390625, + "step": 849 + }, + { + "epoch": 0.2452040963507861, + "grad_norm": 10.812163238919457, + "learning_rate": 4.685945377242051e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1920.0, + "logps/rejected": -1824.0, + "loss": 0.678, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5625, + "rewards/margins": 0.058837890625, + "rewards/rejected": 0.50390625, + "step": 850 + }, + { + "epoch": 0.24549257175825762, + "grad_norm": 12.585251297162053, + "learning_rate": 4.6847223684742255e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.25, + "logps/chosen": -2064.0, + "logps/rejected": -1816.0, + "loss": 0.6902, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.11962890625, + "rewards/rejected": 0.51953125, + "step": 851 + }, + { + "epoch": 0.24578104716572913, + "grad_norm": 10.985520203554621, + "learning_rate": 4.6834971432192673e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1760.0, + "logps/rejected": -1592.0, + "loss": 0.628, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.57421875, + "rewards/margins": 0.12255859375, + "rewards/rejected": 0.453125, + "step": 852 + }, + { + "epoch": 0.24606952257320064, + "grad_norm": 10.043944675552694, + "learning_rate": 4.6822697027202164e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1592.0, + "logps/rejected": -1744.0, + "loss": 0.6982, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.55078125, + "rewards/margins": -0.031494140625, + "rewards/rejected": 0.58203125, + "step": 853 + }, + { + "epoch": 0.24635799798067215, + "grad_norm": 10.652715841911137, + "learning_rate": 4.681040048222359e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.328125, + "logps/chosen": -1848.0, + "logps/rejected": -1672.0, + "loss": 0.5963, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.61328125, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.486328125, + "step": 854 + }, + { + "epoch": 0.24664647338814366, + "grad_norm": 10.6133898281545, + "learning_rate": 4.6798081809732286e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.34375, + "logps/chosen": -1896.0, + "logps/rejected": -1800.0, + "loss": 0.6921, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.609375, + "rewards/margins": 0.03466796875, + "rewards/rejected": 0.57421875, + "step": 855 + }, + { + "epoch": 0.24693494879561517, + "grad_norm": 11.894926314541017, + "learning_rate": 4.6785741022226026e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -1232.0, + "logps/rejected": -1360.0, + "loss": 0.6938, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.08642578125, + "rewards/rejected": 0.4609375, + "step": 856 + }, + { + "epoch": 0.24722342420308668, + "grad_norm": 12.40048594469965, + "learning_rate": 4.677337813222503e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1112.0, + "logps/rejected": -1208.0, + "loss": 0.7119, + "loss/demonstration_loss": -2368.0, + "loss/preference_loss": -2368.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.0084228515625, + "rewards/rejected": 0.486328125, + "step": 857 + }, + { + "epoch": 0.2475118996105582, + "grad_norm": 10.194155743754278, + "learning_rate": 4.6760993152271944e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1352.0, + "logps/rejected": -1392.0, + "loss": 0.6817, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46875, + "rewards/margins": 0.0888671875, + "rewards/rejected": 0.380859375, + "step": 858 + }, + { + "epoch": 0.2478003750180297, + "grad_norm": 11.86449775886594, + "learning_rate": 4.674858609493181e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1512.0, + "logps/rejected": -1704.0, + "loss": 0.6878, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.490234375, + "rewards/margins": -0.0252685546875, + "rewards/rejected": 0.515625, + "step": 859 + }, + { + "epoch": 0.24808885042550122, + "grad_norm": 10.846217791068653, + "learning_rate": 4.6736156972792074e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1920.0, + "logps/rejected": -1792.0, + "loss": 0.6471, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.125, + "rewards/rejected": 0.4765625, + "step": 860 + }, + { + "epoch": 0.24837732583297273, + "grad_norm": 10.196511433060962, + "learning_rate": 4.672370579846259e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1768.0, + "logps/rejected": -1656.0, + "loss": 0.6444, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.59375, + "rewards/margins": 0.1435546875, + "rewards/rejected": 0.451171875, + "step": 861 + }, + { + "epoch": 0.24866580124044424, + "grad_norm": 9.97485184345224, + "learning_rate": 4.6711232584575543e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1504.0, + "logps/rejected": -1384.0, + "loss": 0.7112, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.53125, + "rewards/margins": 0.00506591796875, + "rewards/rejected": 0.52734375, + "step": 862 + }, + { + "epoch": 0.24895427664791578, + "grad_norm": 10.291910474956397, + "learning_rate": 4.6698737343785523e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1872.0, + "logps/rejected": -1640.0, + "loss": 0.624, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.09375, + "rewards/rejected": 0.48828125, + "step": 863 + }, + { + "epoch": 0.2492427520553873, + "grad_norm": 10.245392198155434, + "learning_rate": 4.6686220088769437e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1400.0, + "logps/rejected": -1208.0, + "loss": 0.6788, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2624.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3125, + "rewards/margins": 0.07373046875, + "rewards/rejected": 0.2373046875, + "step": 864 + }, + { + "epoch": 0.2495312274628588, + "grad_norm": 10.928937576672332, + "learning_rate": 4.667368083222652e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0, + "logps/chosen": -1472.0, + "logps/rejected": -1568.0, + "loss": 0.6732, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.05810546875, + "rewards/rejected": 0.431640625, + "step": 865 + }, + { + "epoch": 0.2498197028703303, + "grad_norm": 9.788955597305009, + "learning_rate": 4.666111958687836e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -2096.0, + "logps/rejected": -1736.0, + "loss": 0.6232, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.70703125, + "rewards/margins": 0.30859375, + "rewards/rejected": 0.3984375, + "step": 866 + }, + { + "epoch": 0.2501081782778018, + "grad_norm": 10.294339369715468, + "learning_rate": 4.664853636546884e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1496.0, + "logps/rejected": -1560.0, + "loss": 0.6798, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.4140625, + "step": 867 + }, + { + "epoch": 0.2503966536852733, + "grad_norm": 11.441166234335428, + "learning_rate": 4.6635931180764114e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.3125, + "logps/chosen": -1576.0, + "logps/rejected": -1504.0, + "loss": 0.6924, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.0142822265625, + "rewards/rejected": 0.474609375, + "step": 868 + }, + { + "epoch": 0.2506851290927448, + "grad_norm": 11.168272849852537, + "learning_rate": 4.662330404555266e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.328125, + "logps/chosen": -1592.0, + "logps/rejected": -1648.0, + "loss": 0.6674, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.033203125, + "rewards/rejected": 0.53515625, + "step": 869 + }, + { + "epoch": 0.25097360450021633, + "grad_norm": 10.391460529116333, + "learning_rate": 4.6610654972645205e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1552.0, + "logps/rejected": -1472.0, + "loss": 0.6923, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.0216064453125, + "rewards/rejected": 0.515625, + "step": 870 + }, + { + "epoch": 0.2512620799076879, + "grad_norm": 10.478185318363, + "learning_rate": 4.6597983974874715e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1696.0, + "logps/rejected": -1384.0, + "loss": 0.6859, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6171875, + "rewards/margins": 0.154296875, + "rewards/rejected": 0.46484375, + "step": 871 + }, + { + "epoch": 0.2515505553151594, + "grad_norm": 10.68005738922613, + "learning_rate": 4.6585291065096433e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1496.0, + "logps/rejected": -1640.0, + "loss": 0.6859, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.058349609375, + "rewards/rejected": 0.486328125, + "step": 872 + }, + { + "epoch": 0.2518390307226309, + "grad_norm": 12.0253173496019, + "learning_rate": 4.657257625618782e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1392.0, + "logps/rejected": -1304.0, + "loss": 0.7023, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.0517578125, + "rewards/rejected": 0.35546875, + "step": 873 + }, + { + "epoch": 0.25212750613010243, + "grad_norm": 10.849421679069927, + "learning_rate": 4.655983956104854e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1608.0, + "logps/rejected": -1656.0, + "loss": 0.6778, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.484375, + "rewards/margins": 0.04150390625, + "rewards/rejected": 0.443359375, + "step": 874 + }, + { + "epoch": 0.25241598153757394, + "grad_norm": 11.229638318892812, + "learning_rate": 4.6547080992600476e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1784.0, + "logps/rejected": -1632.0, + "loss": 0.624, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.09521484375, + "rewards/rejected": 0.57421875, + "step": 875 + }, + { + "epoch": 0.25270445694504545, + "grad_norm": 10.287746748360936, + "learning_rate": 4.6534300563787707e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -2096.0, + "logps/rejected": -2064.0, + "loss": 0.6576, + "loss/demonstration_loss": -4256.0, + "loss/preference_loss": -4224.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7109375, + "rewards/margins": 0.0595703125, + "rewards/rejected": 0.65234375, + "step": 876 + }, + { + "epoch": 0.25299293235251696, + "grad_norm": 11.070799087421612, + "learning_rate": 4.6521498287576477e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.234375, + "logps/chosen": -1520.0, + "logps/rejected": -1560.0, + "loss": 0.7085, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.034423828125, + "rewards/rejected": 0.39453125, + "step": 877 + }, + { + "epoch": 0.2532814077599885, + "grad_norm": 10.319685160440125, + "learning_rate": 4.6508674176955196e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.25, + "logps/chosen": -1352.0, + "logps/rejected": -1648.0, + "loss": 0.6921, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.435546875, + "rewards/margins": -0.08154296875, + "rewards/rejected": 0.515625, + "step": 878 + }, + { + "epoch": 0.25356988316746, + "grad_norm": 10.737086190550391, + "learning_rate": 4.6495828244934443e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1616.0, + "logps/rejected": -1624.0, + "loss": 0.6835, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4453125, + "rewards/margins": -0.00537109375, + "rewards/rejected": 0.451171875, + "step": 879 + }, + { + "epoch": 0.2538583585749315, + "grad_norm": 10.46080949185506, + "learning_rate": 4.6482960504546916e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1448.0, + "logps/rejected": -1464.0, + "loss": 0.6647, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.419921875, + "step": 880 + }, + { + "epoch": 0.254146833982403, + "grad_norm": 10.104147094205809, + "learning_rate": 4.647007096884744e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.25, + "logps/chosen": -1608.0, + "logps/rejected": -1520.0, + "loss": 0.6442, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.1552734375, + "rewards/rejected": 0.5078125, + "step": 881 + }, + { + "epoch": 0.2544353093898745, + "grad_norm": 10.37551174282143, + "learning_rate": 4.6457159650912975e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1880.0, + "logps/rejected": -1624.0, + "loss": 0.6432, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.462890625, + "step": 882 + }, + { + "epoch": 0.25472378479734603, + "grad_norm": 12.063460518113907, + "learning_rate": 4.6444226563842547e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1600.0, + "logps/rejected": -1664.0, + "loss": 0.7275, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.001953125, + "rewards/rejected": 0.55859375, + "step": 883 + }, + { + "epoch": 0.25501226020481754, + "grad_norm": 10.208007822708796, + "learning_rate": 4.643127172075729e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1672.0, + "logps/rejected": -1448.0, + "loss": 0.6816, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.0166015625, + "rewards/rejected": 0.51171875, + "step": 884 + }, + { + "epoch": 0.25530073561228905, + "grad_norm": 12.909647109162057, + "learning_rate": 4.641829513480041e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1816.0, + "logps/rejected": -1800.0, + "loss": 0.7251, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5078125, + "rewards/margins": -0.060302734375, + "rewards/rejected": 0.56640625, + "step": 885 + }, + { + "epoch": 0.25558921101976056, + "grad_norm": 12.562323003347004, + "learning_rate": 4.640529681913715e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1816.0, + "logps/rejected": -1752.0, + "loss": 0.6373, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.1015625, + "rewards/rejected": 0.466796875, + "step": 886 + }, + { + "epoch": 0.2558776864272321, + "grad_norm": 9.988680779972267, + "learning_rate": 4.639227678695483e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1440.0, + "logps/rejected": -1552.0, + "loss": 0.6855, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.384765625, + "step": 887 + }, + { + "epoch": 0.2561661618347036, + "grad_norm": 10.56465139256315, + "learning_rate": 4.6379235051462784e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1600.0, + "logps/rejected": -1568.0, + "loss": 0.704, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.46875, + "rewards/margins": 0.006561279296875, + "rewards/rejected": 0.462890625, + "step": 888 + }, + { + "epoch": 0.2564546372421751, + "grad_norm": 10.520984044908994, + "learning_rate": 4.6366171625892356e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1152.0, + "logps/rejected": -1360.0, + "loss": 0.7077, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2560.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.1376953125, + "rewards/rejected": 0.51171875, + "step": 889 + }, + { + "epoch": 0.2567431126496466, + "grad_norm": 10.858736566901005, + "learning_rate": 4.635308652349692e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.078125, + "logps/chosen": -1776.0, + "logps/rejected": -1400.0, + "loss": 0.6575, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.1630859375, + "rewards/rejected": 0.435546875, + "step": 890 + }, + { + "epoch": 0.2570315880571181, + "grad_norm": 10.964177166043617, + "learning_rate": 4.6339979757551827e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1464.0, + "logps/rejected": -1416.0, + "loss": 0.7239, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.045166015625, + "rewards/rejected": 0.34765625, + "step": 891 + }, + { + "epoch": 0.25732006346458963, + "grad_norm": 9.945796855872725, + "learning_rate": 4.6326851341354414e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1784.0, + "logps/rejected": -1432.0, + "loss": 0.6578, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.11767578125, + "rewards/rejected": 0.34765625, + "step": 892 + }, + { + "epoch": 0.25760853887206114, + "grad_norm": 9.452711379895465, + "learning_rate": 4.631370128822396e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1560.0, + "logps/rejected": -1440.0, + "loss": 0.6603, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.060791015625, + "rewards/rejected": 0.369140625, + "step": 893 + }, + { + "epoch": 0.25789701427953265, + "grad_norm": 11.44738970038878, + "learning_rate": 4.630052961150173e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1584.0, + "logps/rejected": -1664.0, + "loss": 0.6769, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.08056640625, + "rewards/rejected": 0.515625, + "step": 894 + }, + { + "epoch": 0.25818548968700417, + "grad_norm": 12.360665721167132, + "learning_rate": 4.6287336324550894e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1856.0, + "logps/rejected": -1872.0, + "loss": 0.7023, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.453125, + "rewards/margins": -0.0595703125, + "rewards/rejected": 0.51171875, + "step": 895 + }, + { + "epoch": 0.2584739650944757, + "grad_norm": 12.862114412363448, + "learning_rate": 4.627412144075658e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -2016.0, + "logps/rejected": -1904.0, + "loss": 0.6666, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.0162353515625, + "rewards/rejected": 0.64453125, + "step": 896 + }, + { + "epoch": 0.2587624405019472, + "grad_norm": 11.352679163643932, + "learning_rate": 4.6260884973525805e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1688.0, + "logps/rejected": -1488.0, + "loss": 0.6733, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6171875, + "rewards/margins": 0.087890625, + "rewards/rejected": 0.52734375, + "step": 897 + }, + { + "epoch": 0.2590509159094187, + "grad_norm": 10.478473507027283, + "learning_rate": 4.624762693628748e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -2000.0, + "logps/rejected": -2008.0, + "loss": 0.7024, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.76171875, + "rewards/margins": 0.031494140625, + "rewards/rejected": 0.73046875, + "step": 898 + }, + { + "epoch": 0.2593393913168902, + "grad_norm": 10.86230041069102, + "learning_rate": 4.623434734249242e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.296875, + "logps/chosen": -1656.0, + "logps/rejected": -1672.0, + "loss": 0.6584, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.52734375, + "rewards/margins": -0.0069580078125, + "rewards/rejected": 0.53515625, + "step": 899 + }, + { + "epoch": 0.2596278667243617, + "grad_norm": 9.61391318321402, + "learning_rate": 4.6221046205613286e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1688.0, + "logps/rejected": -1592.0, + "loss": 0.6589, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.08349609375, + "rewards/rejected": 0.37109375, + "step": 900 + }, + { + "epoch": 0.2599163421318333, + "grad_norm": 10.264457826451379, + "learning_rate": 4.620772353914461e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.21875, + "logps/chosen": -1672.0, + "logps/rejected": -1504.0, + "loss": 0.6906, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.0101318359375, + "rewards/rejected": 0.373046875, + "step": 901 + }, + { + "epoch": 0.2602048175393048, + "grad_norm": 14.32097661463869, + "learning_rate": 4.6194379356602766e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1296.0, + "logps/rejected": -1360.0, + "loss": 0.7379, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.314453125, + "rewards/margins": -0.08154296875, + "rewards/rejected": 0.396484375, + "step": 902 + }, + { + "epoch": 0.2604932929467763, + "grad_norm": 11.86760106922937, + "learning_rate": 4.6181013671525955e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1848.0, + "logps/rejected": -1928.0, + "loss": 0.7009, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6484375, + "rewards/margins": -0.0115966796875, + "rewards/rejected": 0.66015625, + "step": 903 + }, + { + "epoch": 0.2607817683542478, + "grad_norm": 10.844722032987535, + "learning_rate": 4.616762649747419e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -2144.0, + "logps/rejected": -1824.0, + "loss": 0.7081, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4032.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5625, + "rewards/margins": -0.08642578125, + "rewards/rejected": 0.6484375, + "step": 904 + }, + { + "epoch": 0.26107024376171933, + "grad_norm": 10.291725140539793, + "learning_rate": 4.615421784802928e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1888.0, + "logps/rejected": -1808.0, + "loss": 0.6825, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6953125, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.546875, + "step": 905 + }, + { + "epoch": 0.26135871916919085, + "grad_norm": 10.583078078739183, + "learning_rate": 4.614078773679484e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.34375, + "logps/chosen": -1800.0, + "logps/rejected": -1744.0, + "loss": 0.687, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.043701171875, + "rewards/rejected": 0.5859375, + "step": 906 + }, + { + "epoch": 0.26164719457666236, + "grad_norm": 10.922708497340988, + "learning_rate": 4.612733617739625e-07, + "logits/chosen": 3.421875, + "logits/rejected": 3.34375, + "logps/chosen": -1648.0, + "logps/rejected": -1640.0, + "loss": 0.6925, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.03857421875, + "rewards/rejected": 0.515625, + "step": 907 + }, + { + "epoch": 0.26193566998413387, + "grad_norm": 11.682761380305216, + "learning_rate": 4.6113863183480637e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.265625, + "logps/chosen": -1728.0, + "logps/rejected": -1456.0, + "loss": 0.6741, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.103515625, + "rewards/rejected": 0.494140625, + "step": 908 + }, + { + "epoch": 0.2622241453916054, + "grad_norm": 11.09898031546255, + "learning_rate": 4.61003687687169e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.328125, + "logps/chosen": -1704.0, + "logps/rejected": -1600.0, + "loss": 0.6779, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.53125, + "rewards/margins": 0.1318359375, + "rewards/rejected": 0.400390625, + "step": 909 + }, + { + "epoch": 0.2625126207990769, + "grad_norm": 10.851917078634518, + "learning_rate": 4.6086852946795646e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1696.0, + "logps/rejected": -1632.0, + "loss": 0.6374, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.69140625, + "rewards/margins": 0.158203125, + "rewards/rejected": 0.53125, + "step": 910 + }, + { + "epoch": 0.2628010962065484, + "grad_norm": 11.697947647800692, + "learning_rate": 4.607331573142921e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -2032.0, + "logps/rejected": -1712.0, + "loss": 0.6476, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.10107421875, + "rewards/rejected": 0.54296875, + "step": 911 + }, + { + "epoch": 0.2630895716140199, + "grad_norm": 10.335457072007767, + "learning_rate": 4.605975713635163e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.390625, + "logps/chosen": -1792.0, + "logps/rejected": -1896.0, + "loss": 0.7025, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.408203125, + "rewards/margins": -0.009521484375, + "rewards/rejected": 0.41796875, + "step": 912 + }, + { + "epoch": 0.2633780470214914, + "grad_norm": 10.896962704997595, + "learning_rate": 4.604617717531865e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1736.0, + "logps/rejected": -1760.0, + "loss": 0.7059, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.040283203125, + "rewards/rejected": 0.45703125, + "step": 913 + }, + { + "epoch": 0.26366652242896294, + "grad_norm": 10.72063677224275, + "learning_rate": 4.603257586210766e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1480.0, + "logps/rejected": -1704.0, + "loss": 0.7031, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.0074462890625, + "rewards/rejected": 0.439453125, + "step": 914 + }, + { + "epoch": 0.26395499783643445, + "grad_norm": 10.624299723002819, + "learning_rate": 4.601895321051774e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1584.0, + "logps/rejected": -1392.0, + "loss": 0.6585, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.10595703125, + "rewards/rejected": 0.408203125, + "step": 915 + }, + { + "epoch": 0.26424347324390596, + "grad_norm": 12.922084550213027, + "learning_rate": 4.6005309234369605e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1336.0, + "logps/rejected": -1408.0, + "loss": 0.6757, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.494140625, + "rewards/margins": 0.031982421875, + "rewards/rejected": 0.4609375, + "step": 916 + }, + { + "epoch": 0.26453194865137747, + "grad_norm": 10.549364403217549, + "learning_rate": 4.5991643947505605e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1624.0, + "logps/rejected": -1528.0, + "loss": 0.7162, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40625, + "rewards/margins": -0.016357421875, + "rewards/rejected": 0.421875, + "step": 917 + }, + { + "epoch": 0.264820424058849, + "grad_norm": 11.725558179523782, + "learning_rate": 4.5977957363789717e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1304.0, + "logps/rejected": -1448.0, + "loss": 0.7201, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.48828125, + "rewards/margins": -0.095703125, + "rewards/rejected": 0.58203125, + "step": 918 + }, + { + "epoch": 0.2651088994663205, + "grad_norm": 11.237955345790029, + "learning_rate": 4.5964249497107515e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.34375, + "logps/chosen": -1360.0, + "logps/rejected": -1136.0, + "loss": 0.6854, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2544.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.412109375, + "rewards/margins": -0.017333984375, + "rewards/rejected": 0.4296875, + "step": 919 + }, + { + "epoch": 0.265397374873792, + "grad_norm": 12.127182650464324, + "learning_rate": 4.5950520361366174e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.046875, + "logps/chosen": -1368.0, + "logps/rejected": -1288.0, + "loss": 0.6876, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3515625, + "rewards/margins": 0.04345703125, + "rewards/rejected": 0.30859375, + "step": 920 + }, + { + "epoch": 0.2656858502812635, + "grad_norm": 11.588734078301309, + "learning_rate": 4.5936769970494453e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -2080.0, + "logps/rejected": -1752.0, + "loss": 0.6776, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.12451171875, + "rewards/rejected": 0.482421875, + "step": 921 + }, + { + "epoch": 0.265974325688735, + "grad_norm": 11.915185126797628, + "learning_rate": 4.592299833844266e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1960.0, + "logps/rejected": -1680.0, + "loss": 0.6293, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.65625, + "rewards/margins": 0.173828125, + "rewards/rejected": 0.48046875, + "step": 922 + }, + { + "epoch": 0.26626280109620654, + "grad_norm": 13.688771694467569, + "learning_rate": 4.5909205479182657e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1576.0, + "logps/rejected": -1640.0, + "loss": 0.6945, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.1953125, + "rewards/rejected": 0.33984375, + "step": 923 + }, + { + "epoch": 0.26655127650367805, + "grad_norm": 10.70851650584107, + "learning_rate": 4.589539140670784e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -1192.0, + "logps/rejected": -1256.0, + "loss": 0.6708, + "loss/demonstration_loss": -2496.0, + "loss/preference_loss": -2480.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.0771484375, + "rewards/rejected": 0.365234375, + "step": 924 + }, + { + "epoch": 0.26683975191114956, + "grad_norm": 11.299161331986914, + "learning_rate": 4.5881556135033147e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1640.0, + "logps/rejected": -1400.0, + "loss": 0.6642, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.0028533935546875, + "rewards/rejected": 0.5234375, + "step": 925 + }, + { + "epoch": 0.26712822731862107, + "grad_norm": 12.550149018593496, + "learning_rate": 4.5867699678194994e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1904.0, + "logps/rejected": -1552.0, + "loss": 0.686, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.396484375, + "rewards/margins": 0.00347900390625, + "rewards/rejected": 0.392578125, + "step": 926 + }, + { + "epoch": 0.2674167027260926, + "grad_norm": 10.269958355210981, + "learning_rate": 4.585382205025131e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.359375, + "logps/chosen": -1968.0, + "logps/rejected": -1960.0, + "loss": 0.6758, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.0625, + "rewards/rejected": 0.51953125, + "step": 927 + }, + { + "epoch": 0.2677051781335641, + "grad_norm": 11.59885762877722, + "learning_rate": 4.583992326528149e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1840.0, + "logps/rejected": -1832.0, + "loss": 0.6812, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.0150146484375, + "rewards/rejected": 0.5078125, + "step": 928 + }, + { + "epoch": 0.2679936535410356, + "grad_norm": 11.847260015510527, + "learning_rate": 4.582600333738641e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1520.0, + "logps/rejected": -1704.0, + "loss": 0.7462, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.337890625, + "rewards/margins": -0.1845703125, + "rewards/rejected": 0.5234375, + "step": 929 + }, + { + "epoch": 0.2682821289485071, + "grad_norm": 10.907285003407916, + "learning_rate": 4.581206228068838e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1816.0, + "logps/rejected": -1832.0, + "loss": 0.65, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.609375, + "rewards/margins": 0.09912109375, + "rewards/rejected": 0.51171875, + "step": 930 + }, + { + "epoch": 0.2685706043559786, + "grad_norm": 10.38710655330245, + "learning_rate": 4.5798100109331154e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.28125, + "logps/chosen": -1944.0, + "logps/rejected": -1896.0, + "loss": 0.6765, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.53515625, + "rewards/margins": -0.0223388671875, + "rewards/rejected": 0.55859375, + "step": 931 + }, + { + "epoch": 0.2688590797634502, + "grad_norm": 10.529417558703228, + "learning_rate": 4.578411683747991e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1280.0, + "logps/rejected": -1496.0, + "loss": 0.659, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.38671875, + "rewards/margins": 0.02587890625, + "rewards/rejected": 0.359375, + "step": 932 + }, + { + "epoch": 0.2691475551709217, + "grad_norm": 11.385953152697091, + "learning_rate": 4.577011247932122e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.265625, + "logps/chosen": -1680.0, + "logps/rejected": -1744.0, + "loss": 0.7181, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.390625, + "rewards/margins": -0.01300048828125, + "rewards/rejected": 0.404296875, + "step": 933 + }, + { + "epoch": 0.2694360305783932, + "grad_norm": 13.21984684209804, + "learning_rate": 4.5756087049063077e-07, + "logits/chosen": 3.125, + "logits/rejected": 2.9375, + "logps/chosen": -1368.0, + "logps/rejected": -1528.0, + "loss": 0.6995, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.31640625, + "rewards/margins": -0.061279296875, + "rewards/rejected": 0.376953125, + "step": 934 + }, + { + "epoch": 0.2697245059858647, + "grad_norm": 11.010340775086508, + "learning_rate": 4.574204056093481e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1600.0, + "logps/rejected": -1496.0, + "loss": 0.6628, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.326171875, + "rewards/margins": -0.0419921875, + "rewards/rejected": 0.369140625, + "step": 935 + }, + { + "epoch": 0.27001298139333624, + "grad_norm": 9.700924017479833, + "learning_rate": 4.572797302918715e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.234375, + "logps/chosen": -1480.0, + "logps/rejected": -1648.0, + "loss": 0.7125, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.41015625, + "rewards/margins": -0.046630859375, + "rewards/rejected": 0.455078125, + "step": 936 + }, + { + "epoch": 0.27030145680080775, + "grad_norm": 11.545608732035248, + "learning_rate": 4.571388446809216e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1600.0, + "logps/rejected": -1464.0, + "loss": 0.7067, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.0020751953125, + "rewards/rejected": 0.5, + "step": 937 + }, + { + "epoch": 0.27058993220827926, + "grad_norm": 10.09897713589775, + "learning_rate": 4.569977489194324e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.359375, + "logps/chosen": -2112.0, + "logps/rejected": -2112.0, + "loss": 0.6516, + "loss/demonstration_loss": -4256.0, + "loss/preference_loss": -4256.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.453125, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.365234375, + "step": 938 + }, + { + "epoch": 0.27087840761575077, + "grad_norm": 12.28518616478594, + "learning_rate": 4.5685644315055126e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.0, + "logps/chosen": -1600.0, + "logps/rejected": -1456.0, + "loss": 0.703, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.0751953125, + "rewards/rejected": 0.40234375, + "step": 939 + }, + { + "epoch": 0.2711668830232223, + "grad_norm": 11.479374518551127, + "learning_rate": 4.567149275176383e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.25, + "logps/chosen": -1472.0, + "logps/rejected": -1552.0, + "loss": 0.6702, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.0400390625, + "rewards/rejected": 0.419921875, + "step": 940 + }, + { + "epoch": 0.2714553584306938, + "grad_norm": 11.180617721697066, + "learning_rate": 4.565732021642668e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.328125, + "logps/chosen": -1608.0, + "logps/rejected": -1464.0, + "loss": 0.6783, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.478515625, + "rewards/margins": -0.002593994140625, + "rewards/rejected": 0.482421875, + "step": 941 + }, + { + "epoch": 0.2717438338381653, + "grad_norm": 9.936490901949039, + "learning_rate": 4.5643126723422267e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1536.0, + "logps/rejected": -1280.0, + "loss": 0.6754, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.458984375, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.361328125, + "step": 942 + }, + { + "epoch": 0.2720323092456368, + "grad_norm": 12.882232756444198, + "learning_rate": 4.562891228715046e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.15625, + "logps/chosen": -1624.0, + "logps/rejected": -1376.0, + "loss": 0.6752, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.1669921875, + "rewards/rejected": 0.287109375, + "step": 943 + }, + { + "epoch": 0.2723207846531083, + "grad_norm": 11.051097521419868, + "learning_rate": 4.561467692203235e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1640.0, + "logps/rejected": -1696.0, + "loss": 0.7046, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.431640625, + "rewards/margins": -0.0341796875, + "rewards/rejected": 0.46484375, + "step": 944 + }, + { + "epoch": 0.27260926006057984, + "grad_norm": 11.649599458378562, + "learning_rate": 4.560042064251029e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1472.0, + "logps/rejected": -1640.0, + "loss": 0.692, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.43359375, + "rewards/margins": 0.001190185546875, + "rewards/rejected": 0.431640625, + "step": 945 + }, + { + "epoch": 0.27289773546805135, + "grad_norm": 10.657568505395876, + "learning_rate": 4.558614346304783e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1936.0, + "logps/rejected": -1664.0, + "loss": 0.649, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.423828125, + "step": 946 + }, + { + "epoch": 0.27318621087552286, + "grad_norm": 9.471539729341357, + "learning_rate": 4.5571845398129747e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.359375, + "logps/chosen": -1600.0, + "logps/rejected": -1632.0, + "loss": 0.6711, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.0125732421875, + "rewards/rejected": 0.42578125, + "step": 947 + }, + { + "epoch": 0.2734746862829944, + "grad_norm": 12.005934480957475, + "learning_rate": 4.5557526462261986e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1848.0, + "logps/rejected": -1544.0, + "loss": 0.7048, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.40234375, + "step": 948 + }, + { + "epoch": 0.2737631616904659, + "grad_norm": 11.659956622386694, + "learning_rate": 4.5543186669971665e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1488.0, + "logps/rejected": -1544.0, + "loss": 0.6498, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.0546875, + "rewards/rejected": 0.380859375, + "step": 949 + }, + { + "epoch": 0.2740516370979374, + "grad_norm": 10.912399181192729, + "learning_rate": 4.552882603580708e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.140625, + "logps/chosen": -1552.0, + "logps/rejected": -1704.0, + "loss": 0.707, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.01043701171875, + "rewards/rejected": 0.41015625, + "step": 950 + }, + { + "epoch": 0.2743401125054089, + "grad_norm": 10.121750567145934, + "learning_rate": 4.5514444574337646e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -1472.0, + "logps/rejected": -1520.0, + "loss": 0.6638, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.11962890625, + "rewards/rejected": 0.482421875, + "step": 951 + }, + { + "epoch": 0.2746285879128804, + "grad_norm": 9.948714820656734, + "learning_rate": 4.550004230015394e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1712.0, + "logps/rejected": -1656.0, + "loss": 0.6656, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.09326171875, + "rewards/rejected": 0.376953125, + "step": 952 + }, + { + "epoch": 0.27491706332035193, + "grad_norm": 11.61056973275793, + "learning_rate": 4.548561922786763e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.125, + "logps/chosen": -1656.0, + "logps/rejected": -1768.0, + "loss": 0.6881, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.37109375, + "rewards/margins": -0.039794921875, + "rewards/rejected": 0.412109375, + "step": 953 + }, + { + "epoch": 0.27520553872782344, + "grad_norm": 10.309604798565307, + "learning_rate": 4.54711753721115e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1696.0, + "logps/rejected": -1408.0, + "loss": 0.6804, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.259765625, + "step": 954 + }, + { + "epoch": 0.27549401413529495, + "grad_norm": 10.1682469776998, + "learning_rate": 4.545671074753941e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.25, + "logps/chosen": -1800.0, + "logps/rejected": -1680.0, + "loss": 0.6782, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.03759765625, + "rewards/rejected": 0.373046875, + "step": 955 + }, + { + "epoch": 0.27578248954276646, + "grad_norm": 11.504953234766683, + "learning_rate": 4.5442225368826285e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1696.0, + "logps/rejected": -1624.0, + "loss": 0.7076, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.006439208984375, + "rewards/rejected": 0.408203125, + "step": 956 + }, + { + "epoch": 0.276070964950238, + "grad_norm": 12.168369097902977, + "learning_rate": 4.542771925066812e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1520.0, + "logps/rejected": -1624.0, + "loss": 0.6718, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.060302734375, + "rewards/rejected": 0.443359375, + "step": 957 + }, + { + "epoch": 0.2763594403577095, + "grad_norm": 10.488007577533148, + "learning_rate": 4.541319240778194e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.125, + "logps/chosen": -2176.0, + "logps/rejected": -2096.0, + "loss": 0.6818, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.59375, + "rewards/margins": 0.125, + "rewards/rejected": 0.470703125, + "step": 958 + }, + { + "epoch": 0.276647915765181, + "grad_norm": 10.48390299437886, + "learning_rate": 4.539864485490581e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1768.0, + "logps/rejected": -1816.0, + "loss": 0.6907, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.01324462890625, + "rewards/rejected": 0.3984375, + "step": 959 + }, + { + "epoch": 0.2769363911726525, + "grad_norm": 13.463700722711927, + "learning_rate": 4.538407660679879e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1384.0, + "logps/rejected": -1336.0, + "loss": 0.7136, + "loss/demonstration_loss": -2752.0, + "loss/preference_loss": -2752.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.369140625, + "rewards/margins": 0.00616455078125, + "rewards/rejected": 0.36328125, + "step": 960 + }, + { + "epoch": 0.277224866580124, + "grad_norm": 12.609660001285349, + "learning_rate": 4.5369487678240946e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1648.0, + "logps/rejected": -1616.0, + "loss": 0.7378, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.443359375, + "rewards/margins": -0.02978515625, + "rewards/rejected": 0.47265625, + "step": 961 + }, + { + "epoch": 0.27751334198759553, + "grad_norm": 10.27041151151368, + "learning_rate": 4.5354878084033313e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1752.0, + "logps/rejected": -1760.0, + "loss": 0.6745, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.125, + "rewards/rejected": 0.373046875, + "step": 962 + }, + { + "epoch": 0.2778018173950671, + "grad_norm": 11.385464655029894, + "learning_rate": 4.5340247838997917e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.28125, + "logps/chosen": -1824.0, + "logps/rejected": -1760.0, + "loss": 0.6702, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.02880859375, + "rewards/rejected": 0.474609375, + "step": 963 + }, + { + "epoch": 0.2780902928025386, + "grad_norm": 11.05099343854332, + "learning_rate": 4.532559695797771e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1640.0, + "logps/rejected": -1544.0, + "loss": 0.7059, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.3828125, + "step": 964 + }, + { + "epoch": 0.2783787682100101, + "grad_norm": 11.707483514684181, + "learning_rate": 4.531092545583659e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.265625, + "logps/chosen": -1960.0, + "logps/rejected": -1752.0, + "loss": 0.6795, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.390625, + "rewards/margins": 0.030517578125, + "rewards/rejected": 0.361328125, + "step": 965 + }, + { + "epoch": 0.27866724361748163, + "grad_norm": 12.576312068577296, + "learning_rate": 4.5296233347459377e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1544.0, + "logps/rejected": -1352.0, + "loss": 0.691, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.051025390625, + "rewards/rejected": 0.25390625, + "step": 966 + }, + { + "epoch": 0.27895571902495314, + "grad_norm": 12.064003132879426, + "learning_rate": 4.52815206477518e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -1784.0, + "logps/rejected": -1976.0, + "loss": 0.7139, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.484375, + "rewards/margins": 0.024169921875, + "rewards/rejected": 0.4609375, + "step": 967 + }, + { + "epoch": 0.27924419443242465, + "grad_norm": 9.566324066539504, + "learning_rate": 4.5266787371640464e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1392.0, + "logps/rejected": -1600.0, + "loss": 0.6783, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.423828125, + "rewards/margins": 0.0419921875, + "rewards/rejected": 0.3828125, + "step": 968 + }, + { + "epoch": 0.27953266983989616, + "grad_norm": 12.405435269443675, + "learning_rate": 4.5252033534072867e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.203125, + "logps/chosen": -1600.0, + "logps/rejected": -1640.0, + "loss": 0.6738, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.2314453125, + "step": 969 + }, + { + "epoch": 0.2798211452473677, + "grad_norm": 11.69870452479298, + "learning_rate": 4.523725915001735e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1616.0, + "logps/rejected": -1624.0, + "loss": 0.6736, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.07861328125, + "rewards/rejected": 0.38671875, + "step": 970 + }, + { + "epoch": 0.2801096206548392, + "grad_norm": 9.115535838774612, + "learning_rate": 4.522246423446312e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1736.0, + "logps/rejected": -1608.0, + "loss": 0.6614, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.3359375, + "step": 971 + }, + { + "epoch": 0.2803980960623107, + "grad_norm": 10.938440071405513, + "learning_rate": 4.520764880242021e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.234375, + "logps/chosen": -1736.0, + "logps/rejected": -1392.0, + "loss": 0.6569, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.423828125, + "rewards/margins": 0.12109375, + "rewards/rejected": 0.302734375, + "step": 972 + }, + { + "epoch": 0.2806865714697822, + "grad_norm": 9.953261382665383, + "learning_rate": 4.519281286891943e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1336.0, + "logps/rejected": -1352.0, + "loss": 0.6762, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.263671875, + "rewards/margins": -0.021484375, + "rewards/rejected": 0.28515625, + "step": 973 + }, + { + "epoch": 0.2809750468772537, + "grad_norm": 10.376730985362087, + "learning_rate": 4.5177956449012454e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.34375, + "logps/chosen": -1824.0, + "logps/rejected": -1776.0, + "loss": 0.699, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4453125, + "rewards/margins": 0.02001953125, + "rewards/rejected": 0.42578125, + "step": 974 + }, + { + "epoch": 0.28126352228472523, + "grad_norm": 12.997210794188563, + "learning_rate": 4.516307955777169e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.140625, + "logps/chosen": -1632.0, + "logps/rejected": -1656.0, + "loss": 0.7223, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.375, + "rewards/margins": -0.041015625, + "rewards/rejected": 0.416015625, + "step": 975 + }, + { + "epoch": 0.28155199769219674, + "grad_norm": 10.317418362600666, + "learning_rate": 4.514818221029034e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1624.0, + "logps/rejected": -1912.0, + "loss": 0.6409, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.0966796875, + "rewards/rejected": 0.439453125, + "step": 976 + }, + { + "epoch": 0.28184047309966825, + "grad_norm": 12.375737232105662, + "learning_rate": 4.513326442168235e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1864.0, + "logps/rejected": -2000.0, + "loss": 0.6981, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4453125, + "rewards/margins": -0.0673828125, + "rewards/rejected": 0.51171875, + "step": 977 + }, + { + "epoch": 0.28212894850713977, + "grad_norm": 11.796289569828206, + "learning_rate": 4.511832620708239e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1568.0, + "logps/rejected": -1608.0, + "loss": 0.6755, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.35546875, + "rewards/margins": 0.0869140625, + "rewards/rejected": 0.267578125, + "step": 978 + }, + { + "epoch": 0.2824174239146113, + "grad_norm": 13.30386279288329, + "learning_rate": 4.510336758164589e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.28125, + "logps/chosen": -1448.0, + "logps/rejected": -1656.0, + "loss": 0.7361, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.3359375, + "rewards/margins": -0.1240234375, + "rewards/rejected": 0.458984375, + "step": 979 + }, + { + "epoch": 0.2827058993220828, + "grad_norm": 9.743310505386415, + "learning_rate": 4.508838856054896e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.359375, + "logps/chosen": -1408.0, + "logps/rejected": -1264.0, + "loss": 0.669, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.016845703125, + "rewards/rejected": 0.36328125, + "step": 980 + }, + { + "epoch": 0.2829943747295543, + "grad_norm": 9.850574818600123, + "learning_rate": 4.50733891589884e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1592.0, + "logps/rejected": -1424.0, + "loss": 0.6611, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3984375, + "rewards/margins": 0.0947265625, + "rewards/rejected": 0.3046875, + "step": 981 + }, + { + "epoch": 0.2832828501370258, + "grad_norm": 12.486022893711196, + "learning_rate": 4.5058369392181707e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.28125, + "logps/chosen": -1984.0, + "logps/rejected": -1824.0, + "loss": 0.7059, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.134765625, + "rewards/rejected": 0.3828125, + "step": 982 + }, + { + "epoch": 0.2835713255444973, + "grad_norm": 10.926752997714912, + "learning_rate": 4.504332927536702e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.140625, + "logps/chosen": -1728.0, + "logps/rejected": -1648.0, + "loss": 0.6906, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45703125, + "rewards/margins": -0.007415771484375, + "rewards/rejected": 0.462890625, + "step": 983 + }, + { + "epoch": 0.28385980095196883, + "grad_norm": 11.624839981421283, + "learning_rate": 4.502826882380313e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.296875, + "logps/chosen": -1744.0, + "logps/rejected": -1616.0, + "loss": 0.6613, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.0546875, + "rewards/rejected": 0.322265625, + "step": 984 + }, + { + "epoch": 0.28414827635944034, + "grad_norm": 13.974501302543205, + "learning_rate": 4.501318805276947e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1728.0, + "logps/rejected": -1704.0, + "loss": 0.6823, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.0673828125, + "rewards/rejected": 0.341796875, + "step": 985 + }, + { + "epoch": 0.28443675176691186, + "grad_norm": 10.14777759121773, + "learning_rate": 4.4998086977566067e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1440.0, + "logps/rejected": -1496.0, + "loss": 0.6658, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.359375, + "rewards/margins": 0.0179443359375, + "rewards/rejected": 0.33984375, + "step": 986 + }, + { + "epoch": 0.28472522717438337, + "grad_norm": 9.416608047664688, + "learning_rate": 4.4982965613513566e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -1264.0, + "logps/rejected": -1048.0, + "loss": 0.6708, + "loss/demonstration_loss": -2352.0, + "loss/preference_loss": -2336.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.2314453125, + "step": 987 + }, + { + "epoch": 0.2850137025818549, + "grad_norm": 10.413555532071907, + "learning_rate": 4.4967823975953185e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -1456.0, + "logps/rejected": -1584.0, + "loss": 0.6857, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.053955078125, + "rewards/rejected": 0.2177734375, + "step": 988 + }, + { + "epoch": 0.2853021779893264, + "grad_norm": 9.715969066531516, + "learning_rate": 4.495266208024671e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.109375, + "logps/chosen": -1504.0, + "logps/rejected": -1544.0, + "loss": 0.6699, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.03955078125, + "rewards/rejected": 0.32421875, + "step": 989 + }, + { + "epoch": 0.2855906533967979, + "grad_norm": 10.59297515613841, + "learning_rate": 4.493747994177649e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.296875, + "logps/chosen": -1864.0, + "logps/rejected": -1784.0, + "loss": 0.6391, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.486328125, + "rewards/margins": 0.1689453125, + "rewards/rejected": 0.318359375, + "step": 990 + }, + { + "epoch": 0.2858791288042694, + "grad_norm": 10.350104899989711, + "learning_rate": 4.49222775759454e-07, + "logits/chosen": 2.9375, + "logits/rejected": 3.015625, + "logps/chosen": -1448.0, + "logps/rejected": -1232.0, + "loss": 0.6608, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.244140625, + "step": 991 + }, + { + "epoch": 0.2861676042117409, + "grad_norm": 10.78936749452285, + "learning_rate": 4.4907054998176843e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1464.0, + "logps/rejected": -1192.0, + "loss": 0.66, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2672.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.158203125, + "rewards/rejected": 0.1923828125, + "step": 992 + }, + { + "epoch": 0.2864560796192125, + "grad_norm": 10.437666605710469, + "learning_rate": 4.4891812223914714e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1560.0, + "logps/rejected": -1464.0, + "loss": 0.6668, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.087890625, + "rewards/rejected": 0.322265625, + "step": 993 + }, + { + "epoch": 0.286744555026684, + "grad_norm": 11.278607243510134, + "learning_rate": 4.487654926862343e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1608.0, + "logps/rejected": -1560.0, + "loss": 0.6757, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.359375, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.259765625, + "step": 994 + }, + { + "epoch": 0.2870330304341555, + "grad_norm": 11.662771886704727, + "learning_rate": 4.486126614778785e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1640.0, + "logps/rejected": -1592.0, + "loss": 0.6931, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.333984375, + "rewards/margins": 0.0264892578125, + "rewards/rejected": 0.30859375, + "step": 995 + }, + { + "epoch": 0.287321505841627, + "grad_norm": 11.15241005953297, + "learning_rate": 4.4845962876913303e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1680.0, + "logps/rejected": -1592.0, + "loss": 0.6566, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37109375, + "rewards/margins": 0.05517578125, + "rewards/rejected": 0.31640625, + "step": 996 + }, + { + "epoch": 0.28760998124909853, + "grad_norm": 10.711424116386574, + "learning_rate": 4.4830639471525555e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.34375, + "logps/chosen": -1952.0, + "logps/rejected": -1968.0, + "loss": 0.6738, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.09130859375, + "rewards/rejected": 0.2734375, + "step": 997 + }, + { + "epoch": 0.28789845665657005, + "grad_norm": 10.068915704386708, + "learning_rate": 4.4815295947170824e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.25, + "logps/chosen": -1408.0, + "logps/rejected": -1184.0, + "loss": 0.6385, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.17578125, + "rewards/rejected": 0.1455078125, + "step": 998 + }, + { + "epoch": 0.28818693206404156, + "grad_norm": 10.757947704091364, + "learning_rate": 4.479993231941571e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1416.0, + "logps/rejected": -1504.0, + "loss": 0.6823, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.376953125, + "rewards/margins": -0.022705078125, + "rewards/rejected": 0.3984375, + "step": 999 + }, + { + "epoch": 0.28847540747151307, + "grad_norm": 10.722620086975104, + "learning_rate": 4.4784548603847214e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.046875, + "logps/chosen": -1832.0, + "logps/rejected": -1720.0, + "loss": 0.6542, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.0625, + "rewards/rejected": 0.2431640625, + "step": 1000 + }, + { + "epoch": 0.2887638828789846, + "grad_norm": 11.129193803210033, + "learning_rate": 4.4769144816072743e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1448.0, + "logps/rejected": -1004.0, + "loss": 0.6658, + "loss/demonstration_loss": -2496.0, + "loss/preference_loss": -2464.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37890625, + "rewards/margins": 0.185546875, + "rewards/rejected": 0.193359375, + "step": 1001 + }, + { + "epoch": 0.2890523582864561, + "grad_norm": 10.561876252521115, + "learning_rate": 4.475372097172003e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1896.0, + "logps/rejected": -2064.0, + "loss": 0.6937, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.020751953125, + "rewards/rejected": 0.39453125, + "step": 1002 + }, + { + "epoch": 0.2893408336939276, + "grad_norm": 11.559118321164453, + "learning_rate": 4.4738277086437183e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -2128.0, + "logps/rejected": -1960.0, + "loss": 0.678, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.34375, + "rewards/margins": -0.0269775390625, + "rewards/rejected": 0.37109375, + "step": 1003 + }, + { + "epoch": 0.2896293091013991, + "grad_norm": 11.268206051285386, + "learning_rate": 4.472281317589263e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1768.0, + "logps/rejected": -1648.0, + "loss": 0.6608, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.359375, + "step": 1004 + }, + { + "epoch": 0.2899177845088706, + "grad_norm": 10.803036286773263, + "learning_rate": 4.4707329255775115e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1768.0, + "logps/rejected": -1920.0, + "loss": 0.7422, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.421875, + "rewards/margins": -0.01507568359375, + "rewards/rejected": 0.4375, + "step": 1005 + }, + { + "epoch": 0.29020625991634214, + "grad_norm": 8.933277300630738, + "learning_rate": 4.4691825341793706e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1320.0, + "logps/rejected": -1168.0, + "loss": 0.6586, + "loss/demonstration_loss": -2512.0, + "loss/preference_loss": -2496.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.05517578125, + "rewards/rejected": 0.1962890625, + "step": 1006 + }, + { + "epoch": 0.29049473532381365, + "grad_norm": 11.798056181397305, + "learning_rate": 4.4676301449677713e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.21875, + "logps/chosen": -1528.0, + "logps/rejected": -1864.0, + "loss": 0.6841, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51953125, + "rewards/margins": -0.0213623046875, + "rewards/rejected": 0.54296875, + "step": 1007 + }, + { + "epoch": 0.29078321073128516, + "grad_norm": 10.69219364693706, + "learning_rate": 4.4660757595176745e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -1496.0, + "logps/rejected": -1736.0, + "loss": 0.6692, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.345703125, + "rewards/margins": 0.0673828125, + "rewards/rejected": 0.279296875, + "step": 1008 + }, + { + "epoch": 0.29107168613875667, + "grad_norm": 11.004899148027592, + "learning_rate": 4.4645193794060655e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1552.0, + "logps/rejected": -1752.0, + "loss": 0.6768, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3984375, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.28515625, + "step": 1009 + }, + { + "epoch": 0.2913601615462282, + "grad_norm": 11.283551697444814, + "learning_rate": 4.4629610062119544e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.203125, + "logps/chosen": -1872.0, + "logps/rejected": -1824.0, + "loss": 0.652, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.1064453125, + "rewards/rejected": 0.388671875, + "step": 1010 + }, + { + "epoch": 0.2916486369536997, + "grad_norm": 10.597379611134045, + "learning_rate": 4.461400641516371e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1624.0, + "logps/rejected": -1704.0, + "loss": 0.6859, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.037353515625, + "rewards/rejected": 0.34375, + "step": 1011 + }, + { + "epoch": 0.2919371123611712, + "grad_norm": 12.275416407080403, + "learning_rate": 4.459838286902368e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1584.0, + "logps/rejected": -1736.0, + "loss": 0.7167, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.310546875, + "rewards/margins": -0.040771484375, + "rewards/rejected": 0.3515625, + "step": 1012 + }, + { + "epoch": 0.2922255877686427, + "grad_norm": 10.867720526624751, + "learning_rate": 4.4582739439550153e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.125, + "logps/chosen": -1272.0, + "logps/rejected": -1304.0, + "loss": 0.6702, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.099609375, + "rewards/rejected": 0.2353515625, + "step": 1013 + }, + { + "epoch": 0.2925140631761142, + "grad_norm": 12.282673187851413, + "learning_rate": 4.456707614261401e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1568.0, + "logps/rejected": -1680.0, + "loss": 0.6844, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43359375, + "rewards/margins": 0.08203125, + "rewards/rejected": 0.3515625, + "step": 1014 + }, + { + "epoch": 0.29280253858358574, + "grad_norm": 11.63405925402273, + "learning_rate": 4.4551392994106275e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -1640.0, + "logps/rejected": -1592.0, + "loss": 0.7138, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.35546875, + "rewards/margins": 0.007354736328125, + "rewards/rejected": 0.34765625, + "step": 1015 + }, + { + "epoch": 0.29309101399105725, + "grad_norm": 11.777567280944552, + "learning_rate": 4.453569000993813e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1856.0, + "logps/rejected": -1616.0, + "loss": 0.6879, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.34375, + "rewards/margins": 0.0311279296875, + "rewards/rejected": 0.310546875, + "step": 1016 + }, + { + "epoch": 0.29337948939852876, + "grad_norm": 11.136777305828762, + "learning_rate": 4.4519967206040877e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.25, + "logps/chosen": -1648.0, + "logps/rejected": -1440.0, + "loss": 0.6465, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4453125, + "rewards/margins": 0.177734375, + "rewards/rejected": 0.267578125, + "step": 1017 + }, + { + "epoch": 0.29366796480600027, + "grad_norm": 11.510737567604044, + "learning_rate": 4.4504224598365916e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1712.0, + "logps/rejected": -1736.0, + "loss": 0.6938, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.423828125, + "rewards/margins": -0.005889892578125, + "rewards/rejected": 0.4296875, + "step": 1018 + }, + { + "epoch": 0.2939564402134718, + "grad_norm": 11.077878080350898, + "learning_rate": 4.4488462202884733e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1472.0, + "logps/rejected": -1360.0, + "loss": 0.6751, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.240234375, + "rewards/margins": -0.0269775390625, + "rewards/rejected": 0.267578125, + "step": 1019 + }, + { + "epoch": 0.2942449156209433, + "grad_norm": 18.88115469119005, + "learning_rate": 4.447268003558892e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.09375, + "logps/chosen": -1768.0, + "logps/rejected": -1616.0, + "loss": 0.6963, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.408203125, + "rewards/margins": -0.0361328125, + "rewards/rejected": 0.443359375, + "step": 1020 + }, + { + "epoch": 0.2945333910284148, + "grad_norm": 10.9235990367153, + "learning_rate": 4.445687811249009e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1928.0, + "logps/rejected": -1632.0, + "loss": 0.655, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.1328125, + "rewards/rejected": 0.287109375, + "step": 1021 + }, + { + "epoch": 0.2948218664358863, + "grad_norm": 11.314173501306511, + "learning_rate": 4.444105644961994e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.109375, + "logps/chosen": -1664.0, + "logps/rejected": -1528.0, + "loss": 0.6829, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4453125, + "rewards/margins": 0.030029296875, + "rewards/rejected": 0.416015625, + "step": 1022 + }, + { + "epoch": 0.2951103418433578, + "grad_norm": 9.003809028651654, + "learning_rate": 4.442521506303015e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.328125, + "logps/chosen": -2112.0, + "logps/rejected": -1912.0, + "loss": 0.687, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.006195068359375, + "rewards/rejected": 0.388671875, + "step": 1023 + }, + { + "epoch": 0.2953988172508294, + "grad_norm": 14.692557678898737, + "learning_rate": 4.440935396879245e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -1320.0, + "logps/rejected": -1384.0, + "loss": 0.681, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.0242919921875, + "rewards/rejected": 0.2890625, + "step": 1024 + }, + { + "epoch": 0.2956872926583009, + "grad_norm": 11.638718887654086, + "learning_rate": 4.4393473182998544e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1488.0, + "logps/rejected": -1472.0, + "loss": 0.7111, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.35546875, + "rewards/margins": -0.0008392333984375, + "rewards/rejected": 0.35546875, + "step": 1025 + }, + { + "epoch": 0.2959757680657724, + "grad_norm": 12.953793471008641, + "learning_rate": 4.4377572721760105e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1816.0, + "logps/rejected": -1880.0, + "loss": 0.7134, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.431640625, + "rewards/margins": 0.044921875, + "rewards/rejected": 0.38671875, + "step": 1026 + }, + { + "epoch": 0.2962642434732439, + "grad_norm": 11.176485915743152, + "learning_rate": 4.436165260120879e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.40625, + "logps/chosen": -1952.0, + "logps/rejected": -1872.0, + "loss": 0.7111, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.341796875, + "rewards/margins": -0.03125, + "rewards/rejected": 0.373046875, + "step": 1027 + }, + { + "epoch": 0.29655271888071544, + "grad_norm": 10.059378284593764, + "learning_rate": 4.434571283749618e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1592.0, + "logps/rejected": -1608.0, + "loss": 0.6696, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.421875, + "rewards/margins": 0.0081787109375, + "rewards/rejected": 0.412109375, + "step": 1028 + }, + { + "epoch": 0.29684119428818695, + "grad_norm": 10.670638904777086, + "learning_rate": 4.4329753446793806e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1632.0, + "logps/rejected": -1624.0, + "loss": 0.6876, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.388671875, + "rewards/margins": -0.00177001953125, + "rewards/rejected": 0.388671875, + "step": 1029 + }, + { + "epoch": 0.29712966969565846, + "grad_norm": 11.14820599901039, + "learning_rate": 4.4313774445293097e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1896.0, + "logps/rejected": -1880.0, + "loss": 0.6791, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.10009765625, + "rewards/rejected": 0.48828125, + "step": 1030 + }, + { + "epoch": 0.29741814510312997, + "grad_norm": 9.454338091620606, + "learning_rate": 4.4297775849205365e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1424.0, + "logps/rejected": -1352.0, + "loss": 0.6728, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.059326171875, + "rewards/rejected": 0.259765625, + "step": 1031 + }, + { + "epoch": 0.2977066205106015, + "grad_norm": 12.41192727049395, + "learning_rate": 4.428175767476184e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1880.0, + "logps/rejected": -2008.0, + "loss": 0.6824, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.046142578125, + "rewards/rejected": 0.4609375, + "step": 1032 + }, + { + "epoch": 0.297995095918073, + "grad_norm": 12.377032880347572, + "learning_rate": 4.426571993821359e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.296875, + "logps/chosen": -1368.0, + "logps/rejected": -1432.0, + "loss": 0.6934, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.26171875, + "rewards/margins": -0.057373046875, + "rewards/rejected": 0.318359375, + "step": 1033 + }, + { + "epoch": 0.2982835713255445, + "grad_norm": 9.95151473774507, + "learning_rate": 4.424966265583152e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1632.0, + "logps/rejected": -1744.0, + "loss": 0.681, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.40625, + "step": 1034 + }, + { + "epoch": 0.298572046733016, + "grad_norm": 11.367216827256211, + "learning_rate": 4.423358584390639e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1688.0, + "logps/rejected": -1688.0, + "loss": 0.7254, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.333984375, + "rewards/margins": -0.0908203125, + "rewards/rejected": 0.42578125, + "step": 1035 + }, + { + "epoch": 0.29886052214048753, + "grad_norm": 11.74417259921719, + "learning_rate": 4.4217489518748753e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1464.0, + "logps/rejected": -1264.0, + "loss": 0.7052, + "loss/demonstration_loss": -2752.0, + "loss/preference_loss": -2752.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2451171875, + "rewards/margins": 0.021240234375, + "rewards/rejected": 0.2236328125, + "step": 1036 + }, + { + "epoch": 0.29914899754795904, + "grad_norm": 10.66541803953039, + "learning_rate": 4.4201373696688967e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.25, + "logps/chosen": -1824.0, + "logps/rejected": -1840.0, + "loss": 0.6599, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.138671875, + "rewards/rejected": 0.31640625, + "step": 1037 + }, + { + "epoch": 0.29943747295543055, + "grad_norm": 11.47063053001984, + "learning_rate": 4.4185238394077167e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.328125, + "logps/chosen": -1856.0, + "logps/rejected": -2000.0, + "loss": 0.6921, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.0169677734375, + "rewards/rejected": 0.43359375, + "step": 1038 + }, + { + "epoch": 0.29972594836290206, + "grad_norm": 11.531669306695642, + "learning_rate": 4.4169083627283264e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1344.0, + "logps/rejected": -1280.0, + "loss": 0.6667, + "loss/demonstration_loss": -2672.0, + "loss/preference_loss": -2656.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.08203125, + "rewards/rejected": 0.30078125, + "step": 1039 + }, + { + "epoch": 0.3000144237703736, + "grad_norm": 12.85030395789688, + "learning_rate": 4.41529094126969e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -2040.0, + "logps/rejected": -1808.0, + "loss": 0.697, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.01953125, + "rewards/rejected": 0.5625, + "step": 1040 + }, + { + "epoch": 0.3003028991778451, + "grad_norm": 10.156820907951008, + "learning_rate": 4.413671576672745e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.3125, + "logps/chosen": -1576.0, + "logps/rejected": -1600.0, + "loss": 0.6551, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.412109375, + "step": 1041 + }, + { + "epoch": 0.3005913745853166, + "grad_norm": 10.96774179539688, + "learning_rate": 4.412050270580402e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1336.0, + "logps/rejected": -1568.0, + "loss": 0.6876, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.021728515625, + "rewards/rejected": 0.359375, + "step": 1042 + }, + { + "epoch": 0.3008798499927881, + "grad_norm": 10.26203078171827, + "learning_rate": 4.4104270246375397e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.1875, + "logps/chosen": -1824.0, + "logps/rejected": -1776.0, + "loss": 0.6624, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.4765625, + "step": 1043 + }, + { + "epoch": 0.3011683254002596, + "grad_norm": 10.545231165689852, + "learning_rate": 4.4088018404910043e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.046875, + "logps/chosen": -1816.0, + "logps/rejected": -1816.0, + "loss": 0.6857, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.423828125, + "rewards/margins": -0.049072265625, + "rewards/rejected": 0.474609375, + "step": 1044 + }, + { + "epoch": 0.30145680080773113, + "grad_norm": 11.567764862834972, + "learning_rate": 4.407174719789611e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1656.0, + "logps/rejected": -1576.0, + "loss": 0.6888, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.023681640625, + "rewards/rejected": 0.39453125, + "step": 1045 + }, + { + "epoch": 0.30174527621520264, + "grad_norm": 12.007998303641164, + "learning_rate": 4.405545664184136e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -1552.0, + "logps/rejected": -1608.0, + "loss": 0.6815, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.462890625, + "rewards/margins": -0.01025390625, + "rewards/rejected": 0.47265625, + "step": 1046 + }, + { + "epoch": 0.30203375162267415, + "grad_norm": 9.960620502492322, + "learning_rate": 4.403914675327322e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1608.0, + "logps/rejected": -1696.0, + "loss": 0.6492, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.478515625, + "rewards/margins": 0.15625, + "rewards/rejected": 0.322265625, + "step": 1047 + }, + { + "epoch": 0.30232222703014566, + "grad_norm": 10.199605205688002, + "learning_rate": 4.402281754873871e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1496.0, + "logps/rejected": -1440.0, + "loss": 0.6937, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.016357421875, + "rewards/rejected": 0.40234375, + "step": 1048 + }, + { + "epoch": 0.3026107024376172, + "grad_norm": 13.242753505198397, + "learning_rate": 4.4006469044804454e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1680.0, + "logps/rejected": -1544.0, + "loss": 0.6695, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.546875, + "rewards/margins": 0.1806640625, + "rewards/rejected": 0.3671875, + "step": 1049 + }, + { + "epoch": 0.3028991778450887, + "grad_norm": 10.868260554685914, + "learning_rate": 4.399010125805666e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.203125, + "logps/chosen": -1936.0, + "logps/rejected": -2008.0, + "loss": 0.6995, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.03759765625, + "rewards/rejected": 0.48046875, + "step": 1050 + }, + { + "epoch": 0.3031876532525602, + "grad_norm": 10.449313221078908, + "learning_rate": 4.397371420510108e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -2008.0, + "logps/rejected": -1792.0, + "loss": 0.6232, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.57421875, + "rewards/margins": 0.2275390625, + "rewards/rejected": 0.345703125, + "step": 1051 + }, + { + "epoch": 0.3034761286600317, + "grad_norm": 9.545849479115878, + "learning_rate": 4.3957307902563043e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.375, + "logps/chosen": -1808.0, + "logps/rejected": -1696.0, + "loss": 0.6603, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.08056640625, + "rewards/rejected": 0.474609375, + "step": 1052 + }, + { + "epoch": 0.3037646040675032, + "grad_norm": 10.071343070601001, + "learning_rate": 4.394088236708738e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1392.0, + "logps/rejected": -1384.0, + "loss": 0.6742, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.04736328125, + "rewards/rejected": 0.27734375, + "step": 1053 + }, + { + "epoch": 0.3040530794749748, + "grad_norm": 10.793802524685072, + "learning_rate": 4.392443761533846e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.03125, + "logps/chosen": -1328.0, + "logps/rejected": -1416.0, + "loss": 0.6731, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.09375, + "rewards/rejected": 0.2470703125, + "step": 1054 + }, + { + "epoch": 0.3043415548824463, + "grad_norm": 10.460205711825303, + "learning_rate": 4.3907973664000113e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.375, + "logps/chosen": -2080.0, + "logps/rejected": -1968.0, + "loss": 0.6514, + "loss/demonstration_loss": -4080.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.0693359375, + "rewards/rejected": 0.390625, + "step": 1055 + }, + { + "epoch": 0.3046300302899178, + "grad_norm": 9.927944464628002, + "learning_rate": 4.389149052977568e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1664.0, + "logps/rejected": -1728.0, + "loss": 0.7085, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.001983642578125, + "rewards/rejected": 0.40625, + "step": 1056 + }, + { + "epoch": 0.3049185056973893, + "grad_norm": 10.118494224451947, + "learning_rate": 4.387498822938795e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1544.0, + "logps/rejected": -1328.0, + "loss": 0.6917, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.0947265625, + "rewards/rejected": 0.32421875, + "step": 1057 + }, + { + "epoch": 0.30520698110486083, + "grad_norm": 11.446795630909975, + "learning_rate": 4.385846677957916e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1560.0, + "logps/rejected": -1496.0, + "loss": 0.7209, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.369140625, + "rewards/margins": -0.0419921875, + "rewards/rejected": 0.41015625, + "step": 1058 + }, + { + "epoch": 0.30549545651233234, + "grad_norm": 9.83134059349555, + "learning_rate": 4.3841926197110967e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1248.0, + "logps/rejected": -1496.0, + "loss": 0.6356, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.06884765625, + "rewards/rejected": 0.345703125, + "step": 1059 + }, + { + "epoch": 0.30578393191980385, + "grad_norm": 8.975430158885743, + "learning_rate": 4.382536649876445e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.1875, + "logps/chosen": -1904.0, + "logps/rejected": -1888.0, + "loss": 0.6712, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.0751953125, + "rewards/rejected": 0.328125, + "step": 1060 + }, + { + "epoch": 0.30607240732727536, + "grad_norm": 9.725347570040716, + "learning_rate": 4.3808787701340075e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.203125, + "logps/chosen": -2016.0, + "logps/rejected": -1896.0, + "loss": 0.6513, + "loss/demonstration_loss": -3968.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.1298828125, + "rewards/rejected": 0.3984375, + "step": 1061 + }, + { + "epoch": 0.3063608827347469, + "grad_norm": 9.902936543435803, + "learning_rate": 4.3792189821657695e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.125, + "logps/chosen": -1856.0, + "logps/rejected": -1696.0, + "loss": 0.6813, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.050048828125, + "rewards/rejected": 0.4140625, + "step": 1062 + }, + { + "epoch": 0.3066493581422184, + "grad_norm": 9.27244034248201, + "learning_rate": 4.3775572876556504e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1216.0, + "logps/rejected": -1200.0, + "loss": 0.6893, + "loss/demonstration_loss": -2448.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.30078125, + "rewards/margins": 0.02490234375, + "rewards/rejected": 0.275390625, + "step": 1063 + }, + { + "epoch": 0.3069378335496899, + "grad_norm": 11.271431667081364, + "learning_rate": 4.3758936882895046e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.140625, + "logps/chosen": -1648.0, + "logps/rejected": -1832.0, + "loss": 0.6499, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.33203125, + "step": 1064 + }, + { + "epoch": 0.3072263089571614, + "grad_norm": 9.5791873118906, + "learning_rate": 4.374228185755121e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1936.0, + "logps/rejected": -1656.0, + "loss": 0.672, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.45703125, + "step": 1065 + }, + { + "epoch": 0.3075147843646329, + "grad_norm": 12.241701521904064, + "learning_rate": 4.372560781742216e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.09375, + "logps/chosen": -1528.0, + "logps/rejected": -1488.0, + "loss": 0.6649, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.171875, + "rewards/rejected": 0.1943359375, + "step": 1066 + }, + { + "epoch": 0.30780325977210443, + "grad_norm": 10.937796100133534, + "learning_rate": 4.370891477942439e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1632.0, + "logps/rejected": -1384.0, + "loss": 0.673, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.091796875, + "rewards/rejected": 0.427734375, + "step": 1067 + }, + { + "epoch": 0.30809173517957594, + "grad_norm": 11.827708482688212, + "learning_rate": 4.369220276049362e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -2048.0, + "logps/rejected": -1944.0, + "loss": 0.6866, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.6484375, + "rewards/margins": 0.0147705078125, + "rewards/rejected": 0.63671875, + "step": 1068 + }, + { + "epoch": 0.30838021058704745, + "grad_norm": 10.209626008337633, + "learning_rate": 4.3675471777584867e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1696.0, + "logps/rejected": -1704.0, + "loss": 0.6394, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.451171875, + "rewards/margins": 0.05908203125, + "rewards/rejected": 0.392578125, + "step": 1069 + }, + { + "epoch": 0.30866868599451897, + "grad_norm": 10.813931493175227, + "learning_rate": 4.3658721847672374e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1352.0, + "logps/rejected": -1320.0, + "loss": 0.6512, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.1923828125, + "rewards/rejected": 0.275390625, + "step": 1070 + }, + { + "epoch": 0.3089571614019905, + "grad_norm": 12.781476936616123, + "learning_rate": 4.3641952987749604e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1616.0, + "logps/rejected": -1440.0, + "loss": 0.7042, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.427734375, + "rewards/margins": 0.04345703125, + "rewards/rejected": 0.384765625, + "step": 1071 + }, + { + "epoch": 0.309245636809462, + "grad_norm": 10.480854534217741, + "learning_rate": 4.362516521482923e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1472.0, + "logps/rejected": -1456.0, + "loss": 0.6577, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.16015625, + "rewards/rejected": 0.296875, + "step": 1072 + }, + { + "epoch": 0.3095341122169335, + "grad_norm": 9.74032546417054, + "learning_rate": 4.3608358545943105e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -2160.0, + "logps/rejected": -1656.0, + "loss": 0.637, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.76171875, + "rewards/margins": 0.263671875, + "rewards/rejected": 0.498046875, + "step": 1073 + }, + { + "epoch": 0.309822587624405, + "grad_norm": 11.741331183220632, + "learning_rate": 4.3591532998142266e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1856.0, + "logps/rejected": -1696.0, + "loss": 0.6885, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.09033203125, + "rewards/rejected": 0.40234375, + "step": 1074 + }, + { + "epoch": 0.3101110630318765, + "grad_norm": 9.98462904954572, + "learning_rate": 4.3574688588496896e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1024.0, + "logps/rejected": -1168.0, + "loss": 0.6736, + "loss/demonstration_loss": -2224.0, + "loss/preference_loss": -2224.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.041259765625, + "rewards/rejected": 0.265625, + "step": 1075 + }, + { + "epoch": 0.31039953843934803, + "grad_norm": 10.81813828702515, + "learning_rate": 4.3557825334096306e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1616.0, + "logps/rejected": -1616.0, + "loss": 0.6821, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.0262451171875, + "rewards/rejected": 0.337890625, + "step": 1076 + }, + { + "epoch": 0.31068801384681954, + "grad_norm": 10.156942234471277, + "learning_rate": 4.354094325204894e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1336.0, + "logps/rejected": -1248.0, + "loss": 0.6641, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.06103515625, + "rewards/rejected": 0.25, + "step": 1077 + }, + { + "epoch": 0.31097648925429106, + "grad_norm": 12.540158102646249, + "learning_rate": 4.352404235948233e-07, + "logits/chosen": 3.453125, + "logits/rejected": 3.328125, + "logps/chosen": -1584.0, + "logps/rejected": -1720.0, + "loss": 0.7334, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.482421875, + "rewards/margins": -0.0732421875, + "rewards/rejected": 0.5546875, + "step": 1078 + }, + { + "epoch": 0.31126496466176257, + "grad_norm": 9.958180223700081, + "learning_rate": 4.350712267354311e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.359375, + "logps/chosen": -1392.0, + "logps/rejected": -1280.0, + "loss": 0.661, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.1396484375, + "rewards/rejected": 0.33203125, + "step": 1079 + }, + { + "epoch": 0.3115534400692341, + "grad_norm": 11.095589732013739, + "learning_rate": 4.3490184211396963e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.1875, + "logps/chosen": -1640.0, + "logps/rejected": -1872.0, + "loss": 0.6682, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.036376953125, + "rewards/rejected": 0.443359375, + "step": 1080 + }, + { + "epoch": 0.3118419154767056, + "grad_norm": 10.62922833344852, + "learning_rate": 4.347322699022863e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -2000.0, + "logps/rejected": -1712.0, + "loss": 0.655, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.12109375, + "rewards/rejected": 0.5, + "step": 1081 + }, + { + "epoch": 0.3121303908841771, + "grad_norm": 10.809439703632, + "learning_rate": 4.3456251027241876e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.421875, + "logps/chosen": -1640.0, + "logps/rejected": -1480.0, + "loss": 0.673, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.0869140625, + "rewards/rejected": 0.32421875, + "step": 1082 + }, + { + "epoch": 0.3124188662916486, + "grad_norm": 11.558772145283713, + "learning_rate": 4.343925633965949e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1520.0, + "logps/rejected": -1712.0, + "loss": 0.7331, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.447265625, + "rewards/margins": -0.09716796875, + "rewards/rejected": 0.54296875, + "step": 1083 + }, + { + "epoch": 0.3127073416991201, + "grad_norm": 9.600416604496514, + "learning_rate": 4.342224294472326e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -2040.0, + "logps/rejected": -1784.0, + "loss": 0.653, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.796875, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.5234375, + "step": 1084 + }, + { + "epoch": 0.3129958171065917, + "grad_norm": 11.641016808151411, + "learning_rate": 4.3405210859693935e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1480.0, + "logps/rejected": -1504.0, + "loss": 0.6728, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.04248046875, + "rewards/rejected": 0.26171875, + "step": 1085 + }, + { + "epoch": 0.3132842925140632, + "grad_norm": 11.879649323097299, + "learning_rate": 4.3388160101851244e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1928.0, + "logps/rejected": -2128.0, + "loss": 0.7119, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5234375, + "rewards/margins": -0.06103515625, + "rewards/rejected": 0.5859375, + "step": 1086 + }, + { + "epoch": 0.3135727679215347, + "grad_norm": 9.323380293363014, + "learning_rate": 4.337109068849386e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.3125, + "logps/chosen": -1824.0, + "logps/rejected": -1776.0, + "loss": 0.6538, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.0546875, + "rewards/rejected": 0.435546875, + "step": 1087 + }, + { + "epoch": 0.3138612433290062, + "grad_norm": 11.202696120029435, + "learning_rate": 4.335400263693937e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1576.0, + "logps/rejected": -1472.0, + "loss": 0.6697, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.150390625, + "rewards/rejected": 0.369140625, + "step": 1088 + }, + { + "epoch": 0.31414971873647773, + "grad_norm": 9.943585237587477, + "learning_rate": 4.3336895964524276e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.296875, + "logps/chosen": -1832.0, + "logps/rejected": -1864.0, + "loss": 0.6633, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.6171875, + "rewards/margins": -0.0380859375, + "rewards/rejected": 0.65625, + "step": 1089 + }, + { + "epoch": 0.31443819414394925, + "grad_norm": 10.660608823952572, + "learning_rate": 4.3319770688603975e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -1584.0, + "logps/rejected": -1728.0, + "loss": 0.7169, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45703125, + "rewards/margins": -0.03515625, + "rewards/rejected": 0.4921875, + "step": 1090 + }, + { + "epoch": 0.31472666955142076, + "grad_norm": 9.962700899735973, + "learning_rate": 4.3302626826552733e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1600.0, + "logps/rejected": -1712.0, + "loss": 0.6934, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.455078125, + "rewards/margins": -0.0269775390625, + "rewards/rejected": 0.482421875, + "step": 1091 + }, + { + "epoch": 0.31501514495889227, + "grad_norm": 10.959556846793634, + "learning_rate": 4.3285464395763694e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.15625, + "logps/chosen": -1824.0, + "logps/rejected": -1816.0, + "loss": 0.6589, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.11572265625, + "rewards/rejected": 0.51953125, + "step": 1092 + }, + { + "epoch": 0.3153036203663638, + "grad_norm": 10.07789154443979, + "learning_rate": 4.3268283413648786e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.078125, + "logps/chosen": -1576.0, + "logps/rejected": -1384.0, + "loss": 0.6336, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.1923828125, + "rewards/rejected": 0.271484375, + "step": 1093 + }, + { + "epoch": 0.3155920957738353, + "grad_norm": 11.767494127444492, + "learning_rate": 4.325108389763883e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.140625, + "logps/chosen": -1696.0, + "logps/rejected": -1936.0, + "loss": 0.7125, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.03955078125, + "rewards/rejected": 0.36328125, + "step": 1094 + }, + { + "epoch": 0.3158805711813068, + "grad_norm": 12.36861140594807, + "learning_rate": 4.3233865865183396e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.09375, + "logps/chosen": -1608.0, + "logps/rejected": -1424.0, + "loss": 0.6807, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3359375, + "rewards/margins": -0.0146484375, + "rewards/rejected": 0.3515625, + "step": 1095 + }, + { + "epoch": 0.3161690465887783, + "grad_norm": 10.625365820116428, + "learning_rate": 4.321662933375085e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.15625, + "logps/chosen": -2320.0, + "logps/rejected": -2208.0, + "loss": 0.6663, + "loss/demonstration_loss": -4608.0, + "loss/preference_loss": -4608.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.8515625, + "rewards/margins": 0.047119140625, + "rewards/rejected": 0.8046875, + "step": 1096 + }, + { + "epoch": 0.3164575219962498, + "grad_norm": 10.689177670928903, + "learning_rate": 4.3199374320828357e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1872.0, + "logps/rejected": -1736.0, + "loss": 0.6848, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.484375, + "rewards/margins": 0.08251953125, + "rewards/rejected": 0.400390625, + "step": 1097 + }, + { + "epoch": 0.31674599740372134, + "grad_norm": 11.486609689176712, + "learning_rate": 4.3182100843921794e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1824.0, + "logps/rejected": -1984.0, + "loss": 0.6559, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.57421875, + "rewards/margins": 0.09716796875, + "rewards/rejected": 0.4765625, + "step": 1098 + }, + { + "epoch": 0.31703447281119285, + "grad_norm": 10.913265392848576, + "learning_rate": 4.3164808920555783e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1808.0, + "logps/rejected": -1672.0, + "loss": 0.694, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.0302734375, + "rewards/rejected": 0.439453125, + "step": 1099 + }, + { + "epoch": 0.31732294821866436, + "grad_norm": 11.751779066090771, + "learning_rate": 4.3147498568273674e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1840.0, + "logps/rejected": -1632.0, + "loss": 0.6732, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.09765625, + "rewards/rejected": 0.33203125, + "step": 1100 + }, + { + "epoch": 0.31761142362613587, + "grad_norm": 11.018008193645517, + "learning_rate": 4.3130169804637497e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.265625, + "logps/chosen": -2000.0, + "logps/rejected": -1912.0, + "loss": 0.6755, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.453125, + "rewards/margins": -0.010498046875, + "rewards/rejected": 0.462890625, + "step": 1101 + }, + { + "epoch": 0.3178998990336074, + "grad_norm": 11.12508340848376, + "learning_rate": 4.311282264722796e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1216.0, + "logps/rejected": -1024.0, + "loss": 0.6775, + "loss/demonstration_loss": -2288.0, + "loss/preference_loss": -2272.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3671875, + "rewards/margins": 0.07470703125, + "rewards/rejected": 0.291015625, + "step": 1102 + }, + { + "epoch": 0.3181883744410789, + "grad_norm": 10.817337869101193, + "learning_rate": 4.3095457113644456e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1376.0, + "logps/rejected": -1592.0, + "loss": 0.7084, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.443359375, + "rewards/margins": -0.0625, + "rewards/rejected": 0.5078125, + "step": 1103 + }, + { + "epoch": 0.3184768498485504, + "grad_norm": 11.285619032441735, + "learning_rate": 4.3078073221504997e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0625, + "logps/chosen": -1640.0, + "logps/rejected": -1576.0, + "loss": 0.6654, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.028076171875, + "rewards/rejected": 0.4921875, + "step": 1104 + }, + { + "epoch": 0.3187653252560219, + "grad_norm": 9.543128841084139, + "learning_rate": 4.3060670988446226e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1080.0, + "logps/rejected": -892.0, + "loss": 0.6893, + "loss/demonstration_loss": -1992.0, + "loss/preference_loss": -1976.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2197265625, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.0908203125, + "step": 1105 + }, + { + "epoch": 0.3190538006634934, + "grad_norm": 10.96143869200322, + "learning_rate": 4.304325043212339e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1720.0, + "logps/rejected": -1832.0, + "loss": 0.6803, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5, + "rewards/margins": 0.0133056640625, + "rewards/rejected": 0.48828125, + "step": 1106 + }, + { + "epoch": 0.31934227607096494, + "grad_norm": 10.958561735749933, + "learning_rate": 4.302581157021034e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1664.0, + "logps/rejected": -1680.0, + "loss": 0.6748, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.029541015625, + "rewards/rejected": 0.4375, + "step": 1107 + }, + { + "epoch": 0.31963075147843645, + "grad_norm": 9.249602364134974, + "learning_rate": 4.300835442039949e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1400.0, + "logps/rejected": -1344.0, + "loss": 0.6719, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.3984375, + "step": 1108 + }, + { + "epoch": 0.31991922688590796, + "grad_norm": 10.054629788603743, + "learning_rate": 4.299087900040181e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.34375, + "logps/chosen": -1816.0, + "logps/rejected": -1664.0, + "loss": 0.6697, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.05712890625, + "rewards/rejected": 0.478515625, + "step": 1109 + }, + { + "epoch": 0.32020770229337947, + "grad_norm": 9.498590751331028, + "learning_rate": 4.2973385327946796e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1624.0, + "logps/rejected": -1544.0, + "loss": 0.6692, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4375, + "rewards/margins": 0.055419921875, + "rewards/rejected": 0.3828125, + "step": 1110 + }, + { + "epoch": 0.320496177700851, + "grad_norm": 9.865754437646652, + "learning_rate": 4.295587342078247e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1752.0, + "logps/rejected": -1736.0, + "loss": 0.6766, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.55859375, + "rewards/margins": -0.020263671875, + "rewards/rejected": 0.578125, + "step": 1111 + }, + { + "epoch": 0.3207846531083225, + "grad_norm": 9.605724895544492, + "learning_rate": 4.2938343296675365e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1608.0, + "logps/rejected": -1520.0, + "loss": 0.6618, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.10986328125, + "rewards/rejected": 0.462890625, + "step": 1112 + }, + { + "epoch": 0.321073128515794, + "grad_norm": 11.046802334784493, + "learning_rate": 4.2920794973410476e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.25, + "logps/chosen": -1752.0, + "logps/rejected": -1400.0, + "loss": 0.6615, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.17578125, + "rewards/rejected": 0.2021484375, + "step": 1113 + }, + { + "epoch": 0.3213616039232655, + "grad_norm": 11.953063907399423, + "learning_rate": 4.290322846879126e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.203125, + "logps/chosen": -1344.0, + "logps/rejected": -1496.0, + "loss": 0.6938, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40234375, + "rewards/margins": -0.006134033203125, + "rewards/rejected": 0.408203125, + "step": 1114 + }, + { + "epoch": 0.3216500793307371, + "grad_norm": 13.186866721082433, + "learning_rate": 4.2885643800639657e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1936.0, + "logps/rejected": -1568.0, + "loss": 0.6515, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.447265625, + "step": 1115 + }, + { + "epoch": 0.3219385547382086, + "grad_norm": 9.39494627570994, + "learning_rate": 4.2868040986795985e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.328125, + "logps/chosen": -1864.0, + "logps/rejected": -1824.0, + "loss": 0.6553, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.060791015625, + "rewards/rejected": 0.494140625, + "step": 1116 + }, + { + "epoch": 0.3222270301456801, + "grad_norm": 9.498534393571315, + "learning_rate": 4.2850420045118993e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.359375, + "logps/chosen": -2008.0, + "logps/rejected": -2080.0, + "loss": 0.6542, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.53125, + "rewards/margins": -0.00372314453125, + "rewards/rejected": 0.53515625, + "step": 1117 + }, + { + "epoch": 0.3225155055531516, + "grad_norm": 11.915042788541308, + "learning_rate": 4.283278099348584e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.203125, + "logps/chosen": -1576.0, + "logps/rejected": -1720.0, + "loss": 0.6633, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.049072265625, + "rewards/rejected": 0.45703125, + "step": 1118 + }, + { + "epoch": 0.3228039809606231, + "grad_norm": 10.987924428308617, + "learning_rate": 4.2815123849792024e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -1552.0, + "logps/rejected": -1360.0, + "loss": 0.7063, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.453125, + "rewards/margins": 0.0164794921875, + "rewards/rejected": 0.4375, + "step": 1119 + }, + { + "epoch": 0.32309245636809464, + "grad_norm": 10.160955119843878, + "learning_rate": 4.279744863195142e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.34375, + "logps/chosen": -1632.0, + "logps/rejected": -1680.0, + "loss": 0.6783, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4140625, + "rewards/margins": -0.0147705078125, + "rewards/rejected": 0.427734375, + "step": 1120 + }, + { + "epoch": 0.32338093177556615, + "grad_norm": 11.225946867998706, + "learning_rate": 4.277975535789623e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -1488.0, + "logps/rejected": -1200.0, + "loss": 0.6802, + "loss/demonstration_loss": -2752.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5, + "rewards/margins": 0.11181640625, + "rewards/rejected": 0.388671875, + "step": 1121 + }, + { + "epoch": 0.32366940718303766, + "grad_norm": 9.448584759046966, + "learning_rate": 4.276204404557698e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1480.0, + "logps/rejected": -1488.0, + "loss": 0.6595, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.390625, + "rewards/margins": 0.02197265625, + "rewards/rejected": 0.369140625, + "step": 1122 + }, + { + "epoch": 0.32395788259050917, + "grad_norm": 10.568033714034645, + "learning_rate": 4.2744314712962516e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1688.0, + "logps/rejected": -1704.0, + "loss": 0.652, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.546875, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.482421875, + "step": 1123 + }, + { + "epoch": 0.3242463579979807, + "grad_norm": 11.01463413733453, + "learning_rate": 4.2726567378039926e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1528.0, + "logps/rejected": -1392.0, + "loss": 0.6906, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5625, + "rewards/margins": 0.0130615234375, + "rewards/rejected": 0.55078125, + "step": 1124 + }, + { + "epoch": 0.3245348334054522, + "grad_norm": 10.402394081822598, + "learning_rate": 4.2708802058814586e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1656.0, + "logps/rejected": -1608.0, + "loss": 0.6529, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.4609375, + "step": 1125 + }, + { + "epoch": 0.3248233088129237, + "grad_norm": 11.03117060755873, + "learning_rate": 4.269101877331011e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1792.0, + "logps/rejected": -1880.0, + "loss": 0.7263, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4375, + "rewards/margins": -0.0103759765625, + "rewards/rejected": 0.447265625, + "step": 1126 + }, + { + "epoch": 0.3251117842203952, + "grad_norm": 10.644485543976, + "learning_rate": 4.267321753956835e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -2352.0, + "logps/rejected": -2288.0, + "loss": 0.6652, + "loss/demonstration_loss": -4704.0, + "loss/preference_loss": -4704.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.76171875, + "rewards/margins": 0.080078125, + "rewards/rejected": 0.6796875, + "step": 1127 + }, + { + "epoch": 0.32540025962786673, + "grad_norm": 10.39211679108361, + "learning_rate": 4.265539837564936e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.15625, + "logps/chosen": -1504.0, + "logps/rejected": -1544.0, + "loss": 0.6548, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.0537109375, + "rewards/rejected": 0.39453125, + "step": 1128 + }, + { + "epoch": 0.32568873503533824, + "grad_norm": 11.48942717572012, + "learning_rate": 4.263756129963138e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.109375, + "logps/chosen": -1728.0, + "logps/rejected": -1608.0, + "loss": 0.68, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.4921875, + "step": 1129 + }, + { + "epoch": 0.32597721044280975, + "grad_norm": 10.052919449483925, + "learning_rate": 4.261970632961084e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1360.0, + "logps/rejected": -1408.0, + "loss": 0.6945, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.0625, + "rewards/chosen": 0.41015625, + "rewards/margins": -0.08056640625, + "rewards/rejected": 0.4921875, + "step": 1130 + }, + { + "epoch": 0.32626568585028126, + "grad_norm": 9.4922394428165, + "learning_rate": 4.2601833483702297e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1544.0, + "logps/rejected": -1584.0, + "loss": 0.6729, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.0693359375, + "rewards/rejected": 0.384765625, + "step": 1131 + }, + { + "epoch": 0.3265541612577528, + "grad_norm": 11.038417929831315, + "learning_rate": 4.258394278003847e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -2024.0, + "logps/rejected": -1888.0, + "loss": 0.6564, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.515625, + "rewards/margins": 0.10302734375, + "rewards/rejected": 0.4140625, + "step": 1132 + }, + { + "epoch": 0.3268426366652243, + "grad_norm": 9.891690587284414, + "learning_rate": 4.2566034236770186e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -2224.0, + "logps/rejected": -1992.0, + "loss": 0.633, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.1357421875, + "rewards/rejected": 0.498046875, + "step": 1133 + }, + { + "epoch": 0.3271311120726958, + "grad_norm": 9.223117953425657, + "learning_rate": 4.2548107872066364e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1672.0, + "logps/rejected": -1656.0, + "loss": 0.6504, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.12353515625, + "rewards/rejected": 0.357421875, + "step": 1134 + }, + { + "epoch": 0.3274195874801673, + "grad_norm": 11.155238346951988, + "learning_rate": 4.2530163704114006e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1544.0, + "logps/rejected": -1616.0, + "loss": 0.6556, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.041748046875, + "rewards/rejected": 0.3515625, + "step": 1135 + }, + { + "epoch": 0.3277080628876388, + "grad_norm": 9.190078299527917, + "learning_rate": 4.2512201751118194e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.234375, + "logps/chosen": -1432.0, + "logps/rejected": -1048.0, + "loss": 0.6297, + "loss/demonstration_loss": -2512.0, + "loss/preference_loss": -2496.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.1630859375, + "rewards/rejected": 0.17578125, + "step": 1136 + }, + { + "epoch": 0.32799653829511033, + "grad_norm": 11.349245867175151, + "learning_rate": 4.249422203130201e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -1752.0, + "logps/rejected": -1640.0, + "loss": 0.6863, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4140625, + "rewards/margins": 0.00201416015625, + "rewards/rejected": 0.412109375, + "step": 1137 + }, + { + "epoch": 0.32828501370258184, + "grad_norm": 9.911915662201984, + "learning_rate": 4.2476224562906616e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1664.0, + "logps/rejected": -1480.0, + "loss": 0.6906, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.396484375, + "rewards/margins": 0.06591796875, + "rewards/rejected": 0.33203125, + "step": 1138 + }, + { + "epoch": 0.32857348911005335, + "grad_norm": 10.13273038028613, + "learning_rate": 4.245820936419115e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.078125, + "logps/chosen": -1648.0, + "logps/rejected": -1488.0, + "loss": 0.6768, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.0576171875, + "rewards/rejected": 0.318359375, + "step": 1139 + }, + { + "epoch": 0.32886196451752486, + "grad_norm": 11.821421491096762, + "learning_rate": 4.2440176453432734e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1856.0, + "logps/rejected": -1648.0, + "loss": 0.6696, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.1171875, + "rewards/rejected": 0.3671875, + "step": 1140 + }, + { + "epoch": 0.3291504399249964, + "grad_norm": 11.201923070900376, + "learning_rate": 4.2422125848926485e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.75, + "logps/chosen": -1384.0, + "logps/rejected": -1528.0, + "loss": 0.6661, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.326171875, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.1982421875, + "step": 1141 + }, + { + "epoch": 0.3294389153324679, + "grad_norm": 12.043609996358251, + "learning_rate": 4.240405756898543e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1856.0, + "logps/rejected": -1688.0, + "loss": 0.6896, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.044189453125, + "rewards/rejected": 0.376953125, + "step": 1142 + }, + { + "epoch": 0.3297273907399394, + "grad_norm": 10.50533730843436, + "learning_rate": 4.2385971631940566e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1312.0, + "logps/rejected": -1272.0, + "loss": 0.682, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.09619140625, + "rewards/rejected": 0.2216796875, + "step": 1143 + }, + { + "epoch": 0.3300158661474109, + "grad_norm": 11.808515536777824, + "learning_rate": 4.236786805614079e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.265625, + "logps/chosen": -1600.0, + "logps/rejected": -1432.0, + "loss": 0.7482, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.34375, + "rewards/margins": -0.0654296875, + "rewards/rejected": 0.408203125, + "step": 1144 + }, + { + "epoch": 0.3303043415548824, + "grad_norm": 12.242090967844083, + "learning_rate": 4.2349746859952894e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1352.0, + "logps/rejected": -1320.0, + "loss": 0.7022, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5, + "rewards/margins": 0.12451171875, + "rewards/rejected": 0.375, + "step": 1145 + }, + { + "epoch": 0.330592816962354, + "grad_norm": 10.853033861983233, + "learning_rate": 4.233160806176155e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1912.0, + "logps/rejected": -1584.0, + "loss": 0.6714, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.453125, + "rewards/margins": -0.00799560546875, + "rewards/rejected": 0.4609375, + "step": 1146 + }, + { + "epoch": 0.3308812923698255, + "grad_norm": 9.736436329938586, + "learning_rate": 4.2313451679969283e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -1880.0, + "logps/rejected": -1808.0, + "loss": 0.691, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.48046875, + "step": 1147 + }, + { + "epoch": 0.331169767777297, + "grad_norm": 10.167725352894609, + "learning_rate": 4.229527773299645e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.15625, + "logps/chosen": -1616.0, + "logps/rejected": -1592.0, + "loss": 0.6571, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.1318359375, + "rewards/rejected": 0.373046875, + "step": 1148 + }, + { + "epoch": 0.3314582431847685, + "grad_norm": 11.660646932229469, + "learning_rate": 4.2277086239281256e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.265625, + "logps/chosen": -1952.0, + "logps/rejected": -1704.0, + "loss": 0.7161, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5625, + "rewards/margins": 0.0966796875, + "rewards/rejected": 0.466796875, + "step": 1149 + }, + { + "epoch": 0.33174671859224003, + "grad_norm": 13.041432753936327, + "learning_rate": 4.225887721727968e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -1856.0, + "logps/rejected": -1920.0, + "loss": 0.6829, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.1171875, + "rewards/rejected": 0.408203125, + "step": 1150 + }, + { + "epoch": 0.33203519399971154, + "grad_norm": 9.965438308467057, + "learning_rate": 4.2240650685465493e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.96875, + "logps/chosen": -1264.0, + "logps/rejected": -1312.0, + "loss": 0.6862, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2608.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.373046875, + "step": 1151 + }, + { + "epoch": 0.33232366940718305, + "grad_norm": 8.49525303603664, + "learning_rate": 4.2222406662330233e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1784.0, + "logps/rejected": -1456.0, + "loss": 0.609, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.578125, + "rewards/margins": 0.2353515625, + "rewards/rejected": 0.341796875, + "step": 1152 + }, + { + "epoch": 0.33261214481465456, + "grad_norm": 10.693450766992868, + "learning_rate": 4.2204145166383185e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1560.0, + "logps/rejected": -1552.0, + "loss": 0.7242, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.41015625, + "rewards/margins": -0.039306640625, + "rewards/rejected": 0.44921875, + "step": 1153 + }, + { + "epoch": 0.3329006202221261, + "grad_norm": 10.248540609208248, + "learning_rate": 4.218586621615136e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1632.0, + "logps/rejected": -1616.0, + "loss": 0.6858, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5625, + "rewards/margins": 0.016357421875, + "rewards/rejected": 0.546875, + "step": 1154 + }, + { + "epoch": 0.3331890956295976, + "grad_norm": 13.241247193449121, + "learning_rate": 4.216756983017946e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.21875, + "logps/chosen": -1488.0, + "logps/rejected": -1528.0, + "loss": 0.649, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.16015625, + "rewards/rejected": 0.3046875, + "step": 1155 + }, + { + "epoch": 0.3334775710370691, + "grad_norm": 11.303538123352176, + "learning_rate": 4.2149256027029914e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1488.0, + "logps/rejected": -1488.0, + "loss": 0.6941, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.01019287109375, + "rewards/rejected": 0.427734375, + "step": 1156 + }, + { + "epoch": 0.3337660464445406, + "grad_norm": 11.556663034465197, + "learning_rate": 4.2130924825282777e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1656.0, + "logps/rejected": -1584.0, + "loss": 0.6186, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27734375, + "rewards/margins": 0.1708984375, + "rewards/rejected": 0.10546875, + "step": 1157 + }, + { + "epoch": 0.3340545218520121, + "grad_norm": 10.527528895772924, + "learning_rate": 4.211257624353579e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1840.0, + "logps/rejected": -1656.0, + "loss": 0.6434, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.08203125, + "rewards/rejected": 0.4453125, + "step": 1158 + }, + { + "epoch": 0.33434299725948363, + "grad_norm": 9.83585989965221, + "learning_rate": 4.2094210300404306e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.1875, + "logps/chosen": -1976.0, + "logps/rejected": -2032.0, + "loss": 0.6603, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.451171875, + "step": 1159 + }, + { + "epoch": 0.33463147266695514, + "grad_norm": 11.189821775386244, + "learning_rate": 4.2075827014521304e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.21875, + "logps/chosen": -1400.0, + "logps/rejected": -1472.0, + "loss": 0.7112, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.0142822265625, + "rewards/rejected": 0.318359375, + "step": 1160 + }, + { + "epoch": 0.33491994807442665, + "grad_norm": 9.65783390216607, + "learning_rate": 4.2057426404537357e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.265625, + "logps/chosen": -1544.0, + "logps/rejected": -1768.0, + "loss": 0.6645, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.546875, + "rewards/margins": 0.0262451171875, + "rewards/rejected": 0.51953125, + "step": 1161 + }, + { + "epoch": 0.33520842348189817, + "grad_norm": 10.798214811995859, + "learning_rate": 4.2039008489120604e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.15625, + "logps/chosen": -1656.0, + "logps/rejected": -1640.0, + "loss": 0.6792, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.453125, + "step": 1162 + }, + { + "epoch": 0.3354968988893697, + "grad_norm": 13.42015298250084, + "learning_rate": 4.202057328695675e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1624.0, + "logps/rejected": -1480.0, + "loss": 0.6995, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.421875, + "rewards/margins": -0.01116943359375, + "rewards/rejected": 0.43359375, + "step": 1163 + }, + { + "epoch": 0.3357853742968412, + "grad_norm": 11.301053398264328, + "learning_rate": 4.200212081674904e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1408.0, + "logps/rejected": -1440.0, + "loss": 0.707, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.421875, + "rewards/margins": -0.0159912109375, + "rewards/rejected": 0.4375, + "step": 1164 + }, + { + "epoch": 0.3360738497043127, + "grad_norm": 11.201672563859537, + "learning_rate": 4.198365109721823e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1560.0, + "logps/rejected": -1456.0, + "loss": 0.6524, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.10986328125, + "rewards/rejected": 0.41796875, + "step": 1165 + }, + { + "epoch": 0.3363623251117842, + "grad_norm": 11.269175375241158, + "learning_rate": 4.196516414710258e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1592.0, + "logps/rejected": -1680.0, + "loss": 0.6998, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.45703125, + "rewards/margins": -0.0625, + "rewards/rejected": 0.51953125, + "step": 1166 + }, + { + "epoch": 0.3366508005192557, + "grad_norm": 11.47215685353217, + "learning_rate": 4.194665998515783e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1912.0, + "logps/rejected": -1728.0, + "loss": 0.7243, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.53125, + "rewards/margins": -0.0537109375, + "rewards/rejected": 0.5859375, + "step": 1167 + }, + { + "epoch": 0.33693927592672723, + "grad_norm": 11.38601269178146, + "learning_rate": 4.192813863015719e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1832.0, + "logps/rejected": -1680.0, + "loss": 0.679, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.03955078125, + "rewards/rejected": 0.46875, + "step": 1168 + }, + { + "epoch": 0.33722775133419874, + "grad_norm": 11.147829750320977, + "learning_rate": 4.19096001008913e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1616.0, + "logps/rejected": -1480.0, + "loss": 0.7078, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.408203125, + "rewards/margins": -0.0849609375, + "rewards/rejected": 0.494140625, + "step": 1169 + }, + { + "epoch": 0.33751622674167026, + "grad_norm": 10.76755103946222, + "learning_rate": 4.189104441616823e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1344.0, + "logps/rejected": -1144.0, + "loss": 0.6754, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.33203125, + "step": 1170 + }, + { + "epoch": 0.33780470214914177, + "grad_norm": 10.199894516434862, + "learning_rate": 4.187247159481345e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1976.0, + "logps/rejected": -1920.0, + "loss": 0.6576, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.53125, + "step": 1171 + }, + { + "epoch": 0.3380931775566133, + "grad_norm": 11.033576485906467, + "learning_rate": 4.185388165566983e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.125, + "logps/chosen": -1584.0, + "logps/rejected": -1536.0, + "loss": 0.6862, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.07666015625, + "rewards/rejected": 0.4765625, + "step": 1172 + }, + { + "epoch": 0.3383816529640848, + "grad_norm": 10.797517689260367, + "learning_rate": 4.1835274617597596e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1552.0, + "logps/rejected": -1376.0, + "loss": 0.6643, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.173828125, + "rewards/rejected": 0.361328125, + "step": 1173 + }, + { + "epoch": 0.3386701283715563, + "grad_norm": 9.96713535535957, + "learning_rate": 4.181665049947433e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1672.0, + "logps/rejected": -1784.0, + "loss": 0.6404, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.185546875, + "rewards/rejected": 0.310546875, + "step": 1174 + }, + { + "epoch": 0.3389586037790278, + "grad_norm": 11.159489652410565, + "learning_rate": 4.17980093201949e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1904.0, + "logps/rejected": -1616.0, + "loss": 0.6763, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.427734375, + "rewards/margins": 0.0673828125, + "rewards/rejected": 0.359375, + "step": 1175 + }, + { + "epoch": 0.3392470791864994, + "grad_norm": 10.487960293414185, + "learning_rate": 4.1779351098671573e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1648.0, + "logps/rejected": -1432.0, + "loss": 0.6453, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.453125, + "rewards/margins": 0.12109375, + "rewards/rejected": 0.33203125, + "step": 1176 + }, + { + "epoch": 0.3395355545939709, + "grad_norm": 12.10421145151164, + "learning_rate": 4.176067585383382e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1872.0, + "logps/rejected": -2016.0, + "loss": 0.7471, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.66796875, + "rewards/margins": -0.012451171875, + "rewards/rejected": 0.6796875, + "step": 1177 + }, + { + "epoch": 0.3398240300014424, + "grad_norm": 10.665589122195815, + "learning_rate": 4.174198360462841e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.234375, + "logps/chosen": -1664.0, + "logps/rejected": -1520.0, + "loss": 0.6653, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.12353515625, + "rewards/rejected": 0.37109375, + "step": 1178 + }, + { + "epoch": 0.3401125054089139, + "grad_norm": 9.440306643909677, + "learning_rate": 4.1723274370019373e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1944.0, + "logps/rejected": -1976.0, + "loss": 0.6567, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.0966796875, + "rewards/rejected": 0.5625, + "step": 1179 + }, + { + "epoch": 0.3404009808163854, + "grad_norm": 10.707031687381827, + "learning_rate": 4.170454816898798e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1648.0, + "logps/rejected": -1328.0, + "loss": 0.6641, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.455078125, + "step": 1180 + }, + { + "epoch": 0.34068945622385693, + "grad_norm": 12.083004587730889, + "learning_rate": 4.1685805020532683e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.296875, + "logps/chosen": -1720.0, + "logps/rejected": -1728.0, + "loss": 0.6876, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.10205078125, + "rewards/rejected": 0.353515625, + "step": 1181 + }, + { + "epoch": 0.34097793163132845, + "grad_norm": 10.79867636871633, + "learning_rate": 4.166704494366916e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.296875, + "logps/chosen": -1800.0, + "logps/rejected": -1552.0, + "loss": 0.6362, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.578125, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.423828125, + "step": 1182 + }, + { + "epoch": 0.34126640703879996, + "grad_norm": 11.607845874521145, + "learning_rate": 4.164826795743025e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1472.0, + "logps/rejected": -1736.0, + "loss": 0.7083, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.46875, + "rewards/margins": -0.09033203125, + "rewards/rejected": 0.55859375, + "step": 1183 + }, + { + "epoch": 0.34155488244627147, + "grad_norm": 10.992957628024477, + "learning_rate": 4.1629474080865936e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1440.0, + "logps/rejected": -1432.0, + "loss": 0.6772, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.08203125, + "rewards/rejected": 0.326171875, + "step": 1184 + }, + { + "epoch": 0.341843357853743, + "grad_norm": 9.783187473727502, + "learning_rate": 4.161066333304336e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -1688.0, + "logps/rejected": -1688.0, + "loss": 0.6606, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.115234375, + "rewards/rejected": 0.3203125, + "step": 1185 + }, + { + "epoch": 0.3421318332612145, + "grad_norm": 9.444421637467595, + "learning_rate": 4.159183573304675e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1256.0, + "logps/rejected": -1240.0, + "loss": 0.6806, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37890625, + "rewards/margins": 0.037109375, + "rewards/rejected": 0.341796875, + "step": 1186 + }, + { + "epoch": 0.342420308668686, + "grad_norm": 10.596573558664689, + "learning_rate": 4.157299129997748e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.015625, + "logps/chosen": -1424.0, + "logps/rejected": -1336.0, + "loss": 0.6703, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.12353515625, + "rewards/rejected": 0.32421875, + "step": 1187 + }, + { + "epoch": 0.3427087840761575, + "grad_norm": 10.232235543749564, + "learning_rate": 4.155413005295394e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.0625, + "logps/chosen": -2040.0, + "logps/rejected": -1928.0, + "loss": 0.6551, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.140625, + "rewards/rejected": 0.32421875, + "step": 1188 + }, + { + "epoch": 0.342997259483629, + "grad_norm": 8.869981902020646, + "learning_rate": 4.1535252011111633e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1496.0, + "logps/rejected": -1632.0, + "loss": 0.6584, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.458984375, + "rewards/margins": -0.002471923828125, + "rewards/rejected": 0.4609375, + "step": 1189 + }, + { + "epoch": 0.34328573489110054, + "grad_norm": 11.175947012805695, + "learning_rate": 4.151635719360307e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.25, + "logps/chosen": -1672.0, + "logps/rejected": -1520.0, + "loss": 0.739, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.326171875, + "rewards/margins": -0.06494140625, + "rewards/rejected": 0.390625, + "step": 1190 + }, + { + "epoch": 0.34357421029857205, + "grad_norm": 10.717793583926355, + "learning_rate": 4.149744561959779e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.15625, + "logps/chosen": -1496.0, + "logps/rejected": -1616.0, + "loss": 0.6863, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.049560546875, + "rewards/rejected": 0.421875, + "step": 1191 + }, + { + "epoch": 0.34386268570604356, + "grad_norm": 13.148875037968324, + "learning_rate": 4.1478517308282324e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0625, + "logps/chosen": -1496.0, + "logps/rejected": -1696.0, + "loss": 0.7001, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.294921875, + "rewards/margins": -0.018798828125, + "rewards/rejected": 0.314453125, + "step": 1192 + }, + { + "epoch": 0.34415116111351507, + "grad_norm": 10.163981515009384, + "learning_rate": 4.14595722788602e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1776.0, + "logps/rejected": -1712.0, + "loss": 0.6955, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.03125, + "rewards/rejected": 0.306640625, + "step": 1193 + }, + { + "epoch": 0.3444396365209866, + "grad_norm": 10.32738887342833, + "learning_rate": 4.14406105505519e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1288.0, + "logps/rejected": -1136.0, + "loss": 0.6482, + "loss/demonstration_loss": -2464.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.30078125, + "rewards/margins": 0.0634765625, + "rewards/rejected": 0.2373046875, + "step": 1194 + }, + { + "epoch": 0.3447281119284581, + "grad_norm": 10.384757746187962, + "learning_rate": 4.142163214259484e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.234375, + "logps/chosen": -1488.0, + "logps/rejected": -1448.0, + "loss": 0.674, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.478515625, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.3515625, + "step": 1195 + }, + { + "epoch": 0.3450165873359296, + "grad_norm": 11.667809095858859, + "learning_rate": 4.140263707424337e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.0, + "logps/chosen": -1592.0, + "logps/rejected": -1560.0, + "loss": 0.6945, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.029052734375, + "rewards/rejected": 0.458984375, + "step": 1196 + }, + { + "epoch": 0.3453050627434011, + "grad_norm": 12.822285800547911, + "learning_rate": 4.1383625364768736e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.078125, + "logps/chosen": -1672.0, + "logps/rejected": -1280.0, + "loss": 0.7132, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.404296875, + "step": 1197 + }, + { + "epoch": 0.3455935381508726, + "grad_norm": 11.00139319274122, + "learning_rate": 4.136459703345907e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -1432.0, + "logps/rejected": -1392.0, + "loss": 0.679, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5234375, + "rewards/margins": -0.003509521484375, + "rewards/rejected": 0.52734375, + "step": 1198 + }, + { + "epoch": 0.34588201355834414, + "grad_norm": 10.006378299767858, + "learning_rate": 4.134555209961936e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1896.0, + "logps/rejected": -1584.0, + "loss": 0.6124, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47265625, + "rewards/margins": 0.2451171875, + "rewards/rejected": 0.2275390625, + "step": 1199 + }, + { + "epoch": 0.34617048896581565, + "grad_norm": 10.228589565980382, + "learning_rate": 4.1326490582571444e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.171875, + "logps/chosen": -1640.0, + "logps/rejected": -1656.0, + "loss": 0.6747, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.0203857421875, + "rewards/rejected": 0.427734375, + "step": 1200 + }, + { + "epoch": 0.34645896437328716, + "grad_norm": 11.063630002430141, + "learning_rate": 4.1307412501653987e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.15625, + "logps/chosen": -1856.0, + "logps/rejected": -1648.0, + "loss": 0.6999, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.078125, + "rewards/rejected": 0.50390625, + "step": 1201 + }, + { + "epoch": 0.34674743978075867, + "grad_norm": 9.52957027784144, + "learning_rate": 4.128831787622246e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1336.0, + "logps/rejected": -1336.0, + "loss": 0.6465, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.16015625, + "rewards/rejected": 0.232421875, + "step": 1202 + }, + { + "epoch": 0.3470359151882302, + "grad_norm": 13.360297381685506, + "learning_rate": 4.12692067256491e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.1875, + "logps/chosen": -1936.0, + "logps/rejected": -1912.0, + "loss": 0.6837, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.5, + "rewards/margins": -0.038818359375, + "rewards/rejected": 0.5390625, + "step": 1203 + }, + { + "epoch": 0.3473243905957017, + "grad_norm": 10.034674232099421, + "learning_rate": 4.125007906932294e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1808.0, + "logps/rejected": -1632.0, + "loss": 0.6771, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.04345703125, + "rewards/rejected": 0.5234375, + "step": 1204 + }, + { + "epoch": 0.3476128660031732, + "grad_norm": 11.015711643987565, + "learning_rate": 4.1230934926649736e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.171875, + "logps/chosen": -1992.0, + "logps/rejected": -2128.0, + "loss": 0.6714, + "loss/demonstration_loss": -4192.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.73046875, + "rewards/margins": 0.0712890625, + "rewards/rejected": 0.66015625, + "step": 1205 + }, + { + "epoch": 0.3479013414106447, + "grad_norm": 9.351317176476279, + "learning_rate": 4.1211774317051973e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -2000.0, + "logps/rejected": -1992.0, + "loss": 0.666, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6953125, + "rewards/margins": 0.177734375, + "rewards/rejected": 0.51953125, + "step": 1206 + }, + { + "epoch": 0.3481898168181163, + "grad_norm": 11.503135769328699, + "learning_rate": 4.119259725996886e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1624.0, + "logps/rejected": -1528.0, + "loss": 0.6959, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.0125732421875, + "rewards/rejected": 0.3515625, + "step": 1207 + }, + { + "epoch": 0.3484782922255878, + "grad_norm": 11.166304634273079, + "learning_rate": 4.1173403774856264e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1552.0, + "logps/rejected": -1600.0, + "loss": 0.7046, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5625, + "rewards/margins": 0.04296875, + "rewards/rejected": 0.51953125, + "step": 1208 + }, + { + "epoch": 0.3487667676330593, + "grad_norm": 11.294867745634383, + "learning_rate": 4.115419388118674e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1472.0, + "logps/rejected": -1544.0, + "loss": 0.6943, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.361328125, + "rewards/margins": -0.037109375, + "rewards/rejected": 0.3984375, + "step": 1209 + }, + { + "epoch": 0.3490552430405308, + "grad_norm": 9.142591171113695, + "learning_rate": 4.113496759844948e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.078125, + "logps/chosen": -1176.0, + "logps/rejected": -1256.0, + "loss": 0.662, + "loss/demonstration_loss": -2464.0, + "loss/preference_loss": -2448.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.326171875, + "rewards/margins": 0.03466796875, + "rewards/rejected": 0.29296875, + "step": 1210 + }, + { + "epoch": 0.3493437184480023, + "grad_norm": 12.197461037548551, + "learning_rate": 4.111572494615031e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.15625, + "logps/chosen": -1824.0, + "logps/rejected": -2176.0, + "loss": 0.7894, + "loss/demonstration_loss": -4048.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.39453125, + "rewards/margins": -0.13671875, + "rewards/rejected": 0.53125, + "step": 1211 + }, + { + "epoch": 0.34963219385547384, + "grad_norm": 10.72024228935816, + "learning_rate": 4.1096465943811666e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.03125, + "logps/chosen": -1464.0, + "logps/rejected": -1440.0, + "loss": 0.6882, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.404296875, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.33203125, + "step": 1212 + }, + { + "epoch": 0.34992066926294535, + "grad_norm": 12.549032756973414, + "learning_rate": 4.1077190610972555e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.796875, + "logps/chosen": -1280.0, + "logps/rejected": -1608.0, + "loss": 0.6478, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.373046875, + "rewards/margins": 0.1142578125, + "rewards/rejected": 0.2578125, + "step": 1213 + }, + { + "epoch": 0.35020914467041686, + "grad_norm": 10.641341868219293, + "learning_rate": 4.1057898967188575e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1584.0, + "logps/rejected": -1504.0, + "loss": 0.6714, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.1181640625, + "rewards/rejected": 0.283203125, + "step": 1214 + }, + { + "epoch": 0.35049762007788837, + "grad_norm": 10.321655524550449, + "learning_rate": 4.1038591032031853e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1376.0, + "logps/rejected": -1336.0, + "loss": 0.6794, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.0458984375, + "rewards/rejected": 0.220703125, + "step": 1215 + }, + { + "epoch": 0.3507860954853599, + "grad_norm": 10.209069340440468, + "learning_rate": 4.101926682509106e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.1875, + "logps/chosen": -1760.0, + "logps/rejected": -1736.0, + "loss": 0.6542, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.1328125, + "rewards/rejected": 0.279296875, + "step": 1216 + }, + { + "epoch": 0.3510745708928314, + "grad_norm": 11.255312610539976, + "learning_rate": 4.0999926365971354e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.078125, + "logps/chosen": -1504.0, + "logps/rejected": -1376.0, + "loss": 0.6752, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.431640625, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.328125, + "step": 1217 + }, + { + "epoch": 0.3513630463003029, + "grad_norm": 10.609593452090905, + "learning_rate": 4.098056967429441e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.0625, + "logps/chosen": -1440.0, + "logps/rejected": -1288.0, + "loss": 0.6724, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.0498046875, + "rewards/rejected": 0.3671875, + "step": 1218 + }, + { + "epoch": 0.3516515217077744, + "grad_norm": 11.616146803065622, + "learning_rate": 4.096119676969834e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.046875, + "logps/chosen": -1384.0, + "logps/rejected": -1400.0, + "loss": 0.6874, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.10693359375, + "rewards/rejected": 0.2470703125, + "step": 1219 + }, + { + "epoch": 0.35193999711524593, + "grad_norm": 11.51356497614385, + "learning_rate": 4.0941807671837736e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -1880.0, + "logps/rejected": -1720.0, + "loss": 0.6271, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7109375, + "rewards/margins": 0.2041015625, + "rewards/rejected": 0.5078125, + "step": 1220 + }, + { + "epoch": 0.35222847252271744, + "grad_norm": 11.442237706173277, + "learning_rate": 4.0922402400383594e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0625, + "logps/chosen": -1752.0, + "logps/rejected": -1536.0, + "loss": 0.6962, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.34375, + "rewards/margins": -0.007659912109375, + "rewards/rejected": 0.3515625, + "step": 1221 + }, + { + "epoch": 0.35251694793018895, + "grad_norm": 11.433299502337318, + "learning_rate": 4.0902980975023333e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1392.0, + "logps/rejected": -1256.0, + "loss": 0.6847, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2672.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.0791015625, + "rewards/rejected": 0.26953125, + "step": 1222 + }, + { + "epoch": 0.35280542333766046, + "grad_norm": 10.504823701963634, + "learning_rate": 4.088354341546075e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1840.0, + "logps/rejected": -1752.0, + "loss": 0.7106, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.43359375, + "step": 1223 + }, + { + "epoch": 0.353093898745132, + "grad_norm": 10.53658315043906, + "learning_rate": 4.086408974141603e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.203125, + "logps/chosen": -1336.0, + "logps/rejected": -1432.0, + "loss": 0.6942, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2275390625, + "rewards/margins": -0.07177734375, + "rewards/rejected": 0.298828125, + "step": 1224 + }, + { + "epoch": 0.3533823741526035, + "grad_norm": 11.020135901959646, + "learning_rate": 4.084461997262568e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.953125, + "logps/chosen": -2008.0, + "logps/rejected": -1920.0, + "loss": 0.6665, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3968.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.1650390625, + "rewards/rejected": 0.37890625, + "step": 1225 + }, + { + "epoch": 0.353670849560075, + "grad_norm": 10.18382192812381, + "learning_rate": 4.0825134128842553e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1464.0, + "logps/rejected": -1536.0, + "loss": 0.7198, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.392578125, + "rewards/margins": -0.01519775390625, + "rewards/rejected": 0.408203125, + "step": 1226 + }, + { + "epoch": 0.3539593249675465, + "grad_norm": 11.09982612299353, + "learning_rate": 4.0805632229835805e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.0625, + "logps/chosen": -1872.0, + "logps/rejected": -1880.0, + "loss": 0.6848, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.042724609375, + "rewards/rejected": 0.44921875, + "step": 1227 + }, + { + "epoch": 0.354247800375018, + "grad_norm": 11.124833218998273, + "learning_rate": 4.0786114295390893e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.15625, + "logps/chosen": -1512.0, + "logps/rejected": -1512.0, + "loss": 0.7101, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30078125, + "rewards/margins": -0.037353515625, + "rewards/rejected": 0.337890625, + "step": 1228 + }, + { + "epoch": 0.35453627578248953, + "grad_norm": 10.652363995478082, + "learning_rate": 4.076658034530953e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.09375, + "logps/chosen": -1392.0, + "logps/rejected": -1392.0, + "loss": 0.6691, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34765625, + "rewards/margins": 0.029296875, + "rewards/rejected": 0.318359375, + "step": 1229 + }, + { + "epoch": 0.35482475118996104, + "grad_norm": 10.1175322010966, + "learning_rate": 4.0747030399409663e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.0, + "logps/chosen": -1768.0, + "logps/rejected": -1792.0, + "loss": 0.6733, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.392578125, + "rewards/margins": 0.04833984375, + "rewards/rejected": 0.34375, + "step": 1230 + }, + { + "epoch": 0.35511322659743255, + "grad_norm": 13.285611126719353, + "learning_rate": 4.072746447752551e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1864.0, + "logps/rejected": -1824.0, + "loss": 0.7203, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.39453125, + "rewards/margins": -0.09375, + "rewards/rejected": 0.48828125, + "step": 1231 + }, + { + "epoch": 0.35540170200490406, + "grad_norm": 9.94212127679558, + "learning_rate": 4.070788259950745e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1696.0, + "logps/rejected": -1752.0, + "loss": 0.6833, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.427734375, + "rewards/margins": -0.0250244140625, + "rewards/rejected": 0.453125, + "step": 1232 + }, + { + "epoch": 0.3556901774123756, + "grad_norm": 9.695569175614988, + "learning_rate": 4.068828478522208e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1736.0, + "logps/rejected": -1512.0, + "loss": 0.6272, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.2578125, + "rewards/rejected": 0.341796875, + "step": 1233 + }, + { + "epoch": 0.3559786528198471, + "grad_norm": 11.552814260861226, + "learning_rate": 4.066867105455216e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1312.0, + "logps/rejected": -1192.0, + "loss": 0.6837, + "loss/demonstration_loss": -2544.0, + "loss/preference_loss": -2544.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.02587890625, + "rewards/rejected": 0.32421875, + "step": 1234 + }, + { + "epoch": 0.3562671282273186, + "grad_norm": 10.74267503893525, + "learning_rate": 4.0649041427396593e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0625, + "logps/chosen": -1944.0, + "logps/rejected": -2144.0, + "loss": 0.6954, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.421875, + "rewards/margins": -0.0001220703125, + "rewards/rejected": 0.423828125, + "step": 1235 + }, + { + "epoch": 0.3565556036347901, + "grad_norm": 14.278704332197602, + "learning_rate": 4.062939592367041e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -1536.0, + "logps/rejected": -1552.0, + "loss": 0.6797, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.0517578125, + "rewards/rejected": 0.42578125, + "step": 1236 + }, + { + "epoch": 0.3568440790422617, + "grad_norm": 10.087521814145758, + "learning_rate": 4.060973456330474e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.921875, + "logps/chosen": -1608.0, + "logps/rejected": -1656.0, + "loss": 0.6661, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.28515625, + "rewards/margins": 0.119140625, + "rewards/rejected": 0.166015625, + "step": 1237 + }, + { + "epoch": 0.3571325544497332, + "grad_norm": 10.03717005152117, + "learning_rate": 4.0590057366246835e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1480.0, + "logps/rejected": -1608.0, + "loss": 0.7037, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25390625, + "rewards/margins": -0.039794921875, + "rewards/rejected": 0.29296875, + "step": 1238 + }, + { + "epoch": 0.3574210298572047, + "grad_norm": 15.489038957949608, + "learning_rate": 4.057036435245996e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1632.0, + "logps/rejected": -1640.0, + "loss": 0.6624, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.134765625, + "rewards/rejected": 0.30859375, + "step": 1239 + }, + { + "epoch": 0.3577095052646762, + "grad_norm": 11.184135470319777, + "learning_rate": 4.0550655541923475e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1768.0, + "logps/rejected": -1544.0, + "loss": 0.6364, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.255859375, + "step": 1240 + }, + { + "epoch": 0.3579979806721477, + "grad_norm": 11.938426000634484, + "learning_rate": 4.0530930954632736e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1720.0, + "logps/rejected": -1552.0, + "loss": 0.6837, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.515625, + "rewards/margins": 0.2001953125, + "rewards/rejected": 0.31640625, + "step": 1241 + }, + { + "epoch": 0.35828645607961923, + "grad_norm": 12.457016555737873, + "learning_rate": 4.0511190610599123e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.015625, + "logps/chosen": -2016.0, + "logps/rejected": -1992.0, + "loss": 0.7114, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5625, + "rewards/margins": 0.11279296875, + "rewards/rejected": 0.44921875, + "step": 1242 + }, + { + "epoch": 0.35857493148709074, + "grad_norm": 11.537539904187014, + "learning_rate": 4.049143452984999e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.125, + "logps/chosen": -1656.0, + "logps/rejected": -1744.0, + "loss": 0.7536, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3046875, + "rewards/margins": -0.134765625, + "rewards/rejected": 0.439453125, + "step": 1243 + }, + { + "epoch": 0.35886340689456225, + "grad_norm": 11.670549779837993, + "learning_rate": 4.0471662732428665e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.046875, + "logps/chosen": -1520.0, + "logps/rejected": -1368.0, + "loss": 0.709, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16796875, + "rewards/margins": -0.0615234375, + "rewards/rejected": 0.228515625, + "step": 1244 + }, + { + "epoch": 0.35915188230203376, + "grad_norm": 13.655157700573195, + "learning_rate": 4.045187523839441e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.875, + "logps/chosen": -1272.0, + "logps/rejected": -1240.0, + "loss": 0.6927, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2512.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1787109375, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.048095703125, + "step": 1245 + }, + { + "epoch": 0.3594403577095053, + "grad_norm": 12.46107323221246, + "learning_rate": 4.0432072067822434e-07, + "logits/chosen": 3.078125, + "logits/rejected": 2.96875, + "logps/chosen": -1480.0, + "logps/rejected": -1544.0, + "loss": 0.6813, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.1142578125, + "rewards/rejected": 0.1904296875, + "step": 1246 + }, + { + "epoch": 0.3597288331169768, + "grad_norm": 10.244311602395129, + "learning_rate": 4.041225324080382e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0, + "logps/chosen": -1704.0, + "logps/rejected": -1672.0, + "loss": 0.6378, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.396484375, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.265625, + "step": 1247 + }, + { + "epoch": 0.3600173085244483, + "grad_norm": 11.917569739552388, + "learning_rate": 4.039241877744556e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1936.0, + "logps/rejected": -1912.0, + "loss": 0.6782, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.0888671875, + "rewards/rejected": 0.294921875, + "step": 1248 + }, + { + "epoch": 0.3603057839319198, + "grad_norm": 11.093510006923282, + "learning_rate": 4.037256869787049e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.109375, + "logps/chosen": -1904.0, + "logps/rejected": -2000.0, + "loss": 0.6832, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.294921875, + "rewards/margins": 0.095703125, + "rewards/rejected": 0.2001953125, + "step": 1249 + }, + { + "epoch": 0.3605942593393913, + "grad_norm": 12.588195066679393, + "learning_rate": 4.035270302221732e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1536.0, + "logps/rejected": -1672.0, + "loss": 0.705, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.421875, + "rewards/margins": 0.0947265625, + "rewards/rejected": 0.328125, + "step": 1250 + }, + { + "epoch": 0.36088273474686283, + "grad_norm": 11.501715676588695, + "learning_rate": 4.0332821770640535e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1384.0, + "logps/rejected": -1336.0, + "loss": 0.6939, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1611328125, + "rewards/margins": -0.037109375, + "rewards/rejected": 0.19921875, + "step": 1251 + }, + { + "epoch": 0.36117121015433434, + "grad_norm": 11.251298637465409, + "learning_rate": 4.031292496331047e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.953125, + "logps/chosen": -1616.0, + "logps/rejected": -1784.0, + "loss": 0.6865, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2099609375, + "rewards/margins": -0.0286865234375, + "rewards/rejected": 0.2392578125, + "step": 1252 + }, + { + "epoch": 0.36145968556180585, + "grad_norm": 13.118916015022132, + "learning_rate": 4.0293012620413224e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.03125, + "logps/chosen": -1464.0, + "logps/rejected": -1408.0, + "loss": 0.7103, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2734375, + "rewards/margins": -0.02001953125, + "rewards/rejected": 0.29296875, + "step": 1253 + }, + { + "epoch": 0.36174816096927737, + "grad_norm": 9.931187199086231, + "learning_rate": 4.027308476215064e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1624.0, + "logps/rejected": -1456.0, + "loss": 0.6636, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.01287841796875, + "rewards/rejected": 0.24609375, + "step": 1254 + }, + { + "epoch": 0.3620366363767489, + "grad_norm": 10.89305215170687, + "learning_rate": 4.0253141408740325e-07, + "logits/chosen": 2.953125, + "logits/rejected": 2.96875, + "logps/chosen": -1832.0, + "logps/rejected": -1720.0, + "loss": 0.6681, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2734375, + "rewards/margins": 0.12451171875, + "rewards/rejected": 0.1484375, + "step": 1255 + }, + { + "epoch": 0.3623251117842204, + "grad_norm": 11.713311771807788, + "learning_rate": 4.02331825804156e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.078125, + "logps/chosen": -1696.0, + "logps/rejected": -1720.0, + "loss": 0.7479, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.146484375, + "rewards/rejected": 0.478515625, + "step": 1256 + }, + { + "epoch": 0.3626135871916919, + "grad_norm": 9.437006245338177, + "learning_rate": 4.0213208297425486e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.140625, + "logps/chosen": -1736.0, + "logps/rejected": -1528.0, + "loss": 0.6428, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.439453125, + "rewards/margins": 0.259765625, + "rewards/rejected": 0.1787109375, + "step": 1257 + }, + { + "epoch": 0.3629020625991634, + "grad_norm": 12.573423922064407, + "learning_rate": 4.019321858003468e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.1875, + "logps/chosen": -1496.0, + "logps/rejected": -1464.0, + "loss": 0.7266, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.255859375, + "rewards/margins": -0.0810546875, + "rewards/rejected": 0.3359375, + "step": 1258 + }, + { + "epoch": 0.3631905380066349, + "grad_norm": 11.801353247488077, + "learning_rate": 4.017321344852354e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1288.0, + "logps/rejected": -1240.0, + "loss": 0.6866, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2544.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26171875, + "rewards/margins": 0.0908203125, + "rewards/rejected": 0.1708984375, + "step": 1259 + }, + { + "epoch": 0.36347901341410643, + "grad_norm": 10.531897234328193, + "learning_rate": 4.015319292318806e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.078125, + "logps/chosen": -1608.0, + "logps/rejected": -1328.0, + "loss": 0.6207, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.416015625, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.14453125, + "step": 1260 + }, + { + "epoch": 0.36376748882157794, + "grad_norm": 12.955951911397776, + "learning_rate": 4.013315702433986e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.109375, + "logps/chosen": -1752.0, + "logps/rejected": -1680.0, + "loss": 0.6813, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.00030517578125, + "rewards/rejected": 0.39453125, + "step": 1261 + }, + { + "epoch": 0.36405596422904946, + "grad_norm": 11.923890955328783, + "learning_rate": 4.0113105772306143e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1704.0, + "logps/rejected": -1712.0, + "loss": 0.6939, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2373046875, + "rewards/margins": 0.05322265625, + "rewards/rejected": 0.1845703125, + "step": 1262 + }, + { + "epoch": 0.36434443963652097, + "grad_norm": 11.33155065346091, + "learning_rate": 4.00930391874297e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1624.0, + "logps/rejected": -1544.0, + "loss": 0.7014, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2236328125, + "rewards/margins": -0.026123046875, + "rewards/rejected": 0.25, + "step": 1263 + }, + { + "epoch": 0.3646329150439925, + "grad_norm": 11.857339437582848, + "learning_rate": 4.007295729006888e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.9375, + "logps/chosen": -1808.0, + "logps/rejected": -1840.0, + "loss": 0.6733, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.369140625, + "rewards/margins": 0.07666015625, + "rewards/rejected": 0.291015625, + "step": 1264 + }, + { + "epoch": 0.364921390451464, + "grad_norm": 12.113647886242498, + "learning_rate": 4.0052860100597535e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1472.0, + "logps/rejected": -1432.0, + "loss": 0.6879, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.25390625, + "rewards/margins": 0.038818359375, + "rewards/rejected": 0.21484375, + "step": 1265 + }, + { + "epoch": 0.3652098658589355, + "grad_norm": 11.206608483338035, + "learning_rate": 4.003274763940509e-07, + "logits/chosen": 3.0625, + "logits/rejected": 2.984375, + "logps/chosen": -1880.0, + "logps/rejected": -1800.0, + "loss": 0.6611, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.1435546875, + "rewards/rejected": 0.2578125, + "step": 1266 + }, + { + "epoch": 0.365498341266407, + "grad_norm": 12.267812826918988, + "learning_rate": 4.0012619926896414e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.984375, + "logps/chosen": -1728.0, + "logps/rejected": -1720.0, + "loss": 0.6833, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.0634765625, + "rewards/rejected": 0.34765625, + "step": 1267 + }, + { + "epoch": 0.3657868166738786, + "grad_norm": 10.650627825858022, + "learning_rate": 3.999247698349187e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.875, + "logps/chosen": -1752.0, + "logps/rejected": -1688.0, + "loss": 0.6709, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.1318359375, + "rewards/rejected": 0.373046875, + "step": 1268 + }, + { + "epoch": 0.3660752920813501, + "grad_norm": 12.068252124892172, + "learning_rate": 3.9972318829627275e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -2096.0, + "logps/rejected": -1744.0, + "loss": 0.6227, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.2021484375, + "rewards/rejected": 0.294921875, + "step": 1269 + }, + { + "epoch": 0.3663637674888216, + "grad_norm": 11.604748372620438, + "learning_rate": 3.9952145485753864e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1464.0, + "logps/rejected": -1560.0, + "loss": 0.7193, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30859375, + "rewards/margins": 0.050537109375, + "rewards/rejected": 0.2578125, + "step": 1270 + }, + { + "epoch": 0.3666522428962931, + "grad_norm": 9.853525963837496, + "learning_rate": 3.9931956972338295e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.921875, + "logps/chosen": -1208.0, + "logps/rejected": -1184.0, + "loss": 0.6489, + "loss/demonstration_loss": -2416.0, + "loss/preference_loss": -2400.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.275390625, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.1474609375, + "step": 1271 + }, + { + "epoch": 0.3669407183037646, + "grad_norm": 10.253161845846412, + "learning_rate": 3.991175330986261e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1808.0, + "logps/rejected": -1752.0, + "loss": 0.6408, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5625, + "rewards/margins": 0.212890625, + "rewards/rejected": 0.3515625, + "step": 1272 + }, + { + "epoch": 0.36722919371123613, + "grad_norm": 10.603741507959823, + "learning_rate": 3.989153451882422e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.15625, + "logps/chosen": -1736.0, + "logps/rejected": -1440.0, + "loss": 0.6688, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1923828125, + "rewards/margins": 0.00311279296875, + "rewards/rejected": 0.1884765625, + "step": 1273 + }, + { + "epoch": 0.36751766911870765, + "grad_norm": 11.008733143182452, + "learning_rate": 3.9871300619735905e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1552.0, + "logps/rejected": -1408.0, + "loss": 0.6758, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.1787109375, + "step": 1274 + }, + { + "epoch": 0.36780614452617916, + "grad_norm": 11.961868030763327, + "learning_rate": 3.9851051633125733e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0625, + "logps/chosen": -1856.0, + "logps/rejected": -1824.0, + "loss": 0.6711, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.287109375, + "step": 1275 + }, + { + "epoch": 0.36809461993365067, + "grad_norm": 10.89029653309017, + "learning_rate": 3.983078757953711e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.0625, + "logps/chosen": -1832.0, + "logps/rejected": -1864.0, + "loss": 0.6665, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.1298828125, + "rewards/rejected": 0.361328125, + "step": 1276 + }, + { + "epoch": 0.3683830953411222, + "grad_norm": 13.884388927635102, + "learning_rate": 3.981050847952871e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.078125, + "logps/chosen": -1800.0, + "logps/rejected": -1816.0, + "loss": 0.7044, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.330078125, + "rewards/margins": 0.0262451171875, + "rewards/rejected": 0.3046875, + "step": 1277 + }, + { + "epoch": 0.3686715707485937, + "grad_norm": 11.871776413169744, + "learning_rate": 3.979021435367449e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1480.0, + "logps/rejected": -1520.0, + "loss": 0.6681, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.06396484375, + "rewards/rejected": 0.314453125, + "step": 1278 + }, + { + "epoch": 0.3689600461560652, + "grad_norm": 10.313029121317424, + "learning_rate": 3.9769905222563647e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1744.0, + "logps/rejected": -1704.0, + "loss": 0.6875, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2734375, + "rewards/margins": 0.07470703125, + "rewards/rejected": 0.19921875, + "step": 1279 + }, + { + "epoch": 0.3692485215635367, + "grad_norm": 10.362736003116298, + "learning_rate": 3.974958110680059e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.03125, + "logps/chosen": -1384.0, + "logps/rejected": -1240.0, + "loss": 0.6716, + "loss/demonstration_loss": -2656.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2412109375, + "rewards/margins": 0.09521484375, + "rewards/rejected": 0.1455078125, + "step": 1280 + }, + { + "epoch": 0.3695369969710082, + "grad_norm": 10.520858644058022, + "learning_rate": 3.9729242027004937e-07, + "logits/chosen": 2.90625, + "logits/rejected": 2.96875, + "logps/chosen": -1864.0, + "logps/rejected": -1952.0, + "loss": 0.7301, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.271484375, + "rewards/margins": -0.018798828125, + "rewards/rejected": 0.291015625, + "step": 1281 + }, + { + "epoch": 0.36982547237847974, + "grad_norm": 10.502517234981248, + "learning_rate": 3.9708888003811487e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1752.0, + "logps/rejected": -1824.0, + "loss": 0.7171, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.0625, + "rewards/chosen": 0.1982421875, + "rewards/margins": -0.134765625, + "rewards/rejected": 0.33203125, + "step": 1282 + }, + { + "epoch": 0.37011394778595125, + "grad_norm": 12.677737589428888, + "learning_rate": 3.96885190578702e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.09375, + "logps/chosen": -1304.0, + "logps/rejected": -1376.0, + "loss": 0.673, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2353515625, + "rewards/margins": 0.064453125, + "rewards/rejected": 0.1708984375, + "step": 1283 + }, + { + "epoch": 0.37040242319342276, + "grad_norm": 10.529804641501595, + "learning_rate": 3.9668135209846177e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.109375, + "logps/chosen": -1864.0, + "logps/rejected": -1672.0, + "loss": 0.6664, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.361328125, + "rewards/margins": 0.08544921875, + "rewards/rejected": 0.275390625, + "step": 1284 + }, + { + "epoch": 0.37069089860089427, + "grad_norm": 12.156094409288738, + "learning_rate": 3.964773648041964e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0, + "logps/chosen": -1944.0, + "logps/rejected": -2080.0, + "loss": 0.6691, + "loss/demonstration_loss": -4048.0, + "loss/preference_loss": -4048.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.2890625, + "step": 1285 + }, + { + "epoch": 0.3709793740083658, + "grad_norm": 10.640789097759042, + "learning_rate": 3.9627322890285903e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.171875, + "logps/chosen": -1488.0, + "logps/rejected": -1208.0, + "loss": 0.6634, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2720.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.28125, + "rewards/margins": 0.1513671875, + "rewards/rejected": 0.130859375, + "step": 1286 + }, + { + "epoch": 0.3712678494158373, + "grad_norm": 12.30840419428015, + "learning_rate": 3.960689446015536e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -2048.0, + "logps/rejected": -1880.0, + "loss": 0.6749, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3984.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.515625, + "rewards/margins": 0.002685546875, + "rewards/rejected": 0.51171875, + "step": 1287 + }, + { + "epoch": 0.3715563248233088, + "grad_norm": 10.693499044062825, + "learning_rate": 3.958645121075347e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1792.0, + "logps/rejected": -1752.0, + "loss": 0.6469, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.2490234375, + "rewards/rejected": 0.177734375, + "step": 1288 + }, + { + "epoch": 0.3718448002307803, + "grad_norm": 10.103059828947796, + "learning_rate": 3.9565993162820685e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.984375, + "logps/chosen": -1688.0, + "logps/rejected": -1504.0, + "loss": 0.668, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.287109375, + "rewards/margins": -0.04150390625, + "rewards/rejected": 0.328125, + "step": 1289 + }, + { + "epoch": 0.3721332756382518, + "grad_norm": 11.99122284873429, + "learning_rate": 3.9545520337112546e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.109375, + "logps/chosen": -1936.0, + "logps/rejected": -1672.0, + "loss": 0.712, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19140625, + "rewards/margins": 0.0242919921875, + "rewards/rejected": 0.1669921875, + "step": 1290 + }, + { + "epoch": 0.37242175104572334, + "grad_norm": 10.35646483085471, + "learning_rate": 3.952503275439951e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.109375, + "logps/chosen": -1712.0, + "logps/rejected": -1504.0, + "loss": 0.6732, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.18359375, + "rewards/rejected": 0.0771484375, + "step": 1291 + }, + { + "epoch": 0.37271022645319485, + "grad_norm": 11.217378813959431, + "learning_rate": 3.950453043546706e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1920.0, + "logps/rejected": -1872.0, + "loss": 0.6913, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.310546875, + "rewards/margins": -0.001373291015625, + "rewards/rejected": 0.3125, + "step": 1292 + }, + { + "epoch": 0.37299870186066636, + "grad_norm": 12.198469021608128, + "learning_rate": 3.948401340111559e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.140625, + "logps/chosen": -1472.0, + "logps/rejected": -1552.0, + "loss": 0.6628, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30078125, + "rewards/margins": 0.0184326171875, + "rewards/rejected": 0.283203125, + "step": 1293 + }, + { + "epoch": 0.37328717726813787, + "grad_norm": 11.45192956024991, + "learning_rate": 3.946348167216046e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.984375, + "logps/chosen": -1760.0, + "logps/rejected": -1680.0, + "loss": 0.6688, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.294921875, + "rewards/margins": 0.05224609375, + "rewards/rejected": 0.2412109375, + "step": 1294 + }, + { + "epoch": 0.3735756526756094, + "grad_norm": 11.761034050668208, + "learning_rate": 3.94429352694319e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1752.0, + "logps/rejected": -1576.0, + "loss": 0.6743, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.234375, + "rewards/margins": 0.091796875, + "rewards/rejected": 0.142578125, + "step": 1295 + }, + { + "epoch": 0.3738641280830809, + "grad_norm": 12.946000588423907, + "learning_rate": 3.9422374213775065e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1520.0, + "logps/rejected": -1504.0, + "loss": 0.6896, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.255859375, + "rewards/margins": 0.10791015625, + "rewards/rejected": 0.1474609375, + "step": 1296 + }, + { + "epoch": 0.3741526034905524, + "grad_norm": 10.195209424163581, + "learning_rate": 3.940179852604995e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0, + "logps/chosen": -1264.0, + "logps/rejected": -1536.0, + "loss": 0.6448, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.212890625, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.1376953125, + "step": 1297 + }, + { + "epoch": 0.3744410788980239, + "grad_norm": 10.588385115520685, + "learning_rate": 3.9381208227131406e-07, + "logits/chosen": 2.921875, + "logits/rejected": 2.9375, + "logps/chosen": -1864.0, + "logps/rejected": -1632.0, + "loss": 0.6175, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.396484375, + "rewards/margins": 0.2431640625, + "rewards/rejected": 0.154296875, + "step": 1298 + }, + { + "epoch": 0.3747295543054955, + "grad_norm": 12.250245377682738, + "learning_rate": 3.93606033379091e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.90625, + "logps/chosen": -1488.0, + "logps/rejected": -1664.0, + "loss": 0.6581, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.25390625, + "rewards/margins": 0.04443359375, + "rewards/rejected": 0.2099609375, + "step": 1299 + }, + { + "epoch": 0.375018029712967, + "grad_norm": 12.313145270385382, + "learning_rate": 3.933998387928751e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.0, + "logps/chosen": -1800.0, + "logps/rejected": -1760.0, + "loss": 0.6779, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2890625, + "rewards/margins": 0.07958984375, + "rewards/rejected": 0.2099609375, + "step": 1300 + }, + { + "epoch": 0.3753065051204385, + "grad_norm": 12.785025271343446, + "learning_rate": 3.931934987218589e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -1472.0, + "logps/rejected": -1680.0, + "loss": 0.7155, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2333984375, + "rewards/margins": -0.01153564453125, + "rewards/rejected": 0.2451171875, + "step": 1301 + }, + { + "epoch": 0.37559498052791, + "grad_norm": 11.500316437536918, + "learning_rate": 3.9298701337538255e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1680.0, + "logps/rejected": -1496.0, + "loss": 0.689, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.265625, + "rewards/margins": 0.10546875, + "rewards/rejected": 0.16015625, + "step": 1302 + }, + { + "epoch": 0.3758834559353815, + "grad_norm": 13.908586601294262, + "learning_rate": 3.927803829629336e-07, + "logits/chosen": 2.921875, + "logits/rejected": 3.046875, + "logps/chosen": -2224.0, + "logps/rejected": -2040.0, + "loss": 0.7136, + "loss/demonstration_loss": -4288.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.2431640625, + "rewards/margins": -0.050537109375, + "rewards/rejected": 0.29296875, + "step": 1303 + }, + { + "epoch": 0.37617193134285304, + "grad_norm": 12.665459028622772, + "learning_rate": 3.925736076941467e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1696.0, + "logps/rejected": -1536.0, + "loss": 0.6698, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.244140625, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.171875, + "step": 1304 + }, + { + "epoch": 0.37646040675032455, + "grad_norm": 11.599603626351664, + "learning_rate": 3.9236668777880355e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1768.0, + "logps/rejected": -1888.0, + "loss": 0.6906, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2109375, + "rewards/margins": 0.02099609375, + "rewards/rejected": 0.1904296875, + "step": 1305 + }, + { + "epoch": 0.37674888215779606, + "grad_norm": 11.086085065575784, + "learning_rate": 3.9215962342683266e-07, + "logits/chosen": 2.96875, + "logits/rejected": 3.03125, + "logps/chosen": -1664.0, + "logps/rejected": -1544.0, + "loss": 0.6872, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1552734375, + "rewards/margins": -0.04833984375, + "rewards/rejected": 0.203125, + "step": 1306 + }, + { + "epoch": 0.3770373575652676, + "grad_norm": 10.464538772184198, + "learning_rate": 3.91952414848309e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0625, + "logps/chosen": -1832.0, + "logps/rejected": -1864.0, + "loss": 0.683, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.205078125, + "rewards/margins": -0.033203125, + "rewards/rejected": 0.23828125, + "step": 1307 + }, + { + "epoch": 0.3773258329727391, + "grad_norm": 10.559774990618118, + "learning_rate": 3.9174506225345373e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1824.0, + "logps/rejected": -1624.0, + "loss": 0.6612, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3125, + "rewards/margins": 0.1611328125, + "rewards/rejected": 0.15234375, + "step": 1308 + }, + { + "epoch": 0.3776143083802106, + "grad_norm": 11.501113010606543, + "learning_rate": 3.915375658526343e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.890625, + "logps/chosen": -1632.0, + "logps/rejected": -1720.0, + "loss": 0.6979, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1025390625, + "rewards/margins": -0.027587890625, + "rewards/rejected": 0.1298828125, + "step": 1309 + }, + { + "epoch": 0.3779027837876821, + "grad_norm": 12.792367570953155, + "learning_rate": 3.9132992585636406e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1096.0, + "logps/rejected": -1304.0, + "loss": 0.7051, + "loss/demonstration_loss": -2400.0, + "loss/preference_loss": -2416.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.107421875, + "rewards/margins": -0.04248046875, + "rewards/rejected": 0.150390625, + "step": 1310 + }, + { + "epoch": 0.3781912591951536, + "grad_norm": 11.664082855726287, + "learning_rate": 3.911221424753019e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1504.0, + "logps/rejected": -1480.0, + "loss": 0.6755, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.212890625, + "rewards/margins": 0.06689453125, + "rewards/rejected": 0.1455078125, + "step": 1311 + }, + { + "epoch": 0.37847973460262513, + "grad_norm": 10.933817609782864, + "learning_rate": 3.909142159202523e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.1875, + "logps/chosen": -1800.0, + "logps/rejected": -1776.0, + "loss": 0.6661, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1474609375, + "rewards/margins": 0.06982421875, + "rewards/rejected": 0.07763671875, + "step": 1312 + }, + { + "epoch": 0.37876821001009664, + "grad_norm": 12.337783587066738, + "learning_rate": 3.9070614640216503e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.09375, + "logps/chosen": -1344.0, + "logps/rejected": -1496.0, + "loss": 0.7125, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1748046875, + "rewards/margins": -0.1064453125, + "rewards/rejected": 0.28125, + "step": 1313 + }, + { + "epoch": 0.37905668541756815, + "grad_norm": 10.192238319533546, + "learning_rate": 3.904979341321348e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -2048.0, + "logps/rejected": -1800.0, + "loss": 0.6696, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.34375, + "rewards/margins": 0.173828125, + "rewards/rejected": 0.1689453125, + "step": 1314 + }, + { + "epoch": 0.37934516082503966, + "grad_norm": 13.807593183105208, + "learning_rate": 3.902895793214011e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0, + "logps/chosen": -1576.0, + "logps/rejected": -1720.0, + "loss": 0.7178, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33984375, + "rewards/margins": -0.0042724609375, + "rewards/rejected": 0.34375, + "step": 1315 + }, + { + "epoch": 0.3796336362325112, + "grad_norm": 12.336995914001294, + "learning_rate": 3.900810821813482e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1976.0, + "logps/rejected": -1912.0, + "loss": 0.6672, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.12060546875, + "rewards/rejected": 0.35546875, + "step": 1316 + }, + { + "epoch": 0.3799221116399827, + "grad_norm": 11.07192044233282, + "learning_rate": 3.898724429235046e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -1960.0, + "logps/rejected": -2016.0, + "loss": 0.6863, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.0087890625, + "rewards/rejected": 0.341796875, + "step": 1317 + }, + { + "epoch": 0.3802105870474542, + "grad_norm": 10.16552468544801, + "learning_rate": 3.8966366175954323e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -1664.0, + "logps/rejected": -1800.0, + "loss": 0.671, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.1669921875, + "rewards/rejected": 0.15625, + "step": 1318 + }, + { + "epoch": 0.3804990624549257, + "grad_norm": 12.337054319881316, + "learning_rate": 3.8945473890128066e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -2144.0, + "logps/rejected": -1832.0, + "loss": 0.6798, + "loss/demonstration_loss": -4000.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.328125, + "rewards/margins": 0.10986328125, + "rewards/rejected": 0.2177734375, + "step": 1319 + }, + { + "epoch": 0.3807875378623972, + "grad_norm": 11.415406563828665, + "learning_rate": 3.8924567456067747e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1888.0, + "logps/rejected": -1616.0, + "loss": 0.6793, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.13671875, + "rewards/rejected": 0.203125, + "step": 1320 + }, + { + "epoch": 0.38107601326986873, + "grad_norm": 10.206101516211046, + "learning_rate": 3.8903646894983765e-07, + "logits/chosen": 2.8125, + "logits/rejected": 2.859375, + "logps/chosen": -1720.0, + "logps/rejected": -1552.0, + "loss": 0.6549, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1943359375, + "rewards/margins": 0.0595703125, + "rewards/rejected": 0.134765625, + "step": 1321 + }, + { + "epoch": 0.38136448867734024, + "grad_norm": 10.014761563192863, + "learning_rate": 3.8882712228100854e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.03125, + "logps/chosen": -1296.0, + "logps/rejected": -1448.0, + "loss": 0.6841, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1484375, + "rewards/margins": 0.00048828125, + "rewards/rejected": 0.1484375, + "step": 1322 + }, + { + "epoch": 0.38165296408481175, + "grad_norm": 11.060297814267088, + "learning_rate": 3.8861763476658074e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.984375, + "logps/chosen": -1928.0, + "logps/rejected": -1952.0, + "loss": 0.6707, + "loss/demonstration_loss": -3920.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.0498046875, + "rewards/rejected": 0.28515625, + "step": 1323 + }, + { + "epoch": 0.38194143949228326, + "grad_norm": 11.12051303946696, + "learning_rate": 3.884080066190874e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.0, + "logps/chosen": -1856.0, + "logps/rejected": -1904.0, + "loss": 0.6624, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.126953125, + "rewards/rejected": 0.1806640625, + "step": 1324 + }, + { + "epoch": 0.3822299148997548, + "grad_norm": 11.992897182442796, + "learning_rate": 3.8819823805120474e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1888.0, + "logps/rejected": -2048.0, + "loss": 0.7081, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1943359375, + "rewards/margins": -0.0625, + "rewards/rejected": 0.2578125, + "step": 1325 + }, + { + "epoch": 0.3825183903072263, + "grad_norm": 12.041075268194472, + "learning_rate": 3.879883292757511e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.109375, + "logps/chosen": -1808.0, + "logps/rejected": -1776.0, + "loss": 0.723, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2021484375, + "rewards/margins": -0.0712890625, + "rewards/rejected": 0.2734375, + "step": 1326 + }, + { + "epoch": 0.3828068657146978, + "grad_norm": 12.625450253386129, + "learning_rate": 3.8777828050568735e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1776.0, + "logps/rejected": -1640.0, + "loss": 0.6385, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.375, + "rewards/margins": 0.2197265625, + "rewards/rejected": 0.1552734375, + "step": 1327 + }, + { + "epoch": 0.3830953411221693, + "grad_norm": 11.825084181263108, + "learning_rate": 3.875680919541162e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1576.0, + "logps/rejected": -1504.0, + "loss": 0.6569, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.208984375, + "rewards/margins": 0.026123046875, + "rewards/rejected": 0.1826171875, + "step": 1328 + }, + { + "epoch": 0.3833838165296409, + "grad_norm": 9.84833110363681, + "learning_rate": 3.873577638342823e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.078125, + "logps/chosen": -2080.0, + "logps/rejected": -1864.0, + "loss": 0.6588, + "loss/demonstration_loss": -3968.0, + "loss/preference_loss": -3968.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.12158203125, + "rewards/rejected": 0.2138671875, + "step": 1329 + }, + { + "epoch": 0.3836722919371124, + "grad_norm": 9.924558047227473, + "learning_rate": 3.871472963595717e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.015625, + "logps/chosen": -1896.0, + "logps/rejected": -1832.0, + "loss": 0.6385, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.310546875, + "step": 1330 + }, + { + "epoch": 0.3839607673445839, + "grad_norm": 11.812324278447832, + "learning_rate": 3.86936689743512e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1592.0, + "logps/rejected": -1376.0, + "loss": 0.6623, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.1650390625, + "rewards/rejected": 0.10546875, + "step": 1331 + }, + { + "epoch": 0.3842492427520554, + "grad_norm": 12.33186081272834, + "learning_rate": 3.867259441997721e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.90625, + "logps/chosen": -1760.0, + "logps/rejected": -1448.0, + "loss": 0.674, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14453125, + "rewards/margins": 0.109375, + "rewards/rejected": 0.03515625, + "step": 1332 + }, + { + "epoch": 0.3845377181595269, + "grad_norm": 11.100998626421617, + "learning_rate": 3.865150599421615e-07, + "logits/chosen": 2.734375, + "logits/rejected": 2.84375, + "logps/chosen": -1696.0, + "logps/rejected": -1816.0, + "loss": 0.6913, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.291015625, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.1826171875, + "step": 1333 + }, + { + "epoch": 0.38482619356699843, + "grad_norm": 10.026343311711983, + "learning_rate": 3.863040371846307e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.96875, + "logps/chosen": -1120.0, + "logps/rejected": -1120.0, + "loss": 0.6519, + "loss/demonstration_loss": -2240.0, + "loss/preference_loss": -2224.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.072265625, + "rewards/margins": 0.1298828125, + "rewards/rejected": -0.056884765625, + "step": 1334 + }, + { + "epoch": 0.38511466897446994, + "grad_norm": 9.305094915348548, + "learning_rate": 3.860928761412705e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.078125, + "logps/chosen": -1720.0, + "logps/rejected": -1760.0, + "loss": 0.6697, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.13671875, + "rewards/margins": 0.048828125, + "rewards/rejected": 0.08740234375, + "step": 1335 + }, + { + "epoch": 0.38540314438194145, + "grad_norm": 10.899976416896294, + "learning_rate": 3.8588157702631235e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.109375, + "logps/chosen": -1352.0, + "logps/rejected": -1184.0, + "loss": 0.6993, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2560.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.17578125, + "rewards/margins": 0.01324462890625, + "rewards/rejected": 0.1630859375, + "step": 1336 + }, + { + "epoch": 0.38569161978941296, + "grad_norm": 11.667203214597203, + "learning_rate": 3.8567014005412733e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1456.0, + "logps/rejected": -1552.0, + "loss": 0.668, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.166015625, + "rewards/rejected": 0.1416015625, + "step": 1337 + }, + { + "epoch": 0.3859800951968845, + "grad_norm": 12.892471100540002, + "learning_rate": 3.854585654392267e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.109375, + "logps/chosen": -1720.0, + "logps/rejected": -1624.0, + "loss": 0.713, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.1416015625, + "rewards/rejected": 0.2236328125, + "step": 1338 + }, + { + "epoch": 0.386268570604356, + "grad_norm": 11.061038781517992, + "learning_rate": 3.8524685339626123e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.078125, + "logps/chosen": -1712.0, + "logps/rejected": -1344.0, + "loss": 0.7069, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.1259765625, + "rewards/rejected": 0.1962890625, + "step": 1339 + }, + { + "epoch": 0.3865570460118275, + "grad_norm": 12.102606742884733, + "learning_rate": 3.8503500414002116e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -2024.0, + "logps/rejected": -1944.0, + "loss": 0.6797, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.451171875, + "rewards/margins": 0.022705078125, + "rewards/rejected": 0.4296875, + "step": 1340 + }, + { + "epoch": 0.386845521419299, + "grad_norm": 11.359686357459275, + "learning_rate": 3.848230178854359e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.96875, + "logps/chosen": -1864.0, + "logps/rejected": -1952.0, + "loss": 0.6365, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.333984375, + "rewards/margins": 0.140625, + "rewards/rejected": 0.193359375, + "step": 1341 + }, + { + "epoch": 0.3871339968267705, + "grad_norm": 10.556664842274381, + "learning_rate": 3.846108948475739e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.0, + "logps/chosen": -2240.0, + "logps/rejected": -1848.0, + "loss": 0.6451, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.2255859375, + "rewards/rejected": 0.28515625, + "step": 1342 + }, + { + "epoch": 0.38742247223424203, + "grad_norm": 9.614234889079466, + "learning_rate": 3.843986352416424e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0625, + "logps/chosen": -1680.0, + "logps/rejected": -1560.0, + "loss": 0.6378, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.279296875, + "rewards/margins": 0.1337890625, + "rewards/rejected": 0.146484375, + "step": 1343 + }, + { + "epoch": 0.38771094764171354, + "grad_norm": 13.07384355354088, + "learning_rate": 3.841862392829871e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1776.0, + "logps/rejected": -1376.0, + "loss": 0.6483, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.283203125, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.193359375, + "step": 1344 + }, + { + "epoch": 0.38799942304918505, + "grad_norm": 10.496379282287489, + "learning_rate": 3.839737071870922e-07, + "logits/chosen": 3.109375, + "logits/rejected": 2.953125, + "logps/chosen": -2272.0, + "logps/rejected": -2224.0, + "loss": 0.6858, + "loss/demonstration_loss": -4544.0, + "loss/preference_loss": -4544.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.35546875, + "step": 1345 + }, + { + "epoch": 0.38828789845665657, + "grad_norm": 11.039847914322003, + "learning_rate": 3.837610391695797e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1504.0, + "logps/rejected": -1464.0, + "loss": 0.6662, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.34765625, + "rewards/margins": 0.125, + "rewards/rejected": 0.22265625, + "step": 1346 + }, + { + "epoch": 0.3885763738641281, + "grad_norm": 10.76695953507155, + "learning_rate": 3.835482354462098e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0, + "logps/chosen": -1672.0, + "logps/rejected": -1544.0, + "loss": 0.6761, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37109375, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.2412109375, + "step": 1347 + }, + { + "epoch": 0.3888648492715996, + "grad_norm": 10.868246063809314, + "learning_rate": 3.8333529623288035e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.21875, + "logps/chosen": -1728.0, + "logps/rejected": -1680.0, + "loss": 0.6672, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46875, + "rewards/margins": 0.07958984375, + "rewards/rejected": 0.390625, + "step": 1348 + }, + { + "epoch": 0.3891533246790711, + "grad_norm": 10.080346986521903, + "learning_rate": 3.8312222174562655e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1632.0, + "logps/rejected": -1568.0, + "loss": 0.6699, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1943359375, + "rewards/margins": -0.006439208984375, + "rewards/rejected": 0.201171875, + "step": 1349 + }, + { + "epoch": 0.3894418000865426, + "grad_norm": 12.20409589874611, + "learning_rate": 3.8290901220062086e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.09375, + "logps/chosen": -1960.0, + "logps/rejected": -1752.0, + "loss": 0.697, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.0556640625, + "rewards/rejected": 0.263671875, + "step": 1350 + }, + { + "epoch": 0.3897302754940141, + "grad_norm": 10.73859605566082, + "learning_rate": 3.8269566781417274e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.0625, + "logps/chosen": -1768.0, + "logps/rejected": -1624.0, + "loss": 0.6434, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.1103515625, + "rewards/rejected": 0.2099609375, + "step": 1351 + }, + { + "epoch": 0.39001875090148563, + "grad_norm": 11.550203952385035, + "learning_rate": 3.8248218880272864e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.984375, + "logps/chosen": -1968.0, + "logps/rejected": -2096.0, + "loss": 0.679, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.296875, + "rewards/margins": 0.06884765625, + "rewards/rejected": 0.228515625, + "step": 1352 + }, + { + "epoch": 0.39030722630895714, + "grad_norm": 11.126824280258022, + "learning_rate": 3.822685753828714e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.0, + "logps/chosen": -1576.0, + "logps/rejected": -1736.0, + "loss": 0.6789, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.185546875, + "step": 1353 + }, + { + "epoch": 0.39059570171642866, + "grad_norm": 10.933375145729544, + "learning_rate": 3.8205482777132016e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.171875, + "logps/chosen": -1760.0, + "logps/rejected": -2008.0, + "loss": 0.6794, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.06884765625, + "rewards/rejected": 0.296875, + "step": 1354 + }, + { + "epoch": 0.39088417712390017, + "grad_norm": 12.505520595950992, + "learning_rate": 3.8184094618493035e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1864.0, + "logps/rejected": -1800.0, + "loss": 0.6827, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.10986328125, + "rewards/rejected": 0.30078125, + "step": 1355 + }, + { + "epoch": 0.3911726525313717, + "grad_norm": 9.65454026520546, + "learning_rate": 3.816269308406934e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1648.0, + "logps/rejected": -1536.0, + "loss": 0.6626, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.0888671875, + "rewards/rejected": 0.263671875, + "step": 1356 + }, + { + "epoch": 0.3914611279388432, + "grad_norm": 12.229723173659552, + "learning_rate": 3.8141278195573623e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1464.0, + "logps/rejected": -1536.0, + "loss": 0.6883, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.0732421875, + "rewards/rejected": 0.267578125, + "step": 1357 + }, + { + "epoch": 0.3917496033463147, + "grad_norm": 12.040253231296099, + "learning_rate": 3.8119849974732145e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.0625, + "logps/chosen": -1768.0, + "logps/rejected": -1736.0, + "loss": 0.6794, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.244140625, + "rewards/margins": 0.0006866455078125, + "rewards/rejected": 0.2431640625, + "step": 1358 + }, + { + "epoch": 0.3920380787537862, + "grad_norm": 10.318467411874689, + "learning_rate": 3.809840844328466e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.984375, + "logps/chosen": -1712.0, + "logps/rejected": -1480.0, + "loss": 0.6581, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.09619140625, + "rewards/rejected": 0.353515625, + "step": 1359 + }, + { + "epoch": 0.3923265541612578, + "grad_norm": 12.243879111749385, + "learning_rate": 3.8076953622984467e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.171875, + "logps/chosen": -1608.0, + "logps/rejected": -1416.0, + "loss": 0.7198, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2001953125, + "rewards/margins": -0.08447265625, + "rewards/rejected": 0.28515625, + "step": 1360 + }, + { + "epoch": 0.3926150295687293, + "grad_norm": 11.781847816637676, + "learning_rate": 3.805548553559833e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.03125, + "logps/chosen": -1600.0, + "logps/rejected": -1904.0, + "loss": 0.7139, + "loss/demonstration_loss": -3536.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.059814453125, + "rewards/rejected": 0.259765625, + "step": 1361 + }, + { + "epoch": 0.3929035049762008, + "grad_norm": 11.480501214551937, + "learning_rate": 3.8034004202906464e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.015625, + "logps/chosen": -1632.0, + "logps/rejected": -1544.0, + "loss": 0.6816, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2373046875, + "rewards/margins": 0.037109375, + "rewards/rejected": 0.2001953125, + "step": 1362 + }, + { + "epoch": 0.3931919803836723, + "grad_norm": 10.470357408802219, + "learning_rate": 3.801250964670253e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1440.0, + "logps/rejected": -1336.0, + "loss": 0.6275, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.216796875, + "rewards/rejected": 0.1787109375, + "step": 1363 + }, + { + "epoch": 0.3934804557911438, + "grad_norm": 12.325714808891378, + "learning_rate": 3.7991001888793604e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.984375, + "logps/chosen": -1456.0, + "logps/rejected": -1864.0, + "loss": 0.7493, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.248046875, + "rewards/margins": -0.185546875, + "rewards/rejected": 0.43359375, + "step": 1364 + }, + { + "epoch": 0.39376893119861534, + "grad_norm": 12.986212830845139, + "learning_rate": 3.796948095100016e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.03125, + "logps/chosen": -1600.0, + "logps/rejected": -1592.0, + "loss": 0.7638, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.220703125, + "rewards/margins": -0.1552734375, + "rewards/rejected": 0.376953125, + "step": 1365 + }, + { + "epoch": 0.39405740660608685, + "grad_norm": 12.470036061226049, + "learning_rate": 3.794794685515604e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.046875, + "logps/chosen": -1568.0, + "logps/rejected": -1280.0, + "loss": 0.6902, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.03271484375, + "rewards/rejected": 0.33203125, + "step": 1366 + }, + { + "epoch": 0.39434588201355836, + "grad_norm": 10.99052175249744, + "learning_rate": 3.792639962310843e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1656.0, + "logps/rejected": -1632.0, + "loss": 0.6445, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.08447265625, + "rewards/rejected": 0.2314453125, + "step": 1367 + }, + { + "epoch": 0.39463435742102987, + "grad_norm": 11.268494598420116, + "learning_rate": 3.790483927671785e-07, + "logits/chosen": 2.875, + "logits/rejected": 2.96875, + "logps/chosen": -1672.0, + "logps/rejected": -1200.0, + "loss": 0.6528, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.21484375, + "rewards/rejected": 0.09033203125, + "step": 1368 + }, + { + "epoch": 0.3949228328285014, + "grad_norm": 11.0056605286496, + "learning_rate": 3.7883265837858113e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.125, + "logps/chosen": -1512.0, + "logps/rejected": -1432.0, + "loss": 0.6931, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2138671875, + "rewards/margins": 0.0286865234375, + "rewards/rejected": 0.185546875, + "step": 1369 + }, + { + "epoch": 0.3952113082359729, + "grad_norm": 11.905407093673032, + "learning_rate": 3.786167932841634e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.15625, + "logps/chosen": -2304.0, + "logps/rejected": -2080.0, + "loss": 0.6664, + "loss/demonstration_loss": -4448.0, + "loss/preference_loss": -4416.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.208984375, + "rewards/rejected": 0.310546875, + "step": 1370 + }, + { + "epoch": 0.3954997836434444, + "grad_norm": 11.906840026211704, + "learning_rate": 3.78400797702929e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.03125, + "logps/chosen": -1520.0, + "logps/rejected": -1744.0, + "loss": 0.6989, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2275390625, + "rewards/margins": -0.04638671875, + "rewards/rejected": 0.2734375, + "step": 1371 + }, + { + "epoch": 0.3957882590509159, + "grad_norm": 12.102952826717768, + "learning_rate": 3.7818467185401395e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1664.0, + "logps/rejected": -1640.0, + "loss": 0.7003, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.083984375, + "rewards/rejected": 0.296875, + "step": 1372 + }, + { + "epoch": 0.3960767344583874, + "grad_norm": 11.203280671650758, + "learning_rate": 3.7796841595668614e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1504.0, + "logps/rejected": -1472.0, + "loss": 0.6673, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.208984375, + "step": 1373 + }, + { + "epoch": 0.39636520986585894, + "grad_norm": 11.336706634656963, + "learning_rate": 3.7775203023034617e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1488.0, + "logps/rejected": -1656.0, + "loss": 0.7113, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2197265625, + "rewards/margins": 0.0048828125, + "rewards/rejected": 0.21484375, + "step": 1374 + }, + { + "epoch": 0.39665368527333045, + "grad_norm": 10.744365767136983, + "learning_rate": 3.775355148945257e-07, + "logits/chosen": 2.984375, + "logits/rejected": 2.984375, + "logps/chosen": -1608.0, + "logps/rejected": -1512.0, + "loss": 0.6526, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.267578125, + "rewards/margins": -0.0093994140625, + "rewards/rejected": 0.27734375, + "step": 1375 + }, + { + "epoch": 0.39694216068080196, + "grad_norm": 9.158056439949062, + "learning_rate": 3.773188701688881e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.96875, + "logps/chosen": -1480.0, + "logps/rejected": -1480.0, + "loss": 0.6863, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2734375, + "rewards/margins": 0.03271484375, + "rewards/rejected": 0.240234375, + "step": 1376 + }, + { + "epoch": 0.39723063608827347, + "grad_norm": 11.515527318046763, + "learning_rate": 3.771020962732281e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1768.0, + "logps/rejected": -1648.0, + "loss": 0.6592, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.345703125, + "rewards/margins": 0.01007080078125, + "rewards/rejected": 0.3359375, + "step": 1377 + }, + { + "epoch": 0.397519111495745, + "grad_norm": 11.236653091790737, + "learning_rate": 3.768851934274712e-07, + "logits/chosen": 2.96875, + "logits/rejected": 2.953125, + "logps/chosen": -2160.0, + "logps/rejected": -2128.0, + "loss": 0.7079, + "loss/demonstration_loss": -4320.0, + "loss/preference_loss": -4320.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.0260009765625, + "rewards/rejected": 0.43359375, + "step": 1378 + }, + { + "epoch": 0.3978075869032165, + "grad_norm": 12.145344469991288, + "learning_rate": 3.76668161851674e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.03125, + "logps/chosen": -1664.0, + "logps/rejected": -1456.0, + "loss": 0.6705, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.21875, + "rewards/margins": -0.001495361328125, + "rewards/rejected": 0.2197265625, + "step": 1379 + }, + { + "epoch": 0.398096062310688, + "grad_norm": 10.980631119278355, + "learning_rate": 3.764510017660236e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1664.0, + "logps/rejected": -1496.0, + "loss": 0.6542, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.21875, + "rewards/rejected": 0.04052734375, + "step": 1380 + }, + { + "epoch": 0.3983845377181595, + "grad_norm": 11.120170658415976, + "learning_rate": 3.762337133908375e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -1832.0, + "logps/rejected": -1944.0, + "loss": 0.6989, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2265625, + "rewards/margins": -0.021240234375, + "rewards/rejected": 0.248046875, + "step": 1381 + }, + { + "epoch": 0.398673013125631, + "grad_norm": 13.71060202869591, + "learning_rate": 3.7601629694656335e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1544.0, + "logps/rejected": -1488.0, + "loss": 0.7083, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13671875, + "rewards/margins": -0.044921875, + "rewards/rejected": 0.181640625, + "step": 1382 + }, + { + "epoch": 0.39896148853310254, + "grad_norm": 9.966469146257252, + "learning_rate": 3.757987526537787e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1320.0, + "logps/rejected": -1544.0, + "loss": 0.704, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.26953125, + "rewards/margins": -0.07470703125, + "rewards/rejected": 0.345703125, + "step": 1383 + }, + { + "epoch": 0.39924996394057405, + "grad_norm": 12.274157913828146, + "learning_rate": 3.7558108073319075e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.09375, + "logps/chosen": -1672.0, + "logps/rejected": -1368.0, + "loss": 0.6885, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.04833984375, + "rewards/rejected": 0.31640625, + "step": 1384 + }, + { + "epoch": 0.39953843934804556, + "grad_norm": 12.134508144994225, + "learning_rate": 3.7536328140563644e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1712.0, + "logps/rejected": -1608.0, + "loss": 0.6463, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.251953125, + "rewards/rejected": 0.1416015625, + "step": 1385 + }, + { + "epoch": 0.39982691475551707, + "grad_norm": 12.630642875861069, + "learning_rate": 3.7514535489208155e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1624.0, + "logps/rejected": -1656.0, + "loss": 0.6865, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.30859375, + "rewards/margins": -0.058349609375, + "rewards/rejected": 0.3671875, + "step": 1386 + }, + { + "epoch": 0.4001153901629886, + "grad_norm": 15.527601778733017, + "learning_rate": 3.749273014136213e-07, + "logits/chosen": 3.015625, + "logits/rejected": 2.96875, + "logps/chosen": -2240.0, + "logps/rejected": -2128.0, + "loss": 0.6904, + "loss/demonstration_loss": -4384.0, + "loss/preference_loss": -4384.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.0791015625, + "rewards/rejected": 0.150390625, + "step": 1387 + }, + { + "epoch": 0.4004038655704601, + "grad_norm": 11.928344845671907, + "learning_rate": 3.747091211914796e-07, + "logits/chosen": 2.859375, + "logits/rejected": 2.890625, + "logps/chosen": -1592.0, + "logps/rejected": -1544.0, + "loss": 0.6718, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2333984375, + "rewards/margins": 0.1591796875, + "rewards/rejected": 0.0751953125, + "step": 1388 + }, + { + "epoch": 0.4006923409779316, + "grad_norm": 9.831527296931709, + "learning_rate": 3.7449081444700883e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1488.0, + "logps/rejected": -1624.0, + "loss": 0.626, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.361328125, + "rewards/margins": 0.1484375, + "rewards/rejected": 0.212890625, + "step": 1389 + }, + { + "epoch": 0.40098081638540317, + "grad_norm": 20.268959916633097, + "learning_rate": 3.7427238140168974e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1928.0, + "logps/rejected": -1904.0, + "loss": 0.6844, + "loss/demonstration_loss": -3872.0, + "loss/preference_loss": -3872.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.390625, + "rewards/margins": 0.056640625, + "rewards/rejected": 0.333984375, + "step": 1390 + }, + { + "epoch": 0.4012692917928747, + "grad_norm": 12.538307808945405, + "learning_rate": 3.740538222771314e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.953125, + "logps/chosen": -1488.0, + "logps/rejected": -1336.0, + "loss": 0.6573, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.1728515625, + "step": 1391 + }, + { + "epoch": 0.4015577672003462, + "grad_norm": 11.925766712077214, + "learning_rate": 3.7383513729507055e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1304.0, + "logps/rejected": -1488.0, + "loss": 0.6857, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.051025390625, + "rewards/rejected": 0.279296875, + "step": 1392 + }, + { + "epoch": 0.4018462426078177, + "grad_norm": 10.756370347506724, + "learning_rate": 3.7361632667737187e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1368.0, + "logps/rejected": -1216.0, + "loss": 0.6603, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2578125, + "rewards/margins": 0.13671875, + "rewards/rejected": 0.1201171875, + "step": 1393 + }, + { + "epoch": 0.4021347180152892, + "grad_norm": 11.80577392427376, + "learning_rate": 3.733973906460273e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1560.0, + "logps/rejected": -1480.0, + "loss": 0.686, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2255859375, + "rewards/margins": -0.037353515625, + "rewards/rejected": 0.263671875, + "step": 1394 + }, + { + "epoch": 0.4024231934227607, + "grad_norm": 10.541930387057253, + "learning_rate": 3.731783294231561e-07, + "logits/chosen": 2.984375, + "logits/rejected": 3.03125, + "logps/chosen": -1816.0, + "logps/rejected": -1800.0, + "loss": 0.6609, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.0927734375, + "rewards/rejected": 0.26171875, + "step": 1395 + }, + { + "epoch": 0.40271166883023224, + "grad_norm": 10.062561509224166, + "learning_rate": 3.729591432310045e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.015625, + "logps/chosen": -1728.0, + "logps/rejected": -1472.0, + "loss": 0.6734, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2578125, + "rewards/margins": 0.1298828125, + "rewards/rejected": 0.12890625, + "step": 1396 + }, + { + "epoch": 0.40300014423770375, + "grad_norm": 10.555801747508836, + "learning_rate": 3.7273983229194564e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1504.0, + "logps/rejected": -1888.0, + "loss": 0.6598, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.369140625, + "rewards/margins": 0.107421875, + "rewards/rejected": 0.26171875, + "step": 1397 + }, + { + "epoch": 0.40328861964517526, + "grad_norm": 10.839586544957013, + "learning_rate": 3.7252039682847907e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1736.0, + "logps/rejected": -1712.0, + "loss": 0.6826, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.43359375, + "rewards/margins": 0.005462646484375, + "rewards/rejected": 0.4296875, + "step": 1398 + }, + { + "epoch": 0.4035770950526468, + "grad_norm": 12.326347718252821, + "learning_rate": 3.723008370632308e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.125, + "logps/chosen": -1608.0, + "logps/rejected": -1664.0, + "loss": 0.7119, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.263671875, + "rewards/margins": -0.02490234375, + "rewards/rejected": 0.287109375, + "step": 1399 + }, + { + "epoch": 0.4038655704601183, + "grad_norm": 11.204368014144857, + "learning_rate": 3.7208115321895265e-07, + "logits/chosen": 2.890625, + "logits/rejected": 2.96875, + "logps/chosen": -1776.0, + "logps/rejected": -1568.0, + "loss": 0.6639, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.134765625, + "rewards/rejected": 0.1708984375, + "step": 1400 + }, + { + "epoch": 0.4041540458675898, + "grad_norm": 10.756107706064277, + "learning_rate": 3.7186134551852287e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.15625, + "logps/chosen": -1704.0, + "logps/rejected": -1568.0, + "loss": 0.6777, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.232421875, + "rewards/margins": 0.032958984375, + "rewards/rejected": 0.19921875, + "step": 1401 + }, + { + "epoch": 0.4044425212750613, + "grad_norm": 11.12544501702627, + "learning_rate": 3.7164141418494494e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.09375, + "logps/chosen": -1768.0, + "logps/rejected": -1832.0, + "loss": 0.6691, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2197265625, + "rewards/margins": 0.0625, + "rewards/rejected": 0.1572265625, + "step": 1402 + }, + { + "epoch": 0.4047309966825328, + "grad_norm": 11.38833971702098, + "learning_rate": 3.7142135944134777e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1808.0, + "logps/rejected": -1888.0, + "loss": 0.6982, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1787109375, + "rewards/margins": -0.048828125, + "rewards/rejected": 0.2275390625, + "step": 1403 + }, + { + "epoch": 0.40501947209000433, + "grad_norm": 10.95284458710104, + "learning_rate": 3.7120118151098574e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -1744.0, + "logps/rejected": -1608.0, + "loss": 0.6341, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.22265625, + "rewards/rejected": 0.26171875, + "step": 1404 + }, + { + "epoch": 0.40530794749747584, + "grad_norm": 10.5961858911717, + "learning_rate": 3.7098088061723796e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.171875, + "logps/chosen": -1440.0, + "logps/rejected": -1312.0, + "loss": 0.6956, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1943359375, + "rewards/margins": 0.0419921875, + "rewards/rejected": 0.1513671875, + "step": 1405 + }, + { + "epoch": 0.40559642290494735, + "grad_norm": 12.869756717025295, + "learning_rate": 3.7076045698360847e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1848.0, + "logps/rejected": -1768.0, + "loss": 0.7134, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.296875, + "rewards/margins": 0.03173828125, + "rewards/rejected": 0.263671875, + "step": 1406 + }, + { + "epoch": 0.40588489831241886, + "grad_norm": 10.251453879356593, + "learning_rate": 3.705399108337257e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1712.0, + "logps/rejected": -1512.0, + "loss": 0.6443, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2490234375, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.146484375, + "step": 1407 + }, + { + "epoch": 0.4061733737198904, + "grad_norm": 10.534555354754037, + "learning_rate": 3.703192423913424e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1784.0, + "logps/rejected": -1728.0, + "loss": 0.6663, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.1435546875, + "rewards/rejected": 0.33984375, + "step": 1408 + }, + { + "epoch": 0.4064618491273619, + "grad_norm": 12.155557117387959, + "learning_rate": 3.7009845188033543e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.984375, + "logps/chosen": -1680.0, + "logps/rejected": -1464.0, + "loss": 0.7018, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.024658203125, + "rewards/rejected": 0.28515625, + "step": 1409 + }, + { + "epoch": 0.4067503245348334, + "grad_norm": 11.364593550007232, + "learning_rate": 3.698775395247056e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1672.0, + "logps/rejected": -1624.0, + "loss": 0.7057, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.197265625, + "rewards/margins": 0.01043701171875, + "rewards/rejected": 0.1875, + "step": 1410 + }, + { + "epoch": 0.4070387999423049, + "grad_norm": 12.565424634853716, + "learning_rate": 3.696565055485771e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0, + "logps/chosen": -1648.0, + "logps/rejected": -1624.0, + "loss": 0.6498, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.357421875, + "rewards/margins": 0.08251953125, + "rewards/rejected": 0.275390625, + "step": 1411 + }, + { + "epoch": 0.4073272753497764, + "grad_norm": 10.262384716323979, + "learning_rate": 3.6943535017619765e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.140625, + "logps/chosen": -1984.0, + "logps/rejected": -1816.0, + "loss": 0.6331, + "loss/demonstration_loss": -3840.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.0859375, + "rewards/rejected": 0.220703125, + "step": 1412 + }, + { + "epoch": 0.40761575075724793, + "grad_norm": 13.0962401317729, + "learning_rate": 3.692140736319381e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -1864.0, + "logps/rejected": -1456.0, + "loss": 0.701, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.328125, + "rewards/margins": 0.038818359375, + "rewards/rejected": 0.291015625, + "step": 1413 + }, + { + "epoch": 0.40790422616471944, + "grad_norm": 11.69446479732212, + "learning_rate": 3.6899267614029226e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -2128.0, + "logps/rejected": -2000.0, + "loss": 0.6784, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.294921875, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.22265625, + "step": 1414 + }, + { + "epoch": 0.40819270157219095, + "grad_norm": 11.692005369738197, + "learning_rate": 3.6877115792587673e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1632.0, + "logps/rejected": -1424.0, + "loss": 0.6797, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.1123046875, + "rewards/rejected": 0.208984375, + "step": 1415 + }, + { + "epoch": 0.40848117697966246, + "grad_norm": 10.727445434001131, + "learning_rate": 3.685495192134303e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1376.0, + "logps/rejected": -1456.0, + "loss": 0.7109, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2255859375, + "rewards/margins": 0.031982421875, + "rewards/rejected": 0.193359375, + "step": 1416 + }, + { + "epoch": 0.408769652387134, + "grad_norm": 11.302926833031483, + "learning_rate": 3.683277602278143e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.078125, + "logps/chosen": -1808.0, + "logps/rejected": -1800.0, + "loss": 0.6863, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.400390625, + "rewards/margins": 0.00433349609375, + "rewards/rejected": 0.396484375, + "step": 1417 + }, + { + "epoch": 0.4090581277946055, + "grad_norm": 10.977197944094765, + "learning_rate": 3.6810588119401196e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.203125, + "logps/chosen": -1552.0, + "logps/rejected": -1280.0, + "loss": 0.6544, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1630859375, + "rewards/margins": 0.09716796875, + "rewards/rejected": 0.0654296875, + "step": 1418 + }, + { + "epoch": 0.409346603202077, + "grad_norm": 11.012789089180847, + "learning_rate": 3.678838823371283e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1568.0, + "logps/rejected": -1368.0, + "loss": 0.6827, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2158203125, + "rewards/margins": -0.01123046875, + "rewards/rejected": 0.2275390625, + "step": 1419 + }, + { + "epoch": 0.4096350786095485, + "grad_norm": 12.1094565590526, + "learning_rate": 3.6766176388239005e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -1456.0, + "logps/rejected": -1648.0, + "loss": 0.7296, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.234375, + "rewards/margins": -0.0361328125, + "rewards/rejected": 0.26953125, + "step": 1420 + }, + { + "epoch": 0.4099235540170201, + "grad_norm": 12.696097130619057, + "learning_rate": 3.6743952605514506e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.328125, + "logps/chosen": -1888.0, + "logps/rejected": -1632.0, + "loss": 0.6933, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3125, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.2255859375, + "step": 1421 + }, + { + "epoch": 0.4102120294244916, + "grad_norm": 12.206874522742618, + "learning_rate": 3.672171690808623e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1832.0, + "logps/rejected": -1776.0, + "loss": 0.6679, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2216796875, + "rewards/margins": 0.1513671875, + "rewards/rejected": 0.0712890625, + "step": 1422 + }, + { + "epoch": 0.4105005048319631, + "grad_norm": 12.191887095644232, + "learning_rate": 3.6699469318513187e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1720.0, + "logps/rejected": -1832.0, + "loss": 0.7068, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.263671875, + "rewards/margins": -0.072265625, + "rewards/rejected": 0.3359375, + "step": 1423 + }, + { + "epoch": 0.4107889802394346, + "grad_norm": 10.268853974892556, + "learning_rate": 3.667720985936643e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.140625, + "logps/chosen": -1680.0, + "logps/rejected": -1384.0, + "loss": 0.6804, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.09228515625, + "rewards/rejected": 0.28515625, + "step": 1424 + }, + { + "epoch": 0.4110774556469061, + "grad_norm": 11.687883109081131, + "learning_rate": 3.6654938553229054e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.3125, + "logps/chosen": -1808.0, + "logps/rejected": -1608.0, + "loss": 0.6547, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.265625, + "step": 1425 + }, + { + "epoch": 0.41136593105437763, + "grad_norm": 13.259898993857163, + "learning_rate": 3.663265542269618e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -1728.0, + "logps/rejected": -1840.0, + "loss": 0.6532, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.06494140625, + "rewards/rejected": 0.423828125, + "step": 1426 + }, + { + "epoch": 0.41165440646184914, + "grad_norm": 11.391535438646725, + "learning_rate": 3.6610360490374924e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1904.0, + "logps/rejected": -1696.0, + "loss": 0.6628, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.29296875, + "rewards/margins": 0.0986328125, + "rewards/rejected": 0.1943359375, + "step": 1427 + }, + { + "epoch": 0.41194288186932065, + "grad_norm": 11.820450681871057, + "learning_rate": 3.6588053778884383e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.296875, + "logps/chosen": -1656.0, + "logps/rejected": -1504.0, + "loss": 0.6381, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.474609375, + "rewards/margins": 0.1806640625, + "rewards/rejected": 0.29296875, + "step": 1428 + }, + { + "epoch": 0.41223135727679217, + "grad_norm": 11.738650351186301, + "learning_rate": 3.656573531085559e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.1875, + "logps/chosen": -1632.0, + "logps/rejected": -1632.0, + "loss": 0.6722, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.050048828125, + "rewards/rejected": 0.2177734375, + "step": 1429 + }, + { + "epoch": 0.4125198326842637, + "grad_norm": 11.15405910299464, + "learning_rate": 3.654340510893151e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.15625, + "logps/chosen": -1656.0, + "logps/rejected": -1656.0, + "loss": 0.7001, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.07958984375, + "rewards/rejected": 0.330078125, + "step": 1430 + }, + { + "epoch": 0.4128083080917352, + "grad_norm": 10.026177807685471, + "learning_rate": 3.652106319576702e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.078125, + "logps/chosen": -1472.0, + "logps/rejected": -1448.0, + "loss": 0.6566, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27734375, + "rewards/margins": 0.11376953125, + "rewards/rejected": 0.1640625, + "step": 1431 + }, + { + "epoch": 0.4130967834992067, + "grad_norm": 12.622292901106505, + "learning_rate": 3.6498709594028877e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.203125, + "logps/chosen": -1696.0, + "logps/rejected": -1880.0, + "loss": 0.7127, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.0174560546875, + "rewards/rejected": 0.3359375, + "step": 1432 + }, + { + "epoch": 0.4133852589066782, + "grad_norm": 11.683668149785372, + "learning_rate": 3.6476344326395674e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1552.0, + "logps/rejected": -1520.0, + "loss": 0.6762, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3088.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.279296875, + "rewards/margins": 0.11962890625, + "rewards/rejected": 0.16015625, + "step": 1433 + }, + { + "epoch": 0.4136737343141497, + "grad_norm": 10.325876383907781, + "learning_rate": 3.6453967415557887e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1392.0, + "logps/rejected": -1368.0, + "loss": 0.6577, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1796875, + "rewards/margins": 0.1162109375, + "rewards/rejected": 0.06396484375, + "step": 1434 + }, + { + "epoch": 0.41396220972162123, + "grad_norm": 11.919956287974754, + "learning_rate": 3.6431578884217753e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.078125, + "logps/chosen": -2016.0, + "logps/rejected": -1800.0, + "loss": 0.6448, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.244140625, + "rewards/rejected": 0.232421875, + "step": 1435 + }, + { + "epoch": 0.41425068512909274, + "grad_norm": 10.815510973650063, + "learning_rate": 3.640917875508933e-07, + "logits/chosen": 2.9375, + "logits/rejected": 2.953125, + "logps/chosen": -1568.0, + "logps/rejected": -1448.0, + "loss": 0.6537, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2236328125, + "rewards/margins": 0.10546875, + "rewards/rejected": 0.11767578125, + "step": 1436 + }, + { + "epoch": 0.41453916053656426, + "grad_norm": 10.214991346460826, + "learning_rate": 3.6386767050898433e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.09375, + "logps/chosen": -1440.0, + "logps/rejected": -1304.0, + "loss": 0.6505, + "loss/demonstration_loss": -2768.0, + "loss/preference_loss": -2752.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.21484375, + "rewards/margins": 0.0693359375, + "rewards/rejected": 0.1455078125, + "step": 1437 + }, + { + "epoch": 0.41482763594403577, + "grad_norm": 11.540583460797453, + "learning_rate": 3.636434379438262e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.15625, + "logps/chosen": -1928.0, + "logps/rejected": -1880.0, + "loss": 0.6906, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.134765625, + "rewards/margins": -0.05029296875, + "rewards/rejected": 0.1845703125, + "step": 1438 + }, + { + "epoch": 0.4151161113515073, + "grad_norm": 11.028301760614935, + "learning_rate": 3.634190900829117e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.015625, + "logps/chosen": -1728.0, + "logps/rejected": -1456.0, + "loss": 0.6435, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.146484375, + "rewards/rejected": 0.1923828125, + "step": 1439 + }, + { + "epoch": 0.4154045867589788, + "grad_norm": 10.491212609001602, + "learning_rate": 3.6319462715385063e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1512.0, + "logps/rejected": -1408.0, + "loss": 0.6603, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.35546875, + "rewards/margins": 0.056640625, + "rewards/rejected": 0.298828125, + "step": 1440 + }, + { + "epoch": 0.4156930621664503, + "grad_norm": 11.017226258611302, + "learning_rate": 3.6297004938436946e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1088.0, + "logps/rejected": -1464.0, + "loss": 0.7047, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2560.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1904296875, + "rewards/margins": 0.044189453125, + "rewards/rejected": 0.146484375, + "step": 1441 + }, + { + "epoch": 0.4159815375739218, + "grad_norm": 13.660578113998382, + "learning_rate": 3.6274535700231127e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1976.0, + "logps/rejected": -1896.0, + "loss": 0.6631, + "loss/demonstration_loss": -3920.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.146484375, + "rewards/rejected": 0.248046875, + "step": 1442 + }, + { + "epoch": 0.4162700129813933, + "grad_norm": 11.35993497872218, + "learning_rate": 3.6252055023563533e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1552.0, + "logps/rejected": -1616.0, + "loss": 0.677, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.0277099609375, + "rewards/rejected": 0.232421875, + "step": 1443 + }, + { + "epoch": 0.41655848838886483, + "grad_norm": 12.111398884681277, + "learning_rate": 3.622956293124168e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1992.0, + "logps/rejected": -2112.0, + "loss": 0.7147, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46875, + "rewards/margins": -0.07470703125, + "rewards/rejected": 0.54296875, + "step": 1444 + }, + { + "epoch": 0.41684696379633634, + "grad_norm": 10.721734781623606, + "learning_rate": 3.620705944608472e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.078125, + "logps/chosen": -1840.0, + "logps/rejected": -1392.0, + "loss": 0.6317, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.287109375, + "rewards/rejected": 0.1904296875, + "step": 1445 + }, + { + "epoch": 0.41713543920380786, + "grad_norm": 10.6842336521273, + "learning_rate": 3.6184544590923293e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1512.0, + "logps/rejected": -1384.0, + "loss": 0.6302, + "loss/demonstration_loss": -2928.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.1748046875, + "rewards/rejected": 0.16796875, + "step": 1446 + }, + { + "epoch": 0.41742391461127937, + "grad_norm": 11.750520518640997, + "learning_rate": 3.6162018388599636e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1920.0, + "logps/rejected": -1424.0, + "loss": 0.6511, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.439453125, + "rewards/margins": 0.1640625, + "rewards/rejected": 0.275390625, + "step": 1447 + }, + { + "epoch": 0.4177123900187509, + "grad_norm": 13.74312256116365, + "learning_rate": 3.613948086196745e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1752.0, + "logps/rejected": -1672.0, + "loss": 0.6517, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.16015625, + "rewards/rejected": 0.2177734375, + "step": 1448 + }, + { + "epoch": 0.4180008654262224, + "grad_norm": 10.662334200385676, + "learning_rate": 3.6116932033891955e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.046875, + "logps/chosen": -1584.0, + "logps/rejected": -1568.0, + "loss": 0.698, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4375, + "rewards/margins": 0.0888671875, + "rewards/rejected": 0.34765625, + "step": 1449 + }, + { + "epoch": 0.4182893408336939, + "grad_norm": 9.351814372817023, + "learning_rate": 3.6094371927249833e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1504.0, + "logps/rejected": -1264.0, + "loss": 0.6325, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32421875, + "rewards/margins": 0.244140625, + "rewards/rejected": 0.0791015625, + "step": 1450 + }, + { + "epoch": 0.41857781624116547, + "grad_norm": 11.388130442381131, + "learning_rate": 3.6071800564929203e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1496.0, + "logps/rejected": -1440.0, + "loss": 0.6949, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.2412109375, + "step": 1451 + }, + { + "epoch": 0.418866291648637, + "grad_norm": 11.556995170180452, + "learning_rate": 3.604921796982958e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1744.0, + "logps/rejected": -1544.0, + "loss": 0.6627, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.1630859375, + "rewards/rejected": 0.322265625, + "step": 1452 + }, + { + "epoch": 0.4191547670561085, + "grad_norm": 11.034114543354196, + "learning_rate": 3.6026624164861924e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -1624.0, + "logps/rejected": -1520.0, + "loss": 0.6439, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.08349609375, + "rewards/rejected": 0.279296875, + "step": 1453 + }, + { + "epoch": 0.41944324246358, + "grad_norm": 10.50495607951258, + "learning_rate": 3.6004019172948536e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.15625, + "logps/chosen": -1584.0, + "logps/rejected": -1400.0, + "loss": 0.6702, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.1591796875, + "rewards/rejected": 0.345703125, + "step": 1454 + }, + { + "epoch": 0.4197317178710515, + "grad_norm": 11.274932405920156, + "learning_rate": 3.5981403017023075e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.171875, + "logps/chosen": -1656.0, + "logps/rejected": -1736.0, + "loss": 0.7018, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.060546875, + "rewards/rejected": 0.3046875, + "step": 1455 + }, + { + "epoch": 0.420020193278523, + "grad_norm": 10.984688850160628, + "learning_rate": 3.5958775720030526e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1440.0, + "logps/rejected": -1552.0, + "loss": 0.6998, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.302734375, + "rewards/margins": -0.01177978515625, + "rewards/rejected": 0.314453125, + "step": 1456 + }, + { + "epoch": 0.42030866868599454, + "grad_norm": 10.754272265175567, + "learning_rate": 3.5936137304927166e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.125, + "logps/chosen": -1632.0, + "logps/rejected": -1928.0, + "loss": 0.6976, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.345703125, + "rewards/margins": -0.0390625, + "rewards/rejected": 0.384765625, + "step": 1457 + }, + { + "epoch": 0.42059714409346605, + "grad_norm": 10.658615351240256, + "learning_rate": 3.591348779468056e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.21875, + "logps/chosen": -1808.0, + "logps/rejected": -1904.0, + "loss": 0.6738, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.259765625, + "step": 1458 + }, + { + "epoch": 0.42088561950093756, + "grad_norm": 12.096013865912218, + "learning_rate": 3.5890827212269554e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.296875, + "logps/chosen": -1640.0, + "logps/rejected": -1560.0, + "loss": 0.6888, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29296875, + "rewards/margins": 0.051513671875, + "rewards/rejected": 0.2421875, + "step": 1459 + }, + { + "epoch": 0.42117409490840907, + "grad_norm": 10.719007931907088, + "learning_rate": 3.586815558068417e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.03125, + "logps/chosen": -1760.0, + "logps/rejected": -1520.0, + "loss": 0.6136, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.546875, + "rewards/margins": 0.259765625, + "rewards/rejected": 0.2890625, + "step": 1460 + }, + { + "epoch": 0.4214625703158806, + "grad_norm": 9.338823539807724, + "learning_rate": 3.584547292292571e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.03125, + "logps/chosen": -1632.0, + "logps/rejected": -1408.0, + "loss": 0.6274, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.1416015625, + "rewards/rejected": 0.322265625, + "step": 1461 + }, + { + "epoch": 0.4217510457233521, + "grad_norm": 11.475211810643314, + "learning_rate": 3.5822779262006585e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -2160.0, + "logps/rejected": -1864.0, + "loss": 0.7166, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.318359375, + "rewards/margins": -0.006378173828125, + "rewards/rejected": 0.32421875, + "step": 1462 + }, + { + "epoch": 0.4220395211308236, + "grad_norm": 10.599393354314644, + "learning_rate": 3.5800074620950445e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1952.0, + "logps/rejected": -1792.0, + "loss": 0.6044, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.2080078125, + "rewards/rejected": 0.26171875, + "step": 1463 + }, + { + "epoch": 0.4223279965382951, + "grad_norm": 11.24534671107371, + "learning_rate": 3.577735902279203e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1368.0, + "logps/rejected": -1416.0, + "loss": 0.6747, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2109375, + "rewards/margins": 0.048095703125, + "rewards/rejected": 0.162109375, + "step": 1464 + }, + { + "epoch": 0.4226164719457666, + "grad_norm": 11.524128506135431, + "learning_rate": 3.5754632490577217e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1568.0, + "logps/rejected": -1552.0, + "loss": 0.6863, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.392578125, + "rewards/margins": -0.006256103515625, + "rewards/rejected": 0.3984375, + "step": 1465 + }, + { + "epoch": 0.42290494735323814, + "grad_norm": 12.028380128721462, + "learning_rate": 3.573189504736296e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1848.0, + "logps/rejected": -1712.0, + "loss": 0.6814, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.0263671875, + "rewards/rejected": 0.3046875, + "step": 1466 + }, + { + "epoch": 0.42319342276070965, + "grad_norm": 11.490971015061676, + "learning_rate": 3.5709146716217314e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1744.0, + "logps/rejected": -1560.0, + "loss": 0.7112, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.357421875, + "rewards/margins": -0.00909423828125, + "rewards/rejected": 0.3671875, + "step": 1467 + }, + { + "epoch": 0.42348189816818116, + "grad_norm": 10.591383614157886, + "learning_rate": 3.5686387520219334e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.328125, + "logps/chosen": -2048.0, + "logps/rejected": -1968.0, + "loss": 0.7042, + "loss/demonstration_loss": -4064.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.03857421875, + "rewards/rejected": 0.41015625, + "step": 1468 + }, + { + "epoch": 0.42377037357565267, + "grad_norm": 18.18874131949807, + "learning_rate": 3.566361748245915e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.1875, + "logps/chosen": -1912.0, + "logps/rejected": -1928.0, + "loss": 0.6893, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.0791015625, + "rewards/rejected": 0.408203125, + "step": 1469 + }, + { + "epoch": 0.4240588489831242, + "grad_norm": 10.069922765865112, + "learning_rate": 3.5640836626037835e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1464.0, + "logps/rejected": -1392.0, + "loss": 0.6451, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2864.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2001953125, + "rewards/margins": 0.07568359375, + "rewards/rejected": 0.12451171875, + "step": 1470 + }, + { + "epoch": 0.4243473243905957, + "grad_norm": 12.104243013174909, + "learning_rate": 3.561804497406748e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1520.0, + "logps/rejected": -1480.0, + "loss": 0.7076, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34375, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.28515625, + "step": 1471 + }, + { + "epoch": 0.4246357997980672, + "grad_norm": 8.933401577253049, + "learning_rate": 3.559524254967114e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.171875, + "logps/chosen": -1568.0, + "logps/rejected": -1312.0, + "loss": 0.6716, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.0025177001953125, + "rewards/rejected": 0.302734375, + "step": 1472 + }, + { + "epoch": 0.4249242752055387, + "grad_norm": 11.636055138453058, + "learning_rate": 3.557242937598274e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1640.0, + "logps/rejected": -1728.0, + "loss": 0.7321, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.255859375, + "rewards/margins": 0.02099609375, + "rewards/rejected": 0.234375, + "step": 1473 + }, + { + "epoch": 0.4252127506130102, + "grad_norm": 10.857286000385852, + "learning_rate": 3.554960547614716e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -2024.0, + "logps/rejected": -2000.0, + "loss": 0.6852, + "loss/demonstration_loss": -4080.0, + "loss/preference_loss": -4064.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.072265625, + "rewards/rejected": 0.416015625, + "step": 1474 + }, + { + "epoch": 0.42550122602048174, + "grad_norm": 13.022104543379063, + "learning_rate": 3.552677087332015e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.296875, + "logps/chosen": -1544.0, + "logps/rejected": -1872.0, + "loss": 0.7245, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.23046875, + "rewards/margins": -0.10546875, + "rewards/rejected": 0.3359375, + "step": 1475 + }, + { + "epoch": 0.42578970142795325, + "grad_norm": 11.401791001816868, + "learning_rate": 3.550392559066831e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.28125, + "logps/chosen": -1544.0, + "logps/rejected": -1456.0, + "loss": 0.6519, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.375, + "rewards/margins": 0.1416015625, + "rewards/rejected": 0.234375, + "step": 1476 + }, + { + "epoch": 0.42607817683542476, + "grad_norm": 10.92018613656943, + "learning_rate": 3.5481069651369094e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.21875, + "logps/chosen": -1376.0, + "logps/rejected": -1288.0, + "loss": 0.7062, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2001953125, + "rewards/margins": -0.007781982421875, + "rewards/rejected": 0.2080078125, + "step": 1477 + }, + { + "epoch": 0.42636665224289627, + "grad_norm": 11.29848873932859, + "learning_rate": 3.545820307861075e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.046875, + "logps/chosen": -1968.0, + "logps/rejected": -1760.0, + "loss": 0.7017, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.111328125, + "rewards/rejected": 0.251953125, + "step": 1478 + }, + { + "epoch": 0.4266551276503678, + "grad_norm": 12.351038068854823, + "learning_rate": 3.5435325895592306e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.171875, + "logps/chosen": -1856.0, + "logps/rejected": -1928.0, + "loss": 0.6968, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3824.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.0032958984375, + "rewards/rejected": 0.359375, + "step": 1479 + }, + { + "epoch": 0.4269436030578393, + "grad_norm": 9.760906515927875, + "learning_rate": 3.5412438125523576e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1784.0, + "logps/rejected": -1752.0, + "loss": 0.6622, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.375, + "rewards/margins": 0.1318359375, + "rewards/rejected": 0.2431640625, + "step": 1480 + }, + { + "epoch": 0.4272320784653108, + "grad_norm": 10.97592202940016, + "learning_rate": 3.5389539791625115e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.03125, + "logps/chosen": -1824.0, + "logps/rejected": -1792.0, + "loss": 0.6846, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.0133056640625, + "rewards/rejected": 0.32421875, + "step": 1481 + }, + { + "epoch": 0.42752055387278237, + "grad_norm": 11.415929299624795, + "learning_rate": 3.5366630917128184e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -2144.0, + "logps/rejected": -1752.0, + "loss": 0.6274, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3920.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.21875, + "rewards/rejected": 0.24609375, + "step": 1482 + }, + { + "epoch": 0.4278090292802539, + "grad_norm": 10.07627410815915, + "learning_rate": 3.534371152527473e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.28125, + "logps/chosen": -1344.0, + "logps/rejected": -1352.0, + "loss": 0.6995, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2734375, + "rewards/margins": -0.0654296875, + "rewards/rejected": 0.33984375, + "step": 1483 + }, + { + "epoch": 0.4280975046877254, + "grad_norm": 11.408048765193723, + "learning_rate": 3.532078163931739e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.09375, + "logps/chosen": -1544.0, + "logps/rejected": -1264.0, + "loss": 0.6546, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.1826171875, + "rewards/rejected": 0.1376953125, + "step": 1484 + }, + { + "epoch": 0.4283859800951969, + "grad_norm": 10.541827352240697, + "learning_rate": 3.5297841282519436e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.03125, + "logps/chosen": -1880.0, + "logps/rejected": -1760.0, + "loss": 0.691, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.453125, + "rewards/margins": 0.126953125, + "rewards/rejected": 0.326171875, + "step": 1485 + }, + { + "epoch": 0.4286744555026684, + "grad_norm": 12.497894178831125, + "learning_rate": 3.527489047815478e-07, + "logits/chosen": 3.0, + "logits/rejected": 2.96875, + "logps/chosen": -1592.0, + "logps/rejected": -1704.0, + "loss": 0.6682, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.546875, + "rewards/margins": 0.10205078125, + "rewards/rejected": 0.4453125, + "step": 1486 + }, + { + "epoch": 0.42896293091013993, + "grad_norm": 10.651432605591802, + "learning_rate": 3.5251929249507896e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.15625, + "logps/chosen": -1648.0, + "logps/rejected": -1536.0, + "loss": 0.676, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.0089111328125, + "rewards/rejected": 0.40234375, + "step": 1487 + }, + { + "epoch": 0.42925140631761144, + "grad_norm": 13.718201424486463, + "learning_rate": 3.5228957619873874e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.171875, + "logps/chosen": -1728.0, + "logps/rejected": -1640.0, + "loss": 0.693, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.28125, + "rewards/margins": 0.07177734375, + "rewards/rejected": 0.2099609375, + "step": 1488 + }, + { + "epoch": 0.42953988172508295, + "grad_norm": 10.250272815428048, + "learning_rate": 3.520597561255834e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1496.0, + "logps/rejected": -1520.0, + "loss": 0.6668, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.051513671875, + "rewards/rejected": 0.375, + "step": 1489 + }, + { + "epoch": 0.42982835713255446, + "grad_norm": 10.34411047323811, + "learning_rate": 3.518298325087743e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.1875, + "logps/chosen": -1496.0, + "logps/rejected": -1680.0, + "loss": 0.6757, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.205078125, + "rewards/margins": 0.043701171875, + "rewards/rejected": 0.1611328125, + "step": 1490 + }, + { + "epoch": 0.430116832540026, + "grad_norm": 10.58101444119307, + "learning_rate": 3.515998055815782e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1656.0, + "logps/rejected": -1672.0, + "loss": 0.7004, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.390625, + "rewards/margins": 0.0106201171875, + "rewards/rejected": 0.37890625, + "step": 1491 + }, + { + "epoch": 0.4304053079474975, + "grad_norm": 11.99524345002724, + "learning_rate": 3.513696755773665e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.3125, + "logps/chosen": -1640.0, + "logps/rejected": -1592.0, + "loss": 0.7647, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29296875, + "rewards/margins": -0.06396484375, + "rewards/rejected": 0.357421875, + "step": 1492 + }, + { + "epoch": 0.430693783354969, + "grad_norm": 12.074562094676685, + "learning_rate": 3.511394427296151e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -1760.0, + "logps/rejected": -1664.0, + "loss": 0.676, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.306640625, + "rewards/margins": 0.025390625, + "rewards/rejected": 0.28125, + "step": 1493 + }, + { + "epoch": 0.4309822587624405, + "grad_norm": 10.291934724167026, + "learning_rate": 3.5090910727190435e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1672.0, + "logps/rejected": -1656.0, + "loss": 0.6695, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.1337890625, + "rewards/rejected": 0.349609375, + "step": 1494 + }, + { + "epoch": 0.431270734169912, + "grad_norm": 12.18476995600669, + "learning_rate": 3.5067866943791874e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.28125, + "logps/chosen": -1496.0, + "logps/rejected": -1392.0, + "loss": 0.6832, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25390625, + "rewards/margins": -0.0458984375, + "rewards/rejected": 0.30078125, + "step": 1495 + }, + { + "epoch": 0.43155920957738353, + "grad_norm": 11.29213144970456, + "learning_rate": 3.5044812946144646e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1616.0, + "logps/rejected": -1704.0, + "loss": 0.6919, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.1337890625, + "rewards/rejected": 0.1826171875, + "step": 1496 + }, + { + "epoch": 0.43184768498485504, + "grad_norm": 12.035781256974701, + "learning_rate": 3.502174875763794e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -2112.0, + "logps/rejected": -1984.0, + "loss": 0.6764, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.0400390625, + "rewards/rejected": 0.390625, + "step": 1497 + }, + { + "epoch": 0.43213616039232655, + "grad_norm": 11.115259547939155, + "learning_rate": 3.49986744016713e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1704.0, + "logps/rejected": -1800.0, + "loss": 0.6672, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.06689453125, + "rewards/rejected": 0.412109375, + "step": 1498 + }, + { + "epoch": 0.43242463579979806, + "grad_norm": 9.1792430654426, + "learning_rate": 3.4975589901654555e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.1875, + "logps/chosen": -1336.0, + "logps/rejected": -1336.0, + "loss": 0.6729, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1318359375, + "rewards/margins": 0.0111083984375, + "rewards/rejected": 0.12060546875, + "step": 1499 + }, + { + "epoch": 0.4327131112072696, + "grad_norm": 11.16928600759935, + "learning_rate": 3.495249528100786e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -2048.0, + "logps/rejected": -1992.0, + "loss": 0.6832, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4080.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.19140625, + "rewards/rejected": 0.361328125, + "step": 1500 + }, + { + "epoch": 0.4330015866147411, + "grad_norm": 11.179808598153329, + "learning_rate": 3.4929390563161606e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1600.0, + "logps/rejected": -1368.0, + "loss": 0.6486, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.1533203125, + "rewards/rejected": 0.1962890625, + "step": 1501 + }, + { + "epoch": 0.4332900620222126, + "grad_norm": 11.084748611247903, + "learning_rate": 3.4906275771556435e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1496.0, + "logps/rejected": -1568.0, + "loss": 0.6989, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.287109375, + "rewards/margins": 0.02587890625, + "rewards/rejected": 0.26171875, + "step": 1502 + }, + { + "epoch": 0.4335785374296841, + "grad_norm": 10.5528167332915, + "learning_rate": 3.4883150929643236e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.125, + "logps/chosen": -1360.0, + "logps/rejected": -1136.0, + "loss": 0.6586, + "loss/demonstration_loss": -2512.0, + "loss/preference_loss": -2496.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.138671875, + "rewards/margins": 0.055419921875, + "rewards/rejected": 0.08349609375, + "step": 1503 + }, + { + "epoch": 0.4338670128371556, + "grad_norm": 10.384056551323711, + "learning_rate": 3.486001606088307e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.09375, + "logps/chosen": -1440.0, + "logps/rejected": -1648.0, + "loss": 0.6469, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2734375, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.1318359375, + "step": 1504 + }, + { + "epoch": 0.43415548824462713, + "grad_norm": 10.566809235732396, + "learning_rate": 3.4836871188747165e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1592.0, + "logps/rejected": -1600.0, + "loss": 0.6807, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.265625, + "rewards/margins": -0.0108642578125, + "rewards/rejected": 0.27734375, + "step": 1505 + }, + { + "epoch": 0.43444396365209864, + "grad_norm": 11.197892260093418, + "learning_rate": 3.48137163367169e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1544.0, + "logps/rejected": -1480.0, + "loss": 0.6611, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1533203125, + "rewards/margins": -0.061767578125, + "rewards/rejected": 0.2158203125, + "step": 1506 + }, + { + "epoch": 0.43473243905957015, + "grad_norm": 11.455916758922642, + "learning_rate": 3.479055152828382e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1600.0, + "logps/rejected": -1776.0, + "loss": 0.6713, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30859375, + "rewards/margins": 0.051513671875, + "rewards/rejected": 0.2578125, + "step": 1507 + }, + { + "epoch": 0.43502091446704166, + "grad_norm": 13.166538113964373, + "learning_rate": 3.476737678694951e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.1875, + "logps/chosen": -1808.0, + "logps/rejected": -1632.0, + "loss": 0.6819, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.291015625, + "rewards/margins": 0.1171875, + "rewards/rejected": 0.1748046875, + "step": 1508 + }, + { + "epoch": 0.4353093898745132, + "grad_norm": 13.407125853963413, + "learning_rate": 3.474419213622567e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1872.0, + "logps/rejected": -1984.0, + "loss": 0.6786, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.0235595703125, + "rewards/rejected": 0.296875, + "step": 1509 + }, + { + "epoch": 0.4355978652819847, + "grad_norm": 10.465238306107826, + "learning_rate": 3.472099759963404e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.125, + "logps/chosen": -1592.0, + "logps/rejected": -1792.0, + "loss": 0.6251, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2265625, + "rewards/margins": 0.0301513671875, + "rewards/rejected": 0.1962890625, + "step": 1510 + }, + { + "epoch": 0.4358863406894562, + "grad_norm": 10.30714250308569, + "learning_rate": 3.4697793200706395e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.25, + "logps/chosen": -1424.0, + "logps/rejected": -1384.0, + "loss": 0.6339, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2832.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3125, + "rewards/margins": 0.1123046875, + "rewards/rejected": 0.201171875, + "step": 1511 + }, + { + "epoch": 0.43617481609692776, + "grad_norm": 10.594098814439167, + "learning_rate": 3.467457896298452e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1584.0, + "logps/rejected": -1728.0, + "loss": 0.6526, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28125, + "rewards/margins": 0.095703125, + "rewards/rejected": 0.185546875, + "step": 1512 + }, + { + "epoch": 0.4364632915043993, + "grad_norm": 11.18977581108728, + "learning_rate": 3.465135491002017e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1848.0, + "logps/rejected": -1824.0, + "loss": 0.6675, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.421875, + "rewards/margins": 0.0634765625, + "rewards/rejected": 0.357421875, + "step": 1513 + }, + { + "epoch": 0.4367517669118708, + "grad_norm": 11.67466983083644, + "learning_rate": 3.462812106537506e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.03125, + "logps/chosen": -1800.0, + "logps/rejected": -1800.0, + "loss": 0.6767, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.212890625, + "step": 1514 + }, + { + "epoch": 0.4370402423193423, + "grad_norm": 22.602851819045313, + "learning_rate": 3.4604877452620853e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1768.0, + "logps/rejected": -1688.0, + "loss": 0.6974, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.353515625, + "rewards/margins": -0.0089111328125, + "rewards/rejected": 0.361328125, + "step": 1515 + }, + { + "epoch": 0.4373287177268138, + "grad_norm": 11.26426053911973, + "learning_rate": 3.4581624095339114e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1960.0, + "logps/rejected": -1944.0, + "loss": 0.6805, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.34375, + "rewards/margins": 0.04296875, + "rewards/rejected": 0.30078125, + "step": 1516 + }, + { + "epoch": 0.4376171931342853, + "grad_norm": 11.020470055369165, + "learning_rate": 3.4558361017121275e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1864.0, + "logps/rejected": -1864.0, + "loss": 0.6659, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40625, + "rewards/margins": 0.08642578125, + "rewards/rejected": 0.3203125, + "step": 1517 + }, + { + "epoch": 0.43790566854175683, + "grad_norm": 11.02489383490104, + "learning_rate": 3.453508824156866e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1712.0, + "logps/rejected": -1600.0, + "loss": 0.6729, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.078125, + "rewards/rejected": 0.4140625, + "step": 1518 + }, + { + "epoch": 0.43819414394922834, + "grad_norm": 11.641776554706913, + "learning_rate": 3.451180579229242e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.1875, + "logps/chosen": -1656.0, + "logps/rejected": -1640.0, + "loss": 0.6934, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.03173828125, + "rewards/rejected": 0.3515625, + "step": 1519 + }, + { + "epoch": 0.43848261935669985, + "grad_norm": 10.702828371236077, + "learning_rate": 3.448851369291351e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.03125, + "logps/chosen": -1720.0, + "logps/rejected": -1568.0, + "loss": 0.6484, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.1923828125, + "rewards/rejected": 0.2734375, + "step": 1520 + }, + { + "epoch": 0.43877109476417137, + "grad_norm": 12.82441133727652, + "learning_rate": 3.446521196706271e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1776.0, + "logps/rejected": -1896.0, + "loss": 0.7014, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.388671875, + "step": 1521 + }, + { + "epoch": 0.4390595701716429, + "grad_norm": 12.00846089074656, + "learning_rate": 3.4441900638380503e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1808.0, + "logps/rejected": -1824.0, + "loss": 0.6988, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48046875, + "rewards/margins": -0.013671875, + "rewards/rejected": 0.494140625, + "step": 1522 + }, + { + "epoch": 0.4393480455791144, + "grad_norm": 12.468991620479162, + "learning_rate": 3.4418579730517185e-07, + "logits/chosen": 3.03125, + "logits/rejected": 3.09375, + "logps/chosen": -1992.0, + "logps/rejected": -1784.0, + "loss": 0.6887, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.115234375, + "rewards/rejected": 0.234375, + "step": 1523 + }, + { + "epoch": 0.4396365209865859, + "grad_norm": 9.023467586083413, + "learning_rate": 3.439524926713272e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1672.0, + "logps/rejected": -1704.0, + "loss": 0.6411, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.255859375, + "step": 1524 + }, + { + "epoch": 0.4399249963940574, + "grad_norm": 11.096406358094628, + "learning_rate": 3.4371909271896786e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.15625, + "logps/chosen": -1800.0, + "logps/rejected": -1864.0, + "loss": 0.6544, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.1357421875, + "rewards/rejected": 0.373046875, + "step": 1525 + }, + { + "epoch": 0.4402134718015289, + "grad_norm": 10.846299160475898, + "learning_rate": 3.4348559768488747e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.25, + "logps/chosen": -1576.0, + "logps/rejected": -1112.0, + "loss": 0.6783, + "loss/demonstration_loss": -2704.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2001953125, + "rewards/margins": 0.08154296875, + "rewards/rejected": 0.119140625, + "step": 1526 + }, + { + "epoch": 0.44050194720900043, + "grad_norm": 12.50277006010662, + "learning_rate": 3.432520078059758e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1560.0, + "logps/rejected": -1784.0, + "loss": 0.7116, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.380859375, + "rewards/margins": 0.0281982421875, + "rewards/rejected": 0.3515625, + "step": 1527 + }, + { + "epoch": 0.44079042261647194, + "grad_norm": 10.438881187852047, + "learning_rate": 3.4301832331921894e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1304.0, + "logps/rejected": -1256.0, + "loss": 0.682, + "loss/demonstration_loss": -2576.0, + "loss/preference_loss": -2576.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.134765625, + "rewards/margins": 0.0269775390625, + "rewards/rejected": 0.10791015625, + "step": 1528 + }, + { + "epoch": 0.44107889802394346, + "grad_norm": 12.043717891923857, + "learning_rate": 3.4278454446169926e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.125, + "logps/chosen": -1512.0, + "logps/rejected": -1656.0, + "loss": 0.7047, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.0205078125, + "rewards/rejected": 0.24609375, + "step": 1529 + }, + { + "epoch": 0.44136737343141497, + "grad_norm": 10.909474402337759, + "learning_rate": 3.4255067147059446e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.15625, + "logps/chosen": -1720.0, + "logps/rejected": -1384.0, + "loss": 0.688, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14453125, + "rewards/margins": 0.0361328125, + "rewards/rejected": 0.10888671875, + "step": 1530 + }, + { + "epoch": 0.4416558488388865, + "grad_norm": 11.163215450835171, + "learning_rate": 3.42316704583178e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.203125, + "logps/chosen": -1840.0, + "logps/rejected": -1864.0, + "loss": 0.6673, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.330078125, + "rewards/margins": -0.007415771484375, + "rewards/rejected": 0.337890625, + "step": 1531 + }, + { + "epoch": 0.441944324246358, + "grad_norm": 11.428731644688176, + "learning_rate": 3.420826440368185e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.140625, + "logps/chosen": -1928.0, + "logps/rejected": -2048.0, + "loss": 0.7112, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.005889892578125, + "rewards/rejected": 0.33203125, + "step": 1532 + }, + { + "epoch": 0.4422327996538295, + "grad_norm": 13.242785426143541, + "learning_rate": 3.4184849006897965e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.046875, + "logps/chosen": -1592.0, + "logps/rejected": -1616.0, + "loss": 0.7047, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.34375, + "rewards/margins": 0.12060546875, + "rewards/rejected": 0.2236328125, + "step": 1533 + }, + { + "epoch": 0.442521275061301, + "grad_norm": 10.695306665147804, + "learning_rate": 3.4161424291722e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.1875, + "logps/chosen": -1432.0, + "logps/rejected": -1528.0, + "loss": 0.6439, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28125, + "rewards/margins": 0.056884765625, + "rewards/rejected": 0.224609375, + "step": 1534 + }, + { + "epoch": 0.4428097504687725, + "grad_norm": 11.559363719089523, + "learning_rate": 3.413799028191923e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1528.0, + "logps/rejected": -1776.0, + "loss": 0.6787, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.2236328125, + "rewards/margins": -0.003021240234375, + "rewards/rejected": 0.2265625, + "step": 1535 + }, + { + "epoch": 0.44309822587624403, + "grad_norm": 11.80060863439122, + "learning_rate": 3.41145470012644e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1680.0, + "logps/rejected": -1560.0, + "loss": 0.6838, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.40625, + "rewards/margins": 0.05810546875, + "rewards/rejected": 0.34765625, + "step": 1536 + }, + { + "epoch": 0.44338670128371555, + "grad_norm": 11.28455687625612, + "learning_rate": 3.4091094473541643e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1360.0, + "logps/rejected": -1400.0, + "loss": 0.7161, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.025390625, + "rewards/rejected": 0.369140625, + "step": 1537 + }, + { + "epoch": 0.44367517669118706, + "grad_norm": 12.514969802051278, + "learning_rate": 3.406763272254447e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.03125, + "logps/chosen": -1640.0, + "logps/rejected": -1496.0, + "loss": 0.6927, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4609375, + "rewards/margins": 0.193359375, + "rewards/rejected": 0.267578125, + "step": 1538 + }, + { + "epoch": 0.44396365209865857, + "grad_norm": 11.270246395427481, + "learning_rate": 3.404416177207576e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.328125, + "logps/chosen": -1560.0, + "logps/rejected": -1232.0, + "loss": 0.6985, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.173828125, + "rewards/margins": 0.05322265625, + "rewards/rejected": 0.12060546875, + "step": 1539 + }, + { + "epoch": 0.4442521275061301, + "grad_norm": 12.486459281296215, + "learning_rate": 3.4020681645947714e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.34375, + "logps/chosen": -1792.0, + "logps/rejected": -1552.0, + "loss": 0.6809, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28515625, + "rewards/margins": 0.1728515625, + "rewards/rejected": 0.11279296875, + "step": 1540 + }, + { + "epoch": 0.4445406029136016, + "grad_norm": 11.513183998244546, + "learning_rate": 3.3997192367981846e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.1875, + "logps/chosen": -1568.0, + "logps/rejected": -1696.0, + "loss": 0.6741, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.234375, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.1494140625, + "step": 1541 + }, + { + "epoch": 0.4448290783210731, + "grad_norm": 12.169513875846395, + "learning_rate": 3.3973693962008964e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1616.0, + "logps/rejected": -1576.0, + "loss": 0.7222, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.296875, + "rewards/margins": -0.04833984375, + "rewards/rejected": 0.345703125, + "step": 1542 + }, + { + "epoch": 0.44511755372854467, + "grad_norm": 11.185228664591719, + "learning_rate": 3.395018645186913e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.109375, + "logps/chosen": -1888.0, + "logps/rejected": -1752.0, + "loss": 0.637, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.546875, + "rewards/margins": 0.11376953125, + "rewards/rejected": 0.43359375, + "step": 1543 + }, + { + "epoch": 0.4454060291360162, + "grad_norm": 11.48268926144227, + "learning_rate": 3.3926669861411623e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1560.0, + "logps/rejected": -1520.0, + "loss": 0.6781, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1826171875, + "rewards/margins": 0.0216064453125, + "rewards/rejected": 0.1611328125, + "step": 1544 + }, + { + "epoch": 0.4456945045434877, + "grad_norm": 10.466847863688741, + "learning_rate": 3.3903144214494976e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1672.0, + "logps/rejected": -1536.0, + "loss": 0.654, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.30859375, + "rewards/margins": 0.031494140625, + "rewards/rejected": 0.27734375, + "step": 1545 + }, + { + "epoch": 0.4459829799509592, + "grad_norm": 12.004655549239814, + "learning_rate": 3.387960953498687e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1472.0, + "logps/rejected": -1232.0, + "loss": 0.6689, + "loss/demonstration_loss": -2720.0, + "loss/preference_loss": -2704.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1953125, + "rewards/margins": 0.1259765625, + "rewards/rejected": 0.06982421875, + "step": 1546 + }, + { + "epoch": 0.4462714553584307, + "grad_norm": 13.128230455047806, + "learning_rate": 3.3856065846764174e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.1875, + "logps/chosen": -1688.0, + "logps/rejected": -1568.0, + "loss": 0.6219, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.205078125, + "rewards/margins": 0.09375, + "rewards/rejected": 0.111328125, + "step": 1547 + }, + { + "epoch": 0.4465599307659022, + "grad_norm": 14.52973155880915, + "learning_rate": 3.3832513173712895e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.0625, + "logps/chosen": -1352.0, + "logps/rejected": -1360.0, + "loss": 0.7164, + "loss/demonstration_loss": -2736.0, + "loss/preference_loss": -2736.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.333984375, + "rewards/margins": 0.013916015625, + "rewards/rejected": 0.3203125, + "step": 1548 + }, + { + "epoch": 0.44684840617337374, + "grad_norm": 12.76137699543709, + "learning_rate": 3.3808951539728145e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1840.0, + "logps/rejected": -1728.0, + "loss": 0.6718, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.2373046875, + "step": 1549 + }, + { + "epoch": 0.44713688158084525, + "grad_norm": 11.938625019606807, + "learning_rate": 3.378538096871412e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1912.0, + "logps/rejected": -1832.0, + "loss": 0.7285, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.341796875, + "rewards/margins": -0.0712890625, + "rewards/rejected": 0.412109375, + "step": 1550 + }, + { + "epoch": 0.44742535698831676, + "grad_norm": 11.123998625204196, + "learning_rate": 3.376180148458412e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.328125, + "logps/chosen": -1408.0, + "logps/rejected": -1568.0, + "loss": 0.6926, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.328125, + "rewards/margins": -0.002777099609375, + "rewards/rejected": 0.330078125, + "step": 1551 + }, + { + "epoch": 0.44771383239578827, + "grad_norm": 12.054509010670344, + "learning_rate": 3.373821311126044e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1512.0, + "logps/rejected": -1656.0, + "loss": 0.697, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2265625, + "rewards/margins": -0.025146484375, + "rewards/rejected": 0.251953125, + "step": 1552 + }, + { + "epoch": 0.4480023078032598, + "grad_norm": 10.209600410213321, + "learning_rate": 3.371461587267444e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1560.0, + "logps/rejected": -1360.0, + "loss": 0.6876, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.00555419921875, + "rewards/rejected": 0.310546875, + "step": 1553 + }, + { + "epoch": 0.4482907832107313, + "grad_norm": 10.701349061954332, + "learning_rate": 3.3691009792766424e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1760.0, + "logps/rejected": -1720.0, + "loss": 0.6697, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.294921875, + "rewards/margins": 0.032470703125, + "rewards/rejected": 0.263671875, + "step": 1554 + }, + { + "epoch": 0.4485792586182028, + "grad_norm": 10.878644934593197, + "learning_rate": 3.3667394895485705e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1472.0, + "logps/rejected": -1344.0, + "loss": 0.6806, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1953125, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.09228515625, + "step": 1555 + }, + { + "epoch": 0.4488677340256743, + "grad_norm": 12.740973489305533, + "learning_rate": 3.364377120479054e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.296875, + "logps/chosen": -1640.0, + "logps/rejected": -1616.0, + "loss": 0.6921, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.1220703125, + "rewards/margins": -0.0732421875, + "rewards/rejected": 0.1953125, + "step": 1556 + }, + { + "epoch": 0.4491562094331458, + "grad_norm": 12.020944587129142, + "learning_rate": 3.3620138744648076e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1488.0, + "logps/rejected": -1544.0, + "loss": 0.7266, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.02001953125, + "rewards/rejected": 0.318359375, + "step": 1557 + }, + { + "epoch": 0.44944468484061734, + "grad_norm": 12.29827772532482, + "learning_rate": 3.3596497539034396e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.125, + "logps/chosen": -1648.0, + "logps/rejected": -1792.0, + "loss": 0.7012, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.3359375, + "rewards/margins": -0.0174560546875, + "rewards/rejected": 0.353515625, + "step": 1558 + }, + { + "epoch": 0.44973316024808885, + "grad_norm": 11.45808879883158, + "learning_rate": 3.3572847611934417e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1600.0, + "logps/rejected": -1520.0, + "loss": 0.6841, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2255859375, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.15234375, + "step": 1559 + }, + { + "epoch": 0.45002163565556036, + "grad_norm": 10.428768429102382, + "learning_rate": 3.354918898734194e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.03125, + "logps/chosen": -1768.0, + "logps/rejected": -1544.0, + "loss": 0.6547, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2373046875, + "rewards/margins": 0.1982421875, + "rewards/rejected": 0.038330078125, + "step": 1560 + }, + { + "epoch": 0.45031011106303187, + "grad_norm": 10.80151748863185, + "learning_rate": 3.352552168925957e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1944.0, + "logps/rejected": -1880.0, + "loss": 0.6698, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.314453125, + "rewards/margins": 0.056396484375, + "rewards/rejected": 0.2578125, + "step": 1561 + }, + { + "epoch": 0.4505985864705034, + "grad_norm": 11.242299107912803, + "learning_rate": 3.350184574169872e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.15625, + "logps/chosen": -1384.0, + "logps/rejected": -1488.0, + "loss": 0.6974, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2912.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3203125, + "rewards/margins": -0.054931640625, + "rewards/rejected": 0.375, + "step": 1562 + }, + { + "epoch": 0.4508870618779749, + "grad_norm": 10.54453354195842, + "learning_rate": 3.347816116867956e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1920.0, + "logps/rejected": -1584.0, + "loss": 0.6585, + "loss/demonstration_loss": -3552.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.1689453125, + "rewards/rejected": 0.2734375, + "step": 1563 + }, + { + "epoch": 0.4511755372854464, + "grad_norm": 9.739913471552216, + "learning_rate": 3.345446799423103e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1232.0, + "logps/rejected": -1496.0, + "loss": 0.688, + "loss/demonstration_loss": -2752.0, + "loss/preference_loss": -2752.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1591796875, + "rewards/margins": 0.01177978515625, + "rewards/rejected": 0.1474609375, + "step": 1564 + }, + { + "epoch": 0.4514640126929179, + "grad_norm": 10.979060066881837, + "learning_rate": 3.343076624239081e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1712.0, + "logps/rejected": -1432.0, + "loss": 0.6838, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.201171875, + "rewards/margins": 0.056640625, + "rewards/rejected": 0.14453125, + "step": 1565 + }, + { + "epoch": 0.4517524881003894, + "grad_norm": 10.107511809314284, + "learning_rate": 3.3407055937205233e-07, + "logits/chosen": 3.015625, + "logits/rejected": 3.078125, + "logps/chosen": -1600.0, + "logps/rejected": -1440.0, + "loss": 0.6306, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26953125, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.11767578125, + "step": 1566 + }, + { + "epoch": 0.45204096350786094, + "grad_norm": 11.976752168481113, + "learning_rate": 3.338333710272936e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -1944.0, + "logps/rejected": -1984.0, + "loss": 0.6985, + "loss/demonstration_loss": -3968.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.06689453125, + "rewards/rejected": 0.26953125, + "step": 1567 + }, + { + "epoch": 0.45232943891533245, + "grad_norm": 9.589232610036449, + "learning_rate": 3.3359609763026875e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.203125, + "logps/chosen": -1592.0, + "logps/rejected": -1672.0, + "loss": 0.6697, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.041259765625, + "rewards/rejected": 0.263671875, + "step": 1568 + }, + { + "epoch": 0.45261791432280396, + "grad_norm": 10.016207608726702, + "learning_rate": 3.333587394217011e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.28125, + "logps/chosen": -2016.0, + "logps/rejected": -1920.0, + "loss": 0.6058, + "loss/demonstration_loss": -3984.0, + "loss/preference_loss": -3952.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.29296875, + "rewards/rejected": 0.251953125, + "step": 1569 + }, + { + "epoch": 0.45290638973027547, + "grad_norm": 11.002271865363085, + "learning_rate": 3.3312129664239995e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1376.0, + "logps/rejected": -1400.0, + "loss": 0.6397, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.421875, + "rewards/margins": 0.2333984375, + "rewards/rejected": 0.189453125, + "step": 1570 + }, + { + "epoch": 0.453194865137747, + "grad_norm": 11.563546539471192, + "learning_rate": 3.328837695332603e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -1184.0, + "logps/rejected": -1200.0, + "loss": 0.7157, + "loss/demonstration_loss": -2400.0, + "loss/preference_loss": -2400.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.205078125, + "rewards/margins": -0.0235595703125, + "rewards/rejected": 0.228515625, + "step": 1571 + }, + { + "epoch": 0.4534833405452185, + "grad_norm": 11.417359228722788, + "learning_rate": 3.326461583352628e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.3125, + "logps/chosen": -1624.0, + "logps/rejected": -1400.0, + "loss": 0.6878, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.35546875, + "rewards/margins": -0.000152587890625, + "rewards/rejected": 0.357421875, + "step": 1572 + }, + { + "epoch": 0.45377181595269006, + "grad_norm": 10.926266677011677, + "learning_rate": 3.3240846328947344e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.1875, + "logps/chosen": -1536.0, + "logps/rejected": -1512.0, + "loss": 0.6918, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06640625, + "rewards/margins": -0.0172119140625, + "rewards/rejected": 0.083984375, + "step": 1573 + }, + { + "epoch": 0.45406029136016157, + "grad_norm": 12.408531519132385, + "learning_rate": 3.3217068463704314e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.296875, + "logps/chosen": -1792.0, + "logps/rejected": -1888.0, + "loss": 0.6757, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.09228515625, + "rewards/rejected": 0.41015625, + "step": 1574 + }, + { + "epoch": 0.4543487667676331, + "grad_norm": 11.68798859693865, + "learning_rate": 3.319328226192078e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.34375, + "logps/chosen": -1808.0, + "logps/rejected": -1816.0, + "loss": 0.6769, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.050048828125, + "rewards/rejected": 0.2216796875, + "step": 1575 + }, + { + "epoch": 0.4546372421751046, + "grad_norm": 11.959906105865793, + "learning_rate": 3.316948774772878e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -2160.0, + "logps/rejected": -1968.0, + "loss": 0.7026, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4160.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.255859375, + "rewards/margins": 0.014404296875, + "rewards/rejected": 0.2412109375, + "step": 1576 + }, + { + "epoch": 0.4549257175825761, + "grad_norm": 11.432737253268336, + "learning_rate": 3.314568494526879e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1488.0, + "logps/rejected": -1656.0, + "loss": 0.6793, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.357421875, + "rewards/margins": 0.007110595703125, + "rewards/rejected": 0.349609375, + "step": 1577 + }, + { + "epoch": 0.4552141929900476, + "grad_norm": 9.636465383121207, + "learning_rate": 3.31218738786897e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1528.0, + "logps/rejected": -1456.0, + "loss": 0.6393, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3984375, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.267578125, + "step": 1578 + }, + { + "epoch": 0.45550266839751913, + "grad_norm": 10.285673423955991, + "learning_rate": 3.309805457214877e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.34375, + "logps/chosen": -1824.0, + "logps/rejected": -1752.0, + "loss": 0.6726, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.047607421875, + "rewards/rejected": 0.287109375, + "step": 1579 + }, + { + "epoch": 0.45579114380499064, + "grad_norm": 10.672935570587352, + "learning_rate": 3.3074227049811624e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.328125, + "logps/chosen": -1336.0, + "logps/rejected": -1464.0, + "loss": 0.636, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2800.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.201171875, + "rewards/margins": 0.1201171875, + "rewards/rejected": 0.0810546875, + "step": 1580 + }, + { + "epoch": 0.45607961921246215, + "grad_norm": 12.585785571947707, + "learning_rate": 3.305039133585223e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.3125, + "logps/chosen": -1464.0, + "logps/rejected": -1400.0, + "loss": 0.7053, + "loss/demonstration_loss": -2896.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.353515625, + "rewards/margins": -0.0263671875, + "rewards/rejected": 0.37890625, + "step": 1581 + }, + { + "epoch": 0.45636809461993366, + "grad_norm": 12.024795240532654, + "learning_rate": 3.3026547454452863e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1520.0, + "logps/rejected": -1640.0, + "loss": 0.6483, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.171875, + "rewards/margins": 0.04150390625, + "rewards/rejected": 0.130859375, + "step": 1582 + }, + { + "epoch": 0.4566565700274052, + "grad_norm": 10.264610433567263, + "learning_rate": 3.3002695429804084e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1280.0, + "logps/rejected": -1504.0, + "loss": 0.6642, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2138671875, + "rewards/margins": -0.0118408203125, + "rewards/rejected": 0.2255859375, + "step": 1583 + }, + { + "epoch": 0.4569450454348767, + "grad_norm": 11.29637641607261, + "learning_rate": 3.2978835286104705e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.296875, + "logps/chosen": -1840.0, + "logps/rejected": -1832.0, + "loss": 0.7186, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.255859375, + "rewards/margins": -0.08056640625, + "rewards/rejected": 0.337890625, + "step": 1584 + }, + { + "epoch": 0.4572335208423482, + "grad_norm": 9.280602330638628, + "learning_rate": 3.295496704756179e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1552.0, + "logps/rejected": -1384.0, + "loss": 0.6304, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2960.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.18359375, + "rewards/rejected": 0.2109375, + "step": 1585 + }, + { + "epoch": 0.4575219962498197, + "grad_norm": 9.715913642587658, + "learning_rate": 3.2931090738390597e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.3125, + "logps/chosen": -1792.0, + "logps/rejected": -1936.0, + "loss": 0.715, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.291015625, + "rewards/margins": -0.0167236328125, + "rewards/rejected": 0.30859375, + "step": 1586 + }, + { + "epoch": 0.4578104716572912, + "grad_norm": 11.433723127124694, + "learning_rate": 3.2907206382814606e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1608.0, + "logps/rejected": -1656.0, + "loss": 0.6518, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40625, + "rewards/margins": 0.189453125, + "rewards/rejected": 0.2177734375, + "step": 1587 + }, + { + "epoch": 0.45809894706476273, + "grad_norm": 11.419950109332657, + "learning_rate": 3.2883314005065434e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1840.0, + "logps/rejected": -1680.0, + "loss": 0.6088, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.2451171875, + "rewards/rejected": 0.2109375, + "step": 1588 + }, + { + "epoch": 0.45838742247223424, + "grad_norm": 11.81995880819628, + "learning_rate": 3.285941362938284e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.265625, + "logps/chosen": -1992.0, + "logps/rejected": -2096.0, + "loss": 0.7213, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.482421875, + "rewards/margins": -0.01904296875, + "rewards/rejected": 0.5, + "step": 1589 + }, + { + "epoch": 0.45867589787970575, + "grad_norm": 12.531602892650216, + "learning_rate": 3.283550528001469e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.109375, + "logps/chosen": -1928.0, + "logps/rejected": -2048.0, + "loss": 0.7113, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40234375, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.328125, + "step": 1590 + }, + { + "epoch": 0.45896437328717726, + "grad_norm": 12.219974697480648, + "learning_rate": 3.2811588981216946e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.296875, + "logps/chosen": -1880.0, + "logps/rejected": -1680.0, + "loss": 0.6763, + "loss/demonstration_loss": -3600.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.353515625, + "rewards/margins": 0.11572265625, + "rewards/rejected": 0.23828125, + "step": 1591 + }, + { + "epoch": 0.4592528486946488, + "grad_norm": 10.91595781896608, + "learning_rate": 3.2787664757253663e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.25, + "logps/chosen": -1864.0, + "logps/rejected": -1856.0, + "loss": 0.7209, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.275390625, + "rewards/margins": -0.05322265625, + "rewards/rejected": 0.328125, + "step": 1592 + }, + { + "epoch": 0.4595413241021203, + "grad_norm": 11.285074798067944, + "learning_rate": 3.2763732632396885e-07, + "logits/chosen": 3.0, + "logits/rejected": 3.21875, + "logps/chosen": -1416.0, + "logps/rejected": -1400.0, + "loss": 0.6818, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3515625, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.267578125, + "step": 1593 + }, + { + "epoch": 0.4598297995095918, + "grad_norm": 10.475028053202847, + "learning_rate": 3.273979263092671e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.375, + "logps/chosen": -2064.0, + "logps/rejected": -2040.0, + "loss": 0.6253, + "loss/demonstration_loss": -4160.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.53125, + "rewards/margins": 0.26171875, + "rewards/rejected": 0.26953125, + "step": 1594 + }, + { + "epoch": 0.4601182749170633, + "grad_norm": 10.283963465561667, + "learning_rate": 3.271584477713121e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1768.0, + "logps/rejected": -1712.0, + "loss": 0.6874, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26953125, + "rewards/margins": 0.025634765625, + "rewards/rejected": 0.244140625, + "step": 1595 + }, + { + "epoch": 0.4604067503245348, + "grad_norm": 9.422505031334358, + "learning_rate": 3.269188909530644e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.125, + "logps/chosen": -1608.0, + "logps/rejected": -1656.0, + "loss": 0.6855, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.3515625, + "rewards/margins": -0.0216064453125, + "rewards/rejected": 0.373046875, + "step": 1596 + }, + { + "epoch": 0.46069522573200633, + "grad_norm": 16.83808828034605, + "learning_rate": 3.266792560975638e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1760.0, + "logps/rejected": -1888.0, + "loss": 0.7069, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40625, + "rewards/margins": 0.07763671875, + "rewards/rejected": 0.328125, + "step": 1597 + }, + { + "epoch": 0.46098370113947784, + "grad_norm": 14.722103705466585, + "learning_rate": 3.264395434479292e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.34375, + "logps/chosen": -1512.0, + "logps/rejected": -1600.0, + "loss": 0.6779, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.302734375, + "rewards/margins": 0.11865234375, + "rewards/rejected": 0.18359375, + "step": 1598 + }, + { + "epoch": 0.46127217654694935, + "grad_norm": 10.824814018495962, + "learning_rate": 3.2619975324735866e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1680.0, + "logps/rejected": -1536.0, + "loss": 0.7293, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2412109375, + "rewards/margins": -0.0400390625, + "rewards/rejected": 0.28125, + "step": 1599 + }, + { + "epoch": 0.46156065195442086, + "grad_norm": 10.191871572579466, + "learning_rate": 3.259598857391289e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1552.0, + "logps/rejected": -1568.0, + "loss": 0.6488, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.279296875, + "rewards/margins": 0.023681640625, + "rewards/rejected": 0.255859375, + "step": 1600 + }, + { + "epoch": 0.4618491273618924, + "grad_norm": 12.877389668105883, + "learning_rate": 3.2571994116659474e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.1875, + "logps/chosen": -1808.0, + "logps/rejected": -1664.0, + "loss": 0.6634, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.228515625, + "rewards/margins": 0.076171875, + "rewards/rejected": 0.15234375, + "step": 1601 + }, + { + "epoch": 0.4621376027693639, + "grad_norm": 10.438705605147105, + "learning_rate": 3.254799197731896e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1648.0, + "logps/rejected": -1960.0, + "loss": 0.6796, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.388671875, + "step": 1602 + }, + { + "epoch": 0.4624260781768354, + "grad_norm": 10.56010210180927, + "learning_rate": 3.2523982180242465e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.125, + "logps/chosen": -1528.0, + "logps/rejected": -1568.0, + "loss": 0.6579, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.263671875, + "rewards/margins": 0.1494140625, + "rewards/rejected": 0.11328125, + "step": 1603 + }, + { + "epoch": 0.46271455358430696, + "grad_norm": 10.218838138049144, + "learning_rate": 3.249996474978887e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.171875, + "logps/chosen": -1784.0, + "logps/rejected": -1552.0, + "loss": 0.651, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.41796875, + "rewards/margins": 0.1201171875, + "rewards/rejected": 0.298828125, + "step": 1604 + }, + { + "epoch": 0.4630030289917785, + "grad_norm": 11.209920137305854, + "learning_rate": 3.2475939710324817e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1728.0, + "logps/rejected": -1568.0, + "loss": 0.7146, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1708984375, + "rewards/margins": -0.150390625, + "rewards/rejected": 0.3203125, + "step": 1605 + }, + { + "epoch": 0.46329150439925, + "grad_norm": 11.03767058470553, + "learning_rate": 3.245190708622465e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.28125, + "logps/chosen": -1920.0, + "logps/rejected": -1728.0, + "loss": 0.705, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.291015625, + "rewards/margins": 0.03369140625, + "rewards/rejected": 0.2578125, + "step": 1606 + }, + { + "epoch": 0.4635799798067215, + "grad_norm": 10.86809960140214, + "learning_rate": 3.242786690187042e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.234375, + "logps/chosen": -1784.0, + "logps/rejected": -1824.0, + "loss": 0.667, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.16796875, + "rewards/rejected": 0.1953125, + "step": 1607 + }, + { + "epoch": 0.463868455214193, + "grad_norm": 12.081922729437892, + "learning_rate": 3.2403819181651836e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.125, + "logps/chosen": -1464.0, + "logps/rejected": -1552.0, + "loss": 0.6693, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.08544921875, + "rewards/rejected": 0.21875, + "step": 1608 + }, + { + "epoch": 0.4641569306216645, + "grad_norm": 13.26160229986464, + "learning_rate": 3.237976394996626e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.390625, + "logps/chosen": -1976.0, + "logps/rejected": -2112.0, + "loss": 0.7524, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4128.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.26953125, + "rewards/margins": -0.2431640625, + "rewards/rejected": 0.51171875, + "step": 1609 + }, + { + "epoch": 0.46444540602913603, + "grad_norm": 9.047943325793094, + "learning_rate": 3.235570123121869e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.265625, + "logps/chosen": -1568.0, + "logps/rejected": -1344.0, + "loss": 0.6666, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.11865234375, + "rewards/rejected": 0.291015625, + "step": 1610 + }, + { + "epoch": 0.46473388143660754, + "grad_norm": 10.39249120187221, + "learning_rate": 3.233163104982169e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.046875, + "logps/chosen": -1720.0, + "logps/rejected": -1592.0, + "loss": 0.686, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.259765625, + "rewards/margins": 0.00299072265625, + "rewards/rejected": 0.255859375, + "step": 1611 + }, + { + "epoch": 0.46502235684407905, + "grad_norm": 9.829661040315031, + "learning_rate": 3.2307553430195407e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1760.0, + "logps/rejected": -1824.0, + "loss": 0.6541, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.455078125, + "rewards/margins": 0.11376953125, + "rewards/rejected": 0.341796875, + "step": 1612 + }, + { + "epoch": 0.46531083225155057, + "grad_norm": 11.087936566447548, + "learning_rate": 3.2283468396767546e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.09375, + "logps/chosen": -2064.0, + "logps/rejected": -2096.0, + "loss": 0.6747, + "loss/demonstration_loss": -4224.0, + "loss/preference_loss": -4224.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.61328125, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.546875, + "step": 1613 + }, + { + "epoch": 0.4655993076590221, + "grad_norm": 11.602640017393142, + "learning_rate": 3.225937597397332e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1568.0, + "logps/rejected": -1464.0, + "loss": 0.6688, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.361328125, + "rewards/margins": 0.05615234375, + "rewards/rejected": 0.3046875, + "step": 1614 + }, + { + "epoch": 0.4658877830664936, + "grad_norm": 12.708516687313352, + "learning_rate": 3.223527618625545e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1680.0, + "logps/rejected": -1880.0, + "loss": 0.7216, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3584.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.263671875, + "rewards/margins": -0.027099609375, + "rewards/rejected": 0.2890625, + "step": 1615 + }, + { + "epoch": 0.4661762584739651, + "grad_norm": 11.10808862709176, + "learning_rate": 3.221116905806412e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.21875, + "logps/chosen": -1680.0, + "logps/rejected": -1552.0, + "loss": 0.7016, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.376953125, + "rewards/margins": -0.07275390625, + "rewards/rejected": 0.44921875, + "step": 1616 + }, + { + "epoch": 0.4664647338814366, + "grad_norm": 10.951397953440221, + "learning_rate": 3.218705461385695e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.0625, + "logps/chosen": -1736.0, + "logps/rejected": -1672.0, + "loss": 0.6367, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.30078125, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.203125, + "step": 1617 + }, + { + "epoch": 0.4667532092889081, + "grad_norm": 10.638182654596397, + "learning_rate": 3.2162932878099026e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.0625, + "logps/chosen": -1320.0, + "logps/rejected": -1288.0, + "loss": 0.7032, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2624.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2421875, + "rewards/margins": 0.031982421875, + "rewards/rejected": 0.2099609375, + "step": 1618 + }, + { + "epoch": 0.46704168469637963, + "grad_norm": 11.251332516602643, + "learning_rate": 3.213880387526277e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.140625, + "logps/chosen": -1704.0, + "logps/rejected": -1560.0, + "loss": 0.6891, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1875, + "rewards/margins": -0.01220703125, + "rewards/rejected": 0.2001953125, + "step": 1619 + }, + { + "epoch": 0.46733016010385114, + "grad_norm": 10.402063040743373, + "learning_rate": 3.2114667629828027e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.09375, + "logps/chosen": -1712.0, + "logps/rejected": -1680.0, + "loss": 0.6674, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.369140625, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.255859375, + "step": 1620 + }, + { + "epoch": 0.46761863551132266, + "grad_norm": 10.309833023481726, + "learning_rate": 3.209052416628196e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1784.0, + "logps/rejected": -1520.0, + "loss": 0.6745, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.390625, + "rewards/margins": 0.0064697265625, + "rewards/rejected": 0.3828125, + "step": 1621 + }, + { + "epoch": 0.46790711091879417, + "grad_norm": 11.626064321456356, + "learning_rate": 3.206637350911908e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.21875, + "logps/chosen": -1240.0, + "logps/rejected": -1224.0, + "loss": 0.6689, + "loss/demonstration_loss": -2496.0, + "loss/preference_loss": -2480.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.0810546875, + "rewards/rejected": 0.2578125, + "step": 1622 + }, + { + "epoch": 0.4681955863262657, + "grad_norm": 12.740876088796279, + "learning_rate": 3.204221568284117e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1896.0, + "logps/rejected": -1832.0, + "loss": 0.6684, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40625, + "rewards/margins": 0.04248046875, + "rewards/rejected": 0.36328125, + "step": 1623 + }, + { + "epoch": 0.4684840617337372, + "grad_norm": 11.254569496823319, + "learning_rate": 3.2018050711957314e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.390625, + "logps/chosen": -1776.0, + "logps/rejected": -1600.0, + "loss": 0.6892, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.287109375, + "rewards/margins": -0.0244140625, + "rewards/rejected": 0.310546875, + "step": 1624 + }, + { + "epoch": 0.4687725371412087, + "grad_norm": 9.583885925615416, + "learning_rate": 3.199387862098381e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.15625, + "logps/chosen": -1352.0, + "logps/rejected": -1288.0, + "loss": 0.6734, + "loss/demonstration_loss": -2672.0, + "loss/preference_loss": -2672.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.275390625, + "rewards/margins": 0.017578125, + "rewards/rejected": 0.2578125, + "step": 1625 + }, + { + "epoch": 0.4690610125486802, + "grad_norm": 11.396783106626067, + "learning_rate": 3.1969699434444207e-07, + "logits/chosen": 3.421875, + "logits/rejected": 3.46875, + "logps/chosen": -1776.0, + "logps/rejected": -1600.0, + "loss": 0.7299, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.345703125, + "rewards/margins": -0.046875, + "rewards/rejected": 0.392578125, + "step": 1626 + }, + { + "epoch": 0.4693494879561517, + "grad_norm": 10.785681936126883, + "learning_rate": 3.1945513176869256e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.203125, + "logps/chosen": -1664.0, + "logps/rejected": -1800.0, + "loss": 0.6849, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.1875, + "rewards/chosen": 0.279296875, + "rewards/margins": -0.0238037109375, + "rewards/rejected": 0.302734375, + "step": 1627 + }, + { + "epoch": 0.46963796336362323, + "grad_norm": 11.481099538840834, + "learning_rate": 3.1921319872796856e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1920.0, + "logps/rejected": -1856.0, + "loss": 0.7278, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.373046875, + "rewards/margins": -0.020263671875, + "rewards/rejected": 0.392578125, + "step": 1628 + }, + { + "epoch": 0.46992643877109475, + "grad_norm": 12.93254296511721, + "learning_rate": 3.189711954677208e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.15625, + "logps/chosen": -1408.0, + "logps/rejected": -1336.0, + "loss": 0.744, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.380859375, + "rewards/margins": -0.03369140625, + "rewards/rejected": 0.416015625, + "step": 1629 + }, + { + "epoch": 0.47021491417856626, + "grad_norm": 11.876540826926718, + "learning_rate": 3.187291222334709e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -1640.0, + "logps/rejected": -1504.0, + "loss": 0.7214, + "loss/demonstration_loss": -3168.0, + "loss/preference_loss": -3168.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.298828125, + "rewards/margins": 0.0057373046875, + "rewards/rejected": 0.29296875, + "step": 1630 + }, + { + "epoch": 0.47050338958603777, + "grad_norm": 9.881219982676935, + "learning_rate": 3.184869792708121e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -2040.0, + "logps/rejected": -1808.0, + "loss": 0.6335, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3888.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.484375, + "rewards/margins": 0.095703125, + "rewards/rejected": 0.388671875, + "step": 1631 + }, + { + "epoch": 0.4707918649935093, + "grad_norm": 11.306274851210203, + "learning_rate": 3.182447668254077e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1968.0, + "logps/rejected": -1920.0, + "loss": 0.7064, + "loss/demonstration_loss": -3936.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.498046875, + "rewards/margins": -0.05712890625, + "rewards/rejected": 0.5546875, + "step": 1632 + }, + { + "epoch": 0.4710803404009808, + "grad_norm": 12.687967811554584, + "learning_rate": 3.1800248514299195e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.34375, + "logps/chosen": -1816.0, + "logps/rejected": -1768.0, + "loss": 0.693, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.0322265625, + "rewards/rejected": 0.376953125, + "step": 1633 + }, + { + "epoch": 0.47136881580845236, + "grad_norm": 11.49439956266737, + "learning_rate": 3.177601344693692e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1672.0, + "logps/rejected": -1696.0, + "loss": 0.6761, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.37890625, + "rewards/margins": 0.010986328125, + "rewards/rejected": 0.3671875, + "step": 1634 + }, + { + "epoch": 0.47165729121592387, + "grad_norm": 10.277180864267585, + "learning_rate": 3.1751771505041357e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.171875, + "logps/chosen": -1696.0, + "logps/rejected": -1640.0, + "loss": 0.6911, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.283203125, + "rewards/margins": 0.027587890625, + "rewards/rejected": 0.255859375, + "step": 1635 + }, + { + "epoch": 0.4719457666233954, + "grad_norm": 12.082963955240855, + "learning_rate": 3.172752271320693e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.28125, + "logps/chosen": -1768.0, + "logps/rejected": -1552.0, + "loss": 0.685, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44140625, + "rewards/margins": 0.1689453125, + "rewards/rejected": 0.2734375, + "step": 1636 + }, + { + "epoch": 0.4722342420308669, + "grad_norm": 10.910851654540808, + "learning_rate": 3.170326709603501e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1240.0, + "logps/rejected": -1352.0, + "loss": 0.6743, + "loss/demonstration_loss": -2624.0, + "loss/preference_loss": -2624.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.36328125, + "rewards/margins": -0.00250244140625, + "rewards/rejected": 0.365234375, + "step": 1637 + }, + { + "epoch": 0.4725227174383384, + "grad_norm": 10.133684060220697, + "learning_rate": 3.1679004678133853e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.390625, + "logps/chosen": -1440.0, + "logps/rejected": -1576.0, + "loss": 0.6431, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.431640625, + "rewards/margins": 0.107421875, + "rewards/rejected": 0.32421875, + "step": 1638 + }, + { + "epoch": 0.4728111928458099, + "grad_norm": 11.754098116177238, + "learning_rate": 3.165473548411864e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1720.0, + "logps/rejected": -1640.0, + "loss": 0.6565, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.173828125, + "rewards/rejected": 0.33984375, + "step": 1639 + }, + { + "epoch": 0.4730996682532814, + "grad_norm": 10.141994952867801, + "learning_rate": 3.163045953861145e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -1744.0, + "logps/rejected": -1680.0, + "loss": 0.6511, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4453125, + "rewards/margins": 0.07275390625, + "rewards/rejected": 0.373046875, + "step": 1640 + }, + { + "epoch": 0.47338814366075294, + "grad_norm": 13.657853005883569, + "learning_rate": 3.160617686624117e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1928.0, + "logps/rejected": -1880.0, + "loss": 0.7189, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.416015625, + "rewards/margins": -0.05712890625, + "rewards/rejected": 0.47265625, + "step": 1641 + }, + { + "epoch": 0.47367661906822445, + "grad_norm": 11.197384594235942, + "learning_rate": 3.158188749164354e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.359375, + "logps/chosen": -1856.0, + "logps/rejected": -1584.0, + "loss": 0.629, + "loss/demonstration_loss": -3472.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3515625, + "rewards/margins": 0.177734375, + "rewards/rejected": 0.173828125, + "step": 1642 + }, + { + "epoch": 0.47396509447569596, + "grad_norm": 9.925114677252418, + "learning_rate": 3.155759143946108e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.1875, + "logps/chosen": -1224.0, + "logps/rejected": -1280.0, + "loss": 0.6697, + "loss/demonstration_loss": -2544.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.291015625, + "rewards/margins": 0.025390625, + "rewards/rejected": 0.265625, + "step": 1643 + }, + { + "epoch": 0.47425356988316747, + "grad_norm": 9.946484403005293, + "learning_rate": 3.15332887343431e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -2192.0, + "logps/rejected": -1872.0, + "loss": 0.6608, + "loss/demonstration_loss": -4128.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.1201171875, + "rewards/rejected": 0.40234375, + "step": 1644 + }, + { + "epoch": 0.474542045290639, + "grad_norm": 10.887422199369505, + "learning_rate": 3.1508979400945664e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -2192.0, + "logps/rejected": -1872.0, + "loss": 0.654, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.002044677734375, + "rewards/rejected": 0.43359375, + "step": 1645 + }, + { + "epoch": 0.4748305206981105, + "grad_norm": 10.964583662633919, + "learning_rate": 3.148466346393154e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.234375, + "logps/chosen": -1928.0, + "logps/rejected": -1896.0, + "loss": 0.6478, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36328125, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.259765625, + "step": 1646 + }, + { + "epoch": 0.475118996105582, + "grad_norm": 11.193614909120155, + "learning_rate": 3.1460340947970197e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.3125, + "logps/chosen": -1752.0, + "logps/rejected": -1632.0, + "loss": 0.7308, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.162109375, + "rewards/margins": -0.08056640625, + "rewards/rejected": 0.2431640625, + "step": 1647 + }, + { + "epoch": 0.4754074715130535, + "grad_norm": 18.495126359423217, + "learning_rate": 3.14360118777378e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1720.0, + "logps/rejected": -1464.0, + "loss": 0.6885, + "loss/demonstration_loss": -3216.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.439453125, + "rewards/margins": 0.0247802734375, + "rewards/rejected": 0.4140625, + "step": 1648 + }, + { + "epoch": 0.475695946920525, + "grad_norm": 9.766135893823739, + "learning_rate": 3.141167627791716e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1680.0, + "logps/rejected": -1512.0, + "loss": 0.6561, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.365234375, + "rewards/margins": 0.1435546875, + "rewards/rejected": 0.220703125, + "step": 1649 + }, + { + "epoch": 0.47598442232799654, + "grad_norm": 10.691923087368378, + "learning_rate": 3.138733417319769e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.25, + "logps/chosen": -1784.0, + "logps/rejected": -1600.0, + "loss": 0.6689, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38671875, + "rewards/margins": 0.10302734375, + "rewards/rejected": 0.28515625, + "step": 1650 + }, + { + "epoch": 0.47627289773546805, + "grad_norm": 10.911471294544677, + "learning_rate": 3.1362985588275427e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.265625, + "logps/chosen": -1584.0, + "logps/rejected": -1520.0, + "loss": 0.6914, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2431640625, + "rewards/margins": 0.027099609375, + "rewards/rejected": 0.2158203125, + "step": 1651 + }, + { + "epoch": 0.47656137314293956, + "grad_norm": 10.623893711203577, + "learning_rate": 3.1338630547852954e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.203125, + "logps/chosen": -1712.0, + "logps/rejected": -1544.0, + "loss": 0.6503, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.11083984375, + "rewards/rejected": 0.228515625, + "step": 1652 + }, + { + "epoch": 0.47684984855041107, + "grad_norm": 11.739311315205153, + "learning_rate": 3.131426907663944e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.390625, + "logps/chosen": -1840.0, + "logps/rejected": -1440.0, + "loss": 0.7048, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.263671875, + "rewards/margins": 0.005126953125, + "rewards/rejected": 0.259765625, + "step": 1653 + }, + { + "epoch": 0.4771383239578826, + "grad_norm": 11.126984397384586, + "learning_rate": 3.1289901199350555e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.34375, + "logps/chosen": -1648.0, + "logps/rejected": -1632.0, + "loss": 0.6675, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33203125, + "rewards/margins": 0.004241943359375, + "rewards/rejected": 0.328125, + "step": 1654 + }, + { + "epoch": 0.4774267993653541, + "grad_norm": 13.13897902520666, + "learning_rate": 3.126552694070847e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1944.0, + "logps/rejected": -1712.0, + "loss": 0.6359, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.1318359375, + "rewards/rejected": 0.376953125, + "step": 1655 + }, + { + "epoch": 0.4777152747728256, + "grad_norm": 10.450774644529604, + "learning_rate": 3.1241146325441835e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.34375, + "logps/chosen": -1920.0, + "logps/rejected": -1680.0, + "loss": 0.633, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3648.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.65625, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.474609375, + "step": 1656 + }, + { + "epoch": 0.4780037501802971, + "grad_norm": 11.431990694580966, + "learning_rate": 3.121675937828575e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1152.0, + "logps/rejected": -1248.0, + "loss": 0.7245, + "loss/demonstration_loss": -2432.0, + "loss/preference_loss": -2432.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2392578125, + "rewards/margins": -0.08447265625, + "rewards/rejected": 0.32421875, + "step": 1657 + }, + { + "epoch": 0.4782922255877686, + "grad_norm": 10.545083225769096, + "learning_rate": 3.1192366123981726e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1568.0, + "logps/rejected": -1472.0, + "loss": 0.7066, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.25, + "rewards/margins": 0.041259765625, + "rewards/rejected": 0.208984375, + "step": 1658 + }, + { + "epoch": 0.47858070099524014, + "grad_norm": 10.097535202465012, + "learning_rate": 3.11679665872777e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.265625, + "logps/chosen": -1200.0, + "logps/rejected": -1208.0, + "loss": 0.6983, + "loss/demonstration_loss": -2432.0, + "loss/preference_loss": -2432.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2890625, + "rewards/margins": 0.07666015625, + "rewards/rejected": 0.2109375, + "step": 1659 + }, + { + "epoch": 0.47886917640271165, + "grad_norm": 12.37592120192126, + "learning_rate": 3.1143560792927946e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.328125, + "logps/chosen": -1968.0, + "logps/rejected": -1696.0, + "loss": 0.6381, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.2314453125, + "step": 1660 + }, + { + "epoch": 0.47915765181018316, + "grad_norm": 11.474424649725128, + "learning_rate": 3.111914876569312e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.21875, + "logps/chosen": -1584.0, + "logps/rejected": -1632.0, + "loss": 0.6705, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.09765625, + "rewards/rejected": 0.400390625, + "step": 1661 + }, + { + "epoch": 0.47944612721765467, + "grad_norm": 11.838169836819477, + "learning_rate": 3.1094730530340183e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.296875, + "logps/chosen": -1712.0, + "logps/rejected": -1656.0, + "loss": 0.7224, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.036376953125, + "rewards/rejected": 0.392578125, + "step": 1662 + }, + { + "epoch": 0.4797346026251262, + "grad_norm": 11.965439163826261, + "learning_rate": 3.10703061116424e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -1584.0, + "logps/rejected": -1520.0, + "loss": 0.6555, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2451171875, + "rewards/margins": 0.16015625, + "rewards/rejected": 0.0849609375, + "step": 1663 + }, + { + "epoch": 0.4800230780325977, + "grad_norm": 11.668302841820907, + "learning_rate": 3.104587553437932e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.015625, + "logps/chosen": -1896.0, + "logps/rejected": -1840.0, + "loss": 0.6543, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.375, + "rewards/margins": 0.07470703125, + "rewards/rejected": 0.30078125, + "step": 1664 + }, + { + "epoch": 0.48031155344006926, + "grad_norm": 9.225169460851358, + "learning_rate": 3.10214388233367e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.375, + "logps/chosen": -1576.0, + "logps/rejected": -1456.0, + "loss": 0.6348, + "loss/demonstration_loss": -3072.0, + "loss/preference_loss": -3056.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4765625, + "rewards/margins": 0.1875, + "rewards/rejected": 0.2890625, + "step": 1665 + }, + { + "epoch": 0.48060002884754077, + "grad_norm": 9.79909332658279, + "learning_rate": 3.0996996003306576e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1464.0, + "logps/rejected": -1552.0, + "loss": 0.6735, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27734375, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.1640625, + "step": 1666 + }, + { + "epoch": 0.4808885042550123, + "grad_norm": 11.849989859238182, + "learning_rate": 3.0972547099087136e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1368.0, + "logps/rejected": -1400.0, + "loss": 0.7144, + "loss/demonstration_loss": -2800.0, + "loss/preference_loss": -2784.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.326171875, + "rewards/margins": 0.06640625, + "rewards/rejected": 0.259765625, + "step": 1667 + }, + { + "epoch": 0.4811769796624838, + "grad_norm": 10.39264686463552, + "learning_rate": 3.0948092135482776e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1904.0, + "logps/rejected": -1728.0, + "loss": 0.6535, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3680.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.46875, + "rewards/margins": 0.0810546875, + "rewards/rejected": 0.388671875, + "step": 1668 + }, + { + "epoch": 0.4814654550699553, + "grad_norm": 11.520226026464945, + "learning_rate": 3.092363113730401e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.203125, + "logps/chosen": -1408.0, + "logps/rejected": -1408.0, + "loss": 0.6998, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.02001953125, + "rewards/rejected": 0.318359375, + "step": 1669 + }, + { + "epoch": 0.4817539304774268, + "grad_norm": 10.93738670374896, + "learning_rate": 3.0899164129367483e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.34375, + "logps/chosen": -1528.0, + "logps/rejected": -1568.0, + "loss": 0.6617, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.37890625, + "rewards/margins": 0.03369140625, + "rewards/rejected": 0.345703125, + "step": 1670 + }, + { + "epoch": 0.48204240588489833, + "grad_norm": 13.380987602000694, + "learning_rate": 3.087469113649596e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.296875, + "logps/chosen": -1864.0, + "logps/rejected": -1792.0, + "loss": 0.675, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.44140625, + "rewards/margins": -0.00689697265625, + "rewards/rejected": 0.447265625, + "step": 1671 + }, + { + "epoch": 0.48233088129236984, + "grad_norm": 10.06951658831439, + "learning_rate": 3.085021218351824e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.359375, + "logps/chosen": -1736.0, + "logps/rejected": -1840.0, + "loss": 0.639, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3828125, + "rewards/margins": 0.04248046875, + "rewards/rejected": 0.341796875, + "step": 1672 + }, + { + "epoch": 0.48261935669984135, + "grad_norm": 13.251292234310187, + "learning_rate": 3.08257272952692e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.21875, + "logps/chosen": -1600.0, + "logps/rejected": -1224.0, + "loss": 0.6576, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2490234375, + "rewards/margins": 0.003875732421875, + "rewards/rejected": 0.2451171875, + "step": 1673 + }, + { + "epoch": 0.48290783210731286, + "grad_norm": 12.711856301089107, + "learning_rate": 3.080123649658971e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.328125, + "logps/chosen": -2064.0, + "logps/rejected": -1992.0, + "loss": 0.6868, + "loss/demonstration_loss": -4096.0, + "loss/preference_loss": -4096.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.53125, + "rewards/margins": 0.01043701171875, + "rewards/rejected": 0.51953125, + "step": 1674 + }, + { + "epoch": 0.4831963075147844, + "grad_norm": 10.121013474621607, + "learning_rate": 3.077673981232667e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.375, + "logps/chosen": -1344.0, + "logps/rejected": -1464.0, + "loss": 0.6498, + "loss/demonstration_loss": -2848.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.318359375, + "rewards/margins": 0.00750732421875, + "rewards/rejected": 0.310546875, + "step": 1675 + }, + { + "epoch": 0.4834847829222559, + "grad_norm": 11.519979402165841, + "learning_rate": 3.0752237267332927e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.203125, + "logps/chosen": -1424.0, + "logps/rejected": -1656.0, + "loss": 0.7018, + "loss/demonstration_loss": -3104.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.298828125, + "rewards/margins": -0.043212890625, + "rewards/rejected": 0.34375, + "step": 1676 + }, + { + "epoch": 0.4837732583297274, + "grad_norm": 10.042373408926226, + "learning_rate": 3.072772888646728e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.171875, + "logps/chosen": -1600.0, + "logps/rejected": -1584.0, + "loss": 0.6439, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.380859375, + "step": 1677 + }, + { + "epoch": 0.4840617337371989, + "grad_norm": 10.796593422601202, + "learning_rate": 3.0703214694594455e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.359375, + "logps/chosen": -1424.0, + "logps/rejected": -1768.0, + "loss": 0.6744, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.447265625, + "rewards/margins": 0.09033203125, + "rewards/rejected": 0.357421875, + "step": 1678 + }, + { + "epoch": 0.4843502091446704, + "grad_norm": 12.583742560916544, + "learning_rate": 3.0678694716585053e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.453125, + "logps/chosen": -1448.0, + "logps/rejected": -1472.0, + "loss": 0.6856, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.09228515625, + "rewards/rejected": 0.2451171875, + "step": 1679 + }, + { + "epoch": 0.48463868455214193, + "grad_norm": 9.818383899862901, + "learning_rate": 3.0654168977315577e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1224.0, + "logps/rejected": -1328.0, + "loss": 0.665, + "loss/demonstration_loss": -2592.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.345703125, + "rewards/margins": 0.02880859375, + "rewards/rejected": 0.31640625, + "step": 1680 + }, + { + "epoch": 0.48492715995961344, + "grad_norm": 10.785864429127946, + "learning_rate": 3.062963750166835e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1680.0, + "logps/rejected": -1600.0, + "loss": 0.6201, + "loss/demonstration_loss": -3328.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.154296875, + "rewards/rejected": 0.396484375, + "step": 1681 + }, + { + "epoch": 0.48521563536708495, + "grad_norm": 9.874442072546318, + "learning_rate": 3.0605100314531523e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -2080.0, + "logps/rejected": -1808.0, + "loss": 0.647, + "loss/demonstration_loss": -3952.0, + "loss/preference_loss": -3936.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.16796875, + "rewards/rejected": 0.3828125, + "step": 1682 + }, + { + "epoch": 0.48550411077455646, + "grad_norm": 12.190108625830177, + "learning_rate": 3.058055744079904e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.328125, + "logps/chosen": -1728.0, + "logps/rejected": -1656.0, + "loss": 0.6505, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.384765625, + "rewards/margins": 0.09912109375, + "rewards/rejected": 0.287109375, + "step": 1683 + }, + { + "epoch": 0.485792586182028, + "grad_norm": 13.855554880694768, + "learning_rate": 3.055600890537063e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.0625, + "logps/chosen": -1656.0, + "logps/rejected": -1816.0, + "loss": 0.7065, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.03759765625, + "rewards/rejected": 0.298828125, + "step": 1684 + }, + { + "epoch": 0.4860810615894995, + "grad_norm": 10.471673671181692, + "learning_rate": 3.053145473315173e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.25, + "logps/chosen": -1664.0, + "logps/rejected": -1632.0, + "loss": 0.6221, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3344.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.2001953125, + "rewards/rejected": 0.384765625, + "step": 1685 + }, + { + "epoch": 0.486369536996971, + "grad_norm": 12.613366625243815, + "learning_rate": 3.050689494905354e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.171875, + "logps/chosen": -1384.0, + "logps/rejected": -1184.0, + "loss": 0.6603, + "loss/demonstration_loss": -2608.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.271484375, + "rewards/margins": 0.1435546875, + "rewards/rejected": 0.1279296875, + "step": 1686 + }, + { + "epoch": 0.4866580124044425, + "grad_norm": 10.522916954187854, + "learning_rate": 3.04823295779929e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.203125, + "logps/chosen": -1936.0, + "logps/rejected": -1896.0, + "loss": 0.6184, + "loss/demonstration_loss": -3888.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.314453125, + "step": 1687 + }, + { + "epoch": 0.486946487811914, + "grad_norm": 10.62874471888353, + "learning_rate": 3.045775864489238e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.234375, + "logps/chosen": -1616.0, + "logps/rejected": -1720.0, + "loss": 0.6863, + "loss/demonstration_loss": -3376.0, + "loss/preference_loss": -3376.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.421875, + "rewards/margins": 0.019287109375, + "rewards/rejected": 0.40234375, + "step": 1688 + }, + { + "epoch": 0.48723496321938553, + "grad_norm": 11.97141886346288, + "learning_rate": 3.043318217468015e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.34375, + "logps/chosen": -2064.0, + "logps/rejected": -1904.0, + "loss": 0.6973, + "loss/demonstration_loss": -4016.0, + "loss/preference_loss": -4000.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.146484375, + "rewards/rejected": 0.474609375, + "step": 1689 + }, + { + "epoch": 0.48752343862685704, + "grad_norm": 10.095586672460136, + "learning_rate": 3.0408600192290006e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.171875, + "logps/chosen": -1776.0, + "logps/rejected": -1360.0, + "loss": 0.6269, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.310546875, + "rewards/rejected": 0.22265625, + "step": 1690 + }, + { + "epoch": 0.48781191403432855, + "grad_norm": 10.3762084448092, + "learning_rate": 3.038401272266135e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.21875, + "logps/chosen": -1176.0, + "logps/rejected": -936.0, + "loss": 0.6903, + "loss/demonstration_loss": -2144.0, + "loss/preference_loss": -2128.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.061767578125, + "rewards/rejected": 0.189453125, + "step": 1691 + }, + { + "epoch": 0.48810038944180006, + "grad_norm": 10.97064109696163, + "learning_rate": 3.035941979073913e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.203125, + "logps/chosen": -1736.0, + "logps/rejected": -1728.0, + "loss": 0.6725, + "loss/demonstration_loss": -3520.0, + "loss/preference_loss": -3504.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.0791015625, + "rewards/rejected": 0.390625, + "step": 1692 + }, + { + "epoch": 0.4883888648492716, + "grad_norm": 11.350142017783533, + "learning_rate": 3.0334821421473853e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.359375, + "logps/chosen": -1880.0, + "logps/rejected": -1928.0, + "loss": 0.6491, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3856.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.0284423828125, + "rewards/rejected": 0.4609375, + "step": 1693 + }, + { + "epoch": 0.4886773402567431, + "grad_norm": 20.307940854580874, + "learning_rate": 3.031021763982154e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.1875, + "logps/chosen": -1496.0, + "logps/rejected": -1496.0, + "loss": 0.6575, + "loss/demonstration_loss": -3040.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.453125, + "rewards/margins": 0.0849609375, + "rewards/rejected": 0.3671875, + "step": 1694 + }, + { + "epoch": 0.4889658156642146, + "grad_norm": 9.481432164936566, + "learning_rate": 3.028560847074369e-07, + "logits/chosen": 3.390625, + "logits/rejected": 3.40625, + "logps/chosen": -1640.0, + "logps/rejected": -1512.0, + "loss": 0.6657, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3184.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.330078125, + "rewards/margins": 0.0142822265625, + "rewards/rejected": 0.31640625, + "step": 1695 + }, + { + "epoch": 0.48925429107168616, + "grad_norm": 12.702576381337801, + "learning_rate": 3.026099393920728e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.15625, + "logps/chosen": -1632.0, + "logps/rejected": -1328.0, + "loss": 0.665, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.349609375, + "rewards/margins": 0.1015625, + "rewards/rejected": 0.2490234375, + "step": 1696 + }, + { + "epoch": 0.4895427664791577, + "grad_norm": 11.805345781642751, + "learning_rate": 3.023637407018473e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.203125, + "logps/chosen": -1720.0, + "logps/rejected": -1520.0, + "loss": 0.6561, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.1826171875, + "rewards/rejected": 0.322265625, + "step": 1697 + }, + { + "epoch": 0.4898312418866292, + "grad_norm": 10.213244821480894, + "learning_rate": 3.0211748888653857e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.296875, + "logps/chosen": -1544.0, + "logps/rejected": -1464.0, + "loss": 0.6718, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3024.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.431640625, + "rewards/margins": 0.1591796875, + "rewards/rejected": 0.2734375, + "step": 1698 + }, + { + "epoch": 0.4901197172941007, + "grad_norm": 9.940280439380725, + "learning_rate": 3.0187118419597896e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.203125, + "logps/chosen": -1992.0, + "logps/rejected": -1752.0, + "loss": 0.6473, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.10498046875, + "rewards/rejected": 0.4453125, + "step": 1699 + }, + { + "epoch": 0.4904081927015722, + "grad_norm": 10.083668236499864, + "learning_rate": 3.0162482688005427e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1432.0, + "logps/rejected": -1240.0, + "loss": 0.6586, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1689453125, + "rewards/margins": -0.0150146484375, + "rewards/rejected": 0.18359375, + "step": 1700 + }, + { + "epoch": 0.4906966681090437, + "grad_norm": 11.415420947992045, + "learning_rate": 3.0137841718870347e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1680.0, + "logps/rejected": -1688.0, + "loss": 0.6451, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.10546875, + "rewards/rejected": 0.4296875, + "step": 1701 + }, + { + "epoch": 0.49098514351651523, + "grad_norm": 11.657889584744469, + "learning_rate": 3.0113195537191935e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.328125, + "logps/chosen": -1256.0, + "logps/rejected": -1256.0, + "loss": 0.7116, + "loss/demonstration_loss": -2528.0, + "loss/preference_loss": -2528.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2177734375, + "rewards/margins": -0.056640625, + "rewards/rejected": 0.2734375, + "step": 1702 + }, + { + "epoch": 0.49127361892398674, + "grad_norm": 12.261114573786756, + "learning_rate": 3.00885441679747e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -2080.0, + "logps/rejected": -2160.0, + "loss": 0.7263, + "loss/demonstration_loss": -4256.0, + "loss/preference_loss": -4288.0, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.40625, + "rewards/margins": -0.1064453125, + "rewards/rejected": 0.51171875, + "step": 1703 + }, + { + "epoch": 0.49156209433145825, + "grad_norm": 11.410358699500748, + "learning_rate": 3.006388763622841e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.3125, + "logps/chosen": -1296.0, + "logps/rejected": -1328.0, + "loss": 0.6862, + "loss/demonstration_loss": -2656.0, + "loss/preference_loss": -2656.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.322265625, + "rewards/margins": 0.09814453125, + "rewards/rejected": 0.224609375, + "step": 1704 + }, + { + "epoch": 0.49185056973892977, + "grad_norm": 10.83339660277847, + "learning_rate": 3.003922596696811e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.203125, + "logps/chosen": -1320.0, + "logps/rejected": -1344.0, + "loss": 0.7155, + "loss/demonstration_loss": -2688.0, + "loss/preference_loss": -2688.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2578125, + "rewards/margins": -0.0274658203125, + "rewards/rejected": 0.28515625, + "step": 1705 + }, + { + "epoch": 0.4921390451464013, + "grad_norm": 12.484747357776277, + "learning_rate": 3.001455918521403e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.234375, + "logps/chosen": -1328.0, + "logps/rejected": -1288.0, + "loss": 0.7074, + "loss/demonstration_loss": -2640.0, + "loss/preference_loss": -2640.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2890625, + "rewards/margins": 0.047607421875, + "rewards/rejected": 0.240234375, + "step": 1706 + }, + { + "epoch": 0.4924275205538728, + "grad_norm": 9.695931056236464, + "learning_rate": 2.9989887315991603e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -1768.0, + "logps/rejected": -1480.0, + "loss": 0.6388, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.470703125, + "rewards/margins": 0.1806640625, + "rewards/rejected": 0.2890625, + "step": 1707 + }, + { + "epoch": 0.4927159959613443, + "grad_norm": 9.86618962519078, + "learning_rate": 2.996521038433141e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.234375, + "logps/chosen": -1856.0, + "logps/rejected": -1856.0, + "loss": 0.6492, + "loss/demonstration_loss": -3760.0, + "loss/preference_loss": -3760.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.0927734375, + "rewards/rejected": 0.341796875, + "step": 1708 + }, + { + "epoch": 0.4930044713688158, + "grad_norm": 12.423198378005038, + "learning_rate": 2.9940528415269166e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1816.0, + "logps/rejected": -1896.0, + "loss": 0.6849, + "loss/demonstration_loss": -3776.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6171875, + "rewards/margins": 0.0908203125, + "rewards/rejected": 0.52734375, + "step": 1709 + }, + { + "epoch": 0.4932929467762873, + "grad_norm": 11.756586119945142, + "learning_rate": 2.991584143384571e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.328125, + "logps/chosen": -1448.0, + "logps/rejected": -1512.0, + "loss": 0.6779, + "loss/demonstration_loss": -2992.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.291015625, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.2294921875, + "step": 1710 + }, + { + "epoch": 0.49358142218375883, + "grad_norm": 10.252668862959137, + "learning_rate": 2.9891149465106964e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1488.0, + "logps/rejected": -1512.0, + "loss": 0.6693, + "loss/demonstration_loss": -3056.0, + "loss/preference_loss": -3040.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.484375, + "rewards/margins": 0.0224609375, + "rewards/rejected": 0.4609375, + "step": 1711 + }, + { + "epoch": 0.49386989759123034, + "grad_norm": 8.84332308906608, + "learning_rate": 2.986645253410389e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.34375, + "logps/chosen": -1736.0, + "logps/rejected": -1672.0, + "loss": 0.6516, + "loss/demonstration_loss": -3440.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.353515625, + "rewards/margins": -0.033447265625, + "rewards/rejected": 0.38671875, + "step": 1712 + }, + { + "epoch": 0.49415837299870186, + "grad_norm": 11.644937668776777, + "learning_rate": 2.9841750665892525e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.21875, + "logps/chosen": -1952.0, + "logps/rejected": -1864.0, + "loss": 0.6519, + "loss/demonstration_loss": -3856.0, + "loss/preference_loss": -3840.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.1962890625, + "rewards/rejected": 0.3125, + "step": 1713 + }, + { + "epoch": 0.49444684840617337, + "grad_norm": 11.404532680943579, + "learning_rate": 2.9817043885533866e-07, + "logits/chosen": 3.421875, + "logits/rejected": 3.4375, + "logps/chosen": -1656.0, + "logps/rejected": -1704.0, + "loss": 0.6696, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.021484375, + "rewards/rejected": 0.390625, + "step": 1714 + }, + { + "epoch": 0.4947353238136449, + "grad_norm": 10.293634058233197, + "learning_rate": 2.9792332218093925e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1608.0, + "logps/rejected": -1464.0, + "loss": 0.666, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3104.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.359375, + "rewards/margins": 0.05810546875, + "rewards/rejected": 0.30078125, + "step": 1715 + }, + { + "epoch": 0.4950237992211164, + "grad_norm": 12.789946193674773, + "learning_rate": 2.976761568864367e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -1824.0, + "logps/rejected": -1784.0, + "loss": 0.7097, + "loss/demonstration_loss": -3664.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5234375, + "rewards/margins": -0.038818359375, + "rewards/rejected": 0.5625, + "step": 1716 + }, + { + "epoch": 0.4953122746285879, + "grad_norm": 9.59985322023126, + "learning_rate": 2.9742894322258995e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1416.0, + "logps/rejected": -1336.0, + "loss": 0.6609, + "loss/demonstration_loss": -2784.0, + "loss/preference_loss": -2768.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.29296875, + "rewards/margins": 0.0869140625, + "rewards/rejected": 0.2060546875, + "step": 1717 + }, + { + "epoch": 0.4956007500360594, + "grad_norm": 10.900744999863498, + "learning_rate": 2.9718168144020697e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -1720.0, + "logps/rejected": -1512.0, + "loss": 0.642, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.43359375, + "rewards/margins": 0.08740234375, + "rewards/rejected": 0.345703125, + "step": 1718 + }, + { + "epoch": 0.4958892254435309, + "grad_norm": 11.799864498094603, + "learning_rate": 2.9693437179014465e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -1752.0, + "logps/rejected": -1656.0, + "loss": 0.6484, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3440.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5546875, + "rewards/margins": 0.11767578125, + "rewards/rejected": 0.435546875, + "step": 1719 + }, + { + "epoch": 0.49617770085100243, + "grad_norm": 10.742482625477354, + "learning_rate": 2.9668701452330835e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.15625, + "logps/chosen": -1720.0, + "logps/rejected": -1736.0, + "loss": 0.6896, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42578125, + "rewards/margins": 0.08935546875, + "rewards/rejected": 0.3359375, + "step": 1720 + }, + { + "epoch": 0.49646617625847395, + "grad_norm": 11.530089102220035, + "learning_rate": 2.9643960989065185e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.25, + "logps/chosen": -1728.0, + "logps/rejected": -1688.0, + "loss": 0.67, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3456.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.30078125, + "step": 1721 + }, + { + "epoch": 0.49675465166594546, + "grad_norm": 12.19537685761578, + "learning_rate": 2.96192158143177e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.265625, + "logps/chosen": -1592.0, + "logps/rejected": -1312.0, + "loss": 0.672, + "loss/demonstration_loss": -2944.0, + "loss/preference_loss": -2928.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3984375, + "rewards/margins": 0.15625, + "rewards/rejected": 0.2412109375, + "step": 1722 + }, + { + "epoch": 0.49704312707341697, + "grad_norm": 11.424301235682192, + "learning_rate": 2.9594465953193304e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1632.0, + "logps/rejected": -1584.0, + "loss": 0.6848, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.423828125, + "rewards/margins": 0.0140380859375, + "rewards/rejected": 0.41015625, + "step": 1723 + }, + { + "epoch": 0.4973316024808885, + "grad_norm": 12.854738111464284, + "learning_rate": 2.956971143080175e-07, + "logits/chosen": 3.046875, + "logits/rejected": 3.109375, + "logps/chosen": -1696.0, + "logps/rejected": -1536.0, + "loss": 0.6635, + "loss/demonstration_loss": -3296.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.359375, + "step": 1724 + }, + { + "epoch": 0.49762007788836, + "grad_norm": 9.387828390750895, + "learning_rate": 2.954495227225745e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.234375, + "logps/chosen": -1864.0, + "logps/rejected": -1832.0, + "loss": 0.6413, + "loss/demonstration_loss": -3728.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.359375, + "rewards/margins": 0.0537109375, + "rewards/rejected": 0.3046875, + "step": 1725 + }, + { + "epoch": 0.49790855329583156, + "grad_norm": 10.28469694324359, + "learning_rate": 2.952018850267957e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.28125, + "logps/chosen": -1904.0, + "logps/rejected": -1728.0, + "loss": 0.6509, + "loss/demonstration_loss": -3696.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.298828125, + "rewards/rejected": 0.3203125, + "step": 1726 + }, + { + "epoch": 0.49819702870330307, + "grad_norm": 10.355016485892893, + "learning_rate": 2.949542014719191e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.234375, + "logps/chosen": -1384.0, + "logps/rejected": -1136.0, + "loss": 0.6547, + "loss/demonstration_loss": -2560.0, + "loss/preference_loss": -2544.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.328125, + "rewards/margins": 0.10791015625, + "rewards/rejected": 0.2197265625, + "step": 1727 + }, + { + "epoch": 0.4984855041107746, + "grad_norm": 10.940245488522729, + "learning_rate": 2.947064723092296e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.390625, + "logps/chosen": -1480.0, + "logps/rejected": -1688.0, + "loss": 0.6548, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.3515625, + "rewards/margins": -0.07080078125, + "rewards/rejected": 0.421875, + "step": 1728 + }, + { + "epoch": 0.4987739795182461, + "grad_norm": 10.422880718733358, + "learning_rate": 2.9445869779005817e-07, + "logits/chosen": 3.125, + "logits/rejected": 3.109375, + "logps/chosen": -1576.0, + "logps/rejected": -1296.0, + "loss": 0.6312, + "loss/demonstration_loss": -2912.0, + "loss/preference_loss": -2896.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.390625, + "rewards/margins": 0.1884765625, + "rewards/rejected": 0.2041015625, + "step": 1729 + }, + { + "epoch": 0.4990624549257176, + "grad_norm": 9.65906454017761, + "learning_rate": 2.9421087816578186e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1616.0, + "logps/rejected": -1352.0, + "loss": 0.6443, + "loss/demonstration_loss": -3008.0, + "loss/preference_loss": -2992.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.404296875, + "rewards/margins": 0.1591796875, + "rewards/rejected": 0.244140625, + "step": 1730 + }, + { + "epoch": 0.4993509303331891, + "grad_norm": 11.671909107525298, + "learning_rate": 2.9396301368782346e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.203125, + "logps/chosen": -1408.0, + "logps/rejected": -1424.0, + "loss": 0.6738, + "loss/demonstration_loss": -2864.0, + "loss/preference_loss": -2848.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.251953125, + "rewards/margins": 0.0263671875, + "rewards/rejected": 0.224609375, + "step": 1731 + }, + { + "epoch": 0.4996394057406606, + "grad_norm": 10.127893448593674, + "learning_rate": 2.937151046076512e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.3125, + "logps/chosen": -1512.0, + "logps/rejected": -1584.0, + "loss": 0.6533, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.375, + "rewards/margins": 0.1328125, + "rewards/rejected": 0.2421875, + "step": 1732 + }, + { + "epoch": 0.49992788114813214, + "grad_norm": 10.544998510943747, + "learning_rate": 2.934671511767788e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.328125, + "logps/chosen": -1840.0, + "logps/rejected": -1512.0, + "loss": 0.6683, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.359375, + "rewards/margins": 0.01104736328125, + "rewards/rejected": 0.34765625, + "step": 1733 + }, + { + "epoch": 0.5002163565556036, + "grad_norm": 11.867311749742239, + "learning_rate": 2.9321915364676463e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1832.0, + "logps/rejected": -1616.0, + "loss": 0.6476, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.1201171875, + "rewards/rejected": 0.31640625, + "step": 1734 + }, + { + "epoch": 0.5005048319630752, + "grad_norm": 11.009077027857101, + "learning_rate": 2.929711122692122e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.109375, + "logps/chosen": -2096.0, + "logps/rejected": -1888.0, + "loss": 0.6307, + "loss/demonstration_loss": -4032.0, + "loss/preference_loss": -4016.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.125, + "rewards/rejected": 0.341796875, + "step": 1735 + }, + { + "epoch": 0.5007933073705466, + "grad_norm": 10.418677852444347, + "learning_rate": 2.92723027295769e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -2064.0, + "logps/rejected": -2096.0, + "loss": 0.6851, + "loss/demonstration_loss": -4224.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.53125, + "rewards/margins": 0.1650390625, + "rewards/rejected": 0.3671875, + "step": 1736 + }, + { + "epoch": 0.5010817827780182, + "grad_norm": 11.831636208493132, + "learning_rate": 2.9247489897812723e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.265625, + "logps/chosen": -1496.0, + "logps/rejected": -1624.0, + "loss": 0.67, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3136.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.255859375, + "rewards/margins": 0.0517578125, + "rewards/rejected": 0.2041015625, + "step": 1737 + }, + { + "epoch": 0.5013702581854896, + "grad_norm": 10.127416356434798, + "learning_rate": 2.922267275680228e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.28125, + "logps/chosen": -1576.0, + "logps/rejected": -1680.0, + "loss": 0.6618, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3312.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.53125, + "rewards/margins": 0.0240478515625, + "rewards/rejected": 0.5078125, + "step": 1738 + }, + { + "epoch": 0.5016587335929612, + "grad_norm": 9.891721305961726, + "learning_rate": 2.9197851331723544e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.3125, + "logps/chosen": -1736.0, + "logps/rejected": -1800.0, + "loss": 0.6852, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3568.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2421875, + "rewards/margins": -0.0159912109375, + "rewards/rejected": 0.2578125, + "step": 1739 + }, + { + "epoch": 0.5019472090004327, + "grad_norm": 10.342976397999225, + "learning_rate": 2.9173025647758836e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.21875, + "logps/chosen": -1696.0, + "logps/rejected": -1896.0, + "loss": 0.6598, + "loss/demonstration_loss": -3632.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.451171875, + "rewards/margins": 0.146484375, + "rewards/rejected": 0.3046875, + "step": 1740 + }, + { + "epoch": 0.5022356844079042, + "grad_norm": 10.974733975622495, + "learning_rate": 2.914819573009478e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.1875, + "logps/chosen": -1616.0, + "logps/rejected": -1528.0, + "loss": 0.6645, + "loss/demonstration_loss": -3184.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33984375, + "rewards/margins": 0.1845703125, + "rewards/rejected": 0.1552734375, + "step": 1741 + }, + { + "epoch": 0.5025241598153758, + "grad_norm": 10.179767107236293, + "learning_rate": 2.912336160392231e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.328125, + "logps/chosen": -1448.0, + "logps/rejected": -1352.0, + "loss": 0.6683, + "loss/demonstration_loss": -2816.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2451171875, + "rewards/margins": 0.11083984375, + "rewards/rejected": 0.1337890625, + "step": 1742 + }, + { + "epoch": 0.5028126352228472, + "grad_norm": 11.017607509733278, + "learning_rate": 2.909852329443665e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1656.0, + "logps/rejected": -1672.0, + "loss": 0.733, + "loss/demonstration_loss": -3360.0, + "loss/preference_loss": -3360.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.341796875, + "rewards/margins": -0.031005859375, + "rewards/rejected": 0.373046875, + "step": 1743 + }, + { + "epoch": 0.5031011106303188, + "grad_norm": 9.580508703581222, + "learning_rate": 2.9073680826837216e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.265625, + "logps/chosen": -1920.0, + "logps/rejected": -1608.0, + "loss": 0.6513, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3552.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.388671875, + "rewards/margins": 0.1708984375, + "rewards/rejected": 0.21875, + "step": 1744 + }, + { + "epoch": 0.5033895860377903, + "grad_norm": 10.27492287968011, + "learning_rate": 2.9048834226327687e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.3125, + "logps/chosen": -1496.0, + "logps/rejected": -1744.0, + "loss": 0.6995, + "loss/demonstration_loss": -3280.0, + "loss/preference_loss": -3280.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.390625, + "rewards/margins": 0.030517578125, + "rewards/rejected": 0.359375, + "step": 1745 + }, + { + "epoch": 0.5036780614452618, + "grad_norm": 10.737938929214332, + "learning_rate": 2.902398351811592e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -2368.0, + "logps/rejected": -2048.0, + "loss": 0.6541, + "loss/demonstration_loss": -4448.0, + "loss/preference_loss": -4448.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.1181640625, + "rewards/rejected": 0.41796875, + "step": 1746 + }, + { + "epoch": 0.5039665368527333, + "grad_norm": 10.369215253405205, + "learning_rate": 2.8999128727413933e-07, + "logits/chosen": 3.046875, + "logits/rejected": 2.96875, + "logps/chosen": -1368.0, + "logps/rejected": -1344.0, + "loss": 0.6983, + "loss/demonstration_loss": -2752.0, + "loss/preference_loss": -2752.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3203125, + "rewards/margins": 0.01422119140625, + "rewards/rejected": 0.306640625, + "step": 1747 + }, + { + "epoch": 0.5042550122602049, + "grad_norm": 10.498277552345048, + "learning_rate": 2.8974269879437915e-07, + "logits/chosen": 3.34375, + "logits/rejected": 3.34375, + "logps/chosen": -1872.0, + "logps/rejected": -1880.0, + "loss": 0.6699, + "loss/demonstration_loss": -3792.0, + "loss/preference_loss": -3792.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.462890625, + "rewards/margins": 0.041015625, + "rewards/rejected": 0.421875, + "step": 1748 + }, + { + "epoch": 0.5045434876676763, + "grad_norm": 12.688366685296309, + "learning_rate": 2.8949406999408117e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.109375, + "logps/chosen": -1720.0, + "logps/rejected": -1456.0, + "loss": 0.7061, + "loss/demonstration_loss": -3200.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.341796875, + "rewards/margins": 0.10791015625, + "rewards/rejected": 0.234375, + "step": 1749 + }, + { + "epoch": 0.5048319630751479, + "grad_norm": 11.348897766516595, + "learning_rate": 2.8924540112548933e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.28125, + "logps/chosen": -1888.0, + "logps/rejected": -1744.0, + "loss": 0.6845, + "loss/demonstration_loss": -3680.0, + "loss/preference_loss": -3664.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40234375, + "rewards/margins": -0.0240478515625, + "rewards/rejected": 0.42578125, + "step": 1750 + }, + { + "epoch": 0.5051204384826193, + "grad_norm": 11.201366737806685, + "learning_rate": 2.8899669244088803e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1640.0, + "logps/rejected": -1560.0, + "loss": 0.6846, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.02099609375, + "rewards/rejected": 0.38671875, + "step": 1751 + }, + { + "epoch": 0.5054089138900909, + "grad_norm": 9.549822380818073, + "learning_rate": 2.88747944192602e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1824.0, + "logps/rejected": -1680.0, + "loss": 0.642, + "loss/demonstration_loss": -3568.0, + "loss/preference_loss": -3536.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.2177734375, + "rewards/rejected": 0.37109375, + "step": 1752 + }, + { + "epoch": 0.5056973892975624, + "grad_norm": 13.444438510722783, + "learning_rate": 2.8849915663299606e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.203125, + "logps/chosen": -1648.0, + "logps/rejected": -1584.0, + "loss": 0.6857, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34765625, + "rewards/margins": 0.162109375, + "rewards/rejected": 0.1865234375, + "step": 1753 + }, + { + "epoch": 0.5059858647050339, + "grad_norm": 12.888674000666649, + "learning_rate": 2.882503300144752e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1736.0, + "logps/rejected": -1848.0, + "loss": 0.7273, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.322265625, + "rewards/margins": -0.0634765625, + "rewards/rejected": 0.384765625, + "step": 1754 + }, + { + "epoch": 0.5062743401125054, + "grad_norm": 12.400425308277164, + "learning_rate": 2.880014645894837e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.1875, + "logps/chosen": -1904.0, + "logps/rejected": -1776.0, + "loss": 0.6421, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3696.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3046875, + "rewards/margins": 0.123046875, + "rewards/rejected": 0.181640625, + "step": 1755 + }, + { + "epoch": 0.506562815519977, + "grad_norm": 9.901752448052402, + "learning_rate": 2.8775256061050555e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.40625, + "logps/chosen": -1480.0, + "logps/rejected": -1304.0, + "loss": 0.6498, + "loss/demonstration_loss": -2832.0, + "loss/preference_loss": -2816.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.13671875, + "rewards/rejected": 0.271484375, + "step": 1756 + }, + { + "epoch": 0.5068512909274484, + "grad_norm": 11.889209988164124, + "learning_rate": 2.8750361833006354e-07, + "logits/chosen": 3.1875, + "logits/rejected": 3.140625, + "logps/chosen": -1544.0, + "logps/rejected": -1384.0, + "loss": 0.6734, + "loss/demonstration_loss": -2976.0, + "loss/preference_loss": -2976.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.00347900390625, + "rewards/rejected": 0.408203125, + "step": 1757 + }, + { + "epoch": 0.50713976633492, + "grad_norm": 10.833362685525985, + "learning_rate": 2.8725463800071937e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.1875, + "logps/chosen": -1544.0, + "logps/rejected": -1656.0, + "loss": 0.678, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31640625, + "rewards/margins": 0.00830078125, + "rewards/rejected": 0.30859375, + "step": 1758 + }, + { + "epoch": 0.5074282417423914, + "grad_norm": 11.889963462855883, + "learning_rate": 2.8700561987507357e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.25, + "logps/chosen": -1832.0, + "logps/rejected": -1272.0, + "loss": 0.6644, + "loss/demonstration_loss": -3136.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.0703125, + "rewards/rejected": 0.240234375, + "step": 1759 + }, + { + "epoch": 0.507716717149863, + "grad_norm": 12.335351247616256, + "learning_rate": 2.867565642057648e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1976.0, + "logps/rejected": -1896.0, + "loss": 0.6631, + "loss/demonstration_loss": -3904.0, + "loss/preference_loss": -3904.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.404296875, + "rewards/margins": 0.060302734375, + "rewards/rejected": 0.345703125, + "step": 1760 + }, + { + "epoch": 0.5080051925573345, + "grad_norm": 12.615211501644168, + "learning_rate": 2.865074712454698e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.265625, + "logps/chosen": -1752.0, + "logps/rejected": -1624.0, + "loss": 0.7446, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3408.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.333984375, + "rewards/margins": -0.0771484375, + "rewards/rejected": 0.41015625, + "step": 1761 + }, + { + "epoch": 0.508293667964806, + "grad_norm": 11.030303696586278, + "learning_rate": 2.8625834124690337e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.25, + "logps/chosen": -1600.0, + "logps/rejected": -1744.0, + "loss": 0.6954, + "loss/demonstration_loss": -3392.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.33203125, + "rewards/margins": -0.0306396484375, + "rewards/rejected": 0.361328125, + "step": 1762 + }, + { + "epoch": 0.5085821433722775, + "grad_norm": 10.830900033664998, + "learning_rate": 2.860091744628175e-07, + "logits/chosen": 3.09375, + "logits/rejected": 3.046875, + "logps/chosen": -1080.0, + "logps/rejected": -1368.0, + "loss": 0.6865, + "loss/demonstration_loss": -2480.0, + "loss/preference_loss": -2464.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.267578125, + "rewards/margins": 0.0289306640625, + "rewards/rejected": 0.23828125, + "step": 1763 + }, + { + "epoch": 0.508870618779749, + "grad_norm": 11.75584625699594, + "learning_rate": 2.857599711460021e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.21875, + "logps/chosen": -1752.0, + "logps/rejected": -1824.0, + "loss": 0.68, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3616.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.375, + "rewards/margins": 0.078125, + "rewards/rejected": 0.296875, + "step": 1764 + }, + { + "epoch": 0.5091590941872205, + "grad_norm": 11.037678554057901, + "learning_rate": 2.8551073154928353e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.125, + "logps/chosen": -1968.0, + "logps/rejected": -1680.0, + "loss": 0.7048, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.57421875, + "rewards/margins": -0.01708984375, + "rewards/rejected": 0.59375, + "step": 1765 + }, + { + "epoch": 0.5094475695946921, + "grad_norm": 13.128162958752643, + "learning_rate": 2.852614559255251e-07, + "logits/chosen": 3.078125, + "logits/rejected": 3.078125, + "logps/chosen": -1432.0, + "logps/rejected": -1440.0, + "loss": 0.6617, + "loss/demonstration_loss": -2880.0, + "loss/preference_loss": -2880.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2265625, + "rewards/margins": 0.08251953125, + "rewards/rejected": 0.1435546875, + "step": 1766 + }, + { + "epoch": 0.5097360450021635, + "grad_norm": 12.149654655111377, + "learning_rate": 2.85012144527627e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.234375, + "logps/chosen": -1096.0, + "logps/rejected": -1040.0, + "loss": 0.6613, + "loss/demonstration_loss": -2160.0, + "loss/preference_loss": -2160.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.185546875, + "rewards/margins": 0.03173828125, + "rewards/rejected": 0.154296875, + "step": 1767 + }, + { + "epoch": 0.5100245204096351, + "grad_norm": 11.01578755937033, + "learning_rate": 2.847627976085254e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.265625, + "logps/chosen": -1784.0, + "logps/rejected": -1392.0, + "loss": 0.6149, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3200.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.30859375, + "rewards/rejected": 0.1787109375, + "step": 1768 + }, + { + "epoch": 0.5103129958171065, + "grad_norm": 12.07039681947874, + "learning_rate": 2.8451341542119264e-07, + "logits/chosen": 3.203125, + "logits/rejected": 3.171875, + "logps/chosen": -1824.0, + "logps/rejected": -1736.0, + "loss": 0.6987, + "loss/demonstration_loss": -3584.0, + "loss/preference_loss": -3600.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.267578125, + "rewards/margins": -0.07861328125, + "rewards/rejected": 0.345703125, + "step": 1769 + }, + { + "epoch": 0.5106014712245781, + "grad_norm": 9.261033049972546, + "learning_rate": 2.842639982186367e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.3125, + "logps/chosen": -1800.0, + "logps/rejected": -1568.0, + "loss": 0.6302, + "loss/demonstration_loss": -3408.0, + "loss/preference_loss": -3392.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.408203125, + "rewards/margins": 0.166015625, + "rewards/rejected": 0.2421875, + "step": 1770 + }, + { + "epoch": 0.5108899466320496, + "grad_norm": 11.028187169667907, + "learning_rate": 2.840145462539013e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.25, + "logps/chosen": -1632.0, + "logps/rejected": -1584.0, + "loss": 0.6897, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2734375, + "rewards/margins": -0.0025634765625, + "rewards/rejected": 0.27734375, + "step": 1771 + }, + { + "epoch": 0.5111784220395211, + "grad_norm": 10.958856507788164, + "learning_rate": 2.8376505978006523e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1760.0, + "logps/rejected": -1440.0, + "loss": 0.6868, + "loss/demonstration_loss": -3248.0, + "loss/preference_loss": -3232.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.06982421875, + "rewards/rejected": 0.326171875, + "step": 1772 + }, + { + "epoch": 0.5114668974469927, + "grad_norm": 10.271647073437943, + "learning_rate": 2.835155390502424e-07, + "logits/chosen": 3.265625, + "logits/rejected": 3.28125, + "logps/chosen": -1888.0, + "logps/rejected": -1544.0, + "loss": 0.6309, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.6484375, + "rewards/margins": 0.169921875, + "rewards/rejected": 0.4765625, + "step": 1773 + }, + { + "epoch": 0.5117553728544642, + "grad_norm": 10.07837943509352, + "learning_rate": 2.832659843175814e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.171875, + "logps/chosen": -1624.0, + "logps/rejected": -1600.0, + "loss": 0.6793, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3264.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4296875, + "rewards/margins": 0.042236328125, + "rewards/rejected": 0.38671875, + "step": 1774 + }, + { + "epoch": 0.5120438482619357, + "grad_norm": 10.2426390943358, + "learning_rate": 2.830163958352655e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.203125, + "logps/chosen": -1760.0, + "logps/rejected": -1696.0, + "loss": 0.6788, + "loss/demonstration_loss": -3488.0, + "loss/preference_loss": -3488.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.345703125, + "rewards/margins": 0.009765625, + "rewards/rejected": 0.3359375, + "step": 1775 + }, + { + "epoch": 0.5123323236694072, + "grad_norm": 12.045472262208655, + "learning_rate": 2.827667738565119e-07, + "logits/chosen": 3.171875, + "logits/rejected": 3.109375, + "logps/chosen": -1256.0, + "logps/rejected": -1320.0, + "loss": 0.6523, + "loss/demonstration_loss": -2592.0, + "loss/preference_loss": -2592.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.142578125, + "rewards/margins": 0.042724609375, + "rewards/rejected": 0.09912109375, + "step": 1776 + }, + { + "epoch": 0.5126207990768787, + "grad_norm": 9.512392921154305, + "learning_rate": 2.8251711863457204e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.15625, + "logps/chosen": -1608.0, + "logps/rejected": -1664.0, + "loss": 0.6118, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.2109375, + "rewards/rejected": 0.27734375, + "step": 1777 + }, + { + "epoch": 0.5129092744843502, + "grad_norm": 12.276800496727052, + "learning_rate": 2.8226743042273106e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.359375, + "logps/chosen": -2064.0, + "logps/rejected": -1680.0, + "loss": 0.6464, + "loss/demonstration_loss": -3808.0, + "loss/preference_loss": -3776.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.212890625, + "rewards/rejected": 0.337890625, + "step": 1778 + }, + { + "epoch": 0.5131977498918218, + "grad_norm": 11.193187245544724, + "learning_rate": 2.8201770947430746e-07, + "logits/chosen": 3.15625, + "logits/rejected": 3.078125, + "logps/chosen": -1496.0, + "logps/rejected": -1616.0, + "loss": 0.7102, + "loss/demonstration_loss": -3152.0, + "loss/preference_loss": -3152.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.361328125, + "rewards/margins": -0.022705078125, + "rewards/rejected": 0.384765625, + "step": 1779 + }, + { + "epoch": 0.5134862252992932, + "grad_norm": 11.263666515842987, + "learning_rate": 2.817679560426529e-07, + "logits/chosen": 3.359375, + "logits/rejected": 3.359375, + "logps/chosen": -1824.0, + "logps/rejected": -1760.0, + "loss": 0.7045, + "loss/demonstration_loss": -3616.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2890625, + "rewards/margins": -0.109375, + "rewards/rejected": 0.3984375, + "step": 1780 + }, + { + "epoch": 0.5137747007067648, + "grad_norm": 12.047409658844018, + "learning_rate": 2.8151817038115225e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.421875, + "logps/chosen": -1832.0, + "logps/rejected": -1600.0, + "loss": 0.7298, + "loss/demonstration_loss": -3456.0, + "loss/preference_loss": -3472.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.322265625, + "rewards/margins": -0.09033203125, + "rewards/rejected": 0.412109375, + "step": 1781 + }, + { + "epoch": 0.5140631761142362, + "grad_norm": 12.049594595867552, + "learning_rate": 2.8126835274322285e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1536.0, + "logps/rejected": -1552.0, + "loss": 0.7147, + "loss/demonstration_loss": -3120.0, + "loss/preference_loss": -3120.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3125, + "rewards/margins": -0.06103515625, + "rewards/rejected": 0.373046875, + "step": 1782 + }, + { + "epoch": 0.5143516515217078, + "grad_norm": 10.544502936629046, + "learning_rate": 2.810185033823147e-07, + "logits/chosen": 3.3125, + "logits/rejected": 3.28125, + "logps/chosen": -2272.0, + "logps/rejected": -2064.0, + "loss": 0.5775, + "loss/demonstration_loss": -4384.0, + "loss/preference_loss": -4352.0, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.26171875, + "rewards/rejected": 0.341796875, + "step": 1783 + }, + { + "epoch": 0.5146401269291793, + "grad_norm": 12.577301818836316, + "learning_rate": 2.807686225519097e-07, + "logits/chosen": 3.40625, + "logits/rejected": 3.265625, + "logps/chosen": -1528.0, + "logps/rejected": -1512.0, + "loss": 0.6682, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.388671875, + "rewards/margins": 0.12158203125, + "rewards/rejected": 0.267578125, + "step": 1784 + }, + { + "epoch": 0.5149286023366508, + "grad_norm": 10.639348773105537, + "learning_rate": 2.805187105055217e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.1875, + "logps/chosen": -1568.0, + "logps/rejected": -1616.0, + "loss": 0.6508, + "loss/demonstration_loss": -3232.0, + "loss/preference_loss": -3216.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.423828125, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.294921875, + "step": 1785 + }, + { + "epoch": 0.5152170777441223, + "grad_norm": 8.79555986209923, + "learning_rate": 2.8026876749669666e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.21875, + "logps/chosen": -1528.0, + "logps/rejected": -1440.0, + "loss": 0.6092, + "loss/demonstration_loss": -3024.0, + "loss/preference_loss": -3008.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5625, + "rewards/margins": 0.298828125, + "rewards/rejected": 0.263671875, + "step": 1786 + }, + { + "epoch": 0.5155055531515939, + "grad_norm": 12.385555017777516, + "learning_rate": 2.8001879377901144e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.1875, + "logps/chosen": -1880.0, + "logps/rejected": -1896.0, + "loss": 0.6561, + "loss/demonstration_loss": -3824.0, + "loss/preference_loss": -3808.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.053955078125, + "rewards/rejected": 0.42578125, + "step": 1787 + }, + { + "epoch": 0.5157940285590653, + "grad_norm": 10.924212942877688, + "learning_rate": 2.7976878960607423e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.21875, + "logps/chosen": -1664.0, + "logps/rejected": -2032.0, + "loss": 0.6226, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3728.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.09130859375, + "rewards/rejected": 0.3203125, + "step": 1788 + }, + { + "epoch": 0.5160825039665369, + "grad_norm": 10.78593814369221, + "learning_rate": 2.795187552315242e-07, + "logits/chosen": 3.375, + "logits/rejected": 3.375, + "logps/chosen": -1696.0, + "logps/rejected": -1592.0, + "loss": 0.7046, + "loss/demonstration_loss": -3344.0, + "loss/preference_loss": -3328.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.04296875, + "rewards/rejected": 0.5234375, + "step": 1789 + }, + { + "epoch": 0.5163709793740083, + "grad_norm": 11.279712481239342, + "learning_rate": 2.792686909090311e-07, + "logits/chosen": 3.21875, + "logits/rejected": 3.140625, + "logps/chosen": -1528.0, + "logps/rejected": -1872.0, + "loss": 0.707, + "loss/demonstration_loss": -3424.0, + "loss/preference_loss": -3424.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1650390625, + "rewards/margins": -0.0546875, + "rewards/rejected": 0.2197265625, + "step": 1790 + }, + { + "epoch": 0.5166594547814799, + "grad_norm": 11.547203408410557, + "learning_rate": 2.790185968922951e-07, + "logits/chosen": 3.328125, + "logits/rejected": 3.3125, + "logps/chosen": -1584.0, + "logps/rejected": -1688.0, + "loss": 0.658, + "loss/demonstration_loss": -3312.0, + "loss/preference_loss": -3296.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.310546875, + "rewards/margins": 0.1162109375, + "rewards/rejected": 0.1943359375, + "step": 1791 + }, + { + "epoch": 0.5169479301889514, + "grad_norm": 9.674800434602586, + "learning_rate": 2.787684734350464e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.25, + "logps/chosen": -1544.0, + "logps/rejected": -1512.0, + "loss": 0.6423, + "loss/demonstration_loss": -3088.0, + "loss/preference_loss": -3072.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.296875, + "rewards/margins": 0.12890625, + "rewards/rejected": 0.16796875, + "step": 1792 + }, + { + "epoch": 0.5172364055964229, + "grad_norm": 11.004983692149857, + "learning_rate": 2.785183207910451e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.28125, + "logps/chosen": -1856.0, + "logps/rejected": -1744.0, + "loss": 0.6205, + "loss/demonstration_loss": -3648.0, + "loss/preference_loss": -3632.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.49609375, + "rewards/margins": 0.109375, + "rewards/rejected": 0.38671875, + "step": 1793 + }, + { + "epoch": 0.5175248810038944, + "grad_norm": 11.036381043107324, + "learning_rate": 2.78268139214081e-07, + "logits/chosen": 3.140625, + "logits/rejected": 3.140625, + "logps/chosen": -1552.0, + "logps/rejected": -1656.0, + "loss": 0.6823, + "loss/demonstration_loss": -3264.0, + "loss/preference_loss": -3248.0, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.412109375, + "rewards/margins": 0.03857421875, + "rewards/rejected": 0.373046875, + "step": 1794 + }, + { + "epoch": 0.5178133564113659, + "grad_norm": 11.74127916604716, + "learning_rate": 2.7801792895797314e-07, + "logits/chosen": 3.25, + "logits/rejected": 3.234375, + "logps/chosen": -2144.0, + "logps/rejected": -2016.0, + "loss": 0.6805, + "loss/demonstration_loss": -4192.0, + "loss/preference_loss": -4192.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.087890625, + "rewards/rejected": 0.35546875, + "step": 1795 + }, + { + "epoch": 0.5181018318188374, + "grad_norm": 11.510896381212797, + "learning_rate": 2.777676902765697e-07, + "logits/chosen": 3.296875, + "logits/rejected": 3.296875, + "logps/chosen": -1984.0, + "logps/rejected": -1728.0, + "loss": 0.6583, + "loss/demonstration_loss": -3744.0, + "loss/preference_loss": -3744.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37109375, + "rewards/margins": 0.047119140625, + "rewards/rejected": 0.32421875, + "step": 1796 + }, + { + "epoch": 0.518390307226309, + "grad_norm": 11.714203265773671, + "learning_rate": 2.7751742342374785e-07, + "logits/chosen": 3.28125, + "logits/rejected": 3.296875, + "logps/chosen": -1688.0, + "logps/rejected": -1776.0, + "loss": 0.6896, + "loss/demonstration_loss": -3504.0, + "loss/preference_loss": -3520.0, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.48828125, + "rewards/margins": -0.01953125, + "rewards/rejected": 0.5078125, + "step": 1797 + }, + { + "epoch": 0.5186787826337804, + "grad_norm": 11.394969558183211, + "learning_rate": 2.7726712865341297e-07, + "logits/chosen": 3.0625, + "logits/rejected": 3.234375, + "logps/chosen": -1488.0, + "logps/rejected": -1448.0, + "loss": 0.6825, + "loss/demonstration_loss": -2960.0, + "loss/preference_loss": -2944.0, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.171875, + "rewards/margins": 0.025390625, + "rewards/rejected": 0.146484375, + "step": 1798 + }, + { + "epoch": 0.518967258041252, + "grad_norm": 10.172716302831192, + "learning_rate": 2.770168062194991e-07, + "logits/chosen": 3.234375, + "logits/rejected": 3.359375, + "logps/chosen": -1256.0, + "logps/rejected": -1144.0, + "loss": 0.6469, + "loss/demonstration_loss": -2416.0, + "loss/preference_loss": -2416.0, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1689453125, + "rewards/margins": 0.0234375, + "rewards/rejected": 0.1455078125, + "step": 1799 + }, + { + "epoch": 0.5192557334487234, + "grad_norm": 11.763509760059897, + "learning_rate": 2.767664563759683e-07, + "logits/chosen": 3.109375, + "logits/rejected": 3.09375, + "logps/chosen": -1848.0, + "logps/rejected": -1832.0, + "loss": 0.6771, + "loss/demonstration_loss": -3712.0, + "loss/preference_loss": -3712.0, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.390625, + "rewards/margins": 0.068359375, + "rewards/rejected": 0.322265625, + "step": 1800 + }, + { + "epoch": 0.5192557334487234, + "step": 1800, + "total_flos": 0.0, + "train_loss": 0.0, + "train_runtime": 1.1312, + "train_samples_per_second": 35360.967, + "train_steps_per_second": 1105.03 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}