{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5192557334487234, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002884754074715131, "grad_norm": 15.105622098355184, "learning_rate": 1.440922190201729e-09, "logits/chosen": 2.75, "logits/rejected": 2.828125, "logps/chosen": -1552.0, "logps/rejected": -1752.0, "loss": 0.6947, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1435546875, "rewards/margins": 0.012451171875, "rewards/rejected": -0.15625, "step": 1 }, { "epoch": 0.0005769508149430262, "grad_norm": 13.38233941885805, "learning_rate": 2.881844380403458e-09, "logits/chosen": 2.71875, "logits/rejected": 2.75, "logps/chosen": -1640.0, "logps/rejected": -1560.0, "loss": 0.6732, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.09521484375, "rewards/margins": 0.058837890625, "rewards/rejected": -0.154296875, "step": 2 }, { "epoch": 0.0008654262224145391, "grad_norm": 10.63226296734449, "learning_rate": 4.3227665706051874e-09, "logits/chosen": 2.875, "logits/rejected": 2.8125, "logps/chosen": -1632.0, "logps/rejected": -1672.0, "loss": 0.7098, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.09326171875, "rewards/margins": -0.0230712890625, "rewards/rejected": -0.0703125, "step": 3 }, { "epoch": 0.0011539016298860523, "grad_norm": 14.234756126774585, "learning_rate": 5.763688760806916e-09, "logits/chosen": 2.734375, "logits/rejected": 2.6875, "logps/chosen": -1656.0, "logps/rejected": -1824.0, "loss": 0.6915, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1865234375, "rewards/margins": -0.0306396484375, "rewards/rejected": -0.15625, "step": 4 }, { "epoch": 0.0014423770373575653, "grad_norm": 14.47425314509406, "learning_rate": 7.204610951008645e-09, "logits/chosen": 2.875, "logits/rejected": 2.859375, "logps/chosen": -1816.0, "logps/rejected": -1472.0, "loss": 0.6982, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1240234375, "rewards/margins": 0.041748046875, "rewards/rejected": -0.166015625, "step": 5 }, { "epoch": 0.0017308524448290783, "grad_norm": 12.392300151453513, "learning_rate": 8.645533141210375e-09, "logits/chosen": 2.765625, "logits/rejected": 2.859375, "logps/chosen": -1944.0, "logps/rejected": -1824.0, "loss": 0.7202, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1376953125, "rewards/margins": -0.02001953125, "rewards/rejected": -0.11767578125, "step": 6 }, { "epoch": 0.0020193278523005912, "grad_norm": 11.958961211656046, "learning_rate": 1.0086455331412104e-08, "logits/chosen": 2.78125, "logits/rejected": 2.8125, "logps/chosen": -1768.0, "logps/rejected": -1792.0, "loss": 0.6884, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1318359375, "rewards/margins": 0.0081787109375, "rewards/rejected": -0.140625, "step": 7 }, { "epoch": 0.0023078032597721046, "grad_norm": 12.540658855939016, "learning_rate": 1.1527377521613832e-08, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1864.0, "logps/rejected": -1928.0, "loss": 0.7207, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12890625, "rewards/margins": -0.02880859375, "rewards/rejected": -0.10009765625, "step": 8 }, { "epoch": 0.0025962786672436176, "grad_norm": 13.090625634922002, "learning_rate": 1.2968299711815562e-08, "logits/chosen": 2.65625, "logits/rejected": 2.71875, "logps/chosen": -2208.0, "logps/rejected": -2144.0, "loss": 0.7173, "loss/demonstration_loss": -4352.0, "loss/preference_loss": -4352.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.240234375, "rewards/margins": -0.053955078125, "rewards/rejected": -0.1865234375, "step": 9 }, { "epoch": 0.0028847540747151306, "grad_norm": 15.235836453697397, "learning_rate": 1.440922190201729e-08, "logits/chosen": 2.953125, "logits/rejected": 3.015625, "logps/chosen": -1504.0, "logps/rejected": -1336.0, "loss": 0.7042, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1279296875, "rewards/margins": -0.0263671875, "rewards/rejected": -0.10107421875, "step": 10 }, { "epoch": 0.0031732294821866435, "grad_norm": 14.204409404027695, "learning_rate": 1.585014409221902e-08, "logits/chosen": 2.8125, "logits/rejected": 2.75, "logps/chosen": -1616.0, "logps/rejected": -1648.0, "loss": 0.6975, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.13671875, "rewards/margins": 0.00115966796875, "rewards/rejected": -0.1376953125, "step": 11 }, { "epoch": 0.0034617048896581565, "grad_norm": 18.844810544155305, "learning_rate": 1.729106628242075e-08, "logits/chosen": 2.796875, "logits/rejected": 2.8125, "logps/chosen": -1576.0, "logps/rejected": -1440.0, "loss": 0.7234, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.142578125, "rewards/margins": -0.0419921875, "rewards/rejected": -0.1005859375, "step": 12 }, { "epoch": 0.0037501802971296695, "grad_norm": 12.791797669534505, "learning_rate": 1.8731988472622476e-08, "logits/chosen": 2.859375, "logits/rejected": 2.859375, "logps/chosen": -1744.0, "logps/rejected": -1696.0, "loss": 0.6899, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1259765625, "rewards/margins": 0.03466796875, "rewards/rejected": -0.16015625, "step": 13 }, { "epoch": 0.0040386557046011825, "grad_norm": 12.869716561025204, "learning_rate": 2.0172910662824208e-08, "logits/chosen": 2.71875, "logits/rejected": 2.6875, "logps/chosen": -1672.0, "logps/rejected": -1744.0, "loss": 0.6702, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.08740234375, "rewards/margins": 0.0125732421875, "rewards/rejected": -0.10009765625, "step": 14 }, { "epoch": 0.004327131112072695, "grad_norm": 12.93353721714942, "learning_rate": 2.1613832853025937e-08, "logits/chosen": 2.75, "logits/rejected": 2.71875, "logps/chosen": -1568.0, "logps/rejected": -1408.0, "loss": 0.6797, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.130859375, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.1240234375, "step": 15 }, { "epoch": 0.004615606519544209, "grad_norm": 12.88234502217038, "learning_rate": 2.3054755043227663e-08, "logits/chosen": 2.828125, "logits/rejected": 2.859375, "logps/chosen": -1712.0, "logps/rejected": -1736.0, "loss": 0.7031, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.181640625, "rewards/margins": 0.00616455078125, "rewards/rejected": -0.1875, "step": 16 }, { "epoch": 0.004904081927015722, "grad_norm": 15.034716429557914, "learning_rate": 2.4495677233429392e-08, "logits/chosen": 2.65625, "logits/rejected": 2.5625, "logps/chosen": -1648.0, "logps/rejected": -1672.0, "loss": 0.7188, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1630859375, "rewards/margins": -0.06591796875, "rewards/rejected": -0.09716796875, "step": 17 }, { "epoch": 0.005192557334487235, "grad_norm": 17.502122918901282, "learning_rate": 2.5936599423631125e-08, "logits/chosen": 2.859375, "logits/rejected": 2.828125, "logps/chosen": -1888.0, "logps/rejected": -1984.0, "loss": 0.7198, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.181640625, "rewards/margins": -0.07861328125, "rewards/rejected": -0.1025390625, "step": 18 }, { "epoch": 0.005481032741958748, "grad_norm": 15.442527217563052, "learning_rate": 2.7377521613832854e-08, "logits/chosen": 2.921875, "logits/rejected": 3.015625, "logps/chosen": -1944.0, "logps/rejected": -1936.0, "loss": 0.7505, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.2216796875, "rewards/margins": -0.095703125, "rewards/rejected": -0.1259765625, "step": 19 }, { "epoch": 0.005769508149430261, "grad_norm": 15.794170115027935, "learning_rate": 2.881844380403458e-08, "logits/chosen": 2.90625, "logits/rejected": 2.921875, "logps/chosen": -1560.0, "logps/rejected": -1512.0, "loss": 0.7106, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.150390625, "rewards/margins": -0.04443359375, "rewards/rejected": -0.1064453125, "step": 20 }, { "epoch": 0.006057983556901774, "grad_norm": 12.564747763926832, "learning_rate": 3.025936599423631e-08, "logits/chosen": 2.84375, "logits/rejected": 2.890625, "logps/chosen": -1616.0, "logps/rejected": -1544.0, "loss": 0.7095, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1591796875, "rewards/margins": -0.048828125, "rewards/rejected": -0.1103515625, "step": 21 }, { "epoch": 0.006346458964373287, "grad_norm": 12.59940769824516, "learning_rate": 3.170028818443804e-08, "logits/chosen": 2.6875, "logits/rejected": 2.59375, "logps/chosen": -1960.0, "logps/rejected": -1720.0, "loss": 0.7194, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1875, "rewards/margins": -0.07373046875, "rewards/rejected": -0.11376953125, "step": 22 }, { "epoch": 0.0066349343718448, "grad_norm": 17.171978453205284, "learning_rate": 3.314121037463977e-08, "logits/chosen": 2.890625, "logits/rejected": 2.953125, "logps/chosen": -1960.0, "logps/rejected": -1752.0, "loss": 0.7145, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.154296875, "rewards/margins": -0.0576171875, "rewards/rejected": -0.09619140625, "step": 23 }, { "epoch": 0.006923409779316313, "grad_norm": 16.54562857083525, "learning_rate": 3.45821325648415e-08, "logits/chosen": 2.828125, "logits/rejected": 2.796875, "logps/chosen": -1272.0, "logps/rejected": -1336.0, "loss": 0.7218, "loss/demonstration_loss": -2592.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.16015625, "rewards/margins": -0.095703125, "rewards/rejected": -0.064453125, "step": 24 }, { "epoch": 0.007211885186787826, "grad_norm": 13.444888343207758, "learning_rate": 3.6023054755043225e-08, "logits/chosen": 2.609375, "logits/rejected": 2.703125, "logps/chosen": -1592.0, "logps/rejected": -1584.0, "loss": 0.6952, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.150390625, "rewards/margins": -0.0263671875, "rewards/rejected": -0.1240234375, "step": 25 }, { "epoch": 0.007500360594259339, "grad_norm": 12.995839736424188, "learning_rate": 3.746397694524495e-08, "logits/chosen": 2.890625, "logits/rejected": 2.8125, "logps/chosen": -1624.0, "logps/rejected": -1840.0, "loss": 0.6986, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1162109375, "rewards/margins": -0.01556396484375, "rewards/rejected": -0.1005859375, "step": 26 }, { "epoch": 0.007788836001730853, "grad_norm": 13.86653715582766, "learning_rate": 3.8904899135446684e-08, "logits/chosen": 2.765625, "logits/rejected": 2.75, "logps/chosen": -1600.0, "logps/rejected": -1624.0, "loss": 0.6911, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.134765625, "rewards/margins": -0.02001953125, "rewards/rejected": -0.115234375, "step": 27 }, { "epoch": 0.008077311409202365, "grad_norm": 13.030726174120343, "learning_rate": 4.0345821325648416e-08, "logits/chosen": 2.734375, "logits/rejected": 2.71875, "logps/chosen": -1536.0, "logps/rejected": -1464.0, "loss": 0.7053, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1796875, "rewards/margins": -0.0263671875, "rewards/rejected": -0.154296875, "step": 28 }, { "epoch": 0.008365786816673878, "grad_norm": 15.683952023128214, "learning_rate": 4.178674351585014e-08, "logits/chosen": 2.921875, "logits/rejected": 3.0, "logps/chosen": -1624.0, "logps/rejected": -1192.0, "loss": 0.6995, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.08642578125, "rewards/margins": 0.030517578125, "rewards/rejected": -0.11669921875, "step": 29 }, { "epoch": 0.00865426222414539, "grad_norm": 13.90180927179223, "learning_rate": 4.3227665706051874e-08, "logits/chosen": 2.875, "logits/rejected": 2.859375, "logps/chosen": -1832.0, "logps/rejected": -1872.0, "loss": 0.6894, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.625, "rewards/chosen": -0.080078125, "rewards/margins": 0.09521484375, "rewards/rejected": -0.1748046875, "step": 30 }, { "epoch": 0.008942737631616904, "grad_norm": 17.76007574919579, "learning_rate": 4.46685878962536e-08, "logits/chosen": 2.84375, "logits/rejected": 2.859375, "logps/chosen": -1880.0, "logps/rejected": -1816.0, "loss": 0.6858, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.15234375, "rewards/margins": 0.018798828125, "rewards/rejected": -0.171875, "step": 31 }, { "epoch": 0.009231213039088419, "grad_norm": 12.92750956578199, "learning_rate": 4.6109510086455326e-08, "logits/chosen": 2.890625, "logits/rejected": 2.84375, "logps/chosen": -1816.0, "logps/rejected": -1824.0, "loss": 0.7167, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.15625, "rewards/margins": -0.0224609375, "rewards/rejected": -0.1337890625, "step": 32 }, { "epoch": 0.009519688446559932, "grad_norm": 13.672123525697366, "learning_rate": 4.755043227665706e-08, "logits/chosen": 2.625, "logits/rejected": 2.6875, "logps/chosen": -1256.0, "logps/rejected": -1224.0, "loss": 0.7009, "loss/demonstration_loss": -2464.0, "loss/preference_loss": -2464.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.142578125, "rewards/margins": -0.003753662109375, "rewards/rejected": -0.138671875, "step": 33 }, { "epoch": 0.009808163854031444, "grad_norm": 11.021237673944645, "learning_rate": 4.8991354466858784e-08, "logits/chosen": 2.90625, "logits/rejected": 2.921875, "logps/chosen": -1384.0, "logps/rejected": -1600.0, "loss": 0.6967, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11279296875, "rewards/margins": -0.012451171875, "rewards/rejected": -0.10009765625, "step": 34 }, { "epoch": 0.010096639261502957, "grad_norm": 14.153400876179415, "learning_rate": 5.043227665706052e-08, "logits/chosen": 2.75, "logits/rejected": 2.8125, "logps/chosen": -1816.0, "logps/rejected": -1640.0, "loss": 0.7126, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.19921875, "rewards/margins": -0.043212890625, "rewards/rejected": -0.1552734375, "step": 35 }, { "epoch": 0.01038511466897447, "grad_norm": 13.06943765898696, "learning_rate": 5.187319884726225e-08, "logits/chosen": 2.90625, "logits/rejected": 2.984375, "logps/chosen": -1696.0, "logps/rejected": -1680.0, "loss": 0.6866, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.10107421875, "rewards/margins": -0.015380859375, "rewards/rejected": -0.08544921875, "step": 36 }, { "epoch": 0.010673590076445983, "grad_norm": 11.48175185532301, "learning_rate": 5.3314121037463975e-08, "logits/chosen": 2.875, "logits/rejected": 2.859375, "logps/chosen": -1992.0, "logps/rejected": -2112.0, "loss": 0.6775, "loss/demonstration_loss": -4080.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.158203125, "rewards/margins": 0.046875, "rewards/rejected": -0.205078125, "step": 37 }, { "epoch": 0.010962065483917496, "grad_norm": 12.88120116924046, "learning_rate": 5.475504322766571e-08, "logits/chosen": 2.9375, "logits/rejected": 2.9375, "logps/chosen": -1656.0, "logps/rejected": -1568.0, "loss": 0.6763, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1455078125, "rewards/margins": 0.034912109375, "rewards/rejected": -0.1796875, "step": 38 }, { "epoch": 0.01125054089138901, "grad_norm": 13.50696208674342, "learning_rate": 5.6195965417867433e-08, "logits/chosen": 2.921875, "logits/rejected": 2.890625, "logps/chosen": -1472.0, "logps/rejected": -1376.0, "loss": 0.7014, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0888671875, "rewards/margins": 0.002838134765625, "rewards/rejected": -0.091796875, "step": 39 }, { "epoch": 0.011539016298860522, "grad_norm": 12.5791808171264, "learning_rate": 5.763688760806916e-08, "logits/chosen": 2.828125, "logits/rejected": 2.796875, "logps/chosen": -1984.0, "logps/rejected": -1952.0, "loss": 0.6979, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0927734375, "rewards/margins": 0.0400390625, "rewards/rejected": -0.1328125, "step": 40 }, { "epoch": 0.011827491706332035, "grad_norm": 18.17073907429118, "learning_rate": 5.907780979827089e-08, "logits/chosen": 2.796875, "logits/rejected": 2.78125, "logps/chosen": -1768.0, "logps/rejected": -1632.0, "loss": 0.7208, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1875, "rewards/margins": -0.1025390625, "rewards/rejected": -0.0849609375, "step": 41 }, { "epoch": 0.012115967113803548, "grad_norm": 17.653131599336444, "learning_rate": 6.051873198847262e-08, "logits/chosen": 2.890625, "logits/rejected": 2.890625, "logps/chosen": -1680.0, "logps/rejected": -1488.0, "loss": 0.6991, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1416015625, "rewards/margins": -0.002838134765625, "rewards/rejected": -0.138671875, "step": 42 }, { "epoch": 0.012404442521275061, "grad_norm": 15.3987909231053, "learning_rate": 6.195965417867434e-08, "logits/chosen": 2.796875, "logits/rejected": 2.8125, "logps/chosen": -2384.0, "logps/rejected": -1936.0, "loss": 0.7133, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.22265625, "rewards/margins": 0.02490234375, "rewards/rejected": -0.248046875, "step": 43 }, { "epoch": 0.012692917928746574, "grad_norm": 13.991809287766788, "learning_rate": 6.340057636887608e-08, "logits/chosen": 2.78125, "logits/rejected": 2.71875, "logps/chosen": -1768.0, "logps/rejected": -1864.0, "loss": 0.7067, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1611328125, "rewards/margins": -0.021240234375, "rewards/rejected": -0.1396484375, "step": 44 }, { "epoch": 0.012981393336218087, "grad_norm": 13.067207440719582, "learning_rate": 6.484149855907781e-08, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1952.0, "logps/rejected": -1720.0, "loss": 0.7087, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1474609375, "rewards/margins": -0.0224609375, "rewards/rejected": -0.125, "step": 45 }, { "epoch": 0.0132698687436896, "grad_norm": 12.431510651646285, "learning_rate": 6.628242074927953e-08, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -1896.0, "logps/rejected": -1864.0, "loss": 0.7286, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1953125, "rewards/margins": -0.0859375, "rewards/rejected": -0.109375, "step": 46 }, { "epoch": 0.013558344151161113, "grad_norm": 14.707946307588788, "learning_rate": 6.772334293948126e-08, "logits/chosen": 2.765625, "logits/rejected": 2.6875, "logps/chosen": -1992.0, "logps/rejected": -2032.0, "loss": 0.6945, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1728515625, "rewards/margins": 0.0, "rewards/rejected": -0.1728515625, "step": 47 }, { "epoch": 0.013846819558632626, "grad_norm": 13.897995519062215, "learning_rate": 6.9164265129683e-08, "logits/chosen": 2.8125, "logits/rejected": 2.828125, "logps/chosen": -1632.0, "logps/rejected": -1400.0, "loss": 0.7103, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10888671875, "rewards/margins": 0.0, "rewards/rejected": -0.10888671875, "step": 48 }, { "epoch": 0.014135294966104139, "grad_norm": 12.275750579827823, "learning_rate": 7.060518731988472e-08, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -2192.0, "logps/rejected": -2144.0, "loss": 0.6973, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.146484375, "rewards/margins": 0.039306640625, "rewards/rejected": -0.185546875, "step": 49 }, { "epoch": 0.014423770373575652, "grad_norm": 12.468090358827032, "learning_rate": 7.204610951008645e-08, "logits/chosen": 2.921875, "logits/rejected": 2.859375, "logps/chosen": -2008.0, "logps/rejected": -2144.0, "loss": 0.6705, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.625, "rewards/chosen": -0.0732421875, "rewards/margins": 0.119140625, "rewards/rejected": -0.1923828125, "step": 50 }, { "epoch": 0.014712245781047165, "grad_norm": 12.444198551952201, "learning_rate": 7.348703170028818e-08, "logits/chosen": 2.796875, "logits/rejected": 2.75, "logps/chosen": -2000.0, "logps/rejected": -1784.0, "loss": 0.7191, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.1650390625, "rewards/margins": -0.08984375, "rewards/rejected": -0.0751953125, "step": 51 }, { "epoch": 0.015000721188518678, "grad_norm": 15.483526762563997, "learning_rate": 7.49279538904899e-08, "logits/chosen": 2.90625, "logits/rejected": 2.984375, "logps/chosen": -1696.0, "logps/rejected": -1568.0, "loss": 0.7098, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.09765625, "rewards/margins": 0.0050048828125, "rewards/rejected": -0.1025390625, "step": 52 }, { "epoch": 0.015289196595990193, "grad_norm": 13.08033369855885, "learning_rate": 7.636887608069163e-08, "logits/chosen": 2.828125, "logits/rejected": 2.859375, "logps/chosen": -1704.0, "logps/rejected": -1384.0, "loss": 0.6826, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.171875, "rewards/margins": -0.0224609375, "rewards/rejected": -0.1484375, "step": 53 }, { "epoch": 0.015577672003461706, "grad_norm": 13.451313196439699, "learning_rate": 7.780979827089337e-08, "logits/chosen": 2.8125, "logits/rejected": 2.78125, "logps/chosen": -1888.0, "logps/rejected": -1760.0, "loss": 0.7076, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11767578125, "rewards/margins": -0.0025177001953125, "rewards/rejected": -0.115234375, "step": 54 }, { "epoch": 0.01586614741093322, "grad_norm": 16.85520055071422, "learning_rate": 7.925072046109509e-08, "logits/chosen": 2.75, "logits/rejected": 2.828125, "logps/chosen": -1632.0, "logps/rejected": -1560.0, "loss": 0.7167, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.2177734375, "rewards/margins": -0.039306640625, "rewards/rejected": -0.1787109375, "step": 55 }, { "epoch": 0.01615462281840473, "grad_norm": 12.865724187002614, "learning_rate": 8.069164265129683e-08, "logits/chosen": 2.828125, "logits/rejected": 2.90625, "logps/chosen": -2032.0, "logps/rejected": -1784.0, "loss": 0.71, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.259765625, "rewards/margins": -0.05078125, "rewards/rejected": -0.2099609375, "step": 56 }, { "epoch": 0.016443098225876245, "grad_norm": 11.59857459648346, "learning_rate": 8.213256484149856e-08, "logits/chosen": 2.765625, "logits/rejected": 2.6875, "logps/chosen": -1640.0, "logps/rejected": -1776.0, "loss": 0.7191, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09765625, "rewards/margins": 0.003753662109375, "rewards/rejected": -0.1015625, "step": 57 }, { "epoch": 0.016731573633347756, "grad_norm": 14.902926837818534, "learning_rate": 8.357348703170028e-08, "logits/chosen": 2.5625, "logits/rejected": 2.71875, "logps/chosen": -1800.0, "logps/rejected": -1488.0, "loss": 0.7085, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.146484375, "rewards/margins": -0.07568359375, "rewards/rejected": -0.0703125, "step": 58 }, { "epoch": 0.01702004904081927, "grad_norm": 13.728302979266594, "learning_rate": 8.501440922190202e-08, "logits/chosen": 2.8125, "logits/rejected": 2.828125, "logps/chosen": -1696.0, "logps/rejected": -1832.0, "loss": 0.6771, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1376953125, "rewards/margins": 0.0400390625, "rewards/rejected": -0.177734375, "step": 59 }, { "epoch": 0.01730852444829078, "grad_norm": 16.461096859082186, "learning_rate": 8.645533141210375e-08, "logits/chosen": 2.75, "logits/rejected": 2.71875, "logps/chosen": -1816.0, "logps/rejected": -1920.0, "loss": 0.7271, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1513671875, "rewards/margins": -0.068359375, "rewards/rejected": -0.0830078125, "step": 60 }, { "epoch": 0.017596999855762296, "grad_norm": 10.891807329172197, "learning_rate": 8.789625360230547e-08, "logits/chosen": 2.859375, "logits/rejected": 2.796875, "logps/chosen": -1184.0, "logps/rejected": -1152.0, "loss": 0.6869, "loss/demonstration_loss": -2320.0, "loss/preference_loss": -2320.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08642578125, "rewards/margins": 0.00750732421875, "rewards/rejected": -0.09423828125, "step": 61 }, { "epoch": 0.017885475263233808, "grad_norm": 11.177363888430538, "learning_rate": 8.93371757925072e-08, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1568.0, "logps/rejected": -1536.0, "loss": 0.6735, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.05810546875, "rewards/margins": 0.04443359375, "rewards/rejected": -0.1025390625, "step": 62 }, { "epoch": 0.018173950670705322, "grad_norm": 14.063135789692483, "learning_rate": 9.077809798270893e-08, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1976.0, "logps/rejected": -1896.0, "loss": 0.7209, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1435546875, "rewards/margins": -0.06396484375, "rewards/rejected": -0.080078125, "step": 63 }, { "epoch": 0.018462426078176837, "grad_norm": 15.930494726432237, "learning_rate": 9.221902017291065e-08, "logits/chosen": 2.6875, "logits/rejected": 2.65625, "logps/chosen": -1416.0, "logps/rejected": -1424.0, "loss": 0.7045, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.12060546875, "rewards/margins": -0.0230712890625, "rewards/rejected": -0.09765625, "step": 64 }, { "epoch": 0.01875090148564835, "grad_norm": 11.892934715925998, "learning_rate": 9.365994236311239e-08, "logits/chosen": 2.8125, "logits/rejected": 2.765625, "logps/chosen": -1736.0, "logps/rejected": -1664.0, "loss": 0.698, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1201171875, "rewards/margins": 0.04052734375, "rewards/rejected": -0.1611328125, "step": 65 }, { "epoch": 0.019039376893119863, "grad_norm": 13.55115315282989, "learning_rate": 9.510086455331412e-08, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1808.0, "logps/rejected": -1696.0, "loss": 0.7002, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08642578125, "rewards/margins": 0.06884765625, "rewards/rejected": -0.1552734375, "step": 66 }, { "epoch": 0.019327852300591374, "grad_norm": 13.20889923237034, "learning_rate": 9.654178674351584e-08, "logits/chosen": 2.8125, "logits/rejected": 2.859375, "logps/chosen": -1872.0, "logps/rejected": -1864.0, "loss": 0.7179, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.203125, "rewards/margins": -0.0301513671875, "rewards/rejected": -0.1728515625, "step": 67 }, { "epoch": 0.01961632770806289, "grad_norm": 13.858685182563192, "learning_rate": 9.798270893371757e-08, "logits/chosen": 2.796875, "logits/rejected": 2.765625, "logps/chosen": -1632.0, "logps/rejected": -1472.0, "loss": 0.7089, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.197265625, "rewards/margins": -0.07275390625, "rewards/rejected": -0.12451171875, "step": 68 }, { "epoch": 0.0199048031155344, "grad_norm": 13.189866812536431, "learning_rate": 9.94236311239193e-08, "logits/chosen": 2.875, "logits/rejected": 2.84375, "logps/chosen": -1808.0, "logps/rejected": -1776.0, "loss": 0.706, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1611328125, "rewards/margins": -0.01434326171875, "rewards/rejected": -0.146484375, "step": 69 }, { "epoch": 0.020193278523005915, "grad_norm": 12.815904493910594, "learning_rate": 1.0086455331412103e-07, "logits/chosen": 2.828125, "logits/rejected": 2.84375, "logps/chosen": -1816.0, "logps/rejected": -1816.0, "loss": 0.7035, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.125, "rewards/margins": -0.00872802734375, "rewards/rejected": -0.1162109375, "step": 70 }, { "epoch": 0.020481753930477426, "grad_norm": 16.66476038366138, "learning_rate": 1.0230547550432277e-07, "logits/chosen": 2.703125, "logits/rejected": 2.84375, "logps/chosen": -1368.0, "logps/rejected": -1088.0, "loss": 0.7234, "loss/demonstration_loss": -2432.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1806640625, "rewards/margins": -0.08056640625, "rewards/rejected": -0.10009765625, "step": 71 }, { "epoch": 0.02077022933794894, "grad_norm": 11.2773314220454, "learning_rate": 1.037463976945245e-07, "logits/chosen": 2.84375, "logits/rejected": 2.921875, "logps/chosen": -1440.0, "logps/rejected": -1216.0, "loss": 0.697, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2656.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.12255859375, "rewards/margins": -0.0162353515625, "rewards/rejected": -0.1064453125, "step": 72 }, { "epoch": 0.021058704745420452, "grad_norm": 12.97423541376043, "learning_rate": 1.0518731988472622e-07, "logits/chosen": 2.828125, "logits/rejected": 2.796875, "logps/chosen": -1528.0, "logps/rejected": -1440.0, "loss": 0.6967, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1611328125, "rewards/margins": 0.0054931640625, "rewards/rejected": -0.166015625, "step": 73 }, { "epoch": 0.021347180152891967, "grad_norm": 13.209840943982604, "learning_rate": 1.0662824207492795e-07, "logits/chosen": 2.75, "logits/rejected": 2.78125, "logps/chosen": -1552.0, "logps/rejected": -1336.0, "loss": 0.7178, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1064453125, "rewards/margins": -0.04638671875, "rewards/rejected": -0.06005859375, "step": 74 }, { "epoch": 0.021635655560363478, "grad_norm": 16.866049245330643, "learning_rate": 1.0806916426512968e-07, "logits/chosen": 2.828125, "logits/rejected": 2.96875, "logps/chosen": -1936.0, "logps/rejected": -1576.0, "loss": 0.7155, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.173828125, "rewards/margins": -0.0286865234375, "rewards/rejected": -0.1455078125, "step": 75 }, { "epoch": 0.021924130967834993, "grad_norm": 11.6298357116585, "learning_rate": 1.0951008645533142e-07, "logits/chosen": 2.8125, "logits/rejected": 2.765625, "logps/chosen": -1704.0, "logps/rejected": -1680.0, "loss": 0.6767, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0703125, "rewards/margins": 0.061279296875, "rewards/rejected": -0.1318359375, "step": 76 }, { "epoch": 0.022212606375306504, "grad_norm": 14.091723378559442, "learning_rate": 1.1095100864553314e-07, "logits/chosen": 2.859375, "logits/rejected": 2.84375, "logps/chosen": -2128.0, "logps/rejected": -2208.0, "loss": 0.707, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.1875, "rewards/margins": -0.09521484375, "rewards/rejected": -0.0927734375, "step": 77 }, { "epoch": 0.02250108178277802, "grad_norm": 13.86489071963396, "learning_rate": 1.1239193083573487e-07, "logits/chosen": 2.828125, "logits/rejected": 2.875, "logps/chosen": -1896.0, "logps/rejected": -1928.0, "loss": 0.6869, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.625, "rewards/chosen": -0.125, "rewards/margins": 0.0625, "rewards/rejected": -0.1875, "step": 78 }, { "epoch": 0.02278955719024953, "grad_norm": 12.316191181409046, "learning_rate": 1.1383285302593659e-07, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1984.0, "logps/rejected": -1768.0, "loss": 0.7059, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1064453125, "rewards/margins": -0.03564453125, "rewards/rejected": -0.07080078125, "step": 79 }, { "epoch": 0.023078032597721045, "grad_norm": 15.10554682285765, "learning_rate": 1.1527377521613832e-07, "logits/chosen": 2.953125, "logits/rejected": 2.890625, "logps/chosen": -1768.0, "logps/rejected": -1808.0, "loss": 0.74, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.20703125, "rewards/margins": -0.056396484375, "rewards/rejected": -0.150390625, "step": 80 }, { "epoch": 0.023366508005192556, "grad_norm": 12.586347980858184, "learning_rate": 1.1671469740634004e-07, "logits/chosen": 2.828125, "logits/rejected": 2.828125, "logps/chosen": -1624.0, "logps/rejected": -1712.0, "loss": 0.6951, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1171875, "rewards/margins": -0.034423828125, "rewards/rejected": -0.08251953125, "step": 81 }, { "epoch": 0.02365498341266407, "grad_norm": 14.22680500938693, "learning_rate": 1.1815561959654178e-07, "logits/chosen": 2.765625, "logits/rejected": 2.796875, "logps/chosen": -1416.0, "logps/rejected": -1440.0, "loss": 0.7041, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1201171875, "rewards/margins": -0.0113525390625, "rewards/rejected": -0.10888671875, "step": 82 }, { "epoch": 0.023943458820135582, "grad_norm": 11.682746980381589, "learning_rate": 1.195965417867435e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -2112.0, "logps/rejected": -2128.0, "loss": 0.6851, "loss/demonstration_loss": -4224.0, "loss/preference_loss": -4224.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.10498046875, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.10009765625, "step": 83 }, { "epoch": 0.024231934227607096, "grad_norm": 13.036826056110216, "learning_rate": 1.2103746397694524e-07, "logits/chosen": 2.71875, "logits/rejected": 2.78125, "logps/chosen": -1944.0, "logps/rejected": -1760.0, "loss": 0.6872, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.12255859375, "rewards/margins": -0.012451171875, "rewards/rejected": -0.1103515625, "step": 84 }, { "epoch": 0.02452040963507861, "grad_norm": 12.935984208490911, "learning_rate": 1.2247838616714696e-07, "logits/chosen": 2.703125, "logits/rejected": 2.625, "logps/chosen": -1848.0, "logps/rejected": -1904.0, "loss": 0.6606, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.134765625, "rewards/margins": 0.0673828125, "rewards/rejected": -0.203125, "step": 85 }, { "epoch": 0.024808885042550122, "grad_norm": 13.826537769831619, "learning_rate": 1.2391930835734869e-07, "logits/chosen": 2.9375, "logits/rejected": 3.0, "logps/chosen": -1816.0, "logps/rejected": -1768.0, "loss": 0.7135, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1953125, "rewards/margins": -0.06494140625, "rewards/rejected": -0.1298828125, "step": 86 }, { "epoch": 0.025097360450021637, "grad_norm": 12.225318828058995, "learning_rate": 1.2536023054755044e-07, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1640.0, "logps/rejected": -1608.0, "loss": 0.7056, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.1328125, "rewards/margins": -0.0150146484375, "rewards/rejected": -0.11767578125, "step": 87 }, { "epoch": 0.02538583585749315, "grad_norm": 14.57331483472006, "learning_rate": 1.2680115273775216e-07, "logits/chosen": 2.96875, "logits/rejected": 2.90625, "logps/chosen": -2048.0, "logps/rejected": -1872.0, "loss": 0.7023, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.09521484375, "rewards/margins": 0.0181884765625, "rewards/rejected": -0.11328125, "step": 88 }, { "epoch": 0.025674311264964663, "grad_norm": 14.106600156674954, "learning_rate": 1.282420749279539e-07, "logits/chosen": 2.96875, "logits/rejected": 2.984375, "logps/chosen": -1792.0, "logps/rejected": -1720.0, "loss": 0.7029, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1611328125, "rewards/margins": -0.0712890625, "rewards/rejected": -0.08984375, "step": 89 }, { "epoch": 0.025962786672436174, "grad_norm": 13.01103712770991, "learning_rate": 1.2968299711815562e-07, "logits/chosen": 2.78125, "logits/rejected": 2.828125, "logps/chosen": -1240.0, "logps/rejected": -1320.0, "loss": 0.7124, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2544.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.18359375, "rewards/margins": -0.068359375, "rewards/rejected": -0.11572265625, "step": 90 }, { "epoch": 0.02625126207990769, "grad_norm": 12.517008339381956, "learning_rate": 1.3112391930835734e-07, "logits/chosen": 2.90625, "logits/rejected": 2.796875, "logps/chosen": -1656.0, "logps/rejected": -1776.0, "loss": 0.6832, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11962890625, "rewards/margins": 0.0615234375, "rewards/rejected": -0.1806640625, "step": 91 }, { "epoch": 0.0265397374873792, "grad_norm": 13.570496963689006, "learning_rate": 1.3256484149855907e-07, "logits/chosen": 2.890625, "logits/rejected": 2.90625, "logps/chosen": -1312.0, "logps/rejected": -1336.0, "loss": 0.6992, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1279296875, "rewards/margins": -0.0113525390625, "rewards/rejected": -0.1162109375, "step": 92 }, { "epoch": 0.026828212894850715, "grad_norm": 13.340900026542391, "learning_rate": 1.340057636887608e-07, "logits/chosen": 2.875, "logits/rejected": 2.875, "logps/chosen": -1696.0, "logps/rejected": -1648.0, "loss": 0.7139, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1455078125, "rewards/margins": -0.0830078125, "rewards/rejected": -0.0625, "step": 93 }, { "epoch": 0.027116688302322226, "grad_norm": 12.511035832484655, "learning_rate": 1.3544668587896252e-07, "logits/chosen": 2.8125, "logits/rejected": 2.84375, "logps/chosen": -2160.0, "logps/rejected": -1840.0, "loss": 0.6982, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11279296875, "rewards/margins": 0.012451171875, "rewards/rejected": -0.125, "step": 94 }, { "epoch": 0.02740516370979374, "grad_norm": 12.497757623359346, "learning_rate": 1.3688760806916425e-07, "logits/chosen": 2.90625, "logits/rejected": 3.0, "logps/chosen": -2080.0, "logps/rejected": -1792.0, "loss": 0.698, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12255859375, "rewards/margins": -0.023681640625, "rewards/rejected": -0.0986328125, "step": 95 }, { "epoch": 0.027693639117265252, "grad_norm": 12.601935370996427, "learning_rate": 1.38328530259366e-07, "logits/chosen": 2.828125, "logits/rejected": 2.828125, "logps/chosen": -1656.0, "logps/rejected": -1672.0, "loss": 0.6927, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07080078125, "rewards/margins": 0.052490234375, "rewards/rejected": -0.123046875, "step": 96 }, { "epoch": 0.027982114524736767, "grad_norm": 13.988997792118045, "learning_rate": 1.3976945244956772e-07, "logits/chosen": 2.8125, "logits/rejected": 2.84375, "logps/chosen": -1760.0, "logps/rejected": -1600.0, "loss": 0.6881, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09619140625, "rewards/margins": 0.0218505859375, "rewards/rejected": -0.1181640625, "step": 97 }, { "epoch": 0.028270589932208278, "grad_norm": 12.27595689606032, "learning_rate": 1.4121037463976945e-07, "logits/chosen": 2.859375, "logits/rejected": 2.875, "logps/chosen": -1744.0, "logps/rejected": -1616.0, "loss": 0.7092, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.107421875, "rewards/margins": -0.0400390625, "rewards/rejected": -0.0673828125, "step": 98 }, { "epoch": 0.028559065339679793, "grad_norm": 15.315086949174743, "learning_rate": 1.4265129682997118e-07, "logits/chosen": 2.8125, "logits/rejected": 2.828125, "logps/chosen": -1928.0, "logps/rejected": -1832.0, "loss": 0.7043, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.166015625, "rewards/margins": 0.0067138671875, "rewards/rejected": -0.1728515625, "step": 99 }, { "epoch": 0.028847540747151304, "grad_norm": 12.130639080352182, "learning_rate": 1.440922190201729e-07, "logits/chosen": 2.765625, "logits/rejected": 2.875, "logps/chosen": -1976.0, "logps/rejected": -1728.0, "loss": 0.69, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.052490234375, "rewards/margins": 0.043212890625, "rewards/rejected": -0.095703125, "step": 100 }, { "epoch": 0.02913601615462282, "grad_norm": 13.273837294627922, "learning_rate": 1.4553314121037463e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1800.0, "logps/rejected": -1480.0, "loss": 0.7045, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.15234375, "rewards/margins": -0.036865234375, "rewards/rejected": -0.11572265625, "step": 101 }, { "epoch": 0.02942449156209433, "grad_norm": 13.2645073150252, "learning_rate": 1.4697406340057635e-07, "logits/chosen": 2.859375, "logits/rejected": 2.796875, "logps/chosen": -1632.0, "logps/rejected": -1736.0, "loss": 0.6984, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.1298828125, "rewards/margins": 0.0274658203125, "rewards/rejected": -0.1572265625, "step": 102 }, { "epoch": 0.029712966969565845, "grad_norm": 14.798587107954155, "learning_rate": 1.4841498559077808e-07, "logits/chosen": 2.859375, "logits/rejected": 2.828125, "logps/chosen": -1504.0, "logps/rejected": -1632.0, "loss": 0.7145, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.09765625, "rewards/margins": -0.061279296875, "rewards/rejected": -0.036376953125, "step": 103 }, { "epoch": 0.030001442377037356, "grad_norm": 14.76828065189772, "learning_rate": 1.498559077809798e-07, "logits/chosen": 2.796875, "logits/rejected": 2.765625, "logps/chosen": -1624.0, "logps/rejected": -1800.0, "loss": 0.6865, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12255859375, "rewards/margins": 0.08740234375, "rewards/rejected": -0.2099609375, "step": 104 }, { "epoch": 0.03028991778450887, "grad_norm": 14.609936274691126, "learning_rate": 1.5129682997118153e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -1616.0, "logps/rejected": -1528.0, "loss": 0.6688, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.06884765625, "rewards/margins": 0.0118408203125, "rewards/rejected": -0.08056640625, "step": 105 }, { "epoch": 0.030578393191980385, "grad_norm": 12.412594246631425, "learning_rate": 1.5273775216138326e-07, "logits/chosen": 2.84375, "logits/rejected": 2.890625, "logps/chosen": -1784.0, "logps/rejected": -1656.0, "loss": 0.6895, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0849609375, "rewards/margins": 0.036376953125, "rewards/rejected": -0.12158203125, "step": 106 }, { "epoch": 0.030866868599451897, "grad_norm": 13.978415254779147, "learning_rate": 1.54178674351585e-07, "logits/chosen": 2.859375, "logits/rejected": 2.796875, "logps/chosen": -1728.0, "logps/rejected": -1608.0, "loss": 0.7108, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1474609375, "rewards/margins": -0.03125, "rewards/rejected": -0.1162109375, "step": 107 }, { "epoch": 0.03115534400692341, "grad_norm": 16.041591734644538, "learning_rate": 1.5561959654178673e-07, "logits/chosen": 2.875, "logits/rejected": 2.859375, "logps/chosen": -1744.0, "logps/rejected": -1688.0, "loss": 0.71, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.1533203125, "rewards/margins": -0.061767578125, "rewards/rejected": -0.091796875, "step": 108 }, { "epoch": 0.031443819414394926, "grad_norm": 14.176549820260293, "learning_rate": 1.5706051873198846e-07, "logits/chosen": 2.6875, "logits/rejected": 2.734375, "logps/chosen": -1472.0, "logps/rejected": -1528.0, "loss": 0.7073, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.125, "rewards/margins": -0.0169677734375, "rewards/rejected": -0.10791015625, "step": 109 }, { "epoch": 0.03173229482186644, "grad_norm": 14.018013413620691, "learning_rate": 1.5850144092219019e-07, "logits/chosen": 2.84375, "logits/rejected": 2.84375, "logps/chosen": -1512.0, "logps/rejected": -1488.0, "loss": 0.6989, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.123046875, "rewards/margins": -0.015625, "rewards/rejected": -0.107421875, "step": 110 }, { "epoch": 0.03202077022933795, "grad_norm": 14.354386488282962, "learning_rate": 1.5994236311239194e-07, "logits/chosen": 2.828125, "logits/rejected": 2.75, "logps/chosen": -2032.0, "logps/rejected": -2024.0, "loss": 0.7012, "loss/demonstration_loss": -4048.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.13671875, "rewards/margins": 0.001220703125, "rewards/rejected": -0.1376953125, "step": 111 }, { "epoch": 0.03230924563680946, "grad_norm": 11.234816353620845, "learning_rate": 1.6138328530259366e-07, "logits/chosen": 2.703125, "logits/rejected": 2.703125, "logps/chosen": -1392.0, "logps/rejected": -1520.0, "loss": 0.7076, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.138671875, "rewards/margins": -0.033935546875, "rewards/rejected": -0.10498046875, "step": 112 }, { "epoch": 0.03259772104428098, "grad_norm": 12.791484180303588, "learning_rate": 1.628242074927954e-07, "logits/chosen": 2.78125, "logits/rejected": 2.90625, "logps/chosen": -1640.0, "logps/rejected": -1440.0, "loss": 0.7328, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.15625, "rewards/margins": -0.05859375, "rewards/rejected": -0.09765625, "step": 113 }, { "epoch": 0.03288619645175249, "grad_norm": 13.20804432595701, "learning_rate": 1.6426512968299712e-07, "logits/chosen": 2.875, "logits/rejected": 2.9375, "logps/chosen": -1680.0, "logps/rejected": -1496.0, "loss": 0.719, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.111328125, "rewards/margins": -0.021240234375, "rewards/rejected": -0.08984375, "step": 114 }, { "epoch": 0.033174671859224, "grad_norm": 12.745479685925165, "learning_rate": 1.6570605187319884e-07, "logits/chosen": 2.703125, "logits/rejected": 2.71875, "logps/chosen": -1656.0, "logps/rejected": -1488.0, "loss": 0.6754, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.050048828125, "rewards/margins": 0.0439453125, "rewards/rejected": -0.09375, "step": 115 }, { "epoch": 0.03346314726669551, "grad_norm": 13.451664393092186, "learning_rate": 1.6714697406340057e-07, "logits/chosen": 2.75, "logits/rejected": 2.71875, "logps/chosen": -1488.0, "logps/rejected": -1384.0, "loss": 0.722, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1923828125, "rewards/margins": -0.07373046875, "rewards/rejected": -0.11865234375, "step": 116 }, { "epoch": 0.03375162267416703, "grad_norm": 12.456018395696045, "learning_rate": 1.685878962536023e-07, "logits/chosen": 2.921875, "logits/rejected": 2.890625, "logps/chosen": -1848.0, "logps/rejected": -1824.0, "loss": 0.6857, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.076171875, "rewards/margins": 0.03369140625, "rewards/rejected": -0.1103515625, "step": 117 }, { "epoch": 0.03404009808163854, "grad_norm": 12.801575501642514, "learning_rate": 1.7002881844380405e-07, "logits/chosen": 2.84375, "logits/rejected": 2.859375, "logps/chosen": -2208.0, "logps/rejected": -2080.0, "loss": 0.6843, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.12158203125, "rewards/margins": -0.001251220703125, "rewards/rejected": -0.1201171875, "step": 118 }, { "epoch": 0.03432857348911005, "grad_norm": 17.31704302667602, "learning_rate": 1.7146974063400577e-07, "logits/chosen": 2.890625, "logits/rejected": 2.84375, "logps/chosen": -2048.0, "logps/rejected": -1608.0, "loss": 0.7007, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.111328125, "rewards/margins": -0.03125, "rewards/rejected": -0.080078125, "step": 119 }, { "epoch": 0.03461704889658156, "grad_norm": 14.117864982793359, "learning_rate": 1.729106628242075e-07, "logits/chosen": 2.921875, "logits/rejected": 2.859375, "logps/chosen": -1848.0, "logps/rejected": -1552.0, "loss": 0.7281, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.10498046875, "rewards/margins": -0.00738525390625, "rewards/rejected": -0.09765625, "step": 120 }, { "epoch": 0.03490552430405308, "grad_norm": 10.992825594000015, "learning_rate": 1.7435158501440922e-07, "logits/chosen": 2.84375, "logits/rejected": 2.828125, "logps/chosen": -1448.0, "logps/rejected": -1464.0, "loss": 0.6809, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08154296875, "rewards/margins": -0.0068359375, "rewards/rejected": -0.07421875, "step": 121 }, { "epoch": 0.03519399971152459, "grad_norm": 11.472419374590679, "learning_rate": 1.7579250720461095e-07, "logits/chosen": 2.90625, "logits/rejected": 2.90625, "logps/chosen": -1680.0, "logps/rejected": -2080.0, "loss": 0.667, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.039306640625, "rewards/margins": 0.09423828125, "rewards/rejected": -0.1337890625, "step": 122 }, { "epoch": 0.035482475118996104, "grad_norm": 14.585823976713755, "learning_rate": 1.7723342939481268e-07, "logits/chosen": 2.75, "logits/rejected": 2.75, "logps/chosen": -1848.0, "logps/rejected": -1840.0, "loss": 0.7073, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.150390625, "rewards/margins": -0.030029296875, "rewards/rejected": -0.1201171875, "step": 123 }, { "epoch": 0.035770950526467615, "grad_norm": 13.530686368045174, "learning_rate": 1.786743515850144e-07, "logits/chosen": 3.0, "logits/rejected": 2.96875, "logps/chosen": -1512.0, "logps/rejected": -1536.0, "loss": 0.658, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.06884765625, "rewards/margins": 0.10009765625, "rewards/rejected": -0.1689453125, "step": 124 }, { "epoch": 0.036059425933939133, "grad_norm": 14.11384466587561, "learning_rate": 1.8011527377521613e-07, "logits/chosen": 2.921875, "logits/rejected": 2.90625, "logps/chosen": -1752.0, "logps/rejected": -1784.0, "loss": 0.7164, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1318359375, "rewards/margins": -0.04443359375, "rewards/rejected": -0.08740234375, "step": 125 }, { "epoch": 0.036347901341410645, "grad_norm": 12.345335830360996, "learning_rate": 1.8155619596541785e-07, "logits/chosen": 2.9375, "logits/rejected": 2.90625, "logps/chosen": -1968.0, "logps/rejected": -1872.0, "loss": 0.6993, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1201171875, "rewards/margins": -0.052001953125, "rewards/rejected": -0.068359375, "step": 126 }, { "epoch": 0.036636376748882156, "grad_norm": 15.304159764818719, "learning_rate": 1.8299711815561958e-07, "logits/chosen": 2.765625, "logits/rejected": 2.75, "logps/chosen": -1312.0, "logps/rejected": -1216.0, "loss": 0.7166, "loss/demonstration_loss": -2512.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.10205078125, "rewards/margins": -0.0247802734375, "rewards/rejected": -0.07763671875, "step": 127 }, { "epoch": 0.036924852156353674, "grad_norm": 11.856000837487837, "learning_rate": 1.844380403458213e-07, "logits/chosen": 2.828125, "logits/rejected": 2.875, "logps/chosen": -1264.0, "logps/rejected": -1152.0, "loss": 0.6893, "loss/demonstration_loss": -2400.0, "loss/preference_loss": -2416.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.08056640625, "rewards/margins": -0.0306396484375, "rewards/rejected": -0.050048828125, "step": 128 }, { "epoch": 0.037213327563825185, "grad_norm": 13.671704598170162, "learning_rate": 1.8587896253602306e-07, "logits/chosen": 2.875, "logits/rejected": 2.828125, "logps/chosen": -2144.0, "logps/rejected": -1928.0, "loss": 0.6885, "loss/demonstration_loss": -4048.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1162109375, "rewards/margins": 0.0498046875, "rewards/rejected": -0.166015625, "step": 129 }, { "epoch": 0.0375018029712967, "grad_norm": 14.264431962101927, "learning_rate": 1.8731988472622478e-07, "logits/chosen": 2.71875, "logits/rejected": 2.8125, "logps/chosen": -1792.0, "logps/rejected": -2040.0, "loss": 0.6912, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.0712890625, "rewards/margins": -0.0037841796875, "rewards/rejected": -0.0673828125, "step": 130 }, { "epoch": 0.03779027837876821, "grad_norm": 12.183805910868063, "learning_rate": 1.887608069164265e-07, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1632.0, "logps/rejected": -1560.0, "loss": 0.6959, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.11865234375, "rewards/margins": -0.0625, "rewards/rejected": -0.056396484375, "step": 131 }, { "epoch": 0.038078753786239726, "grad_norm": 12.949687293287298, "learning_rate": 1.9020172910662823e-07, "logits/chosen": 2.84375, "logits/rejected": 2.9375, "logps/chosen": -2000.0, "logps/rejected": -1808.0, "loss": 0.71, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.11865234375, "rewards/margins": -0.04443359375, "rewards/rejected": -0.07421875, "step": 132 }, { "epoch": 0.03836722919371124, "grad_norm": 10.92216606166572, "learning_rate": 1.9164265129682996e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1384.0, "logps/rejected": -1424.0, "loss": 0.6804, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.058837890625, "rewards/margins": 0.0283203125, "rewards/rejected": -0.0869140625, "step": 133 }, { "epoch": 0.03865570460118275, "grad_norm": 14.268022353633459, "learning_rate": 1.9308357348703169e-07, "logits/chosen": 2.734375, "logits/rejected": 2.8125, "logps/chosen": -1576.0, "logps/rejected": -1424.0, "loss": 0.6696, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02880859375, "rewards/margins": 0.07373046875, "rewards/rejected": -0.1025390625, "step": 134 }, { "epoch": 0.03894418000865426, "grad_norm": 12.639083223763501, "learning_rate": 1.945244956772334e-07, "logits/chosen": 2.78125, "logits/rejected": 2.84375, "logps/chosen": -1920.0, "logps/rejected": -1712.0, "loss": 0.7136, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1357421875, "rewards/margins": -0.05078125, "rewards/rejected": -0.0849609375, "step": 135 }, { "epoch": 0.03923265541612578, "grad_norm": 11.257423131412404, "learning_rate": 1.9596541786743514e-07, "logits/chosen": 2.796875, "logits/rejected": 2.859375, "logps/chosen": -2080.0, "logps/rejected": -1984.0, "loss": 0.6952, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1640625, "rewards/margins": -0.021240234375, "rewards/rejected": -0.142578125, "step": 136 }, { "epoch": 0.03952113082359729, "grad_norm": 14.19130633312644, "learning_rate": 1.9740634005763686e-07, "logits/chosen": 2.6875, "logits/rejected": 2.796875, "logps/chosen": -1640.0, "logps/rejected": -1408.0, "loss": 0.7228, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.13671875, "rewards/margins": -0.10693359375, "rewards/rejected": -0.0303955078125, "step": 137 }, { "epoch": 0.0398096062310688, "grad_norm": 12.951420099355532, "learning_rate": 1.988472622478386e-07, "logits/chosen": 2.75, "logits/rejected": 2.796875, "logps/chosen": -1824.0, "logps/rejected": -1648.0, "loss": 0.7073, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.1259765625, "rewards/margins": -0.03369140625, "rewards/rejected": -0.0927734375, "step": 138 }, { "epoch": 0.04009808163854031, "grad_norm": 12.327022694242821, "learning_rate": 2.0028818443804031e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -1776.0, "logps/rejected": -1712.0, "loss": 0.7034, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08056640625, "rewards/margins": 0.0025177001953125, "rewards/rejected": -0.0830078125, "step": 139 }, { "epoch": 0.04038655704601183, "grad_norm": 14.121671421167523, "learning_rate": 2.0172910662824207e-07, "logits/chosen": 3.0, "logits/rejected": 2.953125, "logps/chosen": -1600.0, "logps/rejected": -1616.0, "loss": 0.7064, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.13671875, "rewards/margins": 0.02197265625, "rewards/rejected": -0.1591796875, "step": 140 }, { "epoch": 0.04067503245348334, "grad_norm": 18.92552737036175, "learning_rate": 2.031700288184438e-07, "logits/chosen": 2.71875, "logits/rejected": 2.65625, "logps/chosen": -1616.0, "logps/rejected": -1304.0, "loss": 0.7136, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1474609375, "rewards/margins": -0.1015625, "rewards/rejected": -0.04638671875, "step": 141 }, { "epoch": 0.04096350786095485, "grad_norm": 13.699282288775105, "learning_rate": 2.0461095100864555e-07, "logits/chosen": 2.921875, "logits/rejected": 3.0, "logps/chosen": -1936.0, "logps/rejected": -1912.0, "loss": 0.6901, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.09521484375, "rewards/margins": -0.00445556640625, "rewards/rejected": -0.0908203125, "step": 142 }, { "epoch": 0.041251983268426363, "grad_norm": 14.837494800798574, "learning_rate": 2.0605187319884727e-07, "logits/chosen": 2.96875, "logits/rejected": 2.875, "logps/chosen": -1936.0, "logps/rejected": -1928.0, "loss": 0.6838, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.146484375, "rewards/margins": 0.015869140625, "rewards/rejected": -0.162109375, "step": 143 }, { "epoch": 0.04154045867589788, "grad_norm": 13.555400417587519, "learning_rate": 2.07492795389049e-07, "logits/chosen": 2.96875, "logits/rejected": 2.9375, "logps/chosen": -1312.0, "logps/rejected": -1080.0, "loss": 0.6891, "loss/demonstration_loss": -2384.0, "loss/preference_loss": -2384.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.123046875, "rewards/margins": -0.0458984375, "rewards/rejected": -0.0771484375, "step": 144 }, { "epoch": 0.04182893408336939, "grad_norm": 14.267474595802195, "learning_rate": 2.0893371757925072e-07, "logits/chosen": 2.546875, "logits/rejected": 2.578125, "logps/chosen": -1408.0, "logps/rejected": -1592.0, "loss": 0.717, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1748046875, "rewards/margins": -0.032958984375, "rewards/rejected": -0.1416015625, "step": 145 }, { "epoch": 0.042117409490840904, "grad_norm": 15.505696769909273, "learning_rate": 2.1037463976945245e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1760.0, "logps/rejected": -1728.0, "loss": 0.7064, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.09375, "rewards/margins": -0.03369140625, "rewards/rejected": -0.06005859375, "step": 146 }, { "epoch": 0.04240588489831242, "grad_norm": 11.35862329173575, "learning_rate": 2.1181556195965417e-07, "logits/chosen": 2.75, "logits/rejected": 2.765625, "logps/chosen": -1680.0, "logps/rejected": -1672.0, "loss": 0.6884, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08203125, "rewards/margins": -0.018798828125, "rewards/rejected": -0.06298828125, "step": 147 }, { "epoch": 0.042694360305783934, "grad_norm": 13.796716005612092, "learning_rate": 2.132564841498559e-07, "logits/chosen": 2.96875, "logits/rejected": 2.84375, "logps/chosen": -1704.0, "logps/rejected": -1608.0, "loss": 0.7004, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.08056640625, "rewards/margins": 0.0087890625, "rewards/rejected": -0.08935546875, "step": 148 }, { "epoch": 0.042982835713255445, "grad_norm": 12.974192271382416, "learning_rate": 2.1469740634005763e-07, "logits/chosen": 2.6875, "logits/rejected": 2.703125, "logps/chosen": -1840.0, "logps/rejected": -1976.0, "loss": 0.71, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.095703125, "rewards/margins": 0.005645751953125, "rewards/rejected": -0.1015625, "step": 149 }, { "epoch": 0.043271311120726956, "grad_norm": 12.81111710173094, "learning_rate": 2.1613832853025935e-07, "logits/chosen": 2.796875, "logits/rejected": 2.703125, "logps/chosen": -1800.0, "logps/rejected": -1976.0, "loss": 0.7019, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.125, "rewards/margins": -0.0093994140625, "rewards/rejected": -0.11572265625, "step": 150 }, { "epoch": 0.043559786528198474, "grad_norm": 13.444000972153871, "learning_rate": 2.1757925072046108e-07, "logits/chosen": 2.84375, "logits/rejected": 2.796875, "logps/chosen": -1464.0, "logps/rejected": -1568.0, "loss": 0.6994, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.146484375, "rewards/margins": -0.0159912109375, "rewards/rejected": -0.130859375, "step": 151 }, { "epoch": 0.043848261935669985, "grad_norm": 12.898639566580528, "learning_rate": 2.1902017291066283e-07, "logits/chosen": 2.765625, "logits/rejected": 2.78125, "logps/chosen": -1520.0, "logps/rejected": -1432.0, "loss": 0.6987, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.162109375, "rewards/margins": 0.0272216796875, "rewards/rejected": -0.189453125, "step": 152 }, { "epoch": 0.0441367373431415, "grad_norm": 11.630726904366368, "learning_rate": 2.2046109510086456e-07, "logits/chosen": 2.875, "logits/rejected": 2.84375, "logps/chosen": -1720.0, "logps/rejected": -1800.0, "loss": 0.6931, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09326171875, "rewards/margins": 0.0419921875, "rewards/rejected": -0.134765625, "step": 153 }, { "epoch": 0.04442521275061301, "grad_norm": 11.169770201898757, "learning_rate": 2.2190201729106628e-07, "logits/chosen": 2.84375, "logits/rejected": 2.78125, "logps/chosen": -1704.0, "logps/rejected": -1856.0, "loss": 0.6864, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.125, "rewards/margins": -0.017578125, "rewards/rejected": -0.107421875, "step": 154 }, { "epoch": 0.044713688158084526, "grad_norm": 13.455924614723772, "learning_rate": 2.23342939481268e-07, "logits/chosen": 2.765625, "logits/rejected": 2.8125, "logps/chosen": -1568.0, "logps/rejected": -1440.0, "loss": 0.7084, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.12158203125, "rewards/margins": -0.032470703125, "rewards/rejected": -0.0888671875, "step": 155 }, { "epoch": 0.04500216356555604, "grad_norm": 12.553915789493537, "learning_rate": 2.2478386167146973e-07, "logits/chosen": 2.78125, "logits/rejected": 2.8125, "logps/chosen": -2024.0, "logps/rejected": -1904.0, "loss": 0.7038, "loss/demonstration_loss": -3920.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1630859375, "rewards/margins": -0.09375, "rewards/rejected": -0.06884765625, "step": 156 }, { "epoch": 0.04529063897302755, "grad_norm": 13.478313307695142, "learning_rate": 2.2622478386167146e-07, "logits/chosen": 2.8125, "logits/rejected": 2.8125, "logps/chosen": -2040.0, "logps/rejected": -1952.0, "loss": 0.6841, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1025390625, "rewards/margins": 0.03759765625, "rewards/rejected": -0.140625, "step": 157 }, { "epoch": 0.04557911438049906, "grad_norm": 16.691461931401555, "learning_rate": 2.2766570605187319e-07, "logits/chosen": 2.78125, "logits/rejected": 2.859375, "logps/chosen": -1960.0, "logps/rejected": -1752.0, "loss": 0.7239, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.083984375, "rewards/margins": -0.01611328125, "rewards/rejected": -0.0673828125, "step": 158 }, { "epoch": 0.04586758978797058, "grad_norm": 11.385581956074466, "learning_rate": 2.291066282420749e-07, "logits/chosen": 2.859375, "logits/rejected": 2.921875, "logps/chosen": -1792.0, "logps/rejected": -1704.0, "loss": 0.6858, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0576171875, "rewards/margins": 0.027099609375, "rewards/rejected": -0.08447265625, "step": 159 }, { "epoch": 0.04615606519544209, "grad_norm": 15.23127200272958, "learning_rate": 2.3054755043227664e-07, "logits/chosen": 2.859375, "logits/rejected": 2.953125, "logps/chosen": -2144.0, "logps/rejected": -1640.0, "loss": 0.7311, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1357421875, "rewards/margins": -0.0791015625, "rewards/rejected": -0.056640625, "step": 160 }, { "epoch": 0.0464445406029136, "grad_norm": 11.63659336058798, "learning_rate": 2.3198847262247836e-07, "logits/chosen": 2.953125, "logits/rejected": 2.921875, "logps/chosen": -1504.0, "logps/rejected": -1392.0, "loss": 0.7076, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.0888671875, "rewards/margins": -0.0262451171875, "rewards/rejected": -0.0625, "step": 161 }, { "epoch": 0.04673301601038511, "grad_norm": 11.850391571770666, "learning_rate": 2.334293948126801e-07, "logits/chosen": 2.78125, "logits/rejected": 2.71875, "logps/chosen": -1552.0, "logps/rejected": -1680.0, "loss": 0.6801, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.625, "rewards/chosen": -0.07177734375, "rewards/margins": 0.05322265625, "rewards/rejected": -0.125, "step": 162 }, { "epoch": 0.04702149141785663, "grad_norm": 22.090105508211398, "learning_rate": 2.3487031700288184e-07, "logits/chosen": 2.90625, "logits/rejected": 2.890625, "logps/chosen": -1920.0, "logps/rejected": -1832.0, "loss": 0.7281, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1376953125, "rewards/margins": 0.0019989013671875, "rewards/rejected": -0.1396484375, "step": 163 }, { "epoch": 0.04730996682532814, "grad_norm": 13.361887958756412, "learning_rate": 2.3631123919308357e-07, "logits/chosen": 2.890625, "logits/rejected": 2.859375, "logps/chosen": -1456.0, "logps/rejected": -1464.0, "loss": 0.6953, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.115234375, "rewards/margins": -0.0137939453125, "rewards/rejected": -0.1015625, "step": 164 }, { "epoch": 0.04759844223279965, "grad_norm": 12.742986548037846, "learning_rate": 2.377521613832853e-07, "logits/chosen": 2.9375, "logits/rejected": 2.953125, "logps/chosen": -1824.0, "logps/rejected": -1928.0, "loss": 0.7018, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1162109375, "rewards/margins": -0.03564453125, "rewards/rejected": -0.08056640625, "step": 165 }, { "epoch": 0.047886917640271164, "grad_norm": 14.605682008275203, "learning_rate": 2.39193083573487e-07, "logits/chosen": 2.75, "logits/rejected": 2.859375, "logps/chosen": -1848.0, "logps/rejected": -1568.0, "loss": 0.7106, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.12255859375, "rewards/margins": -0.0673828125, "rewards/rejected": -0.05517578125, "step": 166 }, { "epoch": 0.04817539304774268, "grad_norm": 12.533637459912104, "learning_rate": 2.4063400576368874e-07, "logits/chosen": 2.65625, "logits/rejected": 2.59375, "logps/chosen": -2008.0, "logps/rejected": -2048.0, "loss": 0.7023, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1064453125, "rewards/margins": -0.023193359375, "rewards/rejected": -0.0830078125, "step": 167 }, { "epoch": 0.04846386845521419, "grad_norm": 13.271104990641957, "learning_rate": 2.4207492795389047e-07, "logits/chosen": 2.9375, "logits/rejected": 2.84375, "logps/chosen": -1360.0, "logps/rejected": -1552.0, "loss": 0.6725, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08203125, "rewards/margins": 0.0281982421875, "rewards/rejected": -0.10986328125, "step": 168 }, { "epoch": 0.048752343862685704, "grad_norm": 15.575067536250037, "learning_rate": 2.435158501440922e-07, "logits/chosen": 2.875, "logits/rejected": 2.84375, "logps/chosen": -1760.0, "logps/rejected": -2000.0, "loss": 0.7154, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.1376953125, "rewards/margins": -0.1123046875, "rewards/rejected": -0.02490234375, "step": 169 }, { "epoch": 0.04904081927015722, "grad_norm": 12.850108851026656, "learning_rate": 2.449567723342939e-07, "logits/chosen": 2.828125, "logits/rejected": 2.796875, "logps/chosen": -1616.0, "logps/rejected": -1648.0, "loss": 0.6802, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12890625, "rewards/margins": 0.021240234375, "rewards/rejected": -0.150390625, "step": 170 }, { "epoch": 0.049329294677628734, "grad_norm": 12.890074709688625, "learning_rate": 2.4639769452449565e-07, "logits/chosen": 2.84375, "logits/rejected": 2.8125, "logps/chosen": -1632.0, "logps/rejected": -1664.0, "loss": 0.6982, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.056884765625, "rewards/margins": 0.0294189453125, "rewards/rejected": -0.08642578125, "step": 171 }, { "epoch": 0.049617770085100245, "grad_norm": 13.723740966557617, "learning_rate": 2.4783861671469737e-07, "logits/chosen": 2.765625, "logits/rejected": 2.859375, "logps/chosen": -1936.0, "logps/rejected": -1704.0, "loss": 0.6964, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.1064453125, "rewards/margins": -0.056396484375, "rewards/rejected": -0.050048828125, "step": 172 }, { "epoch": 0.049906245492571756, "grad_norm": 13.6595813240613, "learning_rate": 2.492795389048991e-07, "logits/chosen": 2.859375, "logits/rejected": 2.890625, "logps/chosen": -1768.0, "logps/rejected": -1696.0, "loss": 0.7081, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.173828125, "rewards/margins": -0.0380859375, "rewards/rejected": -0.1357421875, "step": 173 }, { "epoch": 0.050194720900043274, "grad_norm": 15.6352438826303, "learning_rate": 2.507204610951009e-07, "logits/chosen": 2.90625, "logits/rejected": 2.9375, "logps/chosen": -2048.0, "logps/rejected": -1792.0, "loss": 0.7213, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.1552734375, "rewards/margins": -0.10302734375, "rewards/rejected": -0.052001953125, "step": 174 }, { "epoch": 0.050483196307514785, "grad_norm": 15.61004338321909, "learning_rate": 2.5216138328530255e-07, "logits/chosen": 2.75, "logits/rejected": 2.71875, "logps/chosen": -2040.0, "logps/rejected": -2000.0, "loss": 0.7123, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.1201171875, "rewards/margins": -0.01385498046875, "rewards/rejected": -0.1064453125, "step": 175 }, { "epoch": 0.0507716717149863, "grad_norm": 11.551037242327421, "learning_rate": 2.5360230547550433e-07, "logits/chosen": 2.953125, "logits/rejected": 2.875, "logps/chosen": -1024.0, "logps/rejected": -1200.0, "loss": 0.6741, "loss/demonstration_loss": -2224.0, "loss/preference_loss": -2224.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018463134765625, "rewards/margins": 0.06201171875, "rewards/rejected": -0.06396484375, "step": 176 }, { "epoch": 0.05106014712245781, "grad_norm": 12.946875161977205, "learning_rate": 2.55043227665706e-07, "logits/chosen": 2.953125, "logits/rejected": 2.953125, "logps/chosen": -1592.0, "logps/rejected": -1520.0, "loss": 0.6958, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5625, "rewards/chosen": -0.054931640625, "rewards/margins": -0.0037689208984375, "rewards/rejected": -0.05126953125, "step": 177 }, { "epoch": 0.051348622529929326, "grad_norm": 16.215660861493703, "learning_rate": 2.564841498559078e-07, "logits/chosen": 2.71875, "logits/rejected": 2.734375, "logps/chosen": -1920.0, "logps/rejected": -2024.0, "loss": 0.6995, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.064453125, "rewards/margins": 0.07958984375, "rewards/rejected": -0.1435546875, "step": 178 }, { "epoch": 0.05163709793740084, "grad_norm": 14.963728613493695, "learning_rate": 2.5792507204610945e-07, "logits/chosen": 2.875, "logits/rejected": 2.921875, "logps/chosen": -1632.0, "logps/rejected": -1624.0, "loss": 0.7036, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.0625, "rewards/chosen": -0.1435546875, "rewards/margins": -0.09521484375, "rewards/rejected": -0.048828125, "step": 179 }, { "epoch": 0.05192557334487235, "grad_norm": 16.03304682815614, "learning_rate": 2.5936599423631123e-07, "logits/chosen": 2.9375, "logits/rejected": 2.90625, "logps/chosen": -1720.0, "logps/rejected": -1696.0, "loss": 0.704, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.08642578125, "rewards/margins": -0.0006866455078125, "rewards/rejected": -0.0859375, "step": 180 }, { "epoch": 0.05221404875234386, "grad_norm": 12.80502164549832, "learning_rate": 2.6080691642651296e-07, "logits/chosen": 2.890625, "logits/rejected": 2.9375, "logps/chosen": -1904.0, "logps/rejected": -1680.0, "loss": 0.6893, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.041259765625, "rewards/margins": 0.002197265625, "rewards/rejected": -0.04345703125, "step": 181 }, { "epoch": 0.05250252415981538, "grad_norm": 14.76302232613939, "learning_rate": 2.622478386167147e-07, "logits/chosen": 2.875, "logits/rejected": 2.75, "logps/chosen": -1672.0, "logps/rejected": -1576.0, "loss": 0.6946, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0712890625, "rewards/margins": 0.004913330078125, "rewards/rejected": -0.076171875, "step": 182 }, { "epoch": 0.05279099956728689, "grad_norm": 17.701462545711838, "learning_rate": 2.636887608069164e-07, "logits/chosen": 2.8125, "logits/rejected": 2.90625, "logps/chosen": -1856.0, "logps/rejected": -1760.0, "loss": 0.7339, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.189453125, "rewards/margins": -0.11279296875, "rewards/rejected": -0.0771484375, "step": 183 }, { "epoch": 0.0530794749747584, "grad_norm": 14.62512152000554, "learning_rate": 2.6512968299711814e-07, "logits/chosen": 2.71875, "logits/rejected": 2.859375, "logps/chosen": -1904.0, "logps/rejected": -1560.0, "loss": 0.7172, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.09619140625, "rewards/margins": -0.06689453125, "rewards/rejected": -0.0294189453125, "step": 184 }, { "epoch": 0.05336795038222991, "grad_norm": 14.812223557858013, "learning_rate": 2.6657060518731986e-07, "logits/chosen": 2.765625, "logits/rejected": 2.828125, "logps/chosen": -1936.0, "logps/rejected": -1528.0, "loss": 0.7019, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.03125, "rewards/margins": 0.00439453125, "rewards/rejected": -0.03564453125, "step": 185 }, { "epoch": 0.05365642578970143, "grad_norm": 14.926754068736647, "learning_rate": 2.680115273775216e-07, "logits/chosen": 2.828125, "logits/rejected": 2.9375, "logps/chosen": -2008.0, "logps/rejected": -1864.0, "loss": 0.7031, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11865234375, "rewards/margins": -0.0150146484375, "rewards/rejected": -0.10400390625, "step": 186 }, { "epoch": 0.05394490119717294, "grad_norm": 12.996290251553956, "learning_rate": 2.694524495677233e-07, "logits/chosen": 2.84375, "logits/rejected": 2.84375, "logps/chosen": -1960.0, "logps/rejected": -1848.0, "loss": 0.6931, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03759765625, "rewards/margins": 0.010009765625, "rewards/rejected": -0.047607421875, "step": 187 }, { "epoch": 0.05423337660464445, "grad_norm": 15.284209995890105, "learning_rate": 2.7089337175792504e-07, "logits/chosen": 2.671875, "logits/rejected": 2.75, "logps/chosen": -1840.0, "logps/rejected": -1576.0, "loss": 0.7025, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.07275390625, "rewards/margins": -0.0150146484375, "rewards/rejected": -0.0576171875, "step": 188 }, { "epoch": 0.05452185201211597, "grad_norm": 12.316054202172852, "learning_rate": 2.7233429394812677e-07, "logits/chosen": 2.875, "logits/rejected": 2.828125, "logps/chosen": -1416.0, "logps/rejected": -1472.0, "loss": 0.7148, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.091796875, "rewards/margins": -0.06884765625, "rewards/rejected": -0.023193359375, "step": 189 }, { "epoch": 0.05481032741958748, "grad_norm": 13.905673621606834, "learning_rate": 2.737752161383285e-07, "logits/chosen": 2.90625, "logits/rejected": 2.828125, "logps/chosen": -1904.0, "logps/rejected": -1784.0, "loss": 0.7106, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.076171875, "rewards/margins": -0.017578125, "rewards/rejected": -0.058837890625, "step": 190 }, { "epoch": 0.05509880282705899, "grad_norm": 14.504704138091942, "learning_rate": 2.7521613832853027e-07, "logits/chosen": 2.734375, "logits/rejected": 2.765625, "logps/chosen": -2024.0, "logps/rejected": -2008.0, "loss": 0.6959, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08251953125, "rewards/margins": 0.0400390625, "rewards/rejected": -0.12255859375, "step": 191 }, { "epoch": 0.055387278234530504, "grad_norm": 14.96693664872217, "learning_rate": 2.76657060518732e-07, "logits/chosen": 2.84375, "logits/rejected": 2.859375, "logps/chosen": -2192.0, "logps/rejected": -2008.0, "loss": 0.7009, "loss/demonstration_loss": -4192.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.0849609375, "rewards/margins": -0.0250244140625, "rewards/rejected": -0.06005859375, "step": 192 }, { "epoch": 0.05567575364200202, "grad_norm": 14.123079865036004, "learning_rate": 2.780979827089337e-07, "logits/chosen": 2.859375, "logits/rejected": 2.984375, "logps/chosen": -2224.0, "logps/rejected": -1808.0, "loss": 0.6887, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.030029296875, "rewards/margins": 0.023193359375, "rewards/rejected": -0.05322265625, "step": 193 }, { "epoch": 0.055964229049473534, "grad_norm": 13.516746625425505, "learning_rate": 2.7953890489913545e-07, "logits/chosen": 2.796875, "logits/rejected": 2.796875, "logps/chosen": -1976.0, "logps/rejected": -1712.0, "loss": 0.6777, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.04443359375, "rewards/margins": 0.00689697265625, "rewards/rejected": -0.05126953125, "step": 194 }, { "epoch": 0.056252704456945045, "grad_norm": 13.444320240932006, "learning_rate": 2.809798270893372e-07, "logits/chosen": 2.8125, "logits/rejected": 2.859375, "logps/chosen": -1896.0, "logps/rejected": -1872.0, "loss": 0.6724, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00750732421875, "rewards/margins": 0.06201171875, "rewards/rejected": -0.054443359375, "step": 195 }, { "epoch": 0.056541179864416556, "grad_norm": 12.252323492786209, "learning_rate": 2.824207492795389e-07, "logits/chosen": 2.953125, "logits/rejected": 2.984375, "logps/chosen": -1824.0, "logps/rejected": -1752.0, "loss": 0.6705, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.03369140625, "rewards/margins": 0.038818359375, "rewards/rejected": -0.07275390625, "step": 196 }, { "epoch": 0.056829655271888074, "grad_norm": 12.876626846589938, "learning_rate": 2.838616714697406e-07, "logits/chosen": 2.828125, "logits/rejected": 2.8125, "logps/chosen": -1952.0, "logps/rejected": -1864.0, "loss": 0.6958, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0576171875, "rewards/margins": -0.00689697265625, "rewards/rejected": -0.05078125, "step": 197 }, { "epoch": 0.057118130679359586, "grad_norm": 15.372487534461387, "learning_rate": 2.8530259365994235e-07, "logits/chosen": 2.890625, "logits/rejected": 2.890625, "logps/chosen": -2128.0, "logps/rejected": -2032.0, "loss": 0.6931, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.0211181640625, "rewards/margins": 0.12353515625, "rewards/rejected": -0.1025390625, "step": 198 }, { "epoch": 0.0574066060868311, "grad_norm": 13.89838825564844, "learning_rate": 2.867435158501441e-07, "logits/chosen": 2.984375, "logits/rejected": 3.03125, "logps/chosen": -1600.0, "logps/rejected": -1432.0, "loss": 0.694, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.3125, "rewards/chosen": -0.061279296875, "rewards/margins": -0.02001953125, "rewards/rejected": -0.041259765625, "step": 199 }, { "epoch": 0.05769508149430261, "grad_norm": 12.2390300861769, "learning_rate": 2.881844380403458e-07, "logits/chosen": 2.96875, "logits/rejected": 2.984375, "logps/chosen": -1824.0, "logps/rejected": -1768.0, "loss": 0.7135, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.04931640625, "rewards/margins": -0.046875, "rewards/rejected": -0.00250244140625, "step": 200 }, { "epoch": 0.057983556901774126, "grad_norm": 14.418346101592789, "learning_rate": 2.8962536023054753e-07, "logits/chosen": 2.953125, "logits/rejected": 3.015625, "logps/chosen": -1800.0, "logps/rejected": -1712.0, "loss": 0.6849, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.030029296875, "rewards/margins": 0.027587890625, "rewards/rejected": -0.0576171875, "step": 201 }, { "epoch": 0.05827203230924564, "grad_norm": 13.16540704902075, "learning_rate": 2.9106628242074925e-07, "logits/chosen": 2.8125, "logits/rejected": 2.78125, "logps/chosen": -1688.0, "logps/rejected": -1840.0, "loss": 0.6862, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.023193359375, "rewards/margins": 0.0556640625, "rewards/rejected": -0.07861328125, "step": 202 }, { "epoch": 0.05856050771671715, "grad_norm": 11.983946095637638, "learning_rate": 2.9250720461095103e-07, "logits/chosen": 3.046875, "logits/rejected": 3.046875, "logps/chosen": -1888.0, "logps/rejected": -1776.0, "loss": 0.6957, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.03125, "rewards/margins": -0.010009765625, "rewards/rejected": -0.021240234375, "step": 203 }, { "epoch": 0.05884898312418866, "grad_norm": 11.227781823086811, "learning_rate": 2.939481268011527e-07, "logits/chosen": 2.859375, "logits/rejected": 2.84375, "logps/chosen": -1648.0, "logps/rejected": -1640.0, "loss": 0.7103, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.0162353515625, "rewards/margins": -0.01251220703125, "rewards/rejected": 0.02880859375, "step": 204 }, { "epoch": 0.05913745853166018, "grad_norm": 14.438945820821774, "learning_rate": 2.953890489913545e-07, "logits/chosen": 2.6875, "logits/rejected": 2.671875, "logps/chosen": -1472.0, "logps/rejected": -1632.0, "loss": 0.6932, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.023193359375, "rewards/margins": 0.027587890625, "rewards/rejected": -0.05078125, "step": 205 }, { "epoch": 0.05942593393913169, "grad_norm": 11.6561126649981, "learning_rate": 2.9682997118155616e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0, "logps/chosen": -1640.0, "logps/rejected": -1680.0, "loss": 0.7043, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.0087890625, "rewards/margins": 0.01123046875, "rewards/rejected": -0.02001953125, "step": 206 }, { "epoch": 0.0597144093466032, "grad_norm": 13.802370085354822, "learning_rate": 2.9827089337175794e-07, "logits/chosen": 2.890625, "logits/rejected": 2.890625, "logps/chosen": -1896.0, "logps/rejected": -2000.0, "loss": 0.6804, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0224609375, "rewards/margins": 0.04248046875, "rewards/rejected": -0.02001953125, "step": 207 }, { "epoch": 0.06000288475407471, "grad_norm": 13.72589295749893, "learning_rate": 2.997118155619596e-07, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1872.0, "logps/rejected": -1752.0, "loss": 0.7146, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0250244140625, "rewards/margins": -0.050048828125, "rewards/rejected": 0.0250244140625, "step": 208 }, { "epoch": 0.06029136016154623, "grad_norm": 12.338817262211686, "learning_rate": 3.011527377521614e-07, "logits/chosen": 2.828125, "logits/rejected": 2.828125, "logps/chosen": -2208.0, "logps/rejected": -2256.0, "loss": 0.6925, "loss/demonstration_loss": -4480.0, "loss/preference_loss": -4480.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.003753662109375, "rewards/margins": -0.0224609375, "rewards/rejected": 0.0262451171875, "step": 209 }, { "epoch": 0.06057983556901774, "grad_norm": 11.614647514316792, "learning_rate": 3.0259365994236306e-07, "logits/chosen": 2.875, "logits/rejected": 2.96875, "logps/chosen": -1656.0, "logps/rejected": -1640.0, "loss": 0.6911, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.0137939453125, "rewards/rejected": 0.001251220703125, "step": 210 }, { "epoch": 0.06086831097648925, "grad_norm": 11.407490945392553, "learning_rate": 3.0403458213256484e-07, "logits/chosen": 2.984375, "logits/rejected": 2.90625, "logps/chosen": -1512.0, "logps/rejected": -1600.0, "loss": 0.6812, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.0238037109375, "rewards/margins": 0.0174560546875, "rewards/rejected": 0.006256103515625, "step": 211 }, { "epoch": 0.06115678638396077, "grad_norm": 12.904669304061635, "learning_rate": 3.054755043227665e-07, "logits/chosen": 3.0625, "logits/rejected": 2.984375, "logps/chosen": -1704.0, "logps/rejected": -1736.0, "loss": 0.701, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.05517578125, "rewards/margins": 0.017578125, "rewards/rejected": 0.03759765625, "step": 212 }, { "epoch": 0.06144526179143228, "grad_norm": 11.831078067632724, "learning_rate": 3.069164265129683e-07, "logits/chosen": 2.9375, "logits/rejected": 2.828125, "logps/chosen": -2096.0, "logps/rejected": -2112.0, "loss": 0.688, "loss/demonstration_loss": -4224.0, "loss/preference_loss": -4224.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.044921875, "rewards/margins": 0.01123046875, "rewards/rejected": 0.03369140625, "step": 213 }, { "epoch": 0.06173373719890379, "grad_norm": 12.606126784777697, "learning_rate": 3.0835734870317e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0, "logps/chosen": -1488.0, "logps/rejected": -1352.0, "loss": 0.7183, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.0224609375, "rewards/margins": 0.016845703125, "rewards/rejected": 0.005615234375, "step": 214 }, { "epoch": 0.062022212606375304, "grad_norm": 10.772112954532906, "learning_rate": 3.0979827089337174e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0, "logps/chosen": -1520.0, "logps/rejected": -1600.0, "loss": 0.7125, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.1875, "rewards/chosen": -0.03564453125, "rewards/margins": -0.0830078125, "rewards/rejected": 0.047607421875, "step": 215 }, { "epoch": 0.06231068801384682, "grad_norm": 11.465071264758075, "learning_rate": 3.1123919308357347e-07, "logits/chosen": 2.9375, "logits/rejected": 2.90625, "logps/chosen": -1776.0, "logps/rejected": -1872.0, "loss": 0.6883, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.05078125, "rewards/margins": 0.0050048828125, "rewards/rejected": 0.045654296875, "step": 216 }, { "epoch": 0.06259916342131833, "grad_norm": 14.407193027959861, "learning_rate": 3.126801152737752e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -1824.0, "logps/rejected": -1768.0, "loss": 0.6899, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.04248046875, "rewards/margins": -0.0087890625, "rewards/rejected": -0.03369140625, "step": 217 }, { "epoch": 0.06288763882878985, "grad_norm": 13.18961999723925, "learning_rate": 3.141210374639769e-07, "logits/chosen": 2.890625, "logits/rejected": 2.90625, "logps/chosen": -1840.0, "logps/rejected": -1384.0, "loss": 0.6797, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0576171875, "rewards/margins": 0.080078125, "rewards/rejected": -0.0224609375, "step": 218 }, { "epoch": 0.06317611423626136, "grad_norm": 13.126494834514967, "learning_rate": 3.1556195965417865e-07, "logits/chosen": 2.875, "logits/rejected": 2.984375, "logps/chosen": -1664.0, "logps/rejected": -1376.0, "loss": 0.7163, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.0093994140625, "rewards/margins": 0.0037384033203125, "rewards/rejected": 0.005615234375, "step": 219 }, { "epoch": 0.06346458964373287, "grad_norm": 14.081995596872781, "learning_rate": 3.1700288184438037e-07, "logits/chosen": 2.78125, "logits/rejected": 2.828125, "logps/chosen": -1888.0, "logps/rejected": -1512.0, "loss": 0.7113, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.038818359375, "rewards/margins": -0.0030975341796875, "rewards/rejected": -0.03564453125, "step": 220 }, { "epoch": 0.06375306505120439, "grad_norm": 12.043954587426272, "learning_rate": 3.184438040345821e-07, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1728.0, "logps/rejected": -1792.0, "loss": 0.6764, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.080078125, "rewards/margins": 0.04931640625, "rewards/rejected": 0.0306396484375, "step": 221 }, { "epoch": 0.0640415404586759, "grad_norm": 13.958981536855855, "learning_rate": 3.198847262247839e-07, "logits/chosen": 2.984375, "logits/rejected": 2.9375, "logps/chosen": -1456.0, "logps/rejected": -1496.0, "loss": 0.693, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.083984375, "rewards/margins": 0.044921875, "rewards/rejected": 0.038818359375, "step": 222 }, { "epoch": 0.06433001586614741, "grad_norm": 12.952665203623601, "learning_rate": 3.2132564841498555e-07, "logits/chosen": 2.859375, "logits/rejected": 2.84375, "logps/chosen": -1432.0, "logps/rejected": -1664.0, "loss": 0.7032, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.0125732421875, "rewards/margins": -0.0106201171875, "rewards/rejected": 0.023193359375, "step": 223 }, { "epoch": 0.06461849127361892, "grad_norm": 10.659089739422535, "learning_rate": 3.2276657060518733e-07, "logits/chosen": 3.046875, "logits/rejected": 2.953125, "logps/chosen": -1432.0, "logps/rejected": -1480.0, "loss": 0.6884, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.05517578125, "rewards/margins": 0.025634765625, "rewards/rejected": 0.0294189453125, "step": 224 }, { "epoch": 0.06490696668109043, "grad_norm": 14.239858913396796, "learning_rate": 3.2420749279538905e-07, "logits/chosen": 2.9375, "logits/rejected": 2.953125, "logps/chosen": -1840.0, "logps/rejected": -1784.0, "loss": 0.72, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0238037109375, "rewards/margins": -0.018798828125, "rewards/rejected": 0.04248046875, "step": 225 }, { "epoch": 0.06519544208856196, "grad_norm": 12.56030848447778, "learning_rate": 3.256484149855908e-07, "logits/chosen": 2.90625, "logits/rejected": 2.96875, "logps/chosen": -1272.0, "logps/rejected": -1328.0, "loss": 0.6979, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.034423828125, "rewards/margins": -0.023193359375, "rewards/rejected": 0.0576171875, "step": 226 }, { "epoch": 0.06548391749603347, "grad_norm": 13.329347590768073, "learning_rate": 3.270893371757925e-07, "logits/chosen": 3.046875, "logits/rejected": 3.171875, "logps/chosen": -1888.0, "logps/rejected": -1368.0, "loss": 0.6821, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.056396484375, "rewards/margins": 0.0419921875, "rewards/rejected": 0.01434326171875, "step": 227 }, { "epoch": 0.06577239290350498, "grad_norm": 13.282746824623697, "learning_rate": 3.2853025936599423e-07, "logits/chosen": 2.71875, "logits/rejected": 2.84375, "logps/chosen": -1928.0, "logps/rejected": -1664.0, "loss": 0.7008, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0537109375, "rewards/margins": 0.02490234375, "rewards/rejected": 0.02880859375, "step": 228 }, { "epoch": 0.06606086831097649, "grad_norm": 12.940826991838383, "learning_rate": 3.2997118155619596e-07, "logits/chosen": 2.875, "logits/rejected": 2.8125, "logps/chosen": -1576.0, "logps/rejected": -1696.0, "loss": 0.6915, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.083984375, "rewards/margins": 0.0081787109375, "rewards/rejected": 0.07568359375, "step": 229 }, { "epoch": 0.066349343718448, "grad_norm": 13.69041000016369, "learning_rate": 3.314121037463977e-07, "logits/chosen": 2.78125, "logits/rejected": 2.859375, "logps/chosen": -1584.0, "logps/rejected": -1456.0, "loss": 0.682, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.00750732421875, "rewards/margins": 0.0247802734375, "rewards/rejected": -0.0172119140625, "step": 230 }, { "epoch": 0.06663781912591951, "grad_norm": 11.549184347200583, "learning_rate": 3.328530259365994e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1528.0, "logps/rejected": -1552.0, "loss": 0.6798, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.050048828125, "rewards/margins": 0.017578125, "rewards/rejected": 0.032470703125, "step": 231 }, { "epoch": 0.06692629453339102, "grad_norm": 11.509496305810329, "learning_rate": 3.3429394812680114e-07, "logits/chosen": 2.90625, "logits/rejected": 2.9375, "logps/chosen": -1456.0, "logps/rejected": -1424.0, "loss": 0.6863, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.08154296875, "rewards/margins": 0.033203125, "rewards/rejected": 0.048095703125, "step": 232 }, { "epoch": 0.06721476994086255, "grad_norm": 17.10045964597626, "learning_rate": 3.3573487031700286e-07, "logits/chosen": 2.921875, "logits/rejected": 2.796875, "logps/chosen": -1608.0, "logps/rejected": -1656.0, "loss": 0.7194, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.02001953125, "rewards/margins": -0.01055908203125, "rewards/rejected": 0.0306396484375, "step": 233 }, { "epoch": 0.06750324534833406, "grad_norm": 11.782122908947224, "learning_rate": 3.371757925072046e-07, "logits/chosen": 2.96875, "logits/rejected": 2.890625, "logps/chosen": -1904.0, "logps/rejected": -1608.0, "loss": 0.6781, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0673828125, "rewards/margins": 0.040771484375, "rewards/rejected": 0.02685546875, "step": 234 }, { "epoch": 0.06779172075580557, "grad_norm": 13.939692328911423, "learning_rate": 3.386167146974063e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0625, "logps/chosen": -1864.0, "logps/rejected": -1568.0, "loss": 0.7227, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.25, "rewards/chosen": -0.044921875, "rewards/margins": -0.0537109375, "rewards/rejected": 0.0087890625, "step": 235 }, { "epoch": 0.06808019616327708, "grad_norm": 12.844284359940602, "learning_rate": 3.400576368876081e-07, "logits/chosen": 2.734375, "logits/rejected": 2.796875, "logps/chosen": -1904.0, "logps/rejected": -1624.0, "loss": 0.7188, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.125, "rewards/chosen": -0.0137939453125, "rewards/margins": -0.08154296875, "rewards/rejected": 0.0673828125, "step": 236 }, { "epoch": 0.0683686715707486, "grad_norm": 15.095519333688204, "learning_rate": 3.4149855907780976e-07, "logits/chosen": 2.796875, "logits/rejected": 2.828125, "logps/chosen": -1624.0, "logps/rejected": -1560.0, "loss": 0.6948, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.06396484375, "rewards/margins": -0.00750732421875, "rewards/rejected": 0.0712890625, "step": 237 }, { "epoch": 0.0686571469782201, "grad_norm": 15.687937466811052, "learning_rate": 3.4293948126801154e-07, "logits/chosen": 2.984375, "logits/rejected": 2.96875, "logps/chosen": -2032.0, "logps/rejected": -1824.0, "loss": 0.6926, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.08740234375, "rewards/margins": 0.016357421875, "rewards/rejected": 0.0712890625, "step": 238 }, { "epoch": 0.06894562238569162, "grad_norm": 13.634207736710202, "learning_rate": 3.443804034582132e-07, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -1632.0, "logps/rejected": -1520.0, "loss": 0.7124, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.09130859375, "rewards/margins": -0.00439453125, "rewards/rejected": 0.095703125, "step": 239 }, { "epoch": 0.06923409779316313, "grad_norm": 12.199715076807127, "learning_rate": 3.45821325648415e-07, "logits/chosen": 2.921875, "logits/rejected": 2.921875, "logps/chosen": -1704.0, "logps/rejected": -1680.0, "loss": 0.6906, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.0859375, "rewards/margins": 0.045654296875, "rewards/rejected": 0.0400390625, "step": 240 }, { "epoch": 0.06952257320063465, "grad_norm": 14.82092716001698, "learning_rate": 3.4726224783861667e-07, "logits/chosen": 2.78125, "logits/rejected": 2.765625, "logps/chosen": -1632.0, "logps/rejected": -1632.0, "loss": 0.6959, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0908203125, "rewards/margins": 0.00689697265625, "rewards/rejected": 0.083984375, "step": 241 }, { "epoch": 0.06981104860810616, "grad_norm": 12.667383672709043, "learning_rate": 3.4870317002881845e-07, "logits/chosen": 2.9375, "logits/rejected": 2.953125, "logps/chosen": -1416.0, "logps/rejected": -1472.0, "loss": 0.7061, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.005645751953125, "rewards/margins": -0.0380859375, "rewards/rejected": 0.043701171875, "step": 242 }, { "epoch": 0.07009952401557767, "grad_norm": 12.382361001615099, "learning_rate": 3.501440922190201e-07, "logits/chosen": 2.859375, "logits/rejected": 2.8125, "logps/chosen": -1816.0, "logps/rejected": -1760.0, "loss": 0.6903, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.08642578125, "rewards/margins": 0.021240234375, "rewards/rejected": 0.06494140625, "step": 243 }, { "epoch": 0.07038799942304919, "grad_norm": 15.859188391017698, "learning_rate": 3.515850144092219e-07, "logits/chosen": 2.90625, "logits/rejected": 2.984375, "logps/chosen": -1568.0, "logps/rejected": -1416.0, "loss": 0.6932, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.08642578125, "rewards/margins": 0.032470703125, "rewards/rejected": 0.0537109375, "step": 244 }, { "epoch": 0.0706764748305207, "grad_norm": 13.088822358150253, "learning_rate": 3.5302593659942357e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -1616.0, "logps/rejected": -1736.0, "loss": 0.6625, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.10205078125, "rewards/margins": 0.028076171875, "rewards/rejected": 0.07373046875, "step": 245 }, { "epoch": 0.07096495023799221, "grad_norm": 12.277061788936658, "learning_rate": 3.5446685878962535e-07, "logits/chosen": 3.140625, "logits/rejected": 3.046875, "logps/chosen": -1408.0, "logps/rejected": -1560.0, "loss": 0.672, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.076171875, "rewards/margins": 0.0849609375, "rewards/rejected": -0.0087890625, "step": 246 }, { "epoch": 0.07125342564546372, "grad_norm": 12.994224051688281, "learning_rate": 3.559077809798271e-07, "logits/chosen": 2.890625, "logits/rejected": 2.9375, "logps/chosen": -1544.0, "logps/rejected": -1360.0, "loss": 0.6937, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0771484375, "rewards/margins": 0.0181884765625, "rewards/rejected": 0.058837890625, "step": 247 }, { "epoch": 0.07154190105293523, "grad_norm": 11.58354785804806, "learning_rate": 3.573487031700288e-07, "logits/chosen": 2.921875, "logits/rejected": 3.015625, "logps/chosen": -1616.0, "logps/rejected": -1304.0, "loss": 0.6918, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.091796875, "rewards/margins": 0.001251220703125, "rewards/rejected": 0.0908203125, "step": 248 }, { "epoch": 0.07183037646040676, "grad_norm": 12.754971398553955, "learning_rate": 3.5878962536023053e-07, "logits/chosen": 2.703125, "logits/rejected": 2.703125, "logps/chosen": -1512.0, "logps/rejected": -1568.0, "loss": 0.7312, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.017578125, "rewards/margins": -0.09130859375, "rewards/rejected": 0.10888671875, "step": 249 }, { "epoch": 0.07211885186787827, "grad_norm": 13.448078587745409, "learning_rate": 3.6023054755043225e-07, "logits/chosen": 2.90625, "logits/rejected": 2.9375, "logps/chosen": -1728.0, "logps/rejected": -1736.0, "loss": 0.6868, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.08203125, "rewards/margins": 0.04833984375, "rewards/rejected": 0.03369140625, "step": 250 }, { "epoch": 0.07240732727534978, "grad_norm": 14.303317136857709, "learning_rate": 3.61671469740634e-07, "logits/chosen": 2.890625, "logits/rejected": 3.015625, "logps/chosen": -1648.0, "logps/rejected": -1360.0, "loss": 0.6739, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.044921875, "rewards/margins": 0.048583984375, "rewards/rejected": -0.00372314453125, "step": 251 }, { "epoch": 0.07269580268282129, "grad_norm": 11.871220919952881, "learning_rate": 3.631123919308357e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0, "logps/chosen": -1376.0, "logps/rejected": -1352.0, "loss": 0.7146, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.038818359375, "rewards/margins": -0.072265625, "rewards/rejected": 0.111328125, "step": 252 }, { "epoch": 0.0729842780902928, "grad_norm": 12.712874921887503, "learning_rate": 3.645533141210375e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0, "logps/chosen": -1672.0, "logps/rejected": -1584.0, "loss": 0.6986, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.09912109375, "rewards/margins": -0.004058837890625, "rewards/rejected": 0.10302734375, "step": 253 }, { "epoch": 0.07327275349776431, "grad_norm": 11.50468218868833, "learning_rate": 3.6599423631123916e-07, "logits/chosen": 3.015625, "logits/rejected": 2.984375, "logps/chosen": -1288.0, "logps/rejected": -1536.0, "loss": 0.6884, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.08447265625, "rewards/margins": 0.032470703125, "rewards/rejected": 0.052001953125, "step": 254 }, { "epoch": 0.07356122890523582, "grad_norm": 11.121474221622007, "learning_rate": 3.6743515850144094e-07, "logits/chosen": 2.828125, "logits/rejected": 2.84375, "logps/chosen": -1552.0, "logps/rejected": -1648.0, "loss": 0.703, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1064453125, "rewards/margins": 0.0137939453125, "rewards/rejected": 0.0927734375, "step": 255 }, { "epoch": 0.07384970431270735, "grad_norm": 10.93597365996318, "learning_rate": 3.688760806916426e-07, "logits/chosen": 3.0, "logits/rejected": 2.96875, "logps/chosen": -1872.0, "logps/rejected": -1600.0, "loss": 0.6947, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.10400390625, "rewards/margins": 0.0162353515625, "rewards/rejected": 0.08740234375, "step": 256 }, { "epoch": 0.07413817972017886, "grad_norm": 13.389610221343382, "learning_rate": 3.703170028818444e-07, "logits/chosen": 2.890625, "logits/rejected": 2.984375, "logps/chosen": -1760.0, "logps/rejected": -1504.0, "loss": 0.6831, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.0830078125, "rewards/margins": 0.0194091796875, "rewards/rejected": 0.0634765625, "step": 257 }, { "epoch": 0.07442665512765037, "grad_norm": 12.54312204571574, "learning_rate": 3.717579250720461e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -1616.0, "logps/rejected": -1472.0, "loss": 0.6851, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.076171875, "rewards/margins": 0.0216064453125, "rewards/rejected": 0.054443359375, "step": 258 }, { "epoch": 0.07471513053512188, "grad_norm": 12.143904901542992, "learning_rate": 3.7319884726224784e-07, "logits/chosen": 2.953125, "logits/rejected": 2.90625, "logps/chosen": -1904.0, "logps/rejected": -1744.0, "loss": 0.7185, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.10009765625, "rewards/margins": 0.0, "rewards/rejected": 0.10009765625, "step": 259 }, { "epoch": 0.0750036059425934, "grad_norm": 11.113536745358273, "learning_rate": 3.7463976945244956e-07, "logits/chosen": 2.96875, "logits/rejected": 3.03125, "logps/chosen": -2040.0, "logps/rejected": -1736.0, "loss": 0.708, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.0927734375, "rewards/margins": -0.01318359375, "rewards/rejected": 0.10595703125, "step": 260 }, { "epoch": 0.0752920813500649, "grad_norm": 12.038149058643892, "learning_rate": 3.760806916426513e-07, "logits/chosen": 2.796875, "logits/rejected": 2.734375, "logps/chosen": -1696.0, "logps/rejected": -1544.0, "loss": 0.6874, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.1201171875, "rewards/margins": 0.05810546875, "rewards/rejected": 0.06201171875, "step": 261 }, { "epoch": 0.07558055675753642, "grad_norm": 12.973911622755267, "learning_rate": 3.77521613832853e-07, "logits/chosen": 3.015625, "logits/rejected": 2.921875, "logps/chosen": -1592.0, "logps/rejected": -1568.0, "loss": 0.6886, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.10888671875, "rewards/margins": 0.0238037109375, "rewards/rejected": 0.0849609375, "step": 262 }, { "epoch": 0.07586903216500793, "grad_norm": 12.160771283866747, "learning_rate": 3.7896253602305474e-07, "logits/chosen": 3.0, "logits/rejected": 3.0, "logps/chosen": -1760.0, "logps/rejected": -1696.0, "loss": 0.6807, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.115234375, "rewards/margins": 0.032470703125, "rewards/rejected": 0.08251953125, "step": 263 }, { "epoch": 0.07615750757247945, "grad_norm": 15.099553792009583, "learning_rate": 3.8040345821325647e-07, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1896.0, "logps/rejected": -1792.0, "loss": 0.6838, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1640625, "rewards/margins": 0.0244140625, "rewards/rejected": 0.1396484375, "step": 264 }, { "epoch": 0.07644598297995096, "grad_norm": 13.234494901110835, "learning_rate": 3.818443804034582e-07, "logits/chosen": 2.96875, "logits/rejected": 3.015625, "logps/chosen": -1600.0, "logps/rejected": -1488.0, "loss": 0.6965, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.109375, "rewards/margins": 0.0194091796875, "rewards/rejected": 0.08984375, "step": 265 }, { "epoch": 0.07673445838742247, "grad_norm": 12.919941450531079, "learning_rate": 3.832853025936599e-07, "logits/chosen": 2.9375, "logits/rejected": 2.859375, "logps/chosen": -1920.0, "logps/rejected": -1912.0, "loss": 0.7354, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.04248046875, "rewards/margins": -0.0927734375, "rewards/rejected": 0.134765625, "step": 266 }, { "epoch": 0.07702293379489399, "grad_norm": 10.836261820124863, "learning_rate": 3.8472622478386165e-07, "logits/chosen": 3.015625, "logits/rejected": 2.984375, "logps/chosen": -1592.0, "logps/rejected": -1848.0, "loss": 0.6782, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1318359375, "rewards/margins": 0.091796875, "rewards/rejected": 0.0400390625, "step": 267 }, { "epoch": 0.0773114092023655, "grad_norm": 11.856712605255007, "learning_rate": 3.8616714697406337e-07, "logits/chosen": 2.9375, "logits/rejected": 3.03125, "logps/chosen": -1824.0, "logps/rejected": -1600.0, "loss": 0.6873, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.115234375, "rewards/margins": 0.01312255859375, "rewards/rejected": 0.10205078125, "step": 268 }, { "epoch": 0.07759988460983701, "grad_norm": 12.86560996101754, "learning_rate": 3.8760806916426515e-07, "logits/chosen": 2.859375, "logits/rejected": 2.796875, "logps/chosen": -1536.0, "logps/rejected": -1656.0, "loss": 0.7104, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.06494140625, "rewards/margins": -0.044921875, "rewards/rejected": 0.1103515625, "step": 269 }, { "epoch": 0.07788836001730852, "grad_norm": 13.029873728402967, "learning_rate": 3.890489913544668e-07, "logits/chosen": 3.03125, "logits/rejected": 3.125, "logps/chosen": -1968.0, "logps/rejected": -1616.0, "loss": 0.7231, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.07763671875, "rewards/margins": -0.052490234375, "rewards/rejected": 0.1298828125, "step": 270 }, { "epoch": 0.07817683542478003, "grad_norm": 13.810212926425034, "learning_rate": 3.904899135446686e-07, "logits/chosen": 2.890625, "logits/rejected": 2.953125, "logps/chosen": -1928.0, "logps/rejected": -1816.0, "loss": 0.6897, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.140625, "rewards/margins": 0.00311279296875, "rewards/rejected": 0.13671875, "step": 271 }, { "epoch": 0.07846531083225156, "grad_norm": 12.45664192992779, "learning_rate": 3.919308357348703e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -1808.0, "logps/rejected": -1616.0, "loss": 0.689, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1201171875, "rewards/margins": 0.014404296875, "rewards/rejected": 0.10595703125, "step": 272 }, { "epoch": 0.07875378623972307, "grad_norm": 13.677171557938841, "learning_rate": 3.9337175792507205e-07, "logits/chosen": 2.859375, "logits/rejected": 2.84375, "logps/chosen": -2368.0, "logps/rejected": -2272.0, "loss": 0.71, "loss/demonstration_loss": -4640.0, "loss/preference_loss": -4672.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.1484375, "rewards/margins": -0.04150390625, "rewards/rejected": 0.1904296875, "step": 273 }, { "epoch": 0.07904226164719458, "grad_norm": 13.26257309492729, "learning_rate": 3.9481268011527373e-07, "logits/chosen": 2.8125, "logits/rejected": 2.796875, "logps/chosen": -1712.0, "logps/rejected": -1768.0, "loss": 0.724, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.080078125, "rewards/margins": -0.095703125, "rewards/rejected": 0.17578125, "step": 274 }, { "epoch": 0.07933073705466609, "grad_norm": 15.789491585011902, "learning_rate": 3.962536023054755e-07, "logits/chosen": 2.984375, "logits/rejected": 2.9375, "logps/chosen": -1688.0, "logps/rejected": -1856.0, "loss": 0.6714, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.16015625, "rewards/margins": 0.01123046875, "rewards/rejected": 0.1484375, "step": 275 }, { "epoch": 0.0796192124621376, "grad_norm": 13.224673959063026, "learning_rate": 3.976945244956772e-07, "logits/chosen": 2.953125, "logits/rejected": 2.90625, "logps/chosen": -1776.0, "logps/rejected": -1984.0, "loss": 0.6925, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.154296875, "rewards/margins": 0.0244140625, "rewards/rejected": 0.1298828125, "step": 276 }, { "epoch": 0.07990768786960911, "grad_norm": 14.864306427447907, "learning_rate": 3.9913544668587896e-07, "logits/chosen": 2.90625, "logits/rejected": 2.96875, "logps/chosen": -1472.0, "logps/rejected": -1512.0, "loss": 0.7036, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.11962890625, "rewards/margins": -0.0106201171875, "rewards/rejected": 0.1298828125, "step": 277 }, { "epoch": 0.08019616327708062, "grad_norm": 11.757720526653452, "learning_rate": 4.0057636887608063e-07, "logits/chosen": 2.9375, "logits/rejected": 2.9375, "logps/chosen": -1560.0, "logps/rejected": -1704.0, "loss": 0.7144, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1015625, "rewards/margins": -0.02880859375, "rewards/rejected": 0.1298828125, "step": 278 }, { "epoch": 0.08048463868455215, "grad_norm": 11.903795875944434, "learning_rate": 4.020172910662824e-07, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1680.0, "logps/rejected": -1576.0, "loss": 0.6685, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.08251953125, "rewards/margins": 0.0262451171875, "rewards/rejected": 0.056396484375, "step": 279 }, { "epoch": 0.08077311409202366, "grad_norm": 13.240292839327235, "learning_rate": 4.0345821325648413e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -2256.0, "logps/rejected": -2024.0, "loss": 0.6991, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.10009765625, "rewards/margins": -0.06884765625, "rewards/rejected": 0.1689453125, "step": 280 }, { "epoch": 0.08106158949949517, "grad_norm": 13.952479681228839, "learning_rate": 4.0489913544668586e-07, "logits/chosen": 2.859375, "logits/rejected": 2.875, "logps/chosen": -2000.0, "logps/rejected": -2000.0, "loss": 0.6967, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.212890625, "rewards/margins": 0.046875, "rewards/rejected": 0.166015625, "step": 281 }, { "epoch": 0.08135006490696668, "grad_norm": 12.462553393885862, "learning_rate": 4.063400576368876e-07, "logits/chosen": 2.875, "logits/rejected": 2.921875, "logps/chosen": -1328.0, "logps/rejected": -1312.0, "loss": 0.688, "loss/demonstration_loss": -2656.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.08984375, "rewards/margins": 0.0074462890625, "rewards/rejected": 0.08251953125, "step": 282 }, { "epoch": 0.0816385403144382, "grad_norm": 16.478243742118174, "learning_rate": 4.077809798270893e-07, "logits/chosen": 2.921875, "logits/rejected": 2.96875, "logps/chosen": -1680.0, "logps/rejected": -1656.0, "loss": 0.7126, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.08203125, "rewards/margins": -0.060791015625, "rewards/rejected": 0.142578125, "step": 283 }, { "epoch": 0.0819270157219097, "grad_norm": 10.096768480393164, "learning_rate": 4.092219020172911e-07, "logits/chosen": 3.015625, "logits/rejected": 2.96875, "logps/chosen": -1760.0, "logps/rejected": -1576.0, "loss": 0.7027, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1552734375, "rewards/margins": 0.0174560546875, "rewards/rejected": 0.1376953125, "step": 284 }, { "epoch": 0.08221549112938122, "grad_norm": 13.419073157123501, "learning_rate": 4.1066282420749276e-07, "logits/chosen": 2.90625, "logits/rejected": 2.875, "logps/chosen": -1872.0, "logps/rejected": -1856.0, "loss": 0.7064, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.15234375, "rewards/margins": -0.03515625, "rewards/rejected": 0.1875, "step": 285 }, { "epoch": 0.08250396653685273, "grad_norm": 14.724515764537559, "learning_rate": 4.1210374639769454e-07, "logits/chosen": 2.828125, "logits/rejected": 2.734375, "logps/chosen": -1792.0, "logps/rejected": -1824.0, "loss": 0.6962, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.15234375, "rewards/margins": 0.0074462890625, "rewards/rejected": 0.1455078125, "step": 286 }, { "epoch": 0.08279244194432425, "grad_norm": 11.764717302995162, "learning_rate": 4.135446685878962e-07, "logits/chosen": 3.03125, "logits/rejected": 2.921875, "logps/chosen": -1312.0, "logps/rejected": -1592.0, "loss": 0.6841, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1533203125, "rewards/margins": -0.010009765625, "rewards/rejected": 0.1630859375, "step": 287 }, { "epoch": 0.08308091735179576, "grad_norm": 11.526199113507058, "learning_rate": 4.14985590778098e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -1920.0, "logps/rejected": -1808.0, "loss": 0.705, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.22265625, "rewards/margins": 0.043701171875, "rewards/rejected": 0.1787109375, "step": 288 }, { "epoch": 0.08336939275926727, "grad_norm": 12.810133124144434, "learning_rate": 4.1642651296829967e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -1848.0, "logps/rejected": -1824.0, "loss": 0.6912, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.10546875, "rewards/margins": -0.0791015625, "rewards/rejected": 0.1845703125, "step": 289 }, { "epoch": 0.08365786816673879, "grad_norm": 13.345197064838015, "learning_rate": 4.1786743515850145e-07, "logits/chosen": 2.96875, "logits/rejected": 2.90625, "logps/chosen": -1712.0, "logps/rejected": -1896.0, "loss": 0.7095, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.181640625, "rewards/margins": 0.031982421875, "rewards/rejected": 0.150390625, "step": 290 }, { "epoch": 0.0839463435742103, "grad_norm": 10.072933630704417, "learning_rate": 4.193083573487031e-07, "logits/chosen": 2.96875, "logits/rejected": 2.828125, "logps/chosen": -1784.0, "logps/rejected": -1600.0, "loss": 0.6717, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1982421875, "rewards/margins": 0.07763671875, "rewards/rejected": 0.12060546875, "step": 291 }, { "epoch": 0.08423481898168181, "grad_norm": 13.744718323467263, "learning_rate": 4.207492795389049e-07, "logits/chosen": 2.90625, "logits/rejected": 2.859375, "logps/chosen": -1808.0, "logps/rejected": -1744.0, "loss": 0.6766, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.140625, "rewards/margins": 0.0224609375, "rewards/rejected": 0.11767578125, "step": 292 }, { "epoch": 0.08452329438915332, "grad_norm": 11.779287359089318, "learning_rate": 4.221902017291066e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1608.0, "logps/rejected": -1400.0, "loss": 0.6865, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1982421875, "rewards/margins": 0.01434326171875, "rewards/rejected": 0.18359375, "step": 293 }, { "epoch": 0.08481176979662484, "grad_norm": 10.613632492923646, "learning_rate": 4.2363112391930835e-07, "logits/chosen": 2.890625, "logits/rejected": 2.90625, "logps/chosen": -1472.0, "logps/rejected": -1520.0, "loss": 0.6968, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.177734375, "rewards/margins": -0.00628662109375, "rewards/rejected": 0.18359375, "step": 294 }, { "epoch": 0.08510024520409636, "grad_norm": 12.029297966064473, "learning_rate": 4.250720461095101e-07, "logits/chosen": 2.96875, "logits/rejected": 3.09375, "logps/chosen": -1768.0, "logps/rejected": -1616.0, "loss": 0.6914, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2294921875, "rewards/margins": 0.00592041015625, "rewards/rejected": 0.2236328125, "step": 295 }, { "epoch": 0.08538872061156787, "grad_norm": 11.102052930471627, "learning_rate": 4.265129682997118e-07, "logits/chosen": 3.078125, "logits/rejected": 3.109375, "logps/chosen": -1704.0, "logps/rejected": -1704.0, "loss": 0.712, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1240234375, "rewards/margins": -0.046875, "rewards/rejected": 0.1708984375, "step": 296 }, { "epoch": 0.08567719601903938, "grad_norm": 12.503377623561018, "learning_rate": 4.2795389048991353e-07, "logits/chosen": 3.078125, "logits/rejected": 3.0, "logps/chosen": -1992.0, "logps/rejected": -1632.0, "loss": 0.684, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.2490234375, "rewards/margins": 0.08251953125, "rewards/rejected": 0.166015625, "step": 297 }, { "epoch": 0.08596567142651089, "grad_norm": 10.381876486993336, "learning_rate": 4.2939481268011525e-07, "logits/chosen": 2.953125, "logits/rejected": 2.9375, "logps/chosen": -1648.0, "logps/rejected": -1608.0, "loss": 0.6806, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.158203125, "rewards/margins": -0.00030517578125, "rewards/rejected": 0.1591796875, "step": 298 }, { "epoch": 0.0862541468339824, "grad_norm": 11.902101328113877, "learning_rate": 4.30835734870317e-07, "logits/chosen": 2.921875, "logits/rejected": 2.921875, "logps/chosen": -1408.0, "logps/rejected": -1488.0, "loss": 0.705, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.19921875, "rewards/margins": -0.041259765625, "rewards/rejected": 0.240234375, "step": 299 }, { "epoch": 0.08654262224145391, "grad_norm": 11.307581926775873, "learning_rate": 4.322766570605187e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -1280.0, "logps/rejected": -1552.0, "loss": 0.7014, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.12060546875, "rewards/margins": -0.050048828125, "rewards/rejected": 0.1708984375, "step": 300 }, { "epoch": 0.08683109764892542, "grad_norm": 11.45869248213672, "learning_rate": 4.3371757925072043e-07, "logits/chosen": 3.046875, "logits/rejected": 2.953125, "logps/chosen": -1672.0, "logps/rejected": -1856.0, "loss": 0.6917, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1982421875, "rewards/margins": 0.00567626953125, "rewards/rejected": 0.1923828125, "step": 301 }, { "epoch": 0.08711957305639695, "grad_norm": 17.207787172513388, "learning_rate": 4.3515850144092216e-07, "logits/chosen": 3.0, "logits/rejected": 3.03125, "logps/chosen": -2048.0, "logps/rejected": -1728.0, "loss": 0.6835, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.263671875, "rewards/margins": 0.01806640625, "rewards/rejected": 0.24609375, "step": 302 }, { "epoch": 0.08740804846386846, "grad_norm": 11.709457249898819, "learning_rate": 4.365994236311239e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1728.0, "logps/rejected": -1768.0, "loss": 0.7097, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.251953125, "rewards/margins": -0.003814697265625, "rewards/rejected": 0.255859375, "step": 303 }, { "epoch": 0.08769652387133997, "grad_norm": 12.69806809330215, "learning_rate": 4.3804034582132566e-07, "logits/chosen": 2.9375, "logits/rejected": 2.9375, "logps/chosen": -1648.0, "logps/rejected": -1688.0, "loss": 0.688, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.236328125, "rewards/margins": 0.0274658203125, "rewards/rejected": 0.208984375, "step": 304 }, { "epoch": 0.08798499927881148, "grad_norm": 12.446924126675057, "learning_rate": 4.3948126801152733e-07, "logits/chosen": 3.0, "logits/rejected": 3.046875, "logps/chosen": -1600.0, "logps/rejected": -1552.0, "loss": 0.6891, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.185546875, "rewards/margins": -0.0150146484375, "rewards/rejected": 0.201171875, "step": 305 }, { "epoch": 0.088273474686283, "grad_norm": 13.187359169399343, "learning_rate": 4.409221902017291e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1616.0, "logps/rejected": -1640.0, "loss": 0.7092, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1875, "rewards/margins": -0.017578125, "rewards/rejected": 0.205078125, "step": 306 }, { "epoch": 0.0885619500937545, "grad_norm": 13.477804732617766, "learning_rate": 4.423631123919308e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1728.0, "logps/rejected": -1824.0, "loss": 0.689, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.1943359375, "rewards/margins": -0.033203125, "rewards/rejected": 0.2275390625, "step": 307 }, { "epoch": 0.08885042550122602, "grad_norm": 15.056873499408436, "learning_rate": 4.4380403458213256e-07, "logits/chosen": 2.875, "logits/rejected": 2.859375, "logps/chosen": -1672.0, "logps/rejected": -1408.0, "loss": 0.7184, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.19140625, "rewards/margins": -0.01123046875, "rewards/rejected": 0.2021484375, "step": 308 }, { "epoch": 0.08913890090869753, "grad_norm": 13.019298531625479, "learning_rate": 4.4524495677233424e-07, "logits/chosen": 2.953125, "logits/rejected": 3.0, "logps/chosen": -1744.0, "logps/rejected": -1376.0, "loss": 0.6713, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.263671875, "rewards/margins": 0.068359375, "rewards/rejected": 0.1943359375, "step": 309 }, { "epoch": 0.08942737631616905, "grad_norm": 15.217809319721413, "learning_rate": 4.46685878962536e-07, "logits/chosen": 2.859375, "logits/rejected": 2.921875, "logps/chosen": -1488.0, "logps/rejected": -1464.0, "loss": 0.6623, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2041015625, "rewards/margins": 0.0576171875, "rewards/rejected": 0.146484375, "step": 310 }, { "epoch": 0.08971585172364056, "grad_norm": 11.224707026465401, "learning_rate": 4.481268011527377e-07, "logits/chosen": 3.015625, "logits/rejected": 2.890625, "logps/chosen": -1632.0, "logps/rejected": -1768.0, "loss": 0.6852, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.1611328125, "rewards/margins": -0.0198974609375, "rewards/rejected": 0.181640625, "step": 311 }, { "epoch": 0.09000432713111207, "grad_norm": 13.575381385862697, "learning_rate": 4.4956772334293947e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1960.0, "logps/rejected": -1520.0, "loss": 0.7255, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1826171875, "rewards/margins": -0.012451171875, "rewards/rejected": 0.1953125, "step": 312 }, { "epoch": 0.09029280253858359, "grad_norm": 11.452838570371567, "learning_rate": 4.5100864553314114e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -1752.0, "logps/rejected": -1688.0, "loss": 0.6887, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.267578125, "rewards/margins": 0.043212890625, "rewards/rejected": 0.224609375, "step": 313 }, { "epoch": 0.0905812779460551, "grad_norm": 12.768219472102764, "learning_rate": 4.524495677233429e-07, "logits/chosen": 3.046875, "logits/rejected": 3.109375, "logps/chosen": -2096.0, "logps/rejected": -1792.0, "loss": 0.6848, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.248046875, "rewards/margins": 0.056884765625, "rewards/rejected": 0.1904296875, "step": 314 }, { "epoch": 0.09086975335352661, "grad_norm": 11.908153175706465, "learning_rate": 4.538904899135447e-07, "logits/chosen": 2.953125, "logits/rejected": 2.796875, "logps/chosen": -1656.0, "logps/rejected": -1456.0, "loss": 0.6865, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2421875, "rewards/margins": 0.01263427734375, "rewards/rejected": 0.23046875, "step": 315 }, { "epoch": 0.09115822876099812, "grad_norm": 11.926939631481423, "learning_rate": 4.5533141210374637e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0, "logps/chosen": -1808.0, "logps/rejected": -1704.0, "loss": 0.7203, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2275390625, "rewards/margins": -0.00616455078125, "rewards/rejected": 0.234375, "step": 316 }, { "epoch": 0.09144670416846964, "grad_norm": 13.691197642707056, "learning_rate": 4.5677233429394815e-07, "logits/chosen": 2.796875, "logits/rejected": 2.859375, "logps/chosen": -1840.0, "logps/rejected": -1752.0, "loss": 0.6754, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.251953125, "rewards/margins": 0.06396484375, "rewards/rejected": 0.1884765625, "step": 317 }, { "epoch": 0.09173517957594116, "grad_norm": 12.286636519748805, "learning_rate": 4.582132564841498e-07, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1792.0, "logps/rejected": -1744.0, "loss": 0.6899, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2001953125, "rewards/margins": 0.019287109375, "rewards/rejected": 0.1806640625, "step": 318 }, { "epoch": 0.09202365498341267, "grad_norm": 11.496348608058788, "learning_rate": 4.596541786743516e-07, "logits/chosen": 2.984375, "logits/rejected": 3.078125, "logps/chosen": -1560.0, "logps/rejected": -1392.0, "loss": 0.6841, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2236328125, "rewards/margins": 0.0012969970703125, "rewards/rejected": 0.22265625, "step": 319 }, { "epoch": 0.09231213039088418, "grad_norm": 10.56894773571941, "learning_rate": 4.610951008645533e-07, "logits/chosen": 2.921875, "logits/rejected": 2.859375, "logps/chosen": -1536.0, "logps/rejected": -1304.0, "loss": 0.6736, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2353515625, "rewards/margins": 0.06884765625, "rewards/rejected": 0.166015625, "step": 320 }, { "epoch": 0.09260060579835569, "grad_norm": 9.70495369325244, "learning_rate": 4.6253602305475505e-07, "logits/chosen": 3.109375, "logits/rejected": 3.078125, "logps/chosen": -1816.0, "logps/rejected": -1752.0, "loss": 0.676, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.259765625, "rewards/margins": 0.00189208984375, "rewards/rejected": 0.2578125, "step": 321 }, { "epoch": 0.0928890812058272, "grad_norm": 11.947230054535686, "learning_rate": 4.639769452449567e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1600.0, "logps/rejected": -1680.0, "loss": 0.7113, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2373046875, "rewards/margins": -0.052734375, "rewards/rejected": 0.291015625, "step": 322 }, { "epoch": 0.09317755661329871, "grad_norm": 12.323761341626186, "learning_rate": 4.654178674351585e-07, "logits/chosen": 2.875, "logits/rejected": 3.015625, "logps/chosen": -1760.0, "logps/rejected": -1696.0, "loss": 0.6863, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2197265625, "rewards/margins": 0.0115966796875, "rewards/rejected": 0.2080078125, "step": 323 }, { "epoch": 0.09346603202077022, "grad_norm": 12.918962222490473, "learning_rate": 4.668587896253602e-07, "logits/chosen": 2.9375, "logits/rejected": 2.96875, "logps/chosen": -1576.0, "logps/rejected": -1664.0, "loss": 0.7018, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.1728515625, "rewards/margins": -0.1103515625, "rewards/rejected": 0.283203125, "step": 324 }, { "epoch": 0.09375450742824175, "grad_norm": 12.673405909506833, "learning_rate": 4.6829971181556196e-07, "logits/chosen": 2.875, "logits/rejected": 2.84375, "logps/chosen": -2080.0, "logps/rejected": -2176.0, "loss": 0.6989, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.27734375, "rewards/margins": -0.027587890625, "rewards/rejected": 0.3046875, "step": 325 }, { "epoch": 0.09404298283571326, "grad_norm": 12.277683224572902, "learning_rate": 4.697406340057637e-07, "logits/chosen": 2.78125, "logits/rejected": 2.859375, "logps/chosen": -1752.0, "logps/rejected": -1520.0, "loss": 0.6759, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.28515625, "rewards/margins": 0.0859375, "rewards/rejected": 0.19921875, "step": 326 }, { "epoch": 0.09433145824318477, "grad_norm": 12.256988184227385, "learning_rate": 4.711815561959654e-07, "logits/chosen": 2.953125, "logits/rejected": 3.0, "logps/chosen": -1464.0, "logps/rejected": -1440.0, "loss": 0.7084, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.20703125, "rewards/margins": -0.0081787109375, "rewards/rejected": 0.21484375, "step": 327 }, { "epoch": 0.09461993365065628, "grad_norm": 14.865328761434416, "learning_rate": 4.7262247838616713e-07, "logits/chosen": 2.859375, "logits/rejected": 2.90625, "logps/chosen": -1392.0, "logps/rejected": -1368.0, "loss": 0.6826, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.21875, "rewards/margins": 0.0712890625, "rewards/rejected": 0.146484375, "step": 328 }, { "epoch": 0.0949084090581278, "grad_norm": 10.962210847782162, "learning_rate": 4.7406340057636886e-07, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1872.0, "logps/rejected": -1688.0, "loss": 0.6869, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.29296875, "rewards/margins": 0.036376953125, "rewards/rejected": 0.255859375, "step": 329 }, { "epoch": 0.0951968844655993, "grad_norm": 11.93833949929247, "learning_rate": 4.755043227665706e-07, "logits/chosen": 3.03125, "logits/rejected": 2.96875, "logps/chosen": -1872.0, "logps/rejected": -1840.0, "loss": 0.6961, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.259765625, "rewards/margins": -0.0224609375, "rewards/rejected": 0.283203125, "step": 330 }, { "epoch": 0.09548535987307082, "grad_norm": 11.348894472522236, "learning_rate": 4.769452449567723e-07, "logits/chosen": 3.046875, "logits/rejected": 3.109375, "logps/chosen": -1928.0, "logps/rejected": -1512.0, "loss": 0.6837, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.328125, "rewards/margins": 0.0625, "rewards/rejected": 0.265625, "step": 331 }, { "epoch": 0.09577383528054233, "grad_norm": 12.799129514136016, "learning_rate": 4.78386167146974e-07, "logits/chosen": 2.890625, "logits/rejected": 3.0, "logps/chosen": -1592.0, "logps/rejected": -1656.0, "loss": 0.6778, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.248046875, "rewards/margins": -0.00592041015625, "rewards/rejected": 0.25390625, "step": 332 }, { "epoch": 0.09606231068801385, "grad_norm": 11.369320647741647, "learning_rate": 4.798270893371757e-07, "logits/chosen": 3.0, "logits/rejected": 2.953125, "logps/chosen": -1880.0, "logps/rejected": -1624.0, "loss": 0.6763, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31640625, "rewards/margins": 0.043701171875, "rewards/rejected": 0.2734375, "step": 333 }, { "epoch": 0.09635078609548536, "grad_norm": 10.54250252226706, "learning_rate": 4.812680115273775e-07, "logits/chosen": 2.84375, "logits/rejected": 2.875, "logps/chosen": -1592.0, "logps/rejected": -1664.0, "loss": 0.6991, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.306640625, "rewards/margins": -0.0294189453125, "rewards/rejected": 0.3359375, "step": 334 }, { "epoch": 0.09663926150295687, "grad_norm": 11.463685060515383, "learning_rate": 4.827089337175792e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -1592.0, "logps/rejected": -1520.0, "loss": 0.6895, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.310546875, "rewards/margins": 0.013916015625, "rewards/rejected": 0.296875, "step": 335 }, { "epoch": 0.09692773691042839, "grad_norm": 9.96117351998309, "learning_rate": 4.841498559077809e-07, "logits/chosen": 2.890625, "logits/rejected": 2.8125, "logps/chosen": -1176.0, "logps/rejected": -1144.0, "loss": 0.6863, "loss/demonstration_loss": -2352.0, "loss/preference_loss": -2336.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2578125, "rewards/margins": 0.0269775390625, "rewards/rejected": 0.2314453125, "step": 336 }, { "epoch": 0.0972162123178999, "grad_norm": 12.18025141847497, "learning_rate": 4.855907780979827e-07, "logits/chosen": 3.015625, "logits/rejected": 2.96875, "logps/chosen": -1760.0, "logps/rejected": -1872.0, "loss": 0.7124, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.296875, "rewards/margins": -0.06884765625, "rewards/rejected": 0.3671875, "step": 337 }, { "epoch": 0.09750468772537141, "grad_norm": 12.448832794886972, "learning_rate": 4.870317002881844e-07, "logits/chosen": 2.984375, "logits/rejected": 2.96875, "logps/chosen": -1864.0, "logps/rejected": -1864.0, "loss": 0.7123, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.25, "rewards/margins": -0.0751953125, "rewards/rejected": 0.32421875, "step": 338 }, { "epoch": 0.09779316313284292, "grad_norm": 13.043572555308508, "learning_rate": 4.884726224783862e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1880.0, "logps/rejected": -1960.0, "loss": 0.675, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.283203125, "rewards/margins": -0.0224609375, "rewards/rejected": 0.3046875, "step": 339 }, { "epoch": 0.09808163854031444, "grad_norm": 11.082715012938753, "learning_rate": 4.899135446685878e-07, "logits/chosen": 3.125, "logits/rejected": 3.03125, "logps/chosen": -1712.0, "logps/rejected": -1840.0, "loss": 0.6736, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.322265625, "rewards/margins": 0.02490234375, "rewards/rejected": 0.296875, "step": 340 }, { "epoch": 0.09837011394778596, "grad_norm": 13.193880086049354, "learning_rate": 4.913544668587896e-07, "logits/chosen": 3.125, "logits/rejected": 3.03125, "logps/chosen": -1536.0, "logps/rejected": -1752.0, "loss": 0.719, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.291015625, "rewards/margins": -0.064453125, "rewards/rejected": 0.35546875, "step": 341 }, { "epoch": 0.09865858935525747, "grad_norm": 10.814716863646092, "learning_rate": 4.927953890489913e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1760.0, "logps/rejected": -1576.0, "loss": 0.6821, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.322265625, "rewards/margins": 0.0181884765625, "rewards/rejected": 0.302734375, "step": 342 }, { "epoch": 0.09894706476272898, "grad_norm": 11.309814746059253, "learning_rate": 4.942363112391931e-07, "logits/chosen": 2.890625, "logits/rejected": 2.90625, "logps/chosen": -1680.0, "logps/rejected": -1504.0, "loss": 0.7376, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.22265625, "rewards/margins": -0.10400390625, "rewards/rejected": 0.326171875, "step": 343 }, { "epoch": 0.09923554017020049, "grad_norm": 12.896102464020627, "learning_rate": 4.956772334293947e-07, "logits/chosen": 3.0625, "logits/rejected": 2.921875, "logps/chosen": -1584.0, "logps/rejected": -1896.0, "loss": 0.7078, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.296875, "rewards/margins": -0.041259765625, "rewards/rejected": 0.337890625, "step": 344 }, { "epoch": 0.099524015577672, "grad_norm": 11.65631367181338, "learning_rate": 4.971181556195965e-07, "logits/chosen": 2.921875, "logits/rejected": 2.890625, "logps/chosen": -1640.0, "logps/rejected": -1784.0, "loss": 0.6967, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.298828125, "rewards/margins": -0.059326171875, "rewards/rejected": 0.357421875, "step": 345 }, { "epoch": 0.09981249098514351, "grad_norm": 13.187069779527267, "learning_rate": 4.985590778097982e-07, "logits/chosen": 2.90625, "logits/rejected": 2.90625, "logps/chosen": -2096.0, "logps/rejected": -1696.0, "loss": 0.6722, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.384765625, "rewards/margins": 0.06787109375, "rewards/rejected": 0.31640625, "step": 346 }, { "epoch": 0.10010096639261502, "grad_norm": 13.327265365798548, "learning_rate": 5e-07, "logits/chosen": 3.0, "logits/rejected": 3.0, "logps/chosen": -1920.0, "logps/rejected": -1656.0, "loss": 0.7143, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.337890625, "rewards/margins": -0.00494384765625, "rewards/rejected": 0.341796875, "step": 347 }, { "epoch": 0.10038944180008655, "grad_norm": 10.045276255832176, "learning_rate": 4.999998731825629e-07, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1416.0, "logps/rejected": -1264.0, "loss": 0.6618, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.251953125, "rewards/margins": 0.03955078125, "rewards/rejected": 0.2119140625, "step": 348 }, { "epoch": 0.10067791720755806, "grad_norm": 14.434478135022456, "learning_rate": 4.999994927303802e-07, "logits/chosen": 2.984375, "logits/rejected": 2.921875, "logps/chosen": -1760.0, "logps/rejected": -1808.0, "loss": 0.6913, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.318359375, "rewards/margins": 0.044921875, "rewards/rejected": 0.2734375, "step": 349 }, { "epoch": 0.10096639261502957, "grad_norm": 13.967602744275279, "learning_rate": 4.99998858643838e-07, "logits/chosen": 2.96875, "logits/rejected": 2.9375, "logps/chosen": -1864.0, "logps/rejected": -1696.0, "loss": 0.6844, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.283203125, "rewards/margins": 0.03515625, "rewards/rejected": 0.2490234375, "step": 350 }, { "epoch": 0.10125486802250108, "grad_norm": 12.506510735692627, "learning_rate": 4.999979709235794e-07, "logits/chosen": 3.015625, "logits/rejected": 2.90625, "logps/chosen": -1552.0, "logps/rejected": -1440.0, "loss": 0.6941, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3359375, "rewards/margins": 0.04296875, "rewards/rejected": 0.29296875, "step": 351 }, { "epoch": 0.1015433434299726, "grad_norm": 13.892066219848537, "learning_rate": 4.999968295705053e-07, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -1712.0, "logps/rejected": -1688.0, "loss": 0.7126, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.341796875, "rewards/margins": 0.0400390625, "rewards/rejected": 0.302734375, "step": 352 }, { "epoch": 0.1018318188374441, "grad_norm": 13.165475242303698, "learning_rate": 4.999954345857734e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -2128.0, "logps/rejected": -1912.0, "loss": 0.6938, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.341796875, "rewards/margins": 0.02880859375, "rewards/rejected": 0.3125, "step": 353 }, { "epoch": 0.10212029424491562, "grad_norm": 11.760053768935132, "learning_rate": 4.999937859707991e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1752.0, "logps/rejected": -1760.0, "loss": 0.6834, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2734375, "rewards/margins": -4.57763671875e-05, "rewards/rejected": 0.2734375, "step": 354 }, { "epoch": 0.10240876965238713, "grad_norm": 12.114893389034815, "learning_rate": 4.999918837272549e-07, "logits/chosen": 2.859375, "logits/rejected": 2.90625, "logps/chosen": -1912.0, "logps/rejected": -1896.0, "loss": 0.7156, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.330078125, "rewards/margins": -0.0537109375, "rewards/rejected": 0.384765625, "step": 355 }, { "epoch": 0.10269724505985865, "grad_norm": 11.264317900552692, "learning_rate": 4.999897278570708e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1760.0, "logps/rejected": -1800.0, "loss": 0.6838, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.314453125, "rewards/margins": 0.0274658203125, "rewards/rejected": 0.287109375, "step": 356 }, { "epoch": 0.10298572046733016, "grad_norm": 11.892960347103365, "learning_rate": 4.99987318362434e-07, "logits/chosen": 2.953125, "logits/rejected": 3.0, "logps/chosen": -1648.0, "logps/rejected": -1704.0, "loss": 0.6799, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.318359375, "rewards/margins": 0.025634765625, "rewards/rejected": 0.29296875, "step": 357 }, { "epoch": 0.10327419587480167, "grad_norm": 11.593657010577436, "learning_rate": 4.99984655245789e-07, "logits/chosen": 3.046875, "logits/rejected": 2.96875, "logps/chosen": -1416.0, "logps/rejected": -1560.0, "loss": 0.6794, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.26171875, "rewards/margins": -0.00439453125, "rewards/rejected": 0.265625, "step": 358 }, { "epoch": 0.10356267128227319, "grad_norm": 12.17415164738371, "learning_rate": 4.999817385098376e-07, "logits/chosen": 3.03125, "logits/rejected": 2.9375, "logps/chosen": -1688.0, "logps/rejected": -1736.0, "loss": 0.6893, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.353515625, "rewards/margins": -0.01513671875, "rewards/rejected": 0.3671875, "step": 359 }, { "epoch": 0.1038511466897447, "grad_norm": 11.024073265841533, "learning_rate": 4.99978568157539e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -1800.0, "logps/rejected": -1664.0, "loss": 0.7, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.27734375, "rewards/margins": -0.0263671875, "rewards/rejected": 0.3046875, "step": 360 }, { "epoch": 0.10413962209721621, "grad_norm": 11.535957195902807, "learning_rate": 4.999751441921096e-07, "logits/chosen": 2.96875, "logits/rejected": 2.890625, "logps/chosen": -1688.0, "logps/rejected": -1648.0, "loss": 0.6801, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.267578125, "rewards/margins": 0.06640625, "rewards/rejected": 0.201171875, "step": 361 }, { "epoch": 0.10442809750468772, "grad_norm": 11.570431364911059, "learning_rate": 4.999714666170232e-07, "logits/chosen": 2.8125, "logits/rejected": 2.875, "logps/chosen": -1344.0, "logps/rejected": -1264.0, "loss": 0.6957, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.25390625, "rewards/margins": 0.0093994140625, "rewards/rejected": 0.2451171875, "step": 362 }, { "epoch": 0.10471657291215924, "grad_norm": 11.775547233530617, "learning_rate": 4.999675354360108e-07, "logits/chosen": 3.078125, "logits/rejected": 3.109375, "logps/chosen": -1784.0, "logps/rejected": -1680.0, "loss": 0.6796, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.353515625, "rewards/margins": 0.0712890625, "rewards/rejected": 0.28125, "step": 363 }, { "epoch": 0.10500504831963076, "grad_norm": 13.033084161210898, "learning_rate": 4.999633506530608e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -1680.0, "logps/rejected": -1688.0, "loss": 0.6883, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.271484375, "rewards/margins": -0.07470703125, "rewards/rejected": 0.345703125, "step": 364 }, { "epoch": 0.10529352372710227, "grad_norm": 12.278058508261, "learning_rate": 4.999589122724187e-07, "logits/chosen": 2.921875, "logits/rejected": 2.953125, "logps/chosen": -1472.0, "logps/rejected": -1448.0, "loss": 0.6927, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2421875, "rewards/margins": -0.03369140625, "rewards/rejected": 0.27734375, "step": 365 }, { "epoch": 0.10558199913457378, "grad_norm": 11.086831133173982, "learning_rate": 4.999542202985876e-07, "logits/chosen": 2.953125, "logits/rejected": 2.859375, "logps/chosen": -1584.0, "logps/rejected": -1488.0, "loss": 0.7029, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.279296875, "rewards/margins": -0.00811767578125, "rewards/rejected": 0.287109375, "step": 366 }, { "epoch": 0.10587047454204529, "grad_norm": 11.293695231653452, "learning_rate": 4.999492747363275e-07, "logits/chosen": 3.078125, "logits/rejected": 2.984375, "logps/chosen": -1736.0, "logps/rejected": -1816.0, "loss": 0.7023, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.283203125, "rewards/margins": -0.048828125, "rewards/rejected": 0.33203125, "step": 367 }, { "epoch": 0.1061589499495168, "grad_norm": 11.796173413400084, "learning_rate": 4.999440755906561e-07, "logits/chosen": 2.953125, "logits/rejected": 3.046875, "logps/chosen": -1552.0, "logps/rejected": -1376.0, "loss": 0.6865, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.267578125, "rewards/margins": 0.0198974609375, "rewards/rejected": 0.248046875, "step": 368 }, { "epoch": 0.10644742535698831, "grad_norm": 10.951040422904933, "learning_rate": 4.999386228668479e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -1408.0, "logps/rejected": -1280.0, "loss": 0.6767, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.314453125, "rewards/margins": 0.0162353515625, "rewards/rejected": 0.296875, "step": 369 }, { "epoch": 0.10673590076445982, "grad_norm": 12.84640075683829, "learning_rate": 4.999329165704349e-07, "logits/chosen": 2.9375, "logits/rejected": 2.90625, "logps/chosen": -1448.0, "logps/rejected": -1520.0, "loss": 0.7158, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2412109375, "rewards/margins": -0.034912109375, "rewards/rejected": 0.275390625, "step": 370 }, { "epoch": 0.10702437617193135, "grad_norm": 12.77330159672173, "learning_rate": 4.999269567072067e-07, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -1824.0, "logps/rejected": -1792.0, "loss": 0.6525, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.376953125, "rewards/margins": 0.0361328125, "rewards/rejected": 0.33984375, "step": 371 }, { "epoch": 0.10731285157940286, "grad_norm": 11.783193519252135, "learning_rate": 4.999207432832094e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0, "logps/chosen": -1488.0, "logps/rejected": -1544.0, "loss": 0.6927, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.28125, "rewards/margins": 0.00994873046875, "rewards/rejected": 0.271484375, "step": 372 }, { "epoch": 0.10760132698687437, "grad_norm": 11.110919628422778, "learning_rate": 4.99914276304747e-07, "logits/chosen": 3.0, "logits/rejected": 2.84375, "logps/chosen": -1544.0, "logps/rejected": -1640.0, "loss": 0.7225, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.255859375, "rewards/margins": -0.03369140625, "rewards/rejected": 0.291015625, "step": 373 }, { "epoch": 0.10788980239434588, "grad_norm": 13.277910096141774, "learning_rate": 4.999075557783804e-07, "logits/chosen": 2.984375, "logits/rejected": 2.96875, "logps/chosen": -1584.0, "logps/rejected": -1520.0, "loss": 0.6896, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.27734375, "rewards/margins": -0.03564453125, "rewards/rejected": 0.314453125, "step": 374 }, { "epoch": 0.1081782778018174, "grad_norm": 12.917317085902917, "learning_rate": 4.99900581710928e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -2464.0, "logps/rejected": -2256.0, "loss": 0.694, "loss/demonstration_loss": -4768.0, "loss/preference_loss": -4768.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.498046875, "rewards/margins": -0.01239013671875, "rewards/rejected": 0.51171875, "step": 375 }, { "epoch": 0.1084667532092889, "grad_norm": 12.461358320488536, "learning_rate": 4.99893354109465e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1568.0, "logps/rejected": -1616.0, "loss": 0.6903, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.34375, "rewards/margins": 0.023193359375, "rewards/rejected": 0.3203125, "step": 376 }, { "epoch": 0.10875522861676042, "grad_norm": 11.716811830286172, "learning_rate": 4.998858729813244e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1552.0, "logps/rejected": -1440.0, "loss": 0.6836, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3046875, "rewards/margins": 0.0274658203125, "rewards/rejected": 0.27734375, "step": 377 }, { "epoch": 0.10904370402423194, "grad_norm": 17.707198057038767, "learning_rate": 4.998781383340959e-07, "logits/chosen": 2.9375, "logits/rejected": 3.03125, "logps/chosen": -1776.0, "logps/rejected": -1536.0, "loss": 0.7072, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32421875, "rewards/margins": 0.018310546875, "rewards/rejected": 0.3046875, "step": 378 }, { "epoch": 0.10933217943170345, "grad_norm": 11.023741104642982, "learning_rate": 4.998701501756266e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1616.0, "logps/rejected": -1520.0, "loss": 0.6949, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.310546875, "rewards/margins": 0.0172119140625, "rewards/rejected": 0.294921875, "step": 379 }, { "epoch": 0.10962065483917496, "grad_norm": 12.485350390493423, "learning_rate": 4.998619085140208e-07, "logits/chosen": 3.0625, "logits/rejected": 3.140625, "logps/chosen": -1664.0, "logps/rejected": -1536.0, "loss": 0.6795, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.34375, "rewards/margins": 0.0498046875, "rewards/rejected": 0.294921875, "step": 380 }, { "epoch": 0.10990913024664647, "grad_norm": 11.193723079995971, "learning_rate": 4.998534133576402e-07, "logits/chosen": 2.828125, "logits/rejected": 2.796875, "logps/chosen": -1840.0, "logps/rejected": -1776.0, "loss": 0.6899, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": 0.0274658203125, "rewards/rejected": 0.3046875, "step": 381 }, { "epoch": 0.11019760565411799, "grad_norm": 11.72892734729227, "learning_rate": 4.998446647151032e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0, "logps/chosen": -1256.0, "logps/rejected": -1568.0, "loss": 0.7074, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.1962890625, "rewards/margins": -0.047607421875, "rewards/rejected": 0.244140625, "step": 382 }, { "epoch": 0.1104860810615895, "grad_norm": 15.162140841900214, "learning_rate": 4.998356625952859e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1552.0, "logps/rejected": -1432.0, "loss": 0.6693, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.271484375, "rewards/margins": 0.021240234375, "rewards/rejected": 0.25, "step": 383 }, { "epoch": 0.11077455646906101, "grad_norm": 14.227548130386323, "learning_rate": 4.99826407007321e-07, "logits/chosen": 2.875, "logits/rejected": 2.921875, "logps/chosen": -1280.0, "logps/rejected": -1480.0, "loss": 0.7054, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2138671875, "rewards/margins": -0.0576171875, "rewards/rejected": 0.271484375, "step": 384 }, { "epoch": 0.11106303187653252, "grad_norm": 11.090272476920955, "learning_rate": 4.998168979605988e-07, "logits/chosen": 2.8125, "logits/rejected": 2.9375, "logps/chosen": -1704.0, "logps/rejected": -1488.0, "loss": 0.6775, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.26171875, "rewards/margins": 0.037353515625, "rewards/rejected": 0.2236328125, "step": 385 }, { "epoch": 0.11135150728400404, "grad_norm": 13.807018243839257, "learning_rate": 4.998071354647668e-07, "logits/chosen": 2.6875, "logits/rejected": 2.765625, "logps/chosen": -1824.0, "logps/rejected": -1720.0, "loss": 0.713, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3046875, "rewards/margins": 0.0013427734375, "rewards/rejected": 0.302734375, "step": 386 }, { "epoch": 0.11163998269147556, "grad_norm": 13.193719820615023, "learning_rate": 4.997971195297292e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1784.0, "logps/rejected": -1424.0, "loss": 0.6612, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.33203125, "rewards/margins": 0.130859375, "rewards/rejected": 0.201171875, "step": 387 }, { "epoch": 0.11192845809894707, "grad_norm": 10.788098958449911, "learning_rate": 4.997868501656476e-07, "logits/chosen": 3.125, "logits/rejected": 3.0, "logps/chosen": -1472.0, "logps/rejected": -1552.0, "loss": 0.6841, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2392578125, "rewards/margins": -0.045166015625, "rewards/rejected": 0.28515625, "step": 388 }, { "epoch": 0.11221693350641858, "grad_norm": 11.05793384682053, "learning_rate": 4.997763273829407e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -1336.0, "logps/rejected": -1232.0, "loss": 0.6741, "loss/demonstration_loss": -2592.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.23046875, "rewards/margins": 0.017578125, "rewards/rejected": 0.212890625, "step": 389 }, { "epoch": 0.11250540891389009, "grad_norm": 10.788131766783017, "learning_rate": 4.997655511922843e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -2032.0, "logps/rejected": -1968.0, "loss": 0.6812, "loss/demonstration_loss": -4048.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.453125, "rewards/margins": 0.027587890625, "rewards/rejected": 0.42578125, "step": 390 }, { "epoch": 0.1127938843213616, "grad_norm": 12.261534202909708, "learning_rate": 4.997545216046112e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0, "logps/chosen": -1648.0, "logps/rejected": -1768.0, "loss": 0.6989, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.310546875, "rewards/margins": -0.02734375, "rewards/rejected": 0.337890625, "step": 391 }, { "epoch": 0.11308235972883311, "grad_norm": 10.75043528463301, "learning_rate": 4.997432386311114e-07, "logits/chosen": 3.0625, "logits/rejected": 3.03125, "logps/chosen": -1392.0, "logps/rejected": -1496.0, "loss": 0.7007, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.298828125, "rewards/margins": 0.05126953125, "rewards/rejected": 0.248046875, "step": 392 }, { "epoch": 0.11337083513630462, "grad_norm": 13.459322288398441, "learning_rate": 4.99731702283232e-07, "logits/chosen": 3.09375, "logits/rejected": 3.078125, "logps/chosen": -1816.0, "logps/rejected": -1856.0, "loss": 0.6843, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.388671875, "rewards/margins": -0.001251220703125, "rewards/rejected": 0.388671875, "step": 393 }, { "epoch": 0.11365931054377615, "grad_norm": 11.69505172687698, "learning_rate": 4.997199125726769e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -1824.0, "logps/rejected": -1728.0, "loss": 0.7077, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2197265625, "rewards/margins": -0.0224609375, "rewards/rejected": 0.2421875, "step": 394 }, { "epoch": 0.11394778595124766, "grad_norm": 12.290693170480603, "learning_rate": 4.997078695114075e-07, "logits/chosen": 3.0625, "logits/rejected": 2.96875, "logps/chosen": -1376.0, "logps/rejected": -1424.0, "loss": 0.7065, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.3125, "rewards/margins": -0.028076171875, "rewards/rejected": 0.341796875, "step": 395 }, { "epoch": 0.11423626135871917, "grad_norm": 11.138301783679744, "learning_rate": 4.996955731116417e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -2144.0, "logps/rejected": -2096.0, "loss": 0.7006, "loss/demonstration_loss": -4256.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.34765625, "rewards/margins": -0.03955078125, "rewards/rejected": 0.38671875, "step": 396 }, { "epoch": 0.11452473676619068, "grad_norm": 11.69278126430225, "learning_rate": 4.996830233858547e-07, "logits/chosen": 2.890625, "logits/rejected": 2.921875, "logps/chosen": -1832.0, "logps/rejected": -1936.0, "loss": 0.6902, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.27734375, "rewards/margins": 0.044189453125, "rewards/rejected": 0.232421875, "step": 397 }, { "epoch": 0.1148132121736622, "grad_norm": 11.737666952590862, "learning_rate": 4.996702203467789e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1168.0, "logps/rejected": -1336.0, "loss": 0.6924, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.26171875, "rewards/margins": -0.005523681640625, "rewards/rejected": 0.267578125, "step": 398 }, { "epoch": 0.1151016875811337, "grad_norm": 11.380407152789367, "learning_rate": 4.996571640074033e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -1824.0, "logps/rejected": -1512.0, "loss": 0.6678, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.10791015625, "rewards/rejected": 0.30078125, "step": 399 }, { "epoch": 0.11539016298860522, "grad_norm": 12.966311830687642, "learning_rate": 4.996438543809742e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1760.0, "logps/rejected": -1648.0, "loss": 0.6995, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.337890625, "rewards/margins": 0.036376953125, "rewards/rejected": 0.30078125, "step": 400 }, { "epoch": 0.11567863839607674, "grad_norm": 11.138684980739974, "learning_rate": 4.996302914809946e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1416.0, "logps/rejected": -1408.0, "loss": 0.6884, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3125, "rewards/margins": -0.01458740234375, "rewards/rejected": 0.326171875, "step": 401 }, { "epoch": 0.11596711380354825, "grad_norm": 11.039411503538636, "learning_rate": 4.996164753212247e-07, "logits/chosen": 2.890625, "logits/rejected": 2.859375, "logps/chosen": -1736.0, "logps/rejected": -1784.0, "loss": 0.678, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.392578125, "rewards/margins": 0.027587890625, "rewards/rejected": 0.365234375, "step": 402 }, { "epoch": 0.11625558921101976, "grad_norm": 12.650995680888709, "learning_rate": 4.996024059156815e-07, "logits/chosen": 2.84375, "logits/rejected": 2.90625, "logps/chosen": -1512.0, "logps/rejected": -1840.0, "loss": 0.6964, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.359375, "rewards/margins": 0.033203125, "rewards/rejected": 0.32421875, "step": 403 }, { "epoch": 0.11654406461849127, "grad_norm": 10.769179424923426, "learning_rate": 4.99588083278639e-07, "logits/chosen": 3.140625, "logits/rejected": 3.09375, "logps/chosen": -2032.0, "logps/rejected": -1736.0, "loss": 0.6774, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.376953125, "rewards/margins": 0.07763671875, "rewards/rejected": 0.30078125, "step": 404 }, { "epoch": 0.11683254002596279, "grad_norm": 11.517230629593275, "learning_rate": 4.99573507424628e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1600.0, "logps/rejected": -1672.0, "loss": 0.7019, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.294921875, "rewards/margins": -0.0908203125, "rewards/rejected": 0.384765625, "step": 405 }, { "epoch": 0.1171210154334343, "grad_norm": 11.802766660909494, "learning_rate": 4.995586783684363e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1496.0, "logps/rejected": -1496.0, "loss": 0.7174, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.296875, "rewards/margins": 0.002410888671875, "rewards/rejected": 0.294921875, "step": 406 }, { "epoch": 0.11740949084090581, "grad_norm": 11.169480853542668, "learning_rate": 4.995435961251088e-07, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1824.0, "logps/rejected": -1616.0, "loss": 0.6655, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.373046875, "rewards/margins": -0.0107421875, "rewards/rejected": 0.3828125, "step": 407 }, { "epoch": 0.11769796624837732, "grad_norm": 10.27843734823317, "learning_rate": 4.995282607099467e-07, "logits/chosen": 3.125, "logits/rejected": 3.046875, "logps/chosen": -1576.0, "logps/rejected": -1640.0, "loss": 0.6769, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.41796875, "rewards/margins": 0.068359375, "rewards/rejected": 0.349609375, "step": 408 }, { "epoch": 0.11798644165584884, "grad_norm": 12.346267179870837, "learning_rate": 4.995126721385085e-07, "logits/chosen": 2.75, "logits/rejected": 2.796875, "logps/chosen": -1432.0, "logps/rejected": -1408.0, "loss": 0.7251, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.275390625, "rewards/margins": -0.09912109375, "rewards/rejected": 0.375, "step": 409 }, { "epoch": 0.11827491706332036, "grad_norm": 12.674708500126071, "learning_rate": 4.994968304266095e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1448.0, "logps/rejected": -1560.0, "loss": 0.7049, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.24609375, "rewards/margins": -0.0125732421875, "rewards/rejected": 0.259765625, "step": 410 }, { "epoch": 0.11856339247079187, "grad_norm": 12.675925233095406, "learning_rate": 4.994807355903217e-07, "logits/chosen": 2.984375, "logits/rejected": 2.9375, "logps/chosen": -1624.0, "logps/rejected": -1496.0, "loss": 0.6935, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.298828125, "rewards/margins": 0.029296875, "rewards/rejected": 0.26953125, "step": 411 }, { "epoch": 0.11885186787826338, "grad_norm": 11.25341318054663, "learning_rate": 4.994643876459737e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1384.0, "logps/rejected": -1288.0, "loss": 0.6818, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.42578125, "rewards/margins": 0.10302734375, "rewards/rejected": 0.322265625, "step": 412 }, { "epoch": 0.11914034328573489, "grad_norm": 11.017715728761027, "learning_rate": 4.994477866101517e-07, "logits/chosen": 3.046875, "logits/rejected": 2.96875, "logps/chosen": -1456.0, "logps/rejected": -1384.0, "loss": 0.6552, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.314453125, "rewards/margins": 0.06201171875, "rewards/rejected": 0.251953125, "step": 413 }, { "epoch": 0.1194288186932064, "grad_norm": 11.84260987175539, "learning_rate": 4.994309324996976e-07, "logits/chosen": 3.046875, "logits/rejected": 3.140625, "logps/chosen": -1584.0, "logps/rejected": -1448.0, "loss": 0.6791, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.30859375, "rewards/margins": 0.04345703125, "rewards/rejected": 0.265625, "step": 414 }, { "epoch": 0.11971729410067791, "grad_norm": 12.32663216952759, "learning_rate": 4.994138253317107e-07, "logits/chosen": 2.75, "logits/rejected": 2.84375, "logps/chosen": -1784.0, "logps/rejected": -1728.0, "loss": 0.6805, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.369140625, "rewards/margins": 0.046142578125, "rewards/rejected": 0.322265625, "step": 415 }, { "epoch": 0.12000576950814942, "grad_norm": 14.08264896084647, "learning_rate": 4.99396465123547e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1608.0, "logps/rejected": -1464.0, "loss": 0.7081, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32421875, "rewards/margins": -0.0072021484375, "rewards/rejected": 0.33203125, "step": 416 }, { "epoch": 0.12029424491562095, "grad_norm": 11.865145517516, "learning_rate": 4.99378851892819e-07, "logits/chosen": 3.125, "logits/rejected": 3.046875, "logps/chosen": -1728.0, "logps/rejected": -1600.0, "loss": 0.7057, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3828125, "rewards/margins": 0.040771484375, "rewards/rejected": 0.341796875, "step": 417 }, { "epoch": 0.12058272032309246, "grad_norm": 10.569651738583094, "learning_rate": 4.99360985657396e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1904.0, "logps/rejected": -1880.0, "loss": 0.6597, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.0908203125, "rewards/rejected": 0.318359375, "step": 418 }, { "epoch": 0.12087119573056397, "grad_norm": 10.749124921358813, "learning_rate": 4.993428664354041e-07, "logits/chosen": 3.078125, "logits/rejected": 3.109375, "logps/chosen": -1864.0, "logps/rejected": -1784.0, "loss": 0.7182, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.306640625, "rewards/margins": -0.0218505859375, "rewards/rejected": 0.328125, "step": 419 }, { "epoch": 0.12115967113803548, "grad_norm": 12.045921528662834, "learning_rate": 4.99324494245226e-07, "logits/chosen": 3.0, "logits/rejected": 2.921875, "logps/chosen": -1992.0, "logps/rejected": -2048.0, "loss": 0.662, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51953125, "rewards/margins": 0.1103515625, "rewards/rejected": 0.41015625, "step": 420 }, { "epoch": 0.121448146545507, "grad_norm": 10.904436298811431, "learning_rate": 4.99305869105501e-07, "logits/chosen": 2.875, "logits/rejected": 2.8125, "logps/chosen": -1368.0, "logps/rejected": -1432.0, "loss": 0.6655, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.318359375, "rewards/margins": 0.001861572265625, "rewards/rejected": 0.31640625, "step": 421 }, { "epoch": 0.1217366219529785, "grad_norm": 12.071667651710214, "learning_rate": 4.992869910351249e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1288.0, "logps/rejected": -1400.0, "loss": 0.7188, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.2333984375, "rewards/margins": -0.0791015625, "rewards/rejected": 0.3125, "step": 422 }, { "epoch": 0.12202509736045002, "grad_norm": 11.780346960823579, "learning_rate": 4.992678600532503e-07, "logits/chosen": 3.015625, "logits/rejected": 3.078125, "logps/chosen": -1608.0, "logps/rejected": -1520.0, "loss": 0.7131, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.314453125, "rewards/margins": -0.06005859375, "rewards/rejected": 0.375, "step": 423 }, { "epoch": 0.12231357276792154, "grad_norm": 12.224839945194843, "learning_rate": 4.992484761792865e-07, "logits/chosen": 3.09375, "logits/rejected": 3.03125, "logps/chosen": -1552.0, "logps/rejected": -1736.0, "loss": 0.6599, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.0654296875, "rewards/rejected": 0.345703125, "step": 424 }, { "epoch": 0.12260204817539305, "grad_norm": 10.88755251542413, "learning_rate": 4.99228839432899e-07, "logits/chosen": 3.125, "logits/rejected": 2.984375, "logps/chosen": -1608.0, "logps/rejected": -1760.0, "loss": 0.6818, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.384765625, "rewards/margins": 0.04736328125, "rewards/rejected": 0.337890625, "step": 425 }, { "epoch": 0.12289052358286456, "grad_norm": 10.867749018766673, "learning_rate": 4.992089498340101e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1352.0, "logps/rejected": -1656.0, "loss": 0.7012, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2890625, "rewards/margins": -0.022705078125, "rewards/rejected": 0.310546875, "step": 426 }, { "epoch": 0.12317899899033607, "grad_norm": 10.196699378594237, "learning_rate": 4.991888074027985e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1856.0, "logps/rejected": -1808.0, "loss": 0.6611, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.359375, "rewards/margins": 0.0771484375, "rewards/rejected": 0.28125, "step": 427 }, { "epoch": 0.12346747439780759, "grad_norm": 10.87709180925879, "learning_rate": 4.991684121596998e-07, "logits/chosen": 3.0, "logits/rejected": 3.0, "logps/chosen": -1640.0, "logps/rejected": -1728.0, "loss": 0.6871, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.322265625, "rewards/margins": 0.05126953125, "rewards/rejected": 0.271484375, "step": 428 }, { "epoch": 0.1237559498052791, "grad_norm": 11.19434474732812, "learning_rate": 4.991477641254055e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0, "logps/chosen": -1456.0, "logps/rejected": -1528.0, "loss": 0.6641, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.28125, "rewards/margins": 0.06982421875, "rewards/rejected": 0.2109375, "step": 429 }, { "epoch": 0.12404442521275061, "grad_norm": 11.238345351868436, "learning_rate": 4.99126863320864e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1984.0, "logps/rejected": -1984.0, "loss": 0.719, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.333984375, "rewards/margins": -0.0556640625, "rewards/rejected": 0.390625, "step": 430 }, { "epoch": 0.12433290062022212, "grad_norm": 11.458387677602374, "learning_rate": 4.991057097672798e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1368.0, "logps/rejected": -1376.0, "loss": 0.7018, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.263671875, "rewards/margins": -0.029296875, "rewards/rejected": 0.29296875, "step": 431 }, { "epoch": 0.12462137602769364, "grad_norm": 11.817716504736982, "learning_rate": 4.990843034861143e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1992.0, "logps/rejected": -1800.0, "loss": 0.66, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.37890625, "rewards/margins": 0.06640625, "rewards/rejected": 0.3125, "step": 432 }, { "epoch": 0.12490985143516516, "grad_norm": 11.657289397930844, "learning_rate": 4.990626444990848e-07, "logits/chosen": 2.9375, "logits/rejected": 2.90625, "logps/chosen": -1520.0, "logps/rejected": -1496.0, "loss": 0.6896, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.29296875, "rewards/margins": 0.0223388671875, "rewards/rejected": 0.26953125, "step": 433 }, { "epoch": 0.12519832684263665, "grad_norm": 12.308708717277893, "learning_rate": 4.990407328281651e-07, "logits/chosen": 2.921875, "logits/rejected": 2.96875, "logps/chosen": -1808.0, "logps/rejected": -1616.0, "loss": 0.6609, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.1103515625, "rewards/rejected": 0.298828125, "step": 434 }, { "epoch": 0.12548680225010816, "grad_norm": 12.31859895211819, "learning_rate": 4.990185684955858e-07, "logits/chosen": 2.875, "logits/rejected": 2.953125, "logps/chosen": -1672.0, "logps/rejected": -1608.0, "loss": 0.7099, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.287109375, "rewards/margins": -0.000518798828125, "rewards/rejected": 0.287109375, "step": 435 }, { "epoch": 0.1257752776575797, "grad_norm": 10.812363768756235, "learning_rate": 4.989961515238333e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -2336.0, "logps/rejected": -2208.0, "loss": 0.6683, "loss/demonstration_loss": -4576.0, "loss/preference_loss": -4576.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.46484375, "rewards/margins": 0.030029296875, "rewards/rejected": 0.435546875, "step": 436 }, { "epoch": 0.12606375306505122, "grad_norm": 12.895646729122682, "learning_rate": 4.989734819356503e-07, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -1864.0, "logps/rejected": -1712.0, "loss": 0.6873, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.39453125, "rewards/margins": 0.00567626953125, "rewards/rejected": 0.388671875, "step": 437 }, { "epoch": 0.12635222847252273, "grad_norm": 12.384810735149756, "learning_rate": 4.989505597540365e-07, "logits/chosen": 2.921875, "logits/rejected": 2.90625, "logps/chosen": -1736.0, "logps/rejected": -1728.0, "loss": 0.6877, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.318359375, "rewards/margins": -0.02099609375, "rewards/rejected": 0.337890625, "step": 438 }, { "epoch": 0.12664070387999424, "grad_norm": 11.04595161539796, "learning_rate": 4.989273850022468e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -1280.0, "logps/rejected": -1392.0, "loss": 0.6914, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.24609375, "rewards/margins": -0.018798828125, "rewards/rejected": 0.265625, "step": 439 }, { "epoch": 0.12692917928746575, "grad_norm": 12.535500951366224, "learning_rate": 4.989039577037933e-07, "logits/chosen": 3.0, "logits/rejected": 2.96875, "logps/chosen": -1952.0, "logps/rejected": -1952.0, "loss": 0.6918, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.38671875, "rewards/margins": -0.0081787109375, "rewards/rejected": 0.39453125, "step": 440 }, { "epoch": 0.12721765469493726, "grad_norm": 12.037641912147905, "learning_rate": 4.988802778824437e-07, "logits/chosen": 3.0625, "logits/rejected": 3.03125, "logps/chosen": -1328.0, "logps/rejected": -1648.0, "loss": 0.6973, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3359375, "rewards/margins": 0.0419921875, "rewards/rejected": 0.29296875, "step": 441 }, { "epoch": 0.12750613010240877, "grad_norm": 13.59476767942337, "learning_rate": 4.988563455622222e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0, "logps/chosen": -1752.0, "logps/rejected": -1520.0, "loss": 0.6719, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.349609375, "rewards/margins": -0.00360107421875, "rewards/rejected": 0.353515625, "step": 442 }, { "epoch": 0.12779460550988028, "grad_norm": 12.482000024972319, "learning_rate": 4.988321607674091e-07, "logits/chosen": 2.890625, "logits/rejected": 2.9375, "logps/chosen": -1768.0, "logps/rejected": -1712.0, "loss": 0.6641, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.392578125, "rewards/margins": 0.07958984375, "rewards/rejected": 0.314453125, "step": 443 }, { "epoch": 0.1280830809173518, "grad_norm": 11.943922095909839, "learning_rate": 4.988077235225407e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0, "logps/chosen": -1912.0, "logps/rejected": -1912.0, "loss": 0.6762, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4921875, "rewards/margins": 0.09130859375, "rewards/rejected": 0.40234375, "step": 444 }, { "epoch": 0.1283715563248233, "grad_norm": 11.076323161978614, "learning_rate": 4.987830338524098e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1784.0, "logps/rejected": -1560.0, "loss": 0.6735, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.34375, "rewards/margins": 0.01312255859375, "rewards/rejected": 0.330078125, "step": 445 }, { "epoch": 0.12866003173229482, "grad_norm": 11.050690941040932, "learning_rate": 4.987580917820649e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1632.0, "logps/rejected": -1600.0, "loss": 0.6868, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36328125, "rewards/margins": -0.005615234375, "rewards/rejected": 0.369140625, "step": 446 }, { "epoch": 0.12894850713976633, "grad_norm": 11.232999306414936, "learning_rate": 4.987328973368106e-07, "logits/chosen": 3.03125, "logits/rejected": 2.921875, "logps/chosen": -1048.0, "logps/rejected": -1128.0, "loss": 0.6916, "loss/demonstration_loss": -2208.0, "loss/preference_loss": -2208.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2236328125, "rewards/margins": -0.0186767578125, "rewards/rejected": 0.2421875, "step": 447 }, { "epoch": 0.12923698254723784, "grad_norm": 11.837612777477373, "learning_rate": 4.987074505422078e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1920.0, "logps/rejected": -2064.0, "loss": 0.6935, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.40625, "rewards/margins": -0.039794921875, "rewards/rejected": 0.4453125, "step": 448 }, { "epoch": 0.12952545795470935, "grad_norm": 11.471188644583533, "learning_rate": 4.986817514240734e-07, "logits/chosen": 2.890625, "logits/rejected": 2.90625, "logps/chosen": -1920.0, "logps/rejected": -1808.0, "loss": 0.6791, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3359375, "rewards/margins": 0.038330078125, "rewards/rejected": 0.296875, "step": 449 }, { "epoch": 0.12981393336218086, "grad_norm": 12.968585608845181, "learning_rate": 4.986558000084798e-07, "logits/chosen": 3.046875, "logits/rejected": 2.953125, "logps/chosen": -1408.0, "logps/rejected": -1568.0, "loss": 0.6886, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.279296875, "rewards/margins": -0.003387451171875, "rewards/rejected": 0.283203125, "step": 450 }, { "epoch": 0.1301024087696524, "grad_norm": 12.577737137438792, "learning_rate": 4.98629596321756e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1776.0, "logps/rejected": -1568.0, "loss": 0.6766, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.400390625, "rewards/margins": 0.00836181640625, "rewards/rejected": 0.390625, "step": 451 }, { "epoch": 0.1303908841771239, "grad_norm": 12.646107856174552, "learning_rate": 4.986031403904868e-07, "logits/chosen": 2.984375, "logits/rejected": 3.03125, "logps/chosen": -1928.0, "logps/rejected": -1808.0, "loss": 0.6656, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4375, "rewards/margins": 0.0751953125, "rewards/rejected": 0.36328125, "step": 452 }, { "epoch": 0.13067935958459542, "grad_norm": 10.8715534249111, "learning_rate": 4.985764322415124e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1872.0, "logps/rejected": -2032.0, "loss": 0.7211, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.30859375, "rewards/margins": -0.053955078125, "rewards/rejected": 0.361328125, "step": 453 }, { "epoch": 0.13096783499206693, "grad_norm": 10.66878191952447, "learning_rate": 4.985494719019297e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1848.0, "logps/rejected": -1904.0, "loss": 0.6908, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.3828125, "rewards/margins": -0.045654296875, "rewards/rejected": 0.427734375, "step": 454 }, { "epoch": 0.13125631039953845, "grad_norm": 11.664312218691633, "learning_rate": 4.985222593990907e-07, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -1584.0, "logps/rejected": -1560.0, "loss": 0.6768, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.263671875, "rewards/margins": 0.02587890625, "rewards/rejected": 0.2373046875, "step": 455 }, { "epoch": 0.13154478580700996, "grad_norm": 11.827868633518412, "learning_rate": 4.984947947606038e-07, "logits/chosen": 3.015625, "logits/rejected": 2.984375, "logps/chosen": -1632.0, "logps/rejected": -1768.0, "loss": 0.6743, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.412109375, "rewards/margins": 0.1171875, "rewards/rejected": 0.294921875, "step": 456 }, { "epoch": 0.13183326121448147, "grad_norm": 13.64045911269315, "learning_rate": 4.984670780143327e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1640.0, "logps/rejected": -1536.0, "loss": 0.6859, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.326171875, "rewards/margins": -0.02001953125, "rewards/rejected": 0.345703125, "step": 457 }, { "epoch": 0.13212173662195298, "grad_norm": 10.830075394101764, "learning_rate": 4.984391091883973e-07, "logits/chosen": 3.03125, "logits/rejected": 2.96875, "logps/chosen": -1600.0, "logps/rejected": -1576.0, "loss": 0.6599, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.365234375, "rewards/margins": 0.053955078125, "rewards/rejected": 0.310546875, "step": 458 }, { "epoch": 0.1324102120294245, "grad_norm": 11.566196448986432, "learning_rate": 4.984108883111732e-07, "logits/chosen": 2.984375, "logits/rejected": 2.96875, "logps/chosen": -1856.0, "logps/rejected": -1960.0, "loss": 0.7008, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.3203125, "rewards/margins": -0.099609375, "rewards/rejected": 0.419921875, "step": 459 }, { "epoch": 0.132698687436896, "grad_norm": 12.89274973770771, "learning_rate": 4.983824154112913e-07, "logits/chosen": 3.0, "logits/rejected": 3.0, "logps/chosen": -1728.0, "logps/rejected": -1536.0, "loss": 0.7102, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.27734375, "rewards/margins": -0.034912109375, "rewards/rejected": 0.3125, "step": 460 }, { "epoch": 0.1329871628443675, "grad_norm": 12.851351655799954, "learning_rate": 4.983536905176387e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -1488.0, "logps/rejected": -1424.0, "loss": 0.6927, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.302734375, "rewards/margins": -0.009521484375, "rewards/rejected": 0.3125, "step": 461 }, { "epoch": 0.13327563825183902, "grad_norm": 12.24378500561336, "learning_rate": 4.983247136593578e-07, "logits/chosen": 2.96875, "logits/rejected": 3.046875, "logps/chosen": -1728.0, "logps/rejected": -1656.0, "loss": 0.6857, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.408203125, "rewards/margins": 0.1142578125, "rewards/rejected": 0.29296875, "step": 462 }, { "epoch": 0.13356411365931053, "grad_norm": 10.466269810760314, "learning_rate": 4.982954848658469e-07, "logits/chosen": 2.859375, "logits/rejected": 2.921875, "logps/chosen": -1960.0, "logps/rejected": -1768.0, "loss": 0.6609, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.34375, "rewards/margins": 0.043212890625, "rewards/rejected": 0.298828125, "step": 463 }, { "epoch": 0.13385258906678205, "grad_norm": 11.302917504764295, "learning_rate": 4.982660041667597e-07, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -1616.0, "logps/rejected": -1448.0, "loss": 0.6852, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.390625, "rewards/margins": 0.06396484375, "rewards/rejected": 0.326171875, "step": 464 }, { "epoch": 0.13414106447425356, "grad_norm": 10.878424035843983, "learning_rate": 4.982362715920054e-07, "logits/chosen": 3.015625, "logits/rejected": 3.125, "logps/chosen": -1616.0, "logps/rejected": -1136.0, "loss": 0.6487, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.345703125, "rewards/margins": 0.162109375, "rewards/rejected": 0.1826171875, "step": 465 }, { "epoch": 0.1344295398817251, "grad_norm": 11.994104282298917, "learning_rate": 4.982062871717492e-07, "logits/chosen": 3.015625, "logits/rejected": 3.078125, "logps/chosen": -1432.0, "logps/rejected": -1176.0, "loss": 0.6815, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.314453125, "rewards/margins": 0.036865234375, "rewards/rejected": 0.27734375, "step": 466 }, { "epoch": 0.1347180152891966, "grad_norm": 11.114639824143396, "learning_rate": 4.981760509364112e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1856.0, "logps/rejected": -1952.0, "loss": 0.674, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.412109375, "rewards/margins": 0.006256103515625, "rewards/rejected": 0.40625, "step": 467 }, { "epoch": 0.13500649069666812, "grad_norm": 10.894157065756591, "learning_rate": 4.981455629166674e-07, "logits/chosen": 3.03125, "logits/rejected": 2.921875, "logps/chosen": -1592.0, "logps/rejected": -1632.0, "loss": 0.6718, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.37109375, "rewards/margins": 0.07470703125, "rewards/rejected": 0.296875, "step": 468 }, { "epoch": 0.13529496610413963, "grad_norm": 15.32373775505376, "learning_rate": 4.98114823143449e-07, "logits/chosen": 2.953125, "logits/rejected": 2.9375, "logps/chosen": -1712.0, "logps/rejected": -1616.0, "loss": 0.7105, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.423828125, "rewards/margins": 0.076171875, "rewards/rejected": 0.34765625, "step": 469 }, { "epoch": 0.13558344151161114, "grad_norm": 12.684554010628778, "learning_rate": 4.980838316479427e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1568.0, "logps/rejected": -1376.0, "loss": 0.6589, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33203125, "rewards/margins": 0.038818359375, "rewards/rejected": 0.29296875, "step": 470 }, { "epoch": 0.13587191691908265, "grad_norm": 10.479653630459104, "learning_rate": 4.980525884615907e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1568.0, "logps/rejected": -1664.0, "loss": 0.7075, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.345703125, "rewards/margins": -0.0113525390625, "rewards/rejected": 0.357421875, "step": 471 }, { "epoch": 0.13616039232655416, "grad_norm": 12.171377176430834, "learning_rate": 4.980210936160904e-07, "logits/chosen": 3.015625, "logits/rejected": 3.015625, "logps/chosen": -1784.0, "logps/rejected": -1600.0, "loss": 0.6942, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3828125, "rewards/margins": 0.016357421875, "rewards/rejected": 0.3671875, "step": 472 }, { "epoch": 0.13644886773402568, "grad_norm": 13.86944091475449, "learning_rate": 4.979893471433946e-07, "logits/chosen": 2.96875, "logits/rejected": 3.046875, "logps/chosen": -1584.0, "logps/rejected": -1792.0, "loss": 0.7258, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3671875, "rewards/margins": -0.00384521484375, "rewards/rejected": 0.37109375, "step": 473 }, { "epoch": 0.1367373431414972, "grad_norm": 11.511931924315167, "learning_rate": 4.979573490757112e-07, "logits/chosen": 3.046875, "logits/rejected": 3.125, "logps/chosen": -1760.0, "logps/rejected": -1720.0, "loss": 0.7013, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.373046875, "rewards/margins": -0.0126953125, "rewards/rejected": 0.38671875, "step": 474 }, { "epoch": 0.1370258185489687, "grad_norm": 11.802226915711545, "learning_rate": 4.979250994455038e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1824.0, "logps/rejected": -1712.0, "loss": 0.7106, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.33203125, "rewards/margins": -0.08154296875, "rewards/rejected": 0.412109375, "step": 475 }, { "epoch": 0.1373142939564402, "grad_norm": 12.279123869774102, "learning_rate": 4.978925982854906e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1872.0, "logps/rejected": -2048.0, "loss": 0.7271, "loss/demonstration_loss": -3968.0, "loss/preference_loss": -3968.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.40625, "rewards/margins": -0.058837890625, "rewards/rejected": 0.46484375, "step": 476 }, { "epoch": 0.13760276936391172, "grad_norm": 11.293577935133893, "learning_rate": 4.978598456286455e-07, "logits/chosen": 2.90625, "logits/rejected": 2.875, "logps/chosen": -1672.0, "logps/rejected": -1648.0, "loss": 0.6981, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.26953125, "rewards/margins": -0.04541015625, "rewards/rejected": 0.314453125, "step": 477 }, { "epoch": 0.13789124477138323, "grad_norm": 10.82962450929594, "learning_rate": 4.978268415081973e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1752.0, "logps/rejected": -1720.0, "loss": 0.6935, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.400390625, "rewards/margins": 0.01708984375, "rewards/rejected": 0.3828125, "step": 478 }, { "epoch": 0.13817972017885474, "grad_norm": 13.905992733274704, "learning_rate": 4.9779358595763e-07, "logits/chosen": 3.09375, "logits/rejected": 3.171875, "logps/chosen": -1528.0, "logps/rejected": -1432.0, "loss": 0.7242, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3125, "rewards/margins": -0.0186767578125, "rewards/rejected": 0.33203125, "step": 479 }, { "epoch": 0.13846819558632625, "grad_norm": 10.162473885017894, "learning_rate": 4.977600790106826e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -1600.0, "logps/rejected": -1552.0, "loss": 0.693, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.34765625, "rewards/margins": -0.034423828125, "rewards/rejected": 0.3828125, "step": 480 }, { "epoch": 0.13875667099379776, "grad_norm": 13.908433660172133, "learning_rate": 4.977263207013493e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1880.0, "logps/rejected": -1944.0, "loss": 0.7183, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.443359375, "rewards/margins": 0.00360107421875, "rewards/rejected": 0.44140625, "step": 481 }, { "epoch": 0.1390451464012693, "grad_norm": 13.913766960186566, "learning_rate": 4.976923110638794e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1728.0, "logps/rejected": -1792.0, "loss": 0.6802, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.46484375, "rewards/margins": 0.0101318359375, "rewards/rejected": 0.455078125, "step": 482 }, { "epoch": 0.13933362180874082, "grad_norm": 12.677172340535385, "learning_rate": 4.976580501327767e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1688.0, "logps/rejected": -1584.0, "loss": 0.6474, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39453125, "rewards/margins": 0.06396484375, "rewards/rejected": 0.330078125, "step": 483 }, { "epoch": 0.13962209721621233, "grad_norm": 11.556130226393334, "learning_rate": 4.976235379428004e-07, "logits/chosen": 3.109375, "logits/rejected": 3.046875, "logps/chosen": -1824.0, "logps/rejected": -1816.0, "loss": 0.7037, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.375, "rewards/margins": -0.030029296875, "rewards/rejected": 0.404296875, "step": 484 }, { "epoch": 0.13991057262368384, "grad_norm": 10.399818391535014, "learning_rate": 4.975887745289646e-07, "logits/chosen": 3.0, "logits/rejected": 3.078125, "logps/chosen": -1400.0, "logps/rejected": -1560.0, "loss": 0.694, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.29296875, "rewards/margins": -0.05322265625, "rewards/rejected": 0.345703125, "step": 485 }, { "epoch": 0.14019904803115535, "grad_norm": 12.129636704268066, "learning_rate": 4.97553759926538e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1224.0, "logps/rejected": -1184.0, "loss": 0.6909, "loss/demonstration_loss": -2432.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.27734375, "rewards/margins": -0.019775390625, "rewards/rejected": 0.296875, "step": 486 }, { "epoch": 0.14048752343862686, "grad_norm": 11.160443819370817, "learning_rate": 4.975184941710444e-07, "logits/chosen": 2.890625, "logits/rejected": 2.9375, "logps/chosen": -1680.0, "logps/rejected": -1712.0, "loss": 0.7097, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.341796875, "rewards/margins": -0.00286865234375, "rewards/rejected": 0.345703125, "step": 487 }, { "epoch": 0.14077599884609837, "grad_norm": 10.673503276953756, "learning_rate": 4.974829772982622e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1424.0, "logps/rejected": -1464.0, "loss": 0.7014, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.404296875, "rewards/margins": 0.0458984375, "rewards/rejected": 0.357421875, "step": 488 }, { "epoch": 0.14106447425356988, "grad_norm": 11.863813185530908, "learning_rate": 4.974472093442247e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -1696.0, "logps/rejected": -1696.0, "loss": 0.7107, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4296875, "rewards/margins": -0.04248046875, "rewards/rejected": 0.47265625, "step": 489 }, { "epoch": 0.1413529496610414, "grad_norm": 10.923678398397007, "learning_rate": 4.9741119034522e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1688.0, "logps/rejected": -1664.0, "loss": 0.6685, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40234375, "rewards/margins": 0.04736328125, "rewards/rejected": 0.35546875, "step": 490 }, { "epoch": 0.1416414250685129, "grad_norm": 11.75287871932818, "learning_rate": 4.973749203377906e-07, "logits/chosen": 3.0625, "logits/rejected": 3.1875, "logps/chosen": -1672.0, "logps/rejected": -1584.0, "loss": 0.6703, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.345703125, "rewards/margins": -0.056884765625, "rewards/rejected": 0.40234375, "step": 491 }, { "epoch": 0.14192990047598442, "grad_norm": 10.14461251472448, "learning_rate": 4.97338399358734e-07, "logits/chosen": 3.171875, "logits/rejected": 3.171875, "logps/chosen": -1520.0, "logps/rejected": -1472.0, "loss": 0.6993, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.34765625, "rewards/margins": -0.0291748046875, "rewards/rejected": 0.376953125, "step": 492 }, { "epoch": 0.14221837588345593, "grad_norm": 12.913836665899643, "learning_rate": 4.973016274451022e-07, "logits/chosen": 2.96875, "logits/rejected": 3.03125, "logps/chosen": -2016.0, "logps/rejected": -2080.0, "loss": 0.7001, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.361328125, "rewards/margins": -0.07080078125, "rewards/rejected": 0.431640625, "step": 493 }, { "epoch": 0.14250685129092744, "grad_norm": 12.800683838309105, "learning_rate": 4.972646046342018e-07, "logits/chosen": 3.078125, "logits/rejected": 3.046875, "logps/chosen": -1152.0, "logps/rejected": -1272.0, "loss": 0.6843, "loss/demonstration_loss": -2448.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2412109375, "rewards/margins": -0.005401611328125, "rewards/rejected": 0.2470703125, "step": 494 }, { "epoch": 0.14279532669839895, "grad_norm": 12.285986535124662, "learning_rate": 4.972273309635936e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1736.0, "logps/rejected": -1592.0, "loss": 0.683, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.447265625, "rewards/margins": 0.056396484375, "rewards/rejected": 0.390625, "step": 495 }, { "epoch": 0.14308380210587046, "grad_norm": 10.13839707498339, "learning_rate": 4.971898064710935e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -1640.0, "logps/rejected": -1328.0, "loss": 0.689, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.330078125, "rewards/margins": 0.018310546875, "rewards/rejected": 0.3125, "step": 496 }, { "epoch": 0.143372277513342, "grad_norm": 9.980473186457463, "learning_rate": 4.971520311947717e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1744.0, "logps/rejected": -1600.0, "loss": 0.6873, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4375, "rewards/margins": 0.03369140625, "rewards/rejected": 0.404296875, "step": 497 }, { "epoch": 0.1436607529208135, "grad_norm": 13.023225719010433, "learning_rate": 4.971140051729522e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -1760.0, "logps/rejected": -1728.0, "loss": 0.7256, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.408203125, "rewards/margins": -0.03759765625, "rewards/rejected": 0.4453125, "step": 498 }, { "epoch": 0.14394922832828502, "grad_norm": 10.743599075053961, "learning_rate": 4.970757284442144e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1592.0, "logps/rejected": -1544.0, "loss": 0.71, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.392578125, "rewards/margins": -0.04052734375, "rewards/rejected": 0.431640625, "step": 499 }, { "epoch": 0.14423770373575653, "grad_norm": 13.06207089049958, "learning_rate": 4.970372010473914e-07, "logits/chosen": 3.046875, "logits/rejected": 3.171875, "logps/chosen": -1544.0, "logps/rejected": -1352.0, "loss": 0.6778, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.408203125, "rewards/margins": 0.11181640625, "rewards/rejected": 0.296875, "step": 500 }, { "epoch": 0.14452617914322805, "grad_norm": 11.510145092047358, "learning_rate": 4.969984230215707e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1632.0, "logps/rejected": -1696.0, "loss": 0.6606, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.412109375, "rewards/margins": 0.1083984375, "rewards/rejected": 0.3046875, "step": 501 }, { "epoch": 0.14481465455069956, "grad_norm": 10.651575592315242, "learning_rate": 4.969593944060941e-07, "logits/chosen": 3.203125, "logits/rejected": 3.1875, "logps/chosen": -1368.0, "logps/rejected": -1408.0, "loss": 0.6675, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.380859375, "rewards/margins": 0.07470703125, "rewards/rejected": 0.306640625, "step": 502 }, { "epoch": 0.14510312995817107, "grad_norm": 11.880529427851066, "learning_rate": 4.969201152405579e-07, "logits/chosen": 3.046875, "logits/rejected": 3.078125, "logps/chosen": -1696.0, "logps/rejected": -1600.0, "loss": 0.6906, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.392578125, "rewards/margins": 0.002685546875, "rewards/rejected": 0.390625, "step": 503 }, { "epoch": 0.14539160536564258, "grad_norm": 10.413485604537877, "learning_rate": 4.968805855648121e-07, "logits/chosen": 2.953125, "logits/rejected": 2.9375, "logps/chosen": -1800.0, "logps/rejected": -1768.0, "loss": 0.7117, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.375, "rewards/margins": -0.09814453125, "rewards/rejected": 0.474609375, "step": 504 }, { "epoch": 0.1456800807731141, "grad_norm": 10.373490660400943, "learning_rate": 4.968408054189612e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1608.0, "logps/rejected": -1648.0, "loss": 0.6699, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.357421875, "rewards/margins": -0.0152587890625, "rewards/rejected": 0.373046875, "step": 505 }, { "epoch": 0.1459685561805856, "grad_norm": 10.339528955237336, "learning_rate": 4.968007748433638e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0, "logps/chosen": -1744.0, "logps/rejected": -1696.0, "loss": 0.7034, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.373046875, "rewards/margins": -0.052978515625, "rewards/rejected": 0.42578125, "step": 506 }, { "epoch": 0.1462570315880571, "grad_norm": 11.132867207070843, "learning_rate": 4.967604938786324e-07, "logits/chosen": 3.0, "logits/rejected": 3.109375, "logps/chosen": -1536.0, "logps/rejected": -1552.0, "loss": 0.6699, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.388671875, "rewards/margins": 0.08740234375, "rewards/rejected": 0.30078125, "step": 507 }, { "epoch": 0.14654550699552862, "grad_norm": 11.036855249285532, "learning_rate": 4.967199625656337e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1896.0, "logps/rejected": -1808.0, "loss": 0.7117, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.419921875, "rewards/margins": 0.0439453125, "rewards/rejected": 0.375, "step": 508 }, { "epoch": 0.14683398240300014, "grad_norm": 10.460695809387607, "learning_rate": 4.966791809454885e-07, "logits/chosen": 3.0625, "logits/rejected": 3.125, "logps/chosen": -1256.0, "logps/rejected": -1136.0, "loss": 0.6721, "loss/demonstration_loss": -2432.0, "loss/preference_loss": -2416.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.357421875, "rewards/margins": 0.078125, "rewards/rejected": 0.279296875, "step": 509 }, { "epoch": 0.14712245781047165, "grad_norm": 9.528923214109243, "learning_rate": 4.966381490595709e-07, "logits/chosen": 2.96875, "logits/rejected": 3.015625, "logps/chosen": -1464.0, "logps/rejected": -1216.0, "loss": 0.67, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.318359375, "rewards/margins": 0.043212890625, "rewards/rejected": 0.275390625, "step": 510 }, { "epoch": 0.14741093321794316, "grad_norm": 11.72081958272131, "learning_rate": 4.965968669495097e-07, "logits/chosen": 3.078125, "logits/rejected": 3.1875, "logps/chosen": -1792.0, "logps/rejected": -1552.0, "loss": 0.6562, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.50390625, "rewards/margins": 0.0830078125, "rewards/rejected": 0.421875, "step": 511 }, { "epoch": 0.1476994086254147, "grad_norm": 10.907725844354541, "learning_rate": 4.965553346571873e-07, "logits/chosen": 2.875, "logits/rejected": 2.875, "logps/chosen": -1792.0, "logps/rejected": -1504.0, "loss": 0.6899, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.34765625, "rewards/margins": 0.0264892578125, "rewards/rejected": 0.322265625, "step": 512 }, { "epoch": 0.1479878840328862, "grad_norm": 10.02020249462617, "learning_rate": 4.965135522247396e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1432.0, "logps/rejected": -1240.0, "loss": 0.663, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.3671875, "rewards/margins": 0.0927734375, "rewards/rejected": 0.275390625, "step": 513 }, { "epoch": 0.14827635944035772, "grad_norm": 10.842272867250761, "learning_rate": 4.964715196945567e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -2192.0, "logps/rejected": -2224.0, "loss": 0.7009, "loss/demonstration_loss": -4480.0, "loss/preference_loss": -4480.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.48046875, "rewards/margins": -0.023681640625, "rewards/rejected": 0.50390625, "step": 514 }, { "epoch": 0.14856483484782923, "grad_norm": 9.68066870407659, "learning_rate": 4.964292371092822e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1744.0, "logps/rejected": -1520.0, "loss": 0.6712, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.384765625, "rewards/margins": 0.036865234375, "rewards/rejected": 0.34765625, "step": 515 }, { "epoch": 0.14885331025530074, "grad_norm": 10.366629315944794, "learning_rate": 4.963867045118135e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1984.0, "logps/rejected": -1768.0, "loss": 0.6433, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.48828125, "rewards/margins": 0.0810546875, "rewards/rejected": 0.40625, "step": 516 }, { "epoch": 0.14914178566277225, "grad_norm": 10.620667795775566, "learning_rate": 4.963439219453015e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1816.0, "logps/rejected": -1840.0, "loss": 0.7, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.439453125, "rewards/margins": 0.004638671875, "rewards/rejected": 0.435546875, "step": 517 }, { "epoch": 0.14943026107024376, "grad_norm": 10.174217265487732, "learning_rate": 4.963008894531508e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1928.0, "logps/rejected": -1648.0, "loss": 0.6749, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.421875, "rewards/margins": 0.049072265625, "rewards/rejected": 0.373046875, "step": 518 }, { "epoch": 0.14971873647771528, "grad_norm": 13.570250637417114, "learning_rate": 4.962576070790198e-07, "logits/chosen": 3.03125, "logits/rejected": 3.109375, "logps/chosen": -1968.0, "logps/rejected": -1760.0, "loss": 0.7209, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.35546875, "rewards/margins": -0.020751953125, "rewards/rejected": 0.375, "step": 519 }, { "epoch": 0.1500072118851868, "grad_norm": 11.494283585065263, "learning_rate": 4.962140748668199e-07, "logits/chosen": 2.875, "logits/rejected": 2.890625, "logps/chosen": -1656.0, "logps/rejected": -1856.0, "loss": 0.7144, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.45703125, "rewards/margins": -0.052490234375, "rewards/rejected": 0.51171875, "step": 520 }, { "epoch": 0.1502956872926583, "grad_norm": 10.13905200325323, "learning_rate": 4.961702928607165e-07, "logits/chosen": 3.140625, "logits/rejected": 3.09375, "logps/chosen": -2024.0, "logps/rejected": -1600.0, "loss": 0.643, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4765625, "rewards/margins": 0.169921875, "rewards/rejected": 0.306640625, "step": 521 }, { "epoch": 0.1505841627001298, "grad_norm": 11.13609701771337, "learning_rate": 4.961262611051278e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1776.0, "logps/rejected": -1776.0, "loss": 0.6756, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4921875, "rewards/margins": 0.041015625, "rewards/rejected": 0.451171875, "step": 522 }, { "epoch": 0.15087263810760132, "grad_norm": 9.710250967343773, "learning_rate": 4.960819796447261e-07, "logits/chosen": 3.046875, "logits/rejected": 3.03125, "logps/chosen": -1256.0, "logps/rejected": -1272.0, "loss": 0.6805, "loss/demonstration_loss": -2576.0, "loss/preference_loss": -2560.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.359375, "rewards/margins": 0.03759765625, "rewards/rejected": 0.322265625, "step": 523 }, { "epoch": 0.15116111351507283, "grad_norm": 13.364431601252312, "learning_rate": 4.960374485244365e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -2040.0, "logps/rejected": -1792.0, "loss": 0.6866, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.55078125, "rewards/margins": 0.0113525390625, "rewards/rejected": 0.5390625, "step": 524 }, { "epoch": 0.15144958892254434, "grad_norm": 10.221468709368152, "learning_rate": 4.959926677894379e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1840.0, "logps/rejected": -2024.0, "loss": 0.6888, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.431640625, "rewards/margins": -0.021240234375, "rewards/rejected": 0.453125, "step": 525 }, { "epoch": 0.15173806433001585, "grad_norm": 10.345780175545148, "learning_rate": 4.959476374851616e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1296.0, "logps/rejected": -1592.0, "loss": 0.726, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.384765625, "rewards/margins": -0.07373046875, "rewards/rejected": 0.45703125, "step": 526 }, { "epoch": 0.1520265397374874, "grad_norm": 12.840249468504261, "learning_rate": 4.959023576572931e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1976.0, "logps/rejected": -1960.0, "loss": 0.6823, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.515625, "rewards/margins": 0.03662109375, "rewards/rejected": 0.4765625, "step": 527 }, { "epoch": 0.1523150151449589, "grad_norm": 12.082983395582415, "learning_rate": 4.958568283517702e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1616.0, "logps/rejected": -1480.0, "loss": 0.6873, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.46484375, "rewards/margins": 0.053466796875, "rewards/rejected": 0.412109375, "step": 528 }, { "epoch": 0.15260349055243042, "grad_norm": 13.089640007219462, "learning_rate": 4.958110496147845e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1544.0, "logps/rejected": -1504.0, "loss": 0.7128, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.298828125, "rewards/margins": -0.046875, "rewards/rejected": 0.345703125, "step": 529 }, { "epoch": 0.15289196595990193, "grad_norm": 11.86952872353304, "learning_rate": 4.957650214927801e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1960.0, "logps/rejected": -1992.0, "loss": 0.6774, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.470703125, "rewards/margins": 0.021240234375, "rewards/rejected": 0.44921875, "step": 530 }, { "epoch": 0.15318044136737344, "grad_norm": 10.677294276490391, "learning_rate": 4.957187440324545e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1488.0, "logps/rejected": -1888.0, "loss": 0.7001, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3203125, "rewards/margins": -0.04296875, "rewards/rejected": 0.36328125, "step": 531 }, { "epoch": 0.15346891677484495, "grad_norm": 11.546477717803599, "learning_rate": 4.95672217280758e-07, "logits/chosen": 3.078125, "logits/rejected": 3.0625, "logps/chosen": -1448.0, "logps/rejected": -1536.0, "loss": 0.7168, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.271484375, "rewards/margins": -0.0274658203125, "rewards/rejected": 0.298828125, "step": 532 }, { "epoch": 0.15375739218231646, "grad_norm": 13.029540269827725, "learning_rate": 4.956254412848936e-07, "logits/chosen": 3.203125, "logits/rejected": 3.234375, "logps/chosen": -1976.0, "logps/rejected": -1832.0, "loss": 0.6652, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.482421875, "rewards/margins": 0.0576171875, "rewards/rejected": 0.42578125, "step": 533 }, { "epoch": 0.15404586758978797, "grad_norm": 11.632828272662968, "learning_rate": 4.955784160923176e-07, "logits/chosen": 3.265625, "logits/rejected": 3.203125, "logps/chosen": -1968.0, "logps/rejected": -1832.0, "loss": 0.7059, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.46484375, "rewards/margins": -0.003265380859375, "rewards/rejected": 0.46875, "step": 534 }, { "epoch": 0.15433434299725948, "grad_norm": 10.96310161450628, "learning_rate": 4.955311417507391e-07, "logits/chosen": 3.15625, "logits/rejected": 3.140625, "logps/chosen": -1472.0, "logps/rejected": -1280.0, "loss": 0.6476, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.375, "rewards/margins": 0.109375, "rewards/rejected": 0.265625, "step": 535 }, { "epoch": 0.154622818404731, "grad_norm": 10.926793381308396, "learning_rate": 4.954836183081194e-07, "logits/chosen": 3.171875, "logits/rejected": 3.078125, "logps/chosen": -1840.0, "logps/rejected": -1800.0, "loss": 0.6913, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4453125, "rewards/margins": 0.06689453125, "rewards/rejected": 0.37890625, "step": 536 }, { "epoch": 0.1549112938122025, "grad_norm": 12.892540389020454, "learning_rate": 4.954358458126731e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1920.0, "logps/rejected": -1808.0, "loss": 0.6653, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.53125, "rewards/margins": 0.1494140625, "rewards/rejected": 0.3828125, "step": 537 }, { "epoch": 0.15519976921967402, "grad_norm": 10.670508031829, "learning_rate": 4.953878243128673e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1864.0, "logps/rejected": -1712.0, "loss": 0.6748, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4921875, "rewards/margins": 0.0810546875, "rewards/rejected": 0.41015625, "step": 538 }, { "epoch": 0.15548824462714553, "grad_norm": 11.223434535741061, "learning_rate": 4.953395538574218e-07, "logits/chosen": 3.046875, "logits/rejected": 3.046875, "logps/chosen": -1800.0, "logps/rejected": -1920.0, "loss": 0.7, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.416015625, "rewards/margins": -0.0264892578125, "rewards/rejected": 0.44140625, "step": 539 }, { "epoch": 0.15577672003461704, "grad_norm": 9.760028605257483, "learning_rate": 4.952910344953085e-07, "logits/chosen": 3.1875, "logits/rejected": 3.15625, "logps/chosen": -1656.0, "logps/rejected": -1392.0, "loss": 0.6932, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.375, "rewards/margins": 0.0859375, "rewards/rejected": 0.2890625, "step": 540 }, { "epoch": 0.15606519544208855, "grad_norm": 10.563072988719771, "learning_rate": 4.952422662757526e-07, "logits/chosen": 3.046875, "logits/rejected": 3.046875, "logps/chosen": -1648.0, "logps/rejected": -1736.0, "loss": 0.6512, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.458984375, "rewards/margins": -0.001953125, "rewards/rejected": 0.4609375, "step": 541 }, { "epoch": 0.15635367084956006, "grad_norm": 12.650461973480182, "learning_rate": 4.951932492482313e-07, "logits/chosen": 3.3125, "logits/rejected": 3.234375, "logps/chosen": -1312.0, "logps/rejected": -1552.0, "loss": 0.6984, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.330078125, "rewards/margins": 0.06005859375, "rewards/rejected": 0.26953125, "step": 542 }, { "epoch": 0.1566421462570316, "grad_norm": 11.042808094789574, "learning_rate": 4.951439834624742e-07, "logits/chosen": 3.140625, "logits/rejected": 3.078125, "logps/chosen": -1432.0, "logps/rejected": -1384.0, "loss": 0.7131, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.396484375, "rewards/margins": 0.0247802734375, "rewards/rejected": 0.37109375, "step": 543 }, { "epoch": 0.1569306216645031, "grad_norm": 10.576030928370736, "learning_rate": 4.950944689684636e-07, "logits/chosen": 3.109375, "logits/rejected": 3.1875, "logps/chosen": -1760.0, "logps/rejected": -1576.0, "loss": 0.6729, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.390625, "rewards/margins": 0.06494140625, "rewards/rejected": 0.326171875, "step": 544 }, { "epoch": 0.15721909707197462, "grad_norm": 14.316960361305092, "learning_rate": 4.950447058164335e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1648.0, "logps/rejected": -1552.0, "loss": 0.7203, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4609375, "rewards/margins": 0.09814453125, "rewards/rejected": 0.36328125, "step": 545 }, { "epoch": 0.15750757247944613, "grad_norm": 11.134769731051994, "learning_rate": 4.94994694056871e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1432.0, "logps/rejected": -1064.0, "loss": 0.6721, "loss/demonstration_loss": -2544.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.373046875, "rewards/margins": 0.1064453125, "rewards/rejected": 0.265625, "step": 546 }, { "epoch": 0.15779604788691765, "grad_norm": 9.849954202160367, "learning_rate": 4.949444337405149e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1976.0, "logps/rejected": -1880.0, "loss": 0.7021, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.38671875, "rewards/margins": -0.029541015625, "rewards/rejected": 0.416015625, "step": 547 }, { "epoch": 0.15808452329438916, "grad_norm": 11.830963170816082, "learning_rate": 4.948939249183561e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1576.0, "logps/rejected": -1448.0, "loss": 0.6794, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.349609375, "rewards/margins": 0.06494140625, "rewards/rejected": 0.283203125, "step": 548 }, { "epoch": 0.15837299870186067, "grad_norm": 9.84218907015708, "learning_rate": 4.94843167641638e-07, "logits/chosen": 3.15625, "logits/rejected": 3.140625, "logps/chosen": -1216.0, "logps/rejected": -1336.0, "loss": 0.6709, "loss/demonstration_loss": -2576.0, "loss/preference_loss": -2576.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.33984375, "rewards/margins": 0.02490234375, "rewards/rejected": 0.314453125, "step": 549 }, { "epoch": 0.15866147410933218, "grad_norm": 11.750362458082861, "learning_rate": 4.947921619618558e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1776.0, "logps/rejected": -1744.0, "loss": 0.6757, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.328125, "rewards/margins": -0.044921875, "rewards/rejected": 0.373046875, "step": 550 }, { "epoch": 0.1589499495168037, "grad_norm": 11.347953790530728, "learning_rate": 4.947409079307567e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1640.0, "logps/rejected": -1568.0, "loss": 0.6455, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.392578125, "rewards/margins": 0.0869140625, "rewards/rejected": 0.306640625, "step": 551 }, { "epoch": 0.1592384249242752, "grad_norm": 12.639425609580405, "learning_rate": 4.9468940560034e-07, "logits/chosen": 3.15625, "logits/rejected": 3.234375, "logps/chosen": -1616.0, "logps/rejected": -1496.0, "loss": 0.6597, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.373046875, "rewards/margins": 0.04931640625, "rewards/rejected": 0.32421875, "step": 552 }, { "epoch": 0.1595269003317467, "grad_norm": 12.065778851709316, "learning_rate": 4.946376550228569e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1848.0, "logps/rejected": -1544.0, "loss": 0.6545, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.447265625, "rewards/margins": 0.125, "rewards/rejected": 0.322265625, "step": 553 }, { "epoch": 0.15981537573921822, "grad_norm": 10.705187479167527, "learning_rate": 4.945856562508103e-07, "logits/chosen": 3.046875, "logits/rejected": 2.96875, "logps/chosen": -1848.0, "logps/rejected": -1736.0, "loss": 0.6621, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5, "rewards/margins": 0.0546875, "rewards/rejected": 0.4453125, "step": 554 }, { "epoch": 0.16010385114668974, "grad_norm": 11.384753910838244, "learning_rate": 4.945334093369551e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -1808.0, "logps/rejected": -1816.0, "loss": 0.7045, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33203125, "rewards/margins": -0.0086669921875, "rewards/rejected": 0.33984375, "step": 555 }, { "epoch": 0.16039232655416125, "grad_norm": 13.531740260839971, "learning_rate": 4.944809143342978e-07, "logits/chosen": 3.203125, "logits/rejected": 3.125, "logps/chosen": -1768.0, "logps/rejected": -1792.0, "loss": 0.681, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.45703125, "rewards/margins": 0.003997802734375, "rewards/rejected": 0.453125, "step": 556 }, { "epoch": 0.16068080196163276, "grad_norm": 10.678839024047571, "learning_rate": 4.944281712960966e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1264.0, "logps/rejected": -1240.0, "loss": 0.725, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.26171875, "rewards/margins": -0.06396484375, "rewards/rejected": 0.326171875, "step": 557 }, { "epoch": 0.1609692773691043, "grad_norm": 10.782655491579556, "learning_rate": 4.943751802758615e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1680.0, "logps/rejected": -1368.0, "loss": 0.6752, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.515625, "rewards/margins": 0.1396484375, "rewards/rejected": 0.376953125, "step": 558 }, { "epoch": 0.1612577527765758, "grad_norm": 11.41244309532753, "learning_rate": 4.94321941327354e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -2176.0, "logps/rejected": -1832.0, "loss": 0.6719, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.56640625, "rewards/margins": 0.10888671875, "rewards/rejected": 0.45703125, "step": 559 }, { "epoch": 0.16154622818404732, "grad_norm": 13.550724062033822, "learning_rate": 4.94268454504587e-07, "logits/chosen": 3.0625, "logits/rejected": 3.203125, "logps/chosen": -1696.0, "logps/rejected": -1424.0, "loss": 0.709, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.38671875, "rewards/margins": 0.06591796875, "rewards/rejected": 0.322265625, "step": 560 }, { "epoch": 0.16183470359151883, "grad_norm": 11.742548621328552, "learning_rate": 4.942147198618252e-07, "logits/chosen": 3.0625, "logits/rejected": 3.03125, "logps/chosen": -1704.0, "logps/rejected": -1936.0, "loss": 0.7133, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41796875, "rewards/margins": -0.06982421875, "rewards/rejected": 0.48828125, "step": 561 }, { "epoch": 0.16212317899899034, "grad_norm": 11.073705088880684, "learning_rate": 4.941607374535842e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1896.0, "logps/rejected": -1872.0, "loss": 0.6747, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51171875, "rewards/margins": 0.06005859375, "rewards/rejected": 0.451171875, "step": 562 }, { "epoch": 0.16241165440646185, "grad_norm": 15.90612267262837, "learning_rate": 4.941065073346315e-07, "logits/chosen": 3.28125, "logits/rejected": 3.203125, "logps/chosen": -1568.0, "logps/rejected": -1528.0, "loss": 0.6974, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.375, "rewards/margins": -0.0235595703125, "rewards/rejected": 0.3984375, "step": 563 }, { "epoch": 0.16270012981393336, "grad_norm": 13.352533989655855, "learning_rate": 4.940520295599858e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1712.0, "logps/rejected": -1544.0, "loss": 0.6791, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.453125, "rewards/margins": 0.0859375, "rewards/rejected": 0.3671875, "step": 564 }, { "epoch": 0.16298860522140488, "grad_norm": 13.705771691596883, "learning_rate": 4.939973041849167e-07, "logits/chosen": 3.140625, "logits/rejected": 3.0625, "logps/chosen": -1616.0, "logps/rejected": -1584.0, "loss": 0.6919, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.408203125, "rewards/margins": 0.0771484375, "rewards/rejected": 0.33203125, "step": 565 }, { "epoch": 0.1632770806288764, "grad_norm": 10.531353691490725, "learning_rate": 4.939423312649454e-07, "logits/chosen": 3.296875, "logits/rejected": 3.28125, "logps/chosen": -1808.0, "logps/rejected": -1720.0, "loss": 0.6892, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.546875, "rewards/margins": 0.142578125, "rewards/rejected": 0.40234375, "step": 566 }, { "epoch": 0.1635655560363479, "grad_norm": 10.918313592091286, "learning_rate": 4.93887110855844e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1144.0, "logps/rejected": -1384.0, "loss": 0.6977, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2560.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.32421875, "rewards/margins": 0.00628662109375, "rewards/rejected": 0.318359375, "step": 567 }, { "epoch": 0.1638540314438194, "grad_norm": 10.036954175094754, "learning_rate": 4.938316430136359e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -992.0, "logps/rejected": -1288.0, "loss": 0.6988, "loss/demonstration_loss": -2304.0, "loss/preference_loss": -2320.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3046875, "rewards/margins": -0.04541015625, "rewards/rejected": 0.349609375, "step": 568 }, { "epoch": 0.16414250685129092, "grad_norm": 11.911551589305803, "learning_rate": 4.937759277945954e-07, "logits/chosen": 3.21875, "logits/rejected": 3.125, "logps/chosen": -1640.0, "logps/rejected": -1584.0, "loss": 0.7003, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.353515625, "rewards/margins": 0.003631591796875, "rewards/rejected": 0.349609375, "step": 569 }, { "epoch": 0.16443098225876243, "grad_norm": 10.256236683486325, "learning_rate": 4.937199652552477e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1904.0, "logps/rejected": -1496.0, "loss": 0.65, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.53125, "rewards/margins": 0.1689453125, "rewards/rejected": 0.36328125, "step": 570 }, { "epoch": 0.16471945766623394, "grad_norm": 10.931009748407062, "learning_rate": 4.936637554523691e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -2208.0, "logps/rejected": -2096.0, "loss": 0.6785, "loss/demonstration_loss": -4352.0, "loss/preference_loss": -4352.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.453125, "rewards/margins": -0.0223388671875, "rewards/rejected": 0.474609375, "step": 571 }, { "epoch": 0.16500793307370545, "grad_norm": 12.206924178661245, "learning_rate": 4.936072984429866e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -1672.0, "logps/rejected": -1600.0, "loss": 0.6725, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.39453125, "rewards/margins": 0.0089111328125, "rewards/rejected": 0.38671875, "step": 572 }, { "epoch": 0.165296408481177, "grad_norm": 11.448393418558922, "learning_rate": 4.935505942843781e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0, "logps/chosen": -1600.0, "logps/rejected": -1624.0, "loss": 0.7087, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4375, "rewards/margins": 0.0361328125, "rewards/rejected": 0.40234375, "step": 573 }, { "epoch": 0.1655848838886485, "grad_norm": 11.696136392422394, "learning_rate": 4.934936430340724e-07, "logits/chosen": 3.09375, "logits/rejected": 3.078125, "logps/chosen": -1328.0, "logps/rejected": -1360.0, "loss": 0.7256, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.353515625, "rewards/margins": -0.07568359375, "rewards/rejected": 0.4296875, "step": 574 }, { "epoch": 0.16587335929612002, "grad_norm": 13.479571754239823, "learning_rate": 4.934364447498484e-07, "logits/chosen": 3.140625, "logits/rejected": 3.03125, "logps/chosen": -1888.0, "logps/rejected": -1744.0, "loss": 0.6829, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.42578125, "rewards/margins": 0.0301513671875, "rewards/rejected": 0.39453125, "step": 575 }, { "epoch": 0.16616183470359153, "grad_norm": 12.215958988474764, "learning_rate": 4.933789994897362e-07, "logits/chosen": 2.96875, "logits/rejected": 2.8125, "logps/chosen": -2064.0, "logps/rejected": -2048.0, "loss": 0.702, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.498046875, "rewards/margins": 0.004150390625, "rewards/rejected": 0.494140625, "step": 576 }, { "epoch": 0.16645031011106304, "grad_norm": 11.011025757107882, "learning_rate": 4.933213073120163e-07, "logits/chosen": 3.03125, "logits/rejected": 3.078125, "logps/chosen": -1640.0, "logps/rejected": -1712.0, "loss": 0.6902, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4921875, "rewards/margins": -0.048095703125, "rewards/rejected": 0.54296875, "step": 577 }, { "epoch": 0.16673878551853455, "grad_norm": 13.528056439672381, "learning_rate": 4.932633682752199e-07, "logits/chosen": 2.890625, "logits/rejected": 2.890625, "logps/chosen": -1776.0, "logps/rejected": -1624.0, "loss": 0.7017, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.384765625, "rewards/margins": -0.03662109375, "rewards/rejected": 0.419921875, "step": 578 }, { "epoch": 0.16702726092600606, "grad_norm": 10.212937356123117, "learning_rate": 4.932051824381281e-07, "logits/chosen": 3.03125, "logits/rejected": 3.140625, "logps/chosen": -1840.0, "logps/rejected": -1696.0, "loss": 0.6677, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.416015625, "rewards/margins": 0.050537109375, "rewards/rejected": 0.365234375, "step": 579 }, { "epoch": 0.16731573633347757, "grad_norm": 11.950478176164427, "learning_rate": 4.931467498597728e-07, "logits/chosen": 3.125, "logits/rejected": 3.078125, "logps/chosen": -1936.0, "logps/rejected": -1712.0, "loss": 0.6985, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.32421875, "rewards/margins": -0.0274658203125, "rewards/rejected": 0.3515625, "step": 580 }, { "epoch": 0.16760421174094908, "grad_norm": 12.326379214219001, "learning_rate": 4.930880705994362e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -1576.0, "logps/rejected": -1632.0, "loss": 0.6763, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.44921875, "rewards/margins": 0.0205078125, "rewards/rejected": 0.4296875, "step": 581 }, { "epoch": 0.1678926871484206, "grad_norm": 11.451542913511348, "learning_rate": 4.930291447166509e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1784.0, "logps/rejected": -1672.0, "loss": 0.6608, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.435546875, "rewards/margins": 0.05712890625, "rewards/rejected": 0.37890625, "step": 582 }, { "epoch": 0.1681811625558921, "grad_norm": 11.333245878375362, "learning_rate": 4.929699722711993e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0, "logps/chosen": -1696.0, "logps/rejected": -1760.0, "loss": 0.6874, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.412109375, "rewards/margins": -0.007568359375, "rewards/rejected": 0.419921875, "step": 583 }, { "epoch": 0.16846963796336362, "grad_norm": 11.50176149428191, "learning_rate": 4.929105533231143e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1824.0, "logps/rejected": -1816.0, "loss": 0.6827, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.470703125, "rewards/margins": 0.03369140625, "rewards/rejected": 0.4375, "step": 584 }, { "epoch": 0.16875811337083513, "grad_norm": 12.006637934597457, "learning_rate": 4.928508879326787e-07, "logits/chosen": 2.90625, "logits/rejected": 3.0, "logps/chosen": -1584.0, "logps/rejected": -1616.0, "loss": 0.6583, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41015625, "rewards/margins": 0.115234375, "rewards/rejected": 0.294921875, "step": 585 }, { "epoch": 0.16904658877830664, "grad_norm": 11.300936879759172, "learning_rate": 4.927909761604254e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1768.0, "logps/rejected": -1800.0, "loss": 0.71, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.494140625, "rewards/margins": 0.0137939453125, "rewards/rejected": 0.48046875, "step": 586 }, { "epoch": 0.16933506418577815, "grad_norm": 11.018490069183825, "learning_rate": 4.927308180671375e-07, "logits/chosen": 3.078125, "logits/rejected": 3.09375, "logps/chosen": -1776.0, "logps/rejected": -1664.0, "loss": 0.6876, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.486328125, "rewards/margins": -0.037109375, "rewards/rejected": 0.5234375, "step": 587 }, { "epoch": 0.1696235395932497, "grad_norm": 10.37678633527871, "learning_rate": 4.926704137138473e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1408.0, "logps/rejected": -1520.0, "loss": 0.7058, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.30859375, "rewards/margins": -0.0693359375, "rewards/rejected": 0.37890625, "step": 588 }, { "epoch": 0.1699120150007212, "grad_norm": 11.352274123987936, "learning_rate": 4.926097631618378e-07, "logits/chosen": 2.765625, "logits/rejected": 2.859375, "logps/chosen": -1472.0, "logps/rejected": -1488.0, "loss": 0.69, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.50390625, "rewards/margins": 0.0947265625, "rewards/rejected": 0.412109375, "step": 589 }, { "epoch": 0.1702004904081927, "grad_norm": 9.35952563279581, "learning_rate": 4.925488664726413e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1488.0, "logps/rejected": -1520.0, "loss": 0.6826, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": -0.00543212890625, "rewards/rejected": 0.337890625, "step": 590 }, { "epoch": 0.17048896581566422, "grad_norm": 10.682379410068636, "learning_rate": 4.924877237080397e-07, "logits/chosen": 3.09375, "logits/rejected": 3.140625, "logps/chosen": -1872.0, "logps/rejected": -1688.0, "loss": 0.6798, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5390625, "rewards/margins": 0.05615234375, "rewards/rejected": 0.48046875, "step": 591 }, { "epoch": 0.17077744122313573, "grad_norm": 10.630494997162494, "learning_rate": 4.924263349300649e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1664.0, "logps/rejected": -1608.0, "loss": 0.6752, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.498046875, "rewards/margins": 0.051025390625, "rewards/rejected": 0.447265625, "step": 592 }, { "epoch": 0.17106591663060725, "grad_norm": 10.758262688015513, "learning_rate": 4.923647002009983e-07, "logits/chosen": 3.0625, "logits/rejected": 3.1875, "logps/chosen": -2128.0, "logps/rejected": -1688.0, "loss": 0.6545, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.494140625, "rewards/margins": 0.126953125, "rewards/rejected": 0.3671875, "step": 593 }, { "epoch": 0.17135439203807876, "grad_norm": 11.000373621627954, "learning_rate": 4.923028195833706e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0625, "logps/chosen": -1584.0, "logps/rejected": -1472.0, "loss": 0.7069, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3203125, "rewards/margins": -0.0194091796875, "rewards/rejected": 0.33984375, "step": 594 }, { "epoch": 0.17164286744555027, "grad_norm": 11.158457197307875, "learning_rate": 4.922406931399623e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1600.0, "logps/rejected": -1544.0, "loss": 0.6865, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.380859375, "rewards/margins": -0.059326171875, "rewards/rejected": 0.44140625, "step": 595 }, { "epoch": 0.17193134285302178, "grad_norm": 10.187945197873365, "learning_rate": 4.921783209338031e-07, "logits/chosen": 3.0625, "logits/rejected": 3.125, "logps/chosen": -1376.0, "logps/rejected": -1096.0, "loss": 0.6556, "loss/demonstration_loss": -2512.0, "loss/preference_loss": -2496.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.3515625, "rewards/margins": 0.11865234375, "rewards/rejected": 0.232421875, "step": 596 }, { "epoch": 0.1722198182604933, "grad_norm": 11.45351416140906, "learning_rate": 4.921157030281719e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1832.0, "logps/rejected": -1920.0, "loss": 0.6691, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.515625, "rewards/margins": 0.0004425048828125, "rewards/rejected": 0.515625, "step": 597 }, { "epoch": 0.1725082936679648, "grad_norm": 12.095494544379603, "learning_rate": 4.920528394865973e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1288.0, "logps/rejected": -1448.0, "loss": 0.6811, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.3828125, "rewards/margins": 0.08837890625, "rewards/rejected": 0.29296875, "step": 598 }, { "epoch": 0.1727967690754363, "grad_norm": 11.033353921402762, "learning_rate": 4.919897303728565e-07, "logits/chosen": 2.984375, "logits/rejected": 2.890625, "logps/chosen": -1408.0, "logps/rejected": -1552.0, "loss": 0.6855, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.32421875, "rewards/margins": 0.052978515625, "rewards/rejected": 0.271484375, "step": 599 }, { "epoch": 0.17308524448290782, "grad_norm": 13.282549704639296, "learning_rate": 4.919263757509765e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1888.0, "logps/rejected": -1904.0, "loss": 0.7312, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.63671875, "rewards/margins": -0.0140380859375, "rewards/rejected": 0.6484375, "step": 600 }, { "epoch": 0.17337371989037934, "grad_norm": 10.464614874282104, "learning_rate": 4.91862775685233e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1480.0, "logps/rejected": -1560.0, "loss": 0.6664, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.443359375, "rewards/margins": 0.04638671875, "rewards/rejected": 0.3984375, "step": 601 }, { "epoch": 0.17366219529785085, "grad_norm": 11.163597359640665, "learning_rate": 4.917989302401507e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1880.0, "logps/rejected": -1936.0, "loss": 0.6786, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.53515625, "rewards/margins": 0.0283203125, "rewards/rejected": 0.5078125, "step": 602 }, { "epoch": 0.17395067070532236, "grad_norm": 13.816618698580243, "learning_rate": 4.917348394805034e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0, "logps/chosen": -1520.0, "logps/rejected": -1464.0, "loss": 0.666, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.462890625, "rewards/margins": 0.083984375, "rewards/rejected": 0.37890625, "step": 603 }, { "epoch": 0.1742391461127939, "grad_norm": 12.745696270952374, "learning_rate": 4.916705034713136e-07, "logits/chosen": 2.921875, "logits/rejected": 2.984375, "logps/chosen": -1720.0, "logps/rejected": -1816.0, "loss": 0.6978, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.44921875, "rewards/margins": 0.0419921875, "rewards/rejected": 0.408203125, "step": 604 }, { "epoch": 0.1745276215202654, "grad_norm": 11.53009242379067, "learning_rate": 4.916059222778529e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1312.0, "logps/rejected": -1320.0, "loss": 0.7162, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2672.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3984375, "rewards/margins": 0.03271484375, "rewards/rejected": 0.3671875, "step": 605 }, { "epoch": 0.17481609692773692, "grad_norm": 9.584784133565641, "learning_rate": 4.915410959656414e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -1512.0, "logps/rejected": -1656.0, "loss": 0.6971, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.341796875, "rewards/margins": -0.058837890625, "rewards/rejected": 0.400390625, "step": 606 }, { "epoch": 0.17510457233520843, "grad_norm": 11.434226357045482, "learning_rate": 4.914760246004477e-07, "logits/chosen": 3.046875, "logits/rejected": 2.96875, "logps/chosen": -1592.0, "logps/rejected": -1760.0, "loss": 0.6572, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.328125, "rewards/margins": 0.025634765625, "rewards/rejected": 0.302734375, "step": 607 }, { "epoch": 0.17539304774267994, "grad_norm": 11.784044376966978, "learning_rate": 4.914107082482897e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1864.0, "logps/rejected": -1808.0, "loss": 0.6782, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.515625, "rewards/margins": -0.0205078125, "rewards/rejected": 0.53515625, "step": 608 }, { "epoch": 0.17568152315015145, "grad_norm": 11.845626202546088, "learning_rate": 4.91345146975433e-07, "logits/chosen": 3.171875, "logits/rejected": 3.109375, "logps/chosen": -1760.0, "logps/rejected": -1824.0, "loss": 0.692, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4765625, "rewards/margins": 0.045166015625, "rewards/rejected": 0.431640625, "step": 609 }, { "epoch": 0.17596999855762296, "grad_norm": 11.290759515734107, "learning_rate": 4.912793408483925e-07, "logits/chosen": 3.25, "logits/rejected": 3.125, "logps/chosen": -2048.0, "logps/rejected": -2080.0, "loss": 0.6589, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5234375, "rewards/margins": 0.1103515625, "rewards/rejected": 0.412109375, "step": 610 }, { "epoch": 0.17625847396509448, "grad_norm": 11.386363632287404, "learning_rate": 4.912132899339309e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -1832.0, "logps/rejected": -2008.0, "loss": 0.6796, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.498046875, "rewards/margins": -0.0283203125, "rewards/rejected": 0.52734375, "step": 611 }, { "epoch": 0.176546949372566, "grad_norm": 10.247241951203067, "learning_rate": 4.911469942990593e-07, "logits/chosen": 3.046875, "logits/rejected": 2.984375, "logps/chosen": -1360.0, "logps/rejected": -1392.0, "loss": 0.6765, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.416015625, "rewards/margins": 0.07177734375, "rewards/rejected": 0.34375, "step": 612 }, { "epoch": 0.1768354247800375, "grad_norm": 11.623852092176875, "learning_rate": 4.910804540110377e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1832.0, "logps/rejected": -1752.0, "loss": 0.6556, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.455078125, "rewards/margins": 0.09521484375, "rewards/rejected": 0.359375, "step": 613 }, { "epoch": 0.177123900187509, "grad_norm": 11.36474654173702, "learning_rate": 4.910136691373734e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1816.0, "logps/rejected": -1856.0, "loss": 0.6914, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5, "rewards/margins": 0.00396728515625, "rewards/rejected": 0.498046875, "step": 614 }, { "epoch": 0.17741237559498052, "grad_norm": 11.105889579445893, "learning_rate": 4.909466397458225e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1728.0, "logps/rejected": -1792.0, "loss": 0.7299, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.4375, "rewards/margins": -0.0301513671875, "rewards/rejected": 0.46875, "step": 615 }, { "epoch": 0.17770085100245203, "grad_norm": 10.543388807209881, "learning_rate": 4.90879365904389e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1640.0, "logps/rejected": -1728.0, "loss": 0.7005, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.365234375, "rewards/margins": -0.04541015625, "rewards/rejected": 0.41015625, "step": 616 }, { "epoch": 0.17798932640992354, "grad_norm": 11.728949498890294, "learning_rate": 4.908118476813246e-07, "logits/chosen": 3.171875, "logits/rejected": 3.1875, "logps/chosen": -1792.0, "logps/rejected": -1688.0, "loss": 0.6853, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.546875, "rewards/margins": 0.1044921875, "rewards/rejected": 0.44140625, "step": 617 }, { "epoch": 0.17827780181739505, "grad_norm": 9.667782757607464, "learning_rate": 4.907440851451296e-07, "logits/chosen": 3.078125, "logits/rejected": 3.0, "logps/chosen": -1416.0, "logps/rejected": -1528.0, "loss": 0.6843, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.462890625, "rewards/margins": 0.0198974609375, "rewards/rejected": 0.443359375, "step": 618 }, { "epoch": 0.1785662772248666, "grad_norm": 12.104784553070072, "learning_rate": 4.906760783645516e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -2064.0, "logps/rejected": -2192.0, "loss": 0.7223, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.57421875, "rewards/margins": -0.068359375, "rewards/rejected": 0.640625, "step": 619 }, { "epoch": 0.1788547526323381, "grad_norm": 12.267576427240048, "learning_rate": 4.906078274085861e-07, "logits/chosen": 3.1875, "logits/rejected": 3.078125, "logps/chosen": -1744.0, "logps/rejected": -1568.0, "loss": 0.6486, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.44921875, "rewards/margins": 0.1455078125, "rewards/rejected": 0.302734375, "step": 620 }, { "epoch": 0.17914322803980962, "grad_norm": 11.778584569235743, "learning_rate": 4.905393323464763e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1624.0, "logps/rejected": -1616.0, "loss": 0.7095, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.39453125, "rewards/margins": -0.03564453125, "rewards/rejected": 0.431640625, "step": 621 }, { "epoch": 0.17943170344728113, "grad_norm": 10.696015523265535, "learning_rate": 4.904705932477135e-07, "logits/chosen": 3.1875, "logits/rejected": 3.109375, "logps/chosen": -2112.0, "logps/rejected": -2024.0, "loss": 0.6873, "loss/demonstration_loss": -4192.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4921875, "rewards/margins": 0.002197265625, "rewards/rejected": 0.490234375, "step": 622 }, { "epoch": 0.17972017885475264, "grad_norm": 12.117510568408344, "learning_rate": 4.904016101820359e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -1760.0, "logps/rejected": -1568.0, "loss": 0.6965, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.408203125, "rewards/margins": 0.10009765625, "rewards/rejected": 0.30859375, "step": 623 }, { "epoch": 0.18000865426222415, "grad_norm": 11.436652194332098, "learning_rate": 4.903323832194296e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1440.0, "logps/rejected": -1448.0, "loss": 0.6726, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.447265625, "rewards/margins": 0.048583984375, "rewards/rejected": 0.3984375, "step": 624 }, { "epoch": 0.18029712966969566, "grad_norm": 10.386732486544162, "learning_rate": 4.902629124301282e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1456.0, "logps/rejected": -1456.0, "loss": 0.6946, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.5, "rewards/margins": -0.01007080078125, "rewards/rejected": 0.51171875, "step": 625 }, { "epoch": 0.18058560507716717, "grad_norm": 10.337238726866888, "learning_rate": 4.901931978846125e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1656.0, "logps/rejected": -1760.0, "loss": 0.7064, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.443359375, "rewards/margins": -0.04150390625, "rewards/rejected": 0.484375, "step": 626 }, { "epoch": 0.18087408048463868, "grad_norm": 11.682357066808725, "learning_rate": 4.901232396536105e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1880.0, "logps/rejected": -1624.0, "loss": 0.6703, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.59765625, "rewards/margins": 0.08251953125, "rewards/rejected": 0.515625, "step": 627 }, { "epoch": 0.1811625558921102, "grad_norm": 10.479362323288035, "learning_rate": 4.90053037808098e-07, "logits/chosen": 3.078125, "logits/rejected": 3.0625, "logps/chosen": -1704.0, "logps/rejected": -1576.0, "loss": 0.6888, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40625, "rewards/margins": 0.0400390625, "rewards/rejected": 0.3671875, "step": 628 }, { "epoch": 0.1814510312995817, "grad_norm": 11.814134280954951, "learning_rate": 4.899825924192972e-07, "logits/chosen": 3.25, "logits/rejected": 3.203125, "logps/chosen": -1832.0, "logps/rejected": -1800.0, "loss": 0.6847, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.470703125, "rewards/margins": 0.042236328125, "rewards/rejected": 0.427734375, "step": 629 }, { "epoch": 0.18173950670705322, "grad_norm": 11.627789784074366, "learning_rate": 4.899119035586778e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1875, "logps/chosen": -1512.0, "logps/rejected": -1704.0, "loss": 0.7087, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.48828125, "rewards/margins": -0.029052734375, "rewards/rejected": 0.51953125, "step": 630 }, { "epoch": 0.18202798211452473, "grad_norm": 12.232679814842381, "learning_rate": 4.898409712979565e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1192.0, "logps/rejected": -1160.0, "loss": 0.6701, "loss/demonstration_loss": -2384.0, "loss/preference_loss": -2384.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.400390625, "rewards/margins": 0.0269775390625, "rewards/rejected": 0.373046875, "step": 631 }, { "epoch": 0.18231645752199624, "grad_norm": 10.452932775255238, "learning_rate": 4.897697957090968e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1744.0, "logps/rejected": -1728.0, "loss": 0.6725, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4609375, "rewards/margins": 0.0224609375, "rewards/rejected": 0.439453125, "step": 632 }, { "epoch": 0.18260493292946775, "grad_norm": 13.91406027162934, "learning_rate": 4.896983768643091e-07, "logits/chosen": 3.15625, "logits/rejected": 3.140625, "logps/chosen": -2400.0, "logps/rejected": -2288.0, "loss": 0.6609, "loss/demonstration_loss": -4768.0, "loss/preference_loss": -4736.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.62890625, "rewards/margins": 0.1796875, "rewards/rejected": 0.447265625, "step": 633 }, { "epoch": 0.1828934083369393, "grad_norm": 11.955134698840961, "learning_rate": 4.896267148360509e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -2128.0, "logps/rejected": -1760.0, "loss": 0.6721, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.484375, "rewards/margins": 0.0299072265625, "rewards/rejected": 0.455078125, "step": 634 }, { "epoch": 0.1831818837444108, "grad_norm": 11.61545895947558, "learning_rate": 4.895548096970259e-07, "logits/chosen": 3.171875, "logits/rejected": 3.171875, "logps/chosen": -1496.0, "logps/rejected": -1608.0, "loss": 0.692, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.4375, "rewards/margins": -0.018310546875, "rewards/rejected": 0.45703125, "step": 635 }, { "epoch": 0.1834703591518823, "grad_norm": 11.61130649929906, "learning_rate": 4.894826615201849e-07, "logits/chosen": 3.1875, "logits/rejected": 3.078125, "logps/chosen": -1784.0, "logps/rejected": -1656.0, "loss": 0.7145, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.435546875, "rewards/margins": -0.005767822265625, "rewards/rejected": 0.44140625, "step": 636 }, { "epoch": 0.18375883455935382, "grad_norm": 10.595527960954731, "learning_rate": 4.894102703787249e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1808.0, "logps/rejected": -1584.0, "loss": 0.6552, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.48046875, "rewards/margins": 0.0927734375, "rewards/rejected": 0.38671875, "step": 637 }, { "epoch": 0.18404730996682533, "grad_norm": 10.43162983484717, "learning_rate": 4.893376363460896e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1632.0, "logps/rejected": -1648.0, "loss": 0.6871, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.466796875, "rewards/margins": -0.00634765625, "rewards/rejected": 0.474609375, "step": 638 }, { "epoch": 0.18433578537429685, "grad_norm": 11.031885564984536, "learning_rate": 4.892647594959691e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -1520.0, "logps/rejected": -1800.0, "loss": 0.6968, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.44140625, "rewards/margins": -0.01513671875, "rewards/rejected": 0.45703125, "step": 639 }, { "epoch": 0.18462426078176836, "grad_norm": 10.853815900375052, "learning_rate": 4.891916399022999e-07, "logits/chosen": 3.09375, "logits/rejected": 3.15625, "logps/chosen": -1632.0, "logps/rejected": -1512.0, "loss": 0.6938, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.427734375, "rewards/margins": 0.0439453125, "rewards/rejected": 0.3828125, "step": 640 }, { "epoch": 0.18491273618923987, "grad_norm": 11.618782679854233, "learning_rate": 4.891182776392647e-07, "logits/chosen": 2.984375, "logits/rejected": 3.03125, "logps/chosen": -1736.0, "logps/rejected": -1488.0, "loss": 0.6592, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.455078125, "rewards/margins": 0.08544921875, "rewards/rejected": 0.369140625, "step": 641 }, { "epoch": 0.18520121159671138, "grad_norm": 11.376716137136828, "learning_rate": 4.890446727812924e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -1848.0, "logps/rejected": -1744.0, "loss": 0.6695, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5625, "rewards/margins": 0.1298828125, "rewards/rejected": 0.431640625, "step": 642 }, { "epoch": 0.1854896870041829, "grad_norm": 9.360828919284325, "learning_rate": 4.889708254030581e-07, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -1664.0, "logps/rejected": -1592.0, "loss": 0.6411, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.1328125, "rewards/rejected": 0.353515625, "step": 643 }, { "epoch": 0.1857781624116544, "grad_norm": 9.707904199639916, "learning_rate": 4.888967355794829e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1728.0, "logps/rejected": -1568.0, "loss": 0.6674, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.41015625, "rewards/margins": 0.10546875, "rewards/rejected": 0.3046875, "step": 644 }, { "epoch": 0.1860666378191259, "grad_norm": 11.937288760650981, "learning_rate": 4.888224033857337e-07, "logits/chosen": 3.15625, "logits/rejected": 3.140625, "logps/chosen": -1792.0, "logps/rejected": -1808.0, "loss": 0.6746, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51953125, "rewards/margins": 0.00341796875, "rewards/rejected": 0.515625, "step": 645 }, { "epoch": 0.18635511322659742, "grad_norm": 12.499561749000547, "learning_rate": 4.887478288972234e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1680.0, "logps/rejected": -1584.0, "loss": 0.6901, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.451171875, "rewards/margins": -0.01611328125, "rewards/rejected": 0.466796875, "step": 646 }, { "epoch": 0.18664358863406894, "grad_norm": 11.264640436866037, "learning_rate": 4.88673012189611e-07, "logits/chosen": 3.21875, "logits/rejected": 3.109375, "logps/chosen": -1648.0, "logps/rejected": -1664.0, "loss": 0.6767, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4296875, "rewards/margins": 0.0, "rewards/rejected": 0.4296875, "step": 647 }, { "epoch": 0.18693206404154045, "grad_norm": 11.338102487716444, "learning_rate": 4.885979533388009e-07, "logits/chosen": 3.09375, "logits/rejected": 3.015625, "logps/chosen": -1576.0, "logps/rejected": -1720.0, "loss": 0.7008, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.388671875, "rewards/margins": 0.0228271484375, "rewards/rejected": 0.3671875, "step": 648 }, { "epoch": 0.18722053944901196, "grad_norm": 11.314978052708812, "learning_rate": 4.885226524209432e-07, "logits/chosen": 3.078125, "logits/rejected": 3.109375, "logps/chosen": -1624.0, "logps/rejected": -1848.0, "loss": 0.709, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.52734375, "rewards/margins": 0.00408935546875, "rewards/rejected": 0.5234375, "step": 649 }, { "epoch": 0.1875090148564835, "grad_norm": 9.312481855897774, "learning_rate": 4.884471095124337e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1304.0, "logps/rejected": -1192.0, "loss": 0.6609, "loss/demonstration_loss": -2544.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.40234375, "rewards/margins": 0.056884765625, "rewards/rejected": 0.34375, "step": 650 }, { "epoch": 0.187797490263955, "grad_norm": 11.218547720768596, "learning_rate": 4.883713246899137e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -2000.0, "logps/rejected": -1896.0, "loss": 0.6613, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.62109375, "rewards/margins": 0.09814453125, "rewards/rejected": 0.5234375, "step": 651 }, { "epoch": 0.18808596567142652, "grad_norm": 10.364791336068805, "learning_rate": 4.882952980302699e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1504.0, "logps/rejected": -1536.0, "loss": 0.6946, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.333984375, "rewards/margins": 0.01141357421875, "rewards/rejected": 0.322265625, "step": 652 }, { "epoch": 0.18837444107889803, "grad_norm": 11.563087481367752, "learning_rate": 4.882190296106343e-07, "logits/chosen": 3.09375, "logits/rejected": 3.078125, "logps/chosen": -976.0, "logps/rejected": -1112.0, "loss": 0.6973, "loss/demonstration_loss": -2112.0, "loss/preference_loss": -2112.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.21484375, "rewards/margins": -0.0400390625, "rewards/rejected": 0.25390625, "step": 653 }, { "epoch": 0.18866291648636954, "grad_norm": 11.412774475255304, "learning_rate": 4.881425195083842e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1592.0, "logps/rejected": -1288.0, "loss": 0.6699, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.447265625, "rewards/margins": 0.08447265625, "rewards/rejected": 0.36328125, "step": 654 }, { "epoch": 0.18895139189384105, "grad_norm": 11.356964004110743, "learning_rate": 4.880657678011422e-07, "logits/chosen": 3.109375, "logits/rejected": 3.03125, "logps/chosen": -1672.0, "logps/rejected": -1432.0, "loss": 0.647, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.515625, "rewards/margins": 0.10986328125, "rewards/rejected": 0.404296875, "step": 655 }, { "epoch": 0.18923986730131256, "grad_norm": 10.400530437105829, "learning_rate": 4.87988774566776e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1600.0, "logps/rejected": -1544.0, "loss": 0.6767, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.39453125, "rewards/margins": 0.060791015625, "rewards/rejected": 0.33203125, "step": 656 }, { "epoch": 0.18952834270878408, "grad_norm": 11.673465836776696, "learning_rate": 4.879115398833981e-07, "logits/chosen": 3.234375, "logits/rejected": 3.171875, "logps/chosen": -1864.0, "logps/rejected": -1608.0, "loss": 0.6953, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.49609375, "rewards/margins": 0.0556640625, "rewards/rejected": 0.439453125, "step": 657 }, { "epoch": 0.1898168181162556, "grad_norm": 9.708019364590635, "learning_rate": 4.878340638293663e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1352.0, "logps/rejected": -1568.0, "loss": 0.6778, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3203125, "rewards/margins": 0.029296875, "rewards/rejected": 0.291015625, "step": 658 }, { "epoch": 0.1901052935237271, "grad_norm": 11.595573804063251, "learning_rate": 4.87756346483283e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1616.0, "logps/rejected": -1440.0, "loss": 0.7052, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.439453125, "rewards/margins": 0.00701904296875, "rewards/rejected": 0.431640625, "step": 659 }, { "epoch": 0.1903937689311986, "grad_norm": 12.563441180207436, "learning_rate": 4.876783879239955e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -1664.0, "logps/rejected": -1608.0, "loss": 0.6707, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.486328125, "rewards/margins": 0.12158203125, "rewards/rejected": 0.365234375, "step": 660 }, { "epoch": 0.19068224433867012, "grad_norm": 10.065375150408824, "learning_rate": 4.876001882305959e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1808.0, "logps/rejected": -1696.0, "loss": 0.6614, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.54296875, "rewards/margins": 0.083984375, "rewards/rejected": 0.458984375, "step": 661 }, { "epoch": 0.19097071974614163, "grad_norm": 12.160144473767362, "learning_rate": 4.875217474824209e-07, "logits/chosen": 3.109375, "logits/rejected": 3.03125, "logps/chosen": -1256.0, "logps/rejected": -1312.0, "loss": 0.7008, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.490234375, "rewards/margins": -0.0230712890625, "rewards/rejected": 0.51171875, "step": 662 }, { "epoch": 0.19125919515361314, "grad_norm": 11.04545219522318, "learning_rate": 4.874430657590517e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1648.0, "logps/rejected": -1544.0, "loss": 0.6885, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.453125, "rewards/margins": 0.033935546875, "rewards/rejected": 0.41796875, "step": 663 }, { "epoch": 0.19154767056108465, "grad_norm": 11.569681034757473, "learning_rate": 4.87364143140314e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1992.0, "logps/rejected": -1624.0, "loss": 0.6685, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.462890625, "rewards/margins": 0.06640625, "rewards/rejected": 0.396484375, "step": 664 }, { "epoch": 0.1918361459685562, "grad_norm": 9.64977957417266, "learning_rate": 4.87284979706278e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1680.0, "logps/rejected": -1344.0, "loss": 0.6656, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.353515625, "rewards/margins": 0.038818359375, "rewards/rejected": 0.314453125, "step": 665 }, { "epoch": 0.1921246213760277, "grad_norm": 10.14867875139304, "learning_rate": 4.87205575537258e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1584.0, "logps/rejected": -1400.0, "loss": 0.6777, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.455078125, "rewards/margins": 0.0849609375, "rewards/rejected": 0.37109375, "step": 666 }, { "epoch": 0.19241309678349922, "grad_norm": 11.971018077768433, "learning_rate": 4.871259307138128e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1720.0, "logps/rejected": -1824.0, "loss": 0.6723, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.53515625, "rewards/margins": 0.06494140625, "rewards/rejected": 0.470703125, "step": 667 }, { "epoch": 0.19270157219097073, "grad_norm": 11.784758492952502, "learning_rate": 4.870460453167451e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1552.0, "logps/rejected": -1680.0, "loss": 0.714, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.419921875, "rewards/margins": 0.007568359375, "rewards/rejected": 0.412109375, "step": 668 }, { "epoch": 0.19299004759844224, "grad_norm": 11.36392329957994, "learning_rate": 4.869659194271019e-07, "logits/chosen": 3.078125, "logits/rejected": 2.953125, "logps/chosen": -1680.0, "logps/rejected": -1680.0, "loss": 0.6641, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4921875, "rewards/margins": 0.05126953125, "rewards/rejected": 0.439453125, "step": 669 }, { "epoch": 0.19327852300591375, "grad_norm": 9.674440585539758, "learning_rate": 4.86885553126174e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1744.0, "logps/rejected": -1776.0, "loss": 0.6851, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.51953125, "rewards/margins": 0.0498046875, "rewards/rejected": 0.470703125, "step": 670 }, { "epoch": 0.19356699841338526, "grad_norm": 10.617435786761819, "learning_rate": 4.868049464954962e-07, "logits/chosen": 3.0625, "logits/rejected": 2.984375, "logps/chosen": -1496.0, "logps/rejected": -1440.0, "loss": 0.679, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48046875, "rewards/margins": 0.05224609375, "rewards/rejected": 0.4296875, "step": 671 }, { "epoch": 0.19385547382085677, "grad_norm": 11.911754871557925, "learning_rate": 4.867240996168471e-07, "logits/chosen": 3.03125, "logits/rejected": 3.015625, "logps/chosen": -1288.0, "logps/rejected": -1352.0, "loss": 0.6447, "loss/demonstration_loss": -2672.0, "loss/preference_loss": -2672.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.390625, "rewards/margins": 0.09423828125, "rewards/rejected": 0.296875, "step": 672 }, { "epoch": 0.19414394922832828, "grad_norm": 11.207796371810625, "learning_rate": 4.866430125722491e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1528.0, "logps/rejected": -1472.0, "loss": 0.6838, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.41796875, "rewards/margins": -0.034912109375, "rewards/rejected": 0.453125, "step": 673 }, { "epoch": 0.1944324246357998, "grad_norm": 10.63240051661404, "learning_rate": 4.865616854439681e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1720.0, "logps/rejected": -1712.0, "loss": 0.7007, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.470703125, "rewards/margins": -0.03271484375, "rewards/rejected": 0.50390625, "step": 674 }, { "epoch": 0.1947209000432713, "grad_norm": 9.946930580897428, "learning_rate": 4.864801183145138e-07, "logits/chosen": 3.03125, "logits/rejected": 3.125, "logps/chosen": -1784.0, "logps/rejected": -1448.0, "loss": 0.6475, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.50390625, "rewards/margins": 0.140625, "rewards/rejected": 0.36328125, "step": 675 }, { "epoch": 0.19500937545074282, "grad_norm": 10.578052170219387, "learning_rate": 4.863983112666393e-07, "logits/chosen": 3.171875, "logits/rejected": 3.234375, "logps/chosen": -1464.0, "logps/rejected": -1488.0, "loss": 0.6772, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.388671875, "rewards/margins": 0.01177978515625, "rewards/rejected": 0.375, "step": 676 }, { "epoch": 0.19529785085821433, "grad_norm": 10.711801539836088, "learning_rate": 4.863162643833411e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1704.0, "logps/rejected": -1480.0, "loss": 0.675, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.515625, "rewards/margins": 0.08154296875, "rewards/rejected": 0.435546875, "step": 677 }, { "epoch": 0.19558632626568584, "grad_norm": 11.920460055325549, "learning_rate": 4.862339777478587e-07, "logits/chosen": 3.265625, "logits/rejected": 3.265625, "logps/chosen": -2176.0, "logps/rejected": -2040.0, "loss": 0.6811, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4256.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.56640625, "rewards/margins": 0.041015625, "rewards/rejected": 0.52734375, "step": 678 }, { "epoch": 0.19587480167315735, "grad_norm": 12.109920260590938, "learning_rate": 4.861514514436755e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1840.0, "logps/rejected": -1744.0, "loss": 0.6787, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4609375, "rewards/margins": 0.05615234375, "rewards/rejected": 0.404296875, "step": 679 }, { "epoch": 0.1961632770806289, "grad_norm": 9.723362757321727, "learning_rate": 4.860686855545175e-07, "logits/chosen": 3.328125, "logits/rejected": 3.21875, "logps/chosen": -1616.0, "logps/rejected": -1576.0, "loss": 0.6614, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.443359375, "rewards/margins": 0.053955078125, "rewards/rejected": 0.388671875, "step": 680 }, { "epoch": 0.1964517524881004, "grad_norm": 12.179933799639938, "learning_rate": 4.859856801643542e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1392.0, "logps/rejected": -1352.0, "loss": 0.7133, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32421875, "rewards/margins": -0.01226806640625, "rewards/rejected": 0.3359375, "step": 681 }, { "epoch": 0.1967402278955719, "grad_norm": 9.742605886305688, "learning_rate": 4.859024353573975e-07, "logits/chosen": 3.0625, "logits/rejected": 3.109375, "logps/chosen": -1840.0, "logps/rejected": -1560.0, "loss": 0.6739, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.48046875, "rewards/margins": 0.1181640625, "rewards/rejected": 0.361328125, "step": 682 }, { "epoch": 0.19702870330304342, "grad_norm": 10.591025629184088, "learning_rate": 4.858189512181027e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1728.0, "logps/rejected": -1744.0, "loss": 0.6952, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51171875, "rewards/margins": 0.0091552734375, "rewards/rejected": 0.50390625, "step": 683 }, { "epoch": 0.19731717871051493, "grad_norm": 10.851007533404983, "learning_rate": 4.857352278311679e-07, "logits/chosen": 3.125, "logits/rejected": 3.078125, "logps/chosen": -1608.0, "logps/rejected": -1536.0, "loss": 0.679, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.50390625, "rewards/margins": 0.111328125, "rewards/rejected": 0.390625, "step": 684 }, { "epoch": 0.19760565411798645, "grad_norm": 11.948588514401317, "learning_rate": 4.856512652815335e-07, "logits/chosen": 3.15625, "logits/rejected": 3.265625, "logps/chosen": -1656.0, "logps/rejected": -1536.0, "loss": 0.6744, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.54296875, "rewards/margins": 0.134765625, "rewards/rejected": 0.41015625, "step": 685 }, { "epoch": 0.19789412952545796, "grad_norm": 10.018760469895629, "learning_rate": 4.85567063654383e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1784.0, "logps/rejected": -1760.0, "loss": 0.689, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.474609375, "rewards/margins": -0.0108642578125, "rewards/rejected": 0.484375, "step": 686 }, { "epoch": 0.19818260493292947, "grad_norm": 14.102078162572806, "learning_rate": 4.854826230351425e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -1504.0, "logps/rejected": -1488.0, "loss": 0.6993, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.56640625, "rewards/margins": 0.083984375, "rewards/rejected": 0.482421875, "step": 687 }, { "epoch": 0.19847108034040098, "grad_norm": 10.799319125408367, "learning_rate": 4.853979435094798e-07, "logits/chosen": 3.109375, "logits/rejected": 3.046875, "logps/chosen": -1456.0, "logps/rejected": -1648.0, "loss": 0.6768, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4375, "rewards/margins": -0.02587890625, "rewards/rejected": 0.462890625, "step": 688 }, { "epoch": 0.1987595557478725, "grad_norm": 11.440696500060202, "learning_rate": 4.853130251633061e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1776.0, "logps/rejected": -1856.0, "loss": 0.6583, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.53125, "rewards/margins": 0.11181640625, "rewards/rejected": 0.419921875, "step": 689 }, { "epoch": 0.199048031155344, "grad_norm": 11.920239418737804, "learning_rate": 4.852278680827741e-07, "logits/chosen": 3.046875, "logits/rejected": 3.1875, "logps/chosen": -1712.0, "logps/rejected": -1752.0, "loss": 0.7269, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.439453125, "rewards/margins": -0.1640625, "rewards/rejected": 0.60546875, "step": 690 }, { "epoch": 0.1993365065628155, "grad_norm": 10.259290554511898, "learning_rate": 4.851424723542793e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1600.0, "logps/rejected": -1312.0, "loss": 0.6951, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.41796875, "rewards/margins": 0.07568359375, "rewards/rejected": 0.341796875, "step": 691 }, { "epoch": 0.19962498197028702, "grad_norm": 11.312589039583735, "learning_rate": 4.850568380644587e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1296.0, "logps/rejected": -1336.0, "loss": 0.6781, "loss/demonstration_loss": -2656.0, "loss/preference_loss": -2656.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.34765625, "rewards/margins": -0.02685546875, "rewards/rejected": 0.375, "step": 692 }, { "epoch": 0.19991345737775854, "grad_norm": 12.263397013496872, "learning_rate": 4.849709653001921e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1440.0, "logps/rejected": -1440.0, "loss": 0.6759, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5, "rewards/margins": 0.099609375, "rewards/rejected": 0.40234375, "step": 693 }, { "epoch": 0.20020193278523005, "grad_norm": 10.957957872409663, "learning_rate": 4.848848541486005e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1560.0, "logps/rejected": -1536.0, "loss": 0.688, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.416015625, "rewards/margins": -0.034423828125, "rewards/rejected": 0.451171875, "step": 694 }, { "epoch": 0.20049040819270159, "grad_norm": 10.271050783069304, "learning_rate": 4.847985046970471e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1432.0, "logps/rejected": -1424.0, "loss": 0.7087, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.447265625, "rewards/margins": 0.020263671875, "rewards/rejected": 0.42578125, "step": 695 }, { "epoch": 0.2007788836001731, "grad_norm": 11.926952083717993, "learning_rate": 4.847119170331369e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1784.0, "logps/rejected": -1664.0, "loss": 0.6847, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.455078125, "rewards/margins": 0.00732421875, "rewards/rejected": 0.447265625, "step": 696 }, { "epoch": 0.2010673590076446, "grad_norm": 10.520748430768954, "learning_rate": 4.846250912447164e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1576.0, "logps/rejected": -1528.0, "loss": 0.696, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.380859375, "rewards/margins": -0.0274658203125, "rewards/rejected": 0.408203125, "step": 697 }, { "epoch": 0.20135583441511612, "grad_norm": 10.999228935581225, "learning_rate": 4.84538027419874e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1720.0, "logps/rejected": -1632.0, "loss": 0.6985, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.59375, "rewards/margins": 0.028076171875, "rewards/rejected": 0.56640625, "step": 698 }, { "epoch": 0.20164430982258763, "grad_norm": 11.836114288510027, "learning_rate": 4.844507256469392e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1944.0, "logps/rejected": -1872.0, "loss": 0.6912, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5859375, "rewards/margins": -0.0157470703125, "rewards/rejected": 0.6015625, "step": 699 }, { "epoch": 0.20193278523005914, "grad_norm": 11.081049944785299, "learning_rate": 4.843631860144831e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -1416.0, "logps/rejected": -1360.0, "loss": 0.7131, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.443359375, "rewards/margins": 0.0107421875, "rewards/rejected": 0.43359375, "step": 700 }, { "epoch": 0.20222126063753065, "grad_norm": 11.318437277350487, "learning_rate": 4.842754086113183e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1480.0, "logps/rejected": -1608.0, "loss": 0.6798, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.37890625, "rewards/margins": -0.043701171875, "rewards/rejected": 0.421875, "step": 701 }, { "epoch": 0.20250973604500216, "grad_norm": 11.823880302215205, "learning_rate": 4.841873935264982e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1816.0, "logps/rejected": -1720.0, "loss": 0.6569, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.58984375, "rewards/margins": 0.033203125, "rewards/rejected": 0.5546875, "step": 702 }, { "epoch": 0.20279821145247368, "grad_norm": 9.605328610704674, "learning_rate": 4.840991408493177e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1888.0, "logps/rejected": -1704.0, "loss": 0.6494, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.66015625, "rewards/margins": 0.1640625, "rewards/rejected": 0.49609375, "step": 703 }, { "epoch": 0.2030866868599452, "grad_norm": 9.842920213456006, "learning_rate": 4.840106506693127e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1400.0, "logps/rejected": -1352.0, "loss": 0.6995, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.421875, "rewards/margins": -0.0361328125, "rewards/rejected": 0.458984375, "step": 704 }, { "epoch": 0.2033751622674167, "grad_norm": 13.645161505423774, "learning_rate": 4.839219230762598e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -2128.0, "logps/rejected": -1952.0, "loss": 0.6902, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.64453125, "rewards/margins": 0.05419921875, "rewards/rejected": 0.58984375, "step": 705 }, { "epoch": 0.2036636376748882, "grad_norm": 14.426707028631657, "learning_rate": 4.838329581601768e-07, "logits/chosen": 3.203125, "logits/rejected": 3.03125, "logps/chosen": -1400.0, "logps/rejected": -1328.0, "loss": 0.6826, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.451171875, "rewards/margins": 0.06494140625, "rewards/rejected": 0.38671875, "step": 706 }, { "epoch": 0.20395211308235972, "grad_norm": 12.634909182552262, "learning_rate": 4.837437560113221e-07, "logits/chosen": 3.046875, "logits/rejected": 3.1875, "logps/chosen": -1760.0, "logps/rejected": -1160.0, "loss": 0.6858, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.474609375, "rewards/margins": 0.10302734375, "rewards/rejected": 0.37109375, "step": 707 }, { "epoch": 0.20424058848983123, "grad_norm": 12.134777792175141, "learning_rate": 4.836543167201947e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -1568.0, "logps/rejected": -1584.0, "loss": 0.6671, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5078125, "rewards/margins": 0.08203125, "rewards/rejected": 0.42578125, "step": 708 }, { "epoch": 0.20452906389730274, "grad_norm": 10.734087179446105, "learning_rate": 4.835646403775344e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1536.0, "logps/rejected": -1416.0, "loss": 0.6769, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.404296875, "rewards/margins": 0.0235595703125, "rewards/rejected": 0.380859375, "step": 709 }, { "epoch": 0.20481753930477425, "grad_norm": 10.968610477029062, "learning_rate": 4.834747270743214e-07, "logits/chosen": 3.0, "logits/rejected": 3.109375, "logps/chosen": -2016.0, "logps/rejected": -1960.0, "loss": 0.6611, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5703125, "rewards/margins": 0.05029296875, "rewards/rejected": 0.51953125, "step": 710 }, { "epoch": 0.2051060147122458, "grad_norm": 9.880917503093556, "learning_rate": 4.833845769017762e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1640.0, "logps/rejected": -1648.0, "loss": 0.6823, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.55078125, "rewards/margins": 0.024658203125, "rewards/rejected": 0.52734375, "step": 711 }, { "epoch": 0.2053944901197173, "grad_norm": 10.754545624863882, "learning_rate": 4.832941899513599e-07, "logits/chosen": 3.171875, "logits/rejected": 3.09375, "logps/chosen": -1968.0, "logps/rejected": -1848.0, "loss": 0.6762, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.64453125, "rewards/margins": 0.1279296875, "rewards/rejected": 0.515625, "step": 712 }, { "epoch": 0.20568296552718882, "grad_norm": 9.816562259232935, "learning_rate": 4.832035663147733e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1616.0, "logps/rejected": -1512.0, "loss": 0.6635, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.6796875, "rewards/margins": 0.05908203125, "rewards/rejected": 0.62109375, "step": 713 }, { "epoch": 0.20597144093466033, "grad_norm": 9.781413257592055, "learning_rate": 4.831127060839579e-07, "logits/chosen": 3.125, "logits/rejected": 3.203125, "logps/chosen": -1760.0, "logps/rejected": -1360.0, "loss": 0.68, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5859375, "rewards/margins": 0.15625, "rewards/rejected": 0.4296875, "step": 714 }, { "epoch": 0.20625991634213184, "grad_norm": 9.729851726983869, "learning_rate": 4.830216093510951e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1608.0, "logps/rejected": -1648.0, "loss": 0.6412, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.6015625, "rewards/margins": 0.08056640625, "rewards/rejected": 0.5234375, "step": 715 }, { "epoch": 0.20654839174960335, "grad_norm": 13.884011002386666, "learning_rate": 4.829302762086058e-07, "logits/chosen": 3.171875, "logits/rejected": 3.25, "logps/chosen": -1760.0, "logps/rejected": -1560.0, "loss": 0.6591, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51953125, "rewards/margins": 0.0263671875, "rewards/rejected": 0.494140625, "step": 716 }, { "epoch": 0.20683686715707486, "grad_norm": 11.022701363854448, "learning_rate": 4.828387067491514e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1960.0, "logps/rejected": -2040.0, "loss": 0.6531, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.625, "rewards/margins": 0.034423828125, "rewards/rejected": 0.58984375, "step": 717 }, { "epoch": 0.20712534256454637, "grad_norm": 11.196729465565406, "learning_rate": 4.827469010656325e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1576.0, "logps/rejected": -1336.0, "loss": 0.6645, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.58984375, "rewards/margins": 0.1279296875, "rewards/rejected": 0.462890625, "step": 718 }, { "epoch": 0.20741381797201788, "grad_norm": 10.716179400833628, "learning_rate": 4.826548592511897e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1496.0, "logps/rejected": -1200.0, "loss": 0.6959, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3515625, "rewards/margins": 0.02392578125, "rewards/rejected": 0.328125, "step": 719 }, { "epoch": 0.2077022933794894, "grad_norm": 10.811632977912513, "learning_rate": 4.825625813992032e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1968.0, "logps/rejected": -1648.0, "loss": 0.6373, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.70703125, "rewards/margins": 0.1259765625, "rewards/rejected": 0.578125, "step": 720 }, { "epoch": 0.2079907687869609, "grad_norm": 12.174280535165268, "learning_rate": 4.824700676032922e-07, "logits/chosen": 3.109375, "logits/rejected": 3.171875, "logps/chosen": -1776.0, "logps/rejected": -1784.0, "loss": 0.6888, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55078125, "rewards/margins": 0.04541015625, "rewards/rejected": 0.50390625, "step": 721 }, { "epoch": 0.20827924419443242, "grad_norm": 11.795632333869737, "learning_rate": 4.823773179573158e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1608.0, "logps/rejected": -1704.0, "loss": 0.707, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5078125, "rewards/margins": -0.0185546875, "rewards/rejected": 0.5234375, "step": 722 }, { "epoch": 0.20856771960190393, "grad_norm": 9.078251008803598, "learning_rate": 4.822843325553721e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1448.0, "logps/rejected": -1424.0, "loss": 0.6699, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5234375, "rewards/margins": 0.072265625, "rewards/rejected": 0.451171875, "step": 723 }, { "epoch": 0.20885619500937544, "grad_norm": 10.222274664567664, "learning_rate": 4.821911114917986e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1856.0, "logps/rejected": -1840.0, "loss": 0.6993, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.494140625, "rewards/margins": -0.029296875, "rewards/rejected": 0.5234375, "step": 724 }, { "epoch": 0.20914467041684695, "grad_norm": 14.201053835839705, "learning_rate": 4.820976548611717e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1792.0, "logps/rejected": -1704.0, "loss": 0.678, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.6328125, "rewards/margins": 0.18359375, "rewards/rejected": 0.447265625, "step": 725 }, { "epoch": 0.2094331458243185, "grad_norm": 10.649266406239299, "learning_rate": 4.820039627583066e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1824.0, "logps/rejected": -1784.0, "loss": 0.6754, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55859375, "rewards/margins": 0.0810546875, "rewards/rejected": 0.4765625, "step": 726 }, { "epoch": 0.20972162123179, "grad_norm": 10.88402246312366, "learning_rate": 4.819100352782581e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1656.0, "logps/rejected": -1648.0, "loss": 0.6921, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55859375, "rewards/margins": 0.0625, "rewards/rejected": 0.498046875, "step": 727 }, { "epoch": 0.2100100966392615, "grad_norm": 12.457949993147707, "learning_rate": 4.81815872516319e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1656.0, "logps/rejected": -1456.0, "loss": 0.6895, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.53515625, "rewards/margins": 0.017333984375, "rewards/rejected": 0.51953125, "step": 728 }, { "epoch": 0.21029857204673302, "grad_norm": 10.745960850335821, "learning_rate": 4.817214745680212e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1528.0, "logps/rejected": -1536.0, "loss": 0.6649, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.6875, "rewards/margins": 0.033935546875, "rewards/rejected": 0.65625, "step": 729 }, { "epoch": 0.21058704745420453, "grad_norm": 10.316746638279156, "learning_rate": 4.816268415291352e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1600.0, "logps/rejected": -1648.0, "loss": 0.6833, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.42578125, "rewards/margins": -0.08203125, "rewards/rejected": 0.5078125, "step": 730 }, { "epoch": 0.21087552286167605, "grad_norm": 11.870927244951593, "learning_rate": 4.815319734956699e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1936.0, "logps/rejected": -1912.0, "loss": 0.7054, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.50390625, "rewards/margins": -0.1025390625, "rewards/rejected": 0.60546875, "step": 731 }, { "epoch": 0.21116399826914756, "grad_norm": 9.826235486019833, "learning_rate": 4.814368705638726e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1416.0, "logps/rejected": -1512.0, "loss": 0.6851, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.357421875, "rewards/margins": -0.009765625, "rewards/rejected": 0.3671875, "step": 732 }, { "epoch": 0.21145247367661907, "grad_norm": 10.981168775767602, "learning_rate": 4.813415328302292e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1512.0, "logps/rejected": -1448.0, "loss": 0.6843, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.419921875, "rewards/margins": 0.033935546875, "rewards/rejected": 0.384765625, "step": 733 }, { "epoch": 0.21174094908409058, "grad_norm": 11.419392207529663, "learning_rate": 4.812459603914635e-07, "logits/chosen": 3.171875, "logits/rejected": 3.1875, "logps/chosen": -1728.0, "logps/rejected": -1672.0, "loss": 0.685, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.546875, "rewards/margins": 0.033935546875, "rewards/rejected": 0.515625, "step": 734 }, { "epoch": 0.2120294244915621, "grad_norm": 10.514347986848705, "learning_rate": 4.811501533445374e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1688.0, "logps/rejected": -1888.0, "loss": 0.6918, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.515625, "rewards/margins": 0.0224609375, "rewards/rejected": 0.4921875, "step": 735 }, { "epoch": 0.2123178998990336, "grad_norm": 10.770207230190788, "learning_rate": 4.810541117866511e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1880.0, "logps/rejected": -1464.0, "loss": 0.675, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.58984375, "rewards/margins": 0.1611328125, "rewards/rejected": 0.427734375, "step": 736 }, { "epoch": 0.2126063753065051, "grad_norm": 10.095393334059818, "learning_rate": 4.809578358152423e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1384.0, "logps/rejected": -1416.0, "loss": 0.689, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.39453125, "rewards/margins": -0.0267333984375, "rewards/rejected": 0.421875, "step": 737 }, { "epoch": 0.21289485071397662, "grad_norm": 11.734817131212226, "learning_rate": 4.808613255279871e-07, "logits/chosen": 3.171875, "logits/rejected": 3.25, "logps/chosen": -2128.0, "logps/rejected": -1784.0, "loss": 0.6477, "loss/demonstration_loss": -3968.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.55859375, "rewards/margins": 0.1611328125, "rewards/rejected": 0.3984375, "step": 738 }, { "epoch": 0.21318332612144814, "grad_norm": 9.666998956048381, "learning_rate": 4.807645810227988e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1960.0, "logps/rejected": -1992.0, "loss": 0.677, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.546875, "rewards/margins": 0.0458984375, "rewards/rejected": 0.5, "step": 739 }, { "epoch": 0.21347180152891965, "grad_norm": 17.63609448211562, "learning_rate": 4.806676023978285e-07, "logits/chosen": 3.1875, "logits/rejected": 3.0625, "logps/chosen": -1672.0, "logps/rejected": -1792.0, "loss": 0.7662, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.49609375, "rewards/margins": -0.12255859375, "rewards/rejected": 0.6171875, "step": 740 }, { "epoch": 0.21376027693639119, "grad_norm": 11.49331490384248, "learning_rate": 4.80570389751465e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0625, "logps/chosen": -1680.0, "logps/rejected": -1520.0, "loss": 0.7121, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.376953125, "rewards/margins": 0.00201416015625, "rewards/rejected": 0.375, "step": 741 }, { "epoch": 0.2140487523438627, "grad_norm": 11.136275680249597, "learning_rate": 4.804729431823343e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1512.0, "logps/rejected": -1384.0, "loss": 0.7009, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.482421875, "rewards/margins": 0.0245361328125, "rewards/rejected": 0.45703125, "step": 742 }, { "epoch": 0.2143372277513342, "grad_norm": 9.9162938139212, "learning_rate": 4.803752627892997e-07, "logits/chosen": 3.109375, "logits/rejected": 3.171875, "logps/chosen": -1160.0, "logps/rejected": -1120.0, "loss": 0.6873, "loss/demonstration_loss": -2320.0, "loss/preference_loss": -2304.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32421875, "rewards/margins": 0.04052734375, "rewards/rejected": 0.283203125, "step": 743 }, { "epoch": 0.21462570315880572, "grad_norm": 12.526651487820564, "learning_rate": 4.80277348671462e-07, "logits/chosen": 3.390625, "logits/rejected": 3.375, "logps/chosen": -1944.0, "logps/rejected": -2064.0, "loss": 0.6979, "loss/demonstration_loss": -4080.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.609375, "rewards/margins": -0.0174560546875, "rewards/rejected": 0.625, "step": 744 }, { "epoch": 0.21491417856627723, "grad_norm": 11.414316313391137, "learning_rate": 4.801792009281588e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1472.0, "logps/rejected": -1384.0, "loss": 0.6796, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.462890625, "rewards/margins": 0.1025390625, "rewards/rejected": 0.359375, "step": 745 }, { "epoch": 0.21520265397374874, "grad_norm": 10.906583280987837, "learning_rate": 4.800808196589649e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1888.0, "logps/rejected": -1720.0, "loss": 0.6701, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.55859375, "rewards/margins": 0.11083984375, "rewards/rejected": 0.447265625, "step": 746 }, { "epoch": 0.21549112938122025, "grad_norm": 9.726929692644552, "learning_rate": 4.799822049636919e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -2008.0, "logps/rejected": -1584.0, "loss": 0.6833, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55078125, "rewards/margins": 0.05419921875, "rewards/rejected": 0.49609375, "step": 747 }, { "epoch": 0.21577960478869176, "grad_norm": 10.07858396947618, "learning_rate": 4.798833569423885e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1472.0, "logps/rejected": -1288.0, "loss": 0.6721, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.453125, "rewards/margins": 0.0859375, "rewards/rejected": 0.3671875, "step": 748 }, { "epoch": 0.21606808019616328, "grad_norm": 12.567324884876328, "learning_rate": 4.797842756953396e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1896.0, "logps/rejected": -1600.0, "loss": 0.6531, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.64453125, "rewards/margins": 0.162109375, "rewards/rejected": 0.482421875, "step": 749 }, { "epoch": 0.2163565556036348, "grad_norm": 11.011619684047062, "learning_rate": 4.796849613230675e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1696.0, "logps/rejected": -1664.0, "loss": 0.7045, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5078125, "rewards/margins": 0.008056640625, "rewards/rejected": 0.5, "step": 750 }, { "epoch": 0.2166450310111063, "grad_norm": 11.627923604077534, "learning_rate": 4.795854139263301e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1808.0, "logps/rejected": -1736.0, "loss": 0.6995, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.54296875, "rewards/margins": 0.0018310546875, "rewards/rejected": 0.54296875, "step": 751 }, { "epoch": 0.2169335064185778, "grad_norm": 10.372329713893112, "learning_rate": 4.794856336061224e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1240.0, "logps/rejected": -1432.0, "loss": 0.7021, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.498046875, "rewards/margins": -0.03564453125, "rewards/rejected": 0.53125, "step": 752 }, { "epoch": 0.21722198182604932, "grad_norm": 9.674311966203964, "learning_rate": 4.793856204636755e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -1224.0, "logps/rejected": -1232.0, "loss": 0.7018, "loss/demonstration_loss": -2480.0, "loss/preference_loss": -2480.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.31640625, "rewards/margins": -0.03173828125, "rewards/rejected": 0.34765625, "step": 753 }, { "epoch": 0.21751045723352083, "grad_norm": 10.927400718305211, "learning_rate": 4.792853746004566e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1784.0, "logps/rejected": -2048.0, "loss": 0.6879, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.546875, "rewards/margins": -0.031982421875, "rewards/rejected": 0.578125, "step": 754 }, { "epoch": 0.21779893264099234, "grad_norm": 10.678696466368375, "learning_rate": 4.79184896118169e-07, "logits/chosen": 3.28125, "logits/rejected": 3.15625, "logps/chosen": -1792.0, "logps/rejected": -1728.0, "loss": 0.6694, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.490234375, "rewards/margins": 0.037353515625, "rewards/rejected": 0.453125, "step": 755 }, { "epoch": 0.21808740804846388, "grad_norm": 10.42729407783746, "learning_rate": 4.790841851187523e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1640.0, "logps/rejected": -1480.0, "loss": 0.6675, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4140625, "rewards/margins": 0.033447265625, "rewards/rejected": 0.380859375, "step": 756 }, { "epoch": 0.2183758834559354, "grad_norm": 11.294229027280197, "learning_rate": 4.789832417043817e-07, "logits/chosen": 3.1875, "logits/rejected": 3.078125, "logps/chosen": -1808.0, "logps/rejected": -1856.0, "loss": 0.6999, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.47265625, "rewards/margins": -0.06005859375, "rewards/rejected": 0.53125, "step": 757 }, { "epoch": 0.2186643588634069, "grad_norm": 10.129031655286697, "learning_rate": 4.788820659774682e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1448.0, "logps/rejected": -1552.0, "loss": 0.696, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.57421875, "rewards/margins": 0.0084228515625, "rewards/rejected": 0.56640625, "step": 758 }, { "epoch": 0.21895283427087842, "grad_norm": 10.160425206339376, "learning_rate": 4.787806580406588e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1456.0, "logps/rejected": -1448.0, "loss": 0.6815, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4609375, "rewards/margins": 0.02685546875, "rewards/rejected": 0.43359375, "step": 759 }, { "epoch": 0.21924130967834993, "grad_norm": 10.706784668951755, "learning_rate": 4.786790179968354e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1864.0, "logps/rejected": -1872.0, "loss": 0.681, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.578125, "rewards/margins": 0.037109375, "rewards/rejected": 0.5390625, "step": 760 }, { "epoch": 0.21952978508582144, "grad_norm": 10.483601920873845, "learning_rate": 4.785771459491164e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1424.0, "logps/rejected": -1216.0, "loss": 0.6832, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51953125, "rewards/margins": 0.0654296875, "rewards/rejected": 0.453125, "step": 761 }, { "epoch": 0.21981826049329295, "grad_norm": 12.278500643181989, "learning_rate": 4.784750420008545e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1704.0, "logps/rejected": -1632.0, "loss": 0.6714, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.54296875, "rewards/margins": 0.03857421875, "rewards/rejected": 0.50390625, "step": 762 }, { "epoch": 0.22010673590076446, "grad_norm": 11.456260638109258, "learning_rate": 4.783727062556386e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1544.0, "logps/rejected": -1576.0, "loss": 0.6907, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.51953125, "rewards/margins": -0.08544921875, "rewards/rejected": 0.60546875, "step": 763 }, { "epoch": 0.22039521130823597, "grad_norm": 9.891737451836569, "learning_rate": 4.782701388172922e-07, "logits/chosen": 3.3125, "logits/rejected": 3.34375, "logps/chosen": -1984.0, "logps/rejected": -1784.0, "loss": 0.639, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.59375, "rewards/margins": 0.2421875, "rewards/rejected": 0.349609375, "step": 764 }, { "epoch": 0.22068368671570748, "grad_norm": 12.995043382884173, "learning_rate": 4.781673397898739e-07, "logits/chosen": 2.96875, "logits/rejected": 3.046875, "logps/chosen": -1464.0, "logps/rejected": -1424.0, "loss": 0.733, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.478515625, "rewards/margins": -0.03662109375, "rewards/rejected": 0.515625, "step": 765 }, { "epoch": 0.220972162123179, "grad_norm": 10.688877235024968, "learning_rate": 4.780643092776776e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -2240.0, "logps/rejected": -1976.0, "loss": 0.6799, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.6796875, "rewards/margins": 0.10498046875, "rewards/rejected": 0.57421875, "step": 766 }, { "epoch": 0.2212606375306505, "grad_norm": 10.702590110214809, "learning_rate": 4.779610473852317e-07, "logits/chosen": 3.1875, "logits/rejected": 3.265625, "logps/chosen": -2128.0, "logps/rejected": -1920.0, "loss": 0.6915, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5703125, "rewards/margins": 0.07275390625, "rewards/rejected": 0.498046875, "step": 767 }, { "epoch": 0.22154911293812202, "grad_norm": 12.544771235530325, "learning_rate": 4.778575542172994e-07, "logits/chosen": 2.9375, "logits/rejected": 3.0, "logps/chosen": -2096.0, "logps/rejected": -1576.0, "loss": 0.6801, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55859375, "rewards/margins": 0.062255859375, "rewards/rejected": 0.49609375, "step": 768 }, { "epoch": 0.22183758834559353, "grad_norm": 10.264003607868487, "learning_rate": 4.777538298788787e-07, "logits/chosen": 3.28125, "logits/rejected": 3.34375, "logps/chosen": -2096.0, "logps/rejected": -1824.0, "loss": 0.6567, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3968.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.734375, "rewards/margins": 0.1845703125, "rewards/rejected": 0.55078125, "step": 769 }, { "epoch": 0.22212606375306504, "grad_norm": 11.248404200359586, "learning_rate": 4.77649874475202e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -1624.0, "logps/rejected": -1648.0, "loss": 0.6671, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.458984375, "rewards/margins": -0.0186767578125, "rewards/rejected": 0.478515625, "step": 770 }, { "epoch": 0.22241453916053655, "grad_norm": 9.383476495257506, "learning_rate": 4.775456881117363e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1592.0, "logps/rejected": -1472.0, "loss": 0.653, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4609375, "rewards/margins": 0.11474609375, "rewards/rejected": 0.345703125, "step": 771 }, { "epoch": 0.2227030145680081, "grad_norm": 10.416448032488953, "learning_rate": 4.774412708941825e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1840.0, "logps/rejected": -1528.0, "loss": 0.6714, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.45703125, "rewards/margins": 0.1259765625, "rewards/rejected": 0.33203125, "step": 772 }, { "epoch": 0.2229914899754796, "grad_norm": 10.282109197122923, "learning_rate": 4.773366229284762e-07, "logits/chosen": 3.25, "logits/rejected": 3.203125, "logps/chosen": -1504.0, "logps/rejected": -1496.0, "loss": 0.69, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.515625, "rewards/margins": -0.0927734375, "rewards/rejected": 0.609375, "step": 773 }, { "epoch": 0.2232799653829511, "grad_norm": 11.55962934443606, "learning_rate": 4.77231744320787e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1656.0, "logps/rejected": -1616.0, "loss": 0.6917, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.490234375, "rewards/margins": 0.109375, "rewards/rejected": 0.380859375, "step": 774 }, { "epoch": 0.22356844079042262, "grad_norm": 11.33381977631596, "learning_rate": 4.771266351775181e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1864.0, "logps/rejected": -1680.0, "loss": 0.6583, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.64453125, "rewards/margins": 0.1396484375, "rewards/rejected": 0.50390625, "step": 775 }, { "epoch": 0.22385691619789413, "grad_norm": 11.409222112409122, "learning_rate": 4.77021295605307e-07, "logits/chosen": 3.390625, "logits/rejected": 3.375, "logps/chosen": -1856.0, "logps/rejected": -1920.0, "loss": 0.6959, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5546875, "rewards/margins": -0.02978515625, "rewards/rejected": 0.58203125, "step": 776 }, { "epoch": 0.22414539160536565, "grad_norm": 10.512079896468661, "learning_rate": 4.769157257110249e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1208.0, "logps/rejected": -1208.0, "loss": 0.6825, "loss/demonstration_loss": -2448.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.380859375, "rewards/margins": 0.023681640625, "rewards/rejected": 0.357421875, "step": 777 }, { "epoch": 0.22443386701283716, "grad_norm": 10.270088913394073, "learning_rate": 4.7680992560177655e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1784.0, "logps/rejected": -2016.0, "loss": 0.7098, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.46484375, "rewards/margins": -0.059326171875, "rewards/rejected": 0.5234375, "step": 778 }, { "epoch": 0.22472234242030867, "grad_norm": 11.021085673216014, "learning_rate": 4.767038953849004e-07, "logits/chosen": 3.328125, "logits/rejected": 3.265625, "logps/chosen": -1560.0, "logps/rejected": -1536.0, "loss": 0.6766, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.447265625, "rewards/margins": 0.053955078125, "rewards/rejected": 0.392578125, "step": 779 }, { "epoch": 0.22501081782778018, "grad_norm": 11.74429204541403, "learning_rate": 4.7659763516796834e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1640.0, "logps/rejected": -1632.0, "loss": 0.6841, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.494140625, "rewards/margins": -0.0185546875, "rewards/rejected": 0.51171875, "step": 780 }, { "epoch": 0.2252992932352517, "grad_norm": 10.637778976965793, "learning_rate": 4.7649114505878554e-07, "logits/chosen": 3.25, "logits/rejected": 3.203125, "logps/chosen": -1648.0, "logps/rejected": -1480.0, "loss": 0.6826, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.578125, "rewards/margins": 0.10595703125, "rewards/rejected": 0.47265625, "step": 781 }, { "epoch": 0.2255877686427232, "grad_norm": 10.944275520852242, "learning_rate": 4.763844251653902e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1480.0, "logps/rejected": -1488.0, "loss": 0.685, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.365234375, "rewards/margins": -0.1005859375, "rewards/rejected": 0.466796875, "step": 782 }, { "epoch": 0.2258762440501947, "grad_norm": 10.12640516157928, "learning_rate": 4.7627747559605425e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1576.0, "logps/rejected": -1664.0, "loss": 0.7059, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.53125, "rewards/margins": -0.006591796875, "rewards/rejected": 0.5390625, "step": 783 }, { "epoch": 0.22616471945766622, "grad_norm": 9.480851705203083, "learning_rate": 4.76170296459282e-07, "logits/chosen": 3.203125, "logits/rejected": 3.1875, "logps/chosen": -884.0, "logps/rejected": -1152.0, "loss": 0.6787, "loss/demonstration_loss": -2064.0, "loss/preference_loss": -2064.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.232421875, "rewards/margins": -0.0002899169921875, "rewards/rejected": 0.232421875, "step": 784 }, { "epoch": 0.22645319486513774, "grad_norm": 10.301037507342453, "learning_rate": 4.760628878638109e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1528.0, "logps/rejected": -1552.0, "loss": 0.6366, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.47265625, "rewards/margins": 0.045166015625, "rewards/rejected": 0.427734375, "step": 785 }, { "epoch": 0.22674167027260925, "grad_norm": 11.505669544271175, "learning_rate": 4.759552499186113e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1496.0, "logps/rejected": -1416.0, "loss": 0.696, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.345703125, "rewards/margins": -0.06884765625, "rewards/rejected": 0.416015625, "step": 786 }, { "epoch": 0.22703014568008079, "grad_norm": 8.843767392481718, "learning_rate": 4.7584738273288615e-07, "logits/chosen": 3.296875, "logits/rejected": 3.28125, "logps/chosen": -1792.0, "logps/rejected": -1512.0, "loss": 0.6517, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.51171875, "rewards/margins": 0.1474609375, "rewards/rejected": 0.36328125, "step": 787 }, { "epoch": 0.2273186210875523, "grad_norm": 11.870974186373632, "learning_rate": 4.757392864160709e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1800.0, "logps/rejected": -1768.0, "loss": 0.7079, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.408203125, "rewards/margins": -0.068359375, "rewards/rejected": 0.4765625, "step": 788 }, { "epoch": 0.2276070964950238, "grad_norm": 10.021499097336493, "learning_rate": 4.756309610778336e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1776.0, "logps/rejected": -1624.0, "loss": 0.6849, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.474609375, "rewards/margins": 0.08837890625, "rewards/rejected": 0.384765625, "step": 789 }, { "epoch": 0.22789557190249532, "grad_norm": 11.042010895080205, "learning_rate": 4.7552240682807466e-07, "logits/chosen": 3.125, "logits/rejected": 3.25, "logps/chosen": -2128.0, "logps/rejected": -1936.0, "loss": 0.6693, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.66015625, "rewards/margins": 0.1484375, "rewards/rejected": 0.51171875, "step": 790 }, { "epoch": 0.22818404730996683, "grad_norm": 9.818454430936878, "learning_rate": 4.754136237769264e-07, "logits/chosen": 3.34375, "logits/rejected": 3.296875, "logps/chosen": -1608.0, "logps/rejected": -1560.0, "loss": 0.7092, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.515625, "rewards/margins": 0.0003662109375, "rewards/rejected": 0.515625, "step": 791 }, { "epoch": 0.22847252271743834, "grad_norm": 9.404236270857183, "learning_rate": 4.753046120347538e-07, "logits/chosen": 3.375, "logits/rejected": 3.4375, "logps/chosen": -1768.0, "logps/rejected": -1640.0, "loss": 0.6659, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5390625, "rewards/margins": 0.0576171875, "rewards/rejected": 0.482421875, "step": 792 }, { "epoch": 0.22876099812490985, "grad_norm": 11.664474784377166, "learning_rate": 4.751953717121534e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1784.0, "logps/rejected": -1712.0, "loss": 0.6915, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5078125, "rewards/margins": 0.11181640625, "rewards/rejected": 0.396484375, "step": 793 }, { "epoch": 0.22904947353238136, "grad_norm": 9.591498649773056, "learning_rate": 4.7508590291995387e-07, "logits/chosen": 3.28125, "logits/rejected": 3.34375, "logps/chosen": -1464.0, "logps/rejected": -1448.0, "loss": 0.6516, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.46484375, "rewards/margins": 0.078125, "rewards/rejected": 0.38671875, "step": 794 }, { "epoch": 0.22933794893985288, "grad_norm": 10.361518173349966, "learning_rate": 4.749762057692157e-07, "logits/chosen": 3.375, "logits/rejected": 3.359375, "logps/chosen": -1696.0, "logps/rejected": -1872.0, "loss": 0.6879, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5703125, "rewards/margins": 0.06005859375, "rewards/rejected": 0.51171875, "step": 795 }, { "epoch": 0.2296264243473244, "grad_norm": 8.818637581966785, "learning_rate": 4.748662803712309e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1552.0, "logps/rejected": -1264.0, "loss": 0.6754, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5078125, "rewards/margins": 0.10205078125, "rewards/rejected": 0.40625, "step": 796 }, { "epoch": 0.2299148997547959, "grad_norm": 13.867856803535817, "learning_rate": 4.7475612683752307e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -1592.0, "logps/rejected": -1728.0, "loss": 0.7239, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.48046875, "rewards/margins": -0.0252685546875, "rewards/rejected": 0.50390625, "step": 797 }, { "epoch": 0.2302033751622674, "grad_norm": 9.794300973888253, "learning_rate": 4.7464574527984746e-07, "logits/chosen": 3.34375, "logits/rejected": 3.34375, "logps/chosen": -1848.0, "logps/rejected": -1648.0, "loss": 0.6559, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.6796875, "rewards/margins": 0.142578125, "rewards/rejected": 0.53515625, "step": 798 }, { "epoch": 0.23049185056973892, "grad_norm": 10.7887130072238, "learning_rate": 4.7453513581019045e-07, "logits/chosen": 3.125, "logits/rejected": 3.078125, "logps/chosen": -2064.0, "logps/rejected": -1792.0, "loss": 0.6571, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.64453125, "rewards/margins": 0.169921875, "rewards/rejected": 0.474609375, "step": 799 }, { "epoch": 0.23078032597721043, "grad_norm": 9.787222627532174, "learning_rate": 4.744242985407697e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1736.0, "logps/rejected": -1536.0, "loss": 0.6699, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55078125, "rewards/margins": 0.09716796875, "rewards/rejected": 0.455078125, "step": 800 }, { "epoch": 0.23106880138468194, "grad_norm": 12.4120820523628, "learning_rate": 4.7431323358403397e-07, "logits/chosen": 3.3125, "logits/rejected": 3.21875, "logps/chosen": -1856.0, "logps/rejected": -1776.0, "loss": 0.6675, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.484375, "rewards/margins": 0.040283203125, "rewards/rejected": 0.443359375, "step": 801 }, { "epoch": 0.23135727679215348, "grad_norm": 12.132228232391645, "learning_rate": 4.742019410526632e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1544.0, "logps/rejected": -1696.0, "loss": 0.7286, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.494140625, "rewards/margins": -0.07275390625, "rewards/rejected": 0.56640625, "step": 802 }, { "epoch": 0.231645752199625, "grad_norm": 10.83451110623162, "learning_rate": 4.740904210595679e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1984.0, "logps/rejected": -1672.0, "loss": 0.6667, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.56640625, "rewards/margins": 0.142578125, "rewards/rejected": 0.421875, "step": 803 }, { "epoch": 0.2319342276070965, "grad_norm": 13.565057108496916, "learning_rate": 4.739786737178895e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1376.0, "logps/rejected": -1488.0, "loss": 0.7132, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.435546875, "rewards/margins": 0.01513671875, "rewards/rejected": 0.419921875, "step": 804 }, { "epoch": 0.23222270301456802, "grad_norm": 11.679900047201592, "learning_rate": 4.7386669914100026e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1512.0, "logps/rejected": -1560.0, "loss": 0.6829, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5625, "rewards/margins": 0.033935546875, "rewards/rejected": 0.52734375, "step": 805 }, { "epoch": 0.23251117842203953, "grad_norm": 12.37063049798951, "learning_rate": 4.7375449744250264e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1496.0, "logps/rejected": -1456.0, "loss": 0.6904, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.458984375, "rewards/margins": 0.060546875, "rewards/rejected": 0.3984375, "step": 806 }, { "epoch": 0.23279965382951104, "grad_norm": 11.037190300011083, "learning_rate": 4.7364206873622974e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1480.0, "logps/rejected": -1368.0, "loss": 0.6632, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4609375, "rewards/margins": 0.0712890625, "rewards/rejected": 0.390625, "step": 807 }, { "epoch": 0.23308812923698255, "grad_norm": 11.362658973782954, "learning_rate": 4.7352941313624495e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1888.0, "logps/rejected": -1680.0, "loss": 0.6901, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.63671875, "rewards/margins": 0.0146484375, "rewards/rejected": 0.62109375, "step": 808 }, { "epoch": 0.23337660464445406, "grad_norm": 10.122933636489352, "learning_rate": 4.7341653075684186e-07, "logits/chosen": 3.296875, "logits/rejected": 3.328125, "logps/chosen": -1936.0, "logps/rejected": -1792.0, "loss": 0.6382, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.51953125, "rewards/margins": 0.126953125, "rewards/rejected": 0.392578125, "step": 809 }, { "epoch": 0.23366508005192557, "grad_norm": 8.839221635012823, "learning_rate": 4.73303421712544e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1464.0, "logps/rejected": -1280.0, "loss": 0.6655, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.484375, "rewards/margins": 0.09130859375, "rewards/rejected": 0.392578125, "step": 810 }, { "epoch": 0.23395355545939708, "grad_norm": 10.980117329676265, "learning_rate": 4.7319008611810504e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1696.0, "logps/rejected": -1680.0, "loss": 0.6497, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.498046875, "rewards/margins": -0.01019287109375, "rewards/rejected": 0.5078125, "step": 811 }, { "epoch": 0.2342420308668686, "grad_norm": 11.545790571748764, "learning_rate": 4.730765240885084e-07, "logits/chosen": 3.171875, "logits/rejected": 3.28125, "logps/chosen": -1728.0, "logps/rejected": -1408.0, "loss": 0.6575, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4765625, "rewards/margins": 0.1416015625, "rewards/rejected": 0.333984375, "step": 812 }, { "epoch": 0.2345305062743401, "grad_norm": 11.506886564916305, "learning_rate": 4.7296273573896726e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1832.0, "logps/rejected": -1856.0, "loss": 0.6799, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4140625, "rewards/margins": -0.0205078125, "rewards/rejected": 0.435546875, "step": 813 }, { "epoch": 0.23481898168181162, "grad_norm": 9.950823895843715, "learning_rate": 4.7284872118492436e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1928.0, "logps/rejected": -1824.0, "loss": 0.6603, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.578125, "rewards/margins": 0.181640625, "rewards/rejected": 0.396484375, "step": 814 }, { "epoch": 0.23510745708928313, "grad_norm": 11.477741502553926, "learning_rate": 4.72734480542052e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1976.0, "logps/rejected": -1984.0, "loss": 0.6876, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.65234375, "rewards/margins": 0.06982421875, "rewards/rejected": 0.58203125, "step": 815 }, { "epoch": 0.23539593249675464, "grad_norm": 10.805987323200057, "learning_rate": 4.7262001392625186e-07, "logits/chosen": 3.265625, "logits/rejected": 3.265625, "logps/chosen": -1936.0, "logps/rejected": -1640.0, "loss": 0.6753, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.546875, "rewards/margins": 0.008056640625, "rewards/rejected": 0.5390625, "step": 816 }, { "epoch": 0.23568440790422618, "grad_norm": 9.73096815758516, "learning_rate": 4.725053214536547e-07, "logits/chosen": 3.421875, "logits/rejected": 3.34375, "logps/chosen": -1864.0, "logps/rejected": -1736.0, "loss": 0.6801, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5078125, "rewards/margins": 0.06787109375, "rewards/rejected": 0.439453125, "step": 817 }, { "epoch": 0.2359728833116977, "grad_norm": 9.548384861756809, "learning_rate": 4.723904032406206e-07, "logits/chosen": 3.203125, "logits/rejected": 3.140625, "logps/chosen": -1616.0, "logps/rejected": -1552.0, "loss": 0.6224, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.70703125, "rewards/margins": 0.21484375, "rewards/rejected": 0.4921875, "step": 818 }, { "epoch": 0.2362613587191692, "grad_norm": 10.49081696720962, "learning_rate": 4.722752594037388e-07, "logits/chosen": 3.359375, "logits/rejected": 3.375, "logps/chosen": -1672.0, "logps/rejected": -1816.0, "loss": 0.6826, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.466796875, "rewards/margins": -0.0025482177734375, "rewards/rejected": 0.46875, "step": 819 }, { "epoch": 0.2365498341266407, "grad_norm": 11.108943157346348, "learning_rate": 4.7215989005982714e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1320.0, "logps/rejected": -1440.0, "loss": 0.6708, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.52734375, "rewards/margins": -0.01123046875, "rewards/rejected": 0.5390625, "step": 820 }, { "epoch": 0.23683830953411222, "grad_norm": 9.613600091322146, "learning_rate": 4.7204429532593235e-07, "logits/chosen": 3.15625, "logits/rejected": 3.25, "logps/chosen": -1568.0, "logps/rejected": -1344.0, "loss": 0.6649, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5390625, "rewards/margins": 0.1474609375, "rewards/rejected": 0.390625, "step": 821 }, { "epoch": 0.23712678494158373, "grad_norm": 11.886570874261434, "learning_rate": 4.719284753193299e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1664.0, "logps/rejected": -1408.0, "loss": 0.6855, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.55859375, "rewards/margins": 0.1181640625, "rewards/rejected": 0.44140625, "step": 822 }, { "epoch": 0.23741526034905525, "grad_norm": 10.050707587002151, "learning_rate": 4.718124301575238e-07, "logits/chosen": 3.28125, "logits/rejected": 3.34375, "logps/chosen": -1464.0, "logps/rejected": -1368.0, "loss": 0.6705, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.37109375, "rewards/margins": 0.030517578125, "rewards/rejected": 0.33984375, "step": 823 }, { "epoch": 0.23770373575652676, "grad_norm": 9.994920041113383, "learning_rate": 4.7169615995824637e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1208.0, "logps/rejected": -1112.0, "loss": 0.6497, "loss/demonstration_loss": -2352.0, "loss/preference_loss": -2352.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.392578125, "rewards/margins": 0.04931640625, "rewards/rejected": 0.34375, "step": 824 }, { "epoch": 0.23799221116399827, "grad_norm": 12.190246233526418, "learning_rate": 4.7157966483945835e-07, "logits/chosen": 3.1875, "logits/rejected": 3.03125, "logps/chosen": -1560.0, "logps/rejected": -1904.0, "loss": 0.7321, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5859375, "rewards/margins": -0.0703125, "rewards/rejected": 0.65625, "step": 825 }, { "epoch": 0.23828068657146978, "grad_norm": 11.557976031747671, "learning_rate": 4.7146294491934865e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -1760.0, "logps/rejected": -1648.0, "loss": 0.6562, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.56640625, "rewards/margins": 0.1533203125, "rewards/rejected": 0.412109375, "step": 826 }, { "epoch": 0.2385691619789413, "grad_norm": 10.536740357072409, "learning_rate": 4.713460003163342e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1656.0, "logps/rejected": -1576.0, "loss": 0.6671, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.62890625, "rewards/margins": 0.1025390625, "rewards/rejected": 0.52734375, "step": 827 }, { "epoch": 0.2388576373864128, "grad_norm": 10.934009091586816, "learning_rate": 4.7122883114905997e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -2008.0, "logps/rejected": -2016.0, "loss": 0.7086, "loss/demonstration_loss": -4080.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.58203125, "rewards/margins": -0.04052734375, "rewards/rejected": 0.62109375, "step": 828 }, { "epoch": 0.2391461127938843, "grad_norm": 9.512945452235257, "learning_rate": 4.711114375363987e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1448.0, "logps/rejected": -1232.0, "loss": 0.6549, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.453125, "rewards/margins": 0.09228515625, "rewards/rejected": 0.361328125, "step": 829 }, { "epoch": 0.23943458820135582, "grad_norm": 12.986326945426041, "learning_rate": 4.7099381959745077e-07, "logits/chosen": 3.296875, "logits/rejected": 3.328125, "logps/chosen": -1944.0, "logps/rejected": -1928.0, "loss": 0.6951, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.671875, "rewards/margins": -0.0303955078125, "rewards/rejected": 0.69921875, "step": 830 }, { "epoch": 0.23972306360882734, "grad_norm": 11.889624527518347, "learning_rate": 4.708759774515444e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1968.0, "logps/rejected": -1704.0, "loss": 0.6827, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.625, "rewards/margins": 0.03515625, "rewards/rejected": 0.58984375, "step": 831 }, { "epoch": 0.24001153901629885, "grad_norm": 9.671930946902412, "learning_rate": 4.7075791121823487e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1480.0, "logps/rejected": -1392.0, "loss": 0.6969, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.48828125, "rewards/margins": 0.0751953125, "rewards/rejected": 0.4140625, "step": 832 }, { "epoch": 0.24030001442377039, "grad_norm": 11.575294438296122, "learning_rate": 4.7063962101730524e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1696.0, "logps/rejected": -1608.0, "loss": 0.7339, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.53515625, "rewards/margins": -0.03515625, "rewards/rejected": 0.5703125, "step": 833 }, { "epoch": 0.2405884898312419, "grad_norm": 11.089579624254114, "learning_rate": 4.7052110696876545e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1096.0, "logps/rejected": -1096.0, "loss": 0.7113, "loss/demonstration_loss": -2240.0, "loss/preference_loss": -2240.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.39453125, "rewards/margins": -0.043701171875, "rewards/rejected": 0.439453125, "step": 834 }, { "epoch": 0.2408769652387134, "grad_norm": 10.85282376573568, "learning_rate": 4.704023691928528e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1864.0, "logps/rejected": -1656.0, "loss": 0.627, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.703125, "rewards/margins": 0.12890625, "rewards/rejected": 0.57421875, "step": 835 }, { "epoch": 0.24116544064618492, "grad_norm": 8.973876395452272, "learning_rate": 4.702834078100314e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -2040.0, "logps/rejected": -1760.0, "loss": 0.653, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.63671875, "rewards/margins": 0.1083984375, "rewards/rejected": 0.52734375, "step": 836 }, { "epoch": 0.24145391605365643, "grad_norm": 9.85279863086701, "learning_rate": 4.701642229409922e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1616.0, "logps/rejected": -1784.0, "loss": 0.6949, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5, "rewards/margins": -0.00848388671875, "rewards/rejected": 0.5078125, "step": 837 }, { "epoch": 0.24174239146112794, "grad_norm": 11.679607802472916, "learning_rate": 4.7004481470665305e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1680.0, "logps/rejected": -1752.0, "loss": 0.6858, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.55078125, "rewards/margins": -0.0302734375, "rewards/rejected": 0.58203125, "step": 838 }, { "epoch": 0.24203086686859945, "grad_norm": 10.908930535095074, "learning_rate": 4.6992518322815835e-07, "logits/chosen": 3.3125, "logits/rejected": 3.34375, "logps/chosen": -1672.0, "logps/rejected": -1712.0, "loss": 0.7009, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51171875, "rewards/margins": 0.040771484375, "rewards/rejected": 0.470703125, "step": 839 }, { "epoch": 0.24231934227607096, "grad_norm": 10.542724071414089, "learning_rate": 4.698053286268788e-07, "logits/chosen": 3.359375, "logits/rejected": 3.328125, "logps/chosen": -1384.0, "logps/rejected": -1368.0, "loss": 0.6782, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4375, "rewards/margins": 0.0059814453125, "rewards/rejected": 0.431640625, "step": 840 }, { "epoch": 0.24260781768354248, "grad_norm": 12.455285302848992, "learning_rate": 4.6968525102441175e-07, "logits/chosen": 3.25, "logits/rejected": 3.171875, "logps/chosen": -1664.0, "logps/rejected": -1688.0, "loss": 0.6554, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.48046875, "rewards/margins": 0.0079345703125, "rewards/rejected": 0.47265625, "step": 841 }, { "epoch": 0.242896293091014, "grad_norm": 10.842203346965643, "learning_rate": 4.695649505425807e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -2008.0, "logps/rejected": -2128.0, "loss": 0.7206, "loss/demonstration_loss": -4224.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.69140625, "rewards/margins": -0.0147705078125, "rewards/rejected": 0.70703125, "step": 842 }, { "epoch": 0.2431847684984855, "grad_norm": 8.700913787891386, "learning_rate": 4.694444273034351e-07, "logits/chosen": 3.3125, "logits/rejected": 3.375, "logps/chosen": -1720.0, "logps/rejected": -1728.0, "loss": 0.676, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.625, "rewards/margins": 0.059814453125, "rewards/rejected": 0.56640625, "step": 843 }, { "epoch": 0.243473243905957, "grad_norm": 9.87637023456039, "learning_rate": 4.6932368142925076e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1456.0, "logps/rejected": -1200.0, "loss": 0.6865, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.466796875, "rewards/margins": 0.076171875, "rewards/rejected": 0.390625, "step": 844 }, { "epoch": 0.24376171931342852, "grad_norm": 10.08791195452272, "learning_rate": 4.6920271304252893e-07, "logits/chosen": 3.078125, "logits/rejected": 3.09375, "logps/chosen": -1208.0, "logps/rejected": -1152.0, "loss": 0.657, "loss/demonstration_loss": -2400.0, "loss/preference_loss": -2400.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.42578125, "rewards/margins": 0.062255859375, "rewards/rejected": 0.36328125, "step": 845 }, { "epoch": 0.24405019472090003, "grad_norm": 10.539318405924906, "learning_rate": 4.6908152226599696e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1384.0, "logps/rejected": -1464.0, "loss": 0.6971, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4765625, "rewards/margins": 0.017822265625, "rewards/rejected": 0.458984375, "step": 846 }, { "epoch": 0.24433867012837154, "grad_norm": 9.2156612814942, "learning_rate": 4.689601092226075e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1968.0, "logps/rejected": -1784.0, "loss": 0.6456, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.734375, "rewards/margins": 0.21875, "rewards/rejected": 0.515625, "step": 847 }, { "epoch": 0.24462714553584308, "grad_norm": 11.090165248234777, "learning_rate": 4.688384740355391e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1784.0, "logps/rejected": -1888.0, "loss": 0.7018, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.66015625, "rewards/margins": 0.109375, "rewards/rejected": 0.5546875, "step": 848 }, { "epoch": 0.2449156209433146, "grad_norm": 10.142164098617345, "learning_rate": 4.687166168281953e-07, "logits/chosen": 3.0, "logits/rejected": 2.875, "logps/chosen": -1776.0, "logps/rejected": -1712.0, "loss": 0.6528, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.46875, "rewards/margins": 0.06689453125, "rewards/rejected": 0.400390625, "step": 849 }, { "epoch": 0.2452040963507861, "grad_norm": 10.812163238919457, "learning_rate": 4.685945377242051e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1920.0, "logps/rejected": -1824.0, "loss": 0.678, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5625, "rewards/margins": 0.058837890625, "rewards/rejected": 0.50390625, "step": 850 }, { "epoch": 0.24549257175825762, "grad_norm": 12.585251297162053, "learning_rate": 4.6847223684742255e-07, "logits/chosen": 3.296875, "logits/rejected": 3.25, "logps/chosen": -2064.0, "logps/rejected": -1816.0, "loss": 0.6902, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.63671875, "rewards/margins": 0.11962890625, "rewards/rejected": 0.51953125, "step": 851 }, { "epoch": 0.24578104716572913, "grad_norm": 10.985520203554621, "learning_rate": 4.6834971432192673e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1760.0, "logps/rejected": -1592.0, "loss": 0.628, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.57421875, "rewards/margins": 0.12255859375, "rewards/rejected": 0.453125, "step": 852 }, { "epoch": 0.24606952257320064, "grad_norm": 10.043944675552694, "learning_rate": 4.6822697027202164e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1592.0, "logps/rejected": -1744.0, "loss": 0.6982, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.55078125, "rewards/margins": -0.031494140625, "rewards/rejected": 0.58203125, "step": 853 }, { "epoch": 0.24635799798067215, "grad_norm": 10.652715841911137, "learning_rate": 4.681040048222359e-07, "logits/chosen": 3.28125, "logits/rejected": 3.328125, "logps/chosen": -1848.0, "logps/rejected": -1672.0, "loss": 0.5963, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.61328125, "rewards/margins": 0.12890625, "rewards/rejected": 0.486328125, "step": 854 }, { "epoch": 0.24664647338814366, "grad_norm": 10.6133898281545, "learning_rate": 4.6798081809732286e-07, "logits/chosen": 3.3125, "logits/rejected": 3.34375, "logps/chosen": -1896.0, "logps/rejected": -1800.0, "loss": 0.6921, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.609375, "rewards/margins": 0.03466796875, "rewards/rejected": 0.57421875, "step": 855 }, { "epoch": 0.24693494879561517, "grad_norm": 11.894926314541017, "learning_rate": 4.6785741022226026e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -1232.0, "logps/rejected": -1360.0, "loss": 0.6938, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.373046875, "rewards/margins": -0.08642578125, "rewards/rejected": 0.4609375, "step": 856 }, { "epoch": 0.24722342420308668, "grad_norm": 12.40048594469965, "learning_rate": 4.677337813222503e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1112.0, "logps/rejected": -1208.0, "loss": 0.7119, "loss/demonstration_loss": -2368.0, "loss/preference_loss": -2368.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.49609375, "rewards/margins": 0.0084228515625, "rewards/rejected": 0.486328125, "step": 857 }, { "epoch": 0.2475118996105582, "grad_norm": 10.194155743754278, "learning_rate": 4.6760993152271944e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1352.0, "logps/rejected": -1392.0, "loss": 0.6817, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.46875, "rewards/margins": 0.0888671875, "rewards/rejected": 0.380859375, "step": 858 }, { "epoch": 0.2478003750180297, "grad_norm": 11.86449775886594, "learning_rate": 4.674858609493181e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1512.0, "logps/rejected": -1704.0, "loss": 0.6878, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.490234375, "rewards/margins": -0.0252685546875, "rewards/rejected": 0.515625, "step": 859 }, { "epoch": 0.24808885042550122, "grad_norm": 10.846217791068653, "learning_rate": 4.6736156972792074e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1920.0, "logps/rejected": -1792.0, "loss": 0.6471, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.6015625, "rewards/margins": 0.125, "rewards/rejected": 0.4765625, "step": 860 }, { "epoch": 0.24837732583297273, "grad_norm": 10.196511433060962, "learning_rate": 4.672370579846259e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1768.0, "logps/rejected": -1656.0, "loss": 0.6444, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.59375, "rewards/margins": 0.1435546875, "rewards/rejected": 0.451171875, "step": 861 }, { "epoch": 0.24866580124044424, "grad_norm": 9.97485184345224, "learning_rate": 4.6711232584575543e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1504.0, "logps/rejected": -1384.0, "loss": 0.7112, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.53125, "rewards/margins": 0.00506591796875, "rewards/rejected": 0.52734375, "step": 862 }, { "epoch": 0.24895427664791578, "grad_norm": 10.291910474956397, "learning_rate": 4.6698737343785523e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1872.0, "logps/rejected": -1640.0, "loss": 0.624, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.58203125, "rewards/margins": 0.09375, "rewards/rejected": 0.48828125, "step": 863 }, { "epoch": 0.2492427520553873, "grad_norm": 10.245392198155434, "learning_rate": 4.6686220088769437e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1400.0, "logps/rejected": -1208.0, "loss": 0.6788, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2624.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3125, "rewards/margins": 0.07373046875, "rewards/rejected": 0.2373046875, "step": 864 }, { "epoch": 0.2495312274628588, "grad_norm": 10.928937576672332, "learning_rate": 4.667368083222652e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0, "logps/chosen": -1472.0, "logps/rejected": -1568.0, "loss": 0.6732, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.490234375, "rewards/margins": 0.05810546875, "rewards/rejected": 0.431640625, "step": 865 }, { "epoch": 0.2498197028703303, "grad_norm": 9.788955597305009, "learning_rate": 4.666111958687836e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -2096.0, "logps/rejected": -1736.0, "loss": 0.6232, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.70703125, "rewards/margins": 0.30859375, "rewards/rejected": 0.3984375, "step": 866 }, { "epoch": 0.2501081782778018, "grad_norm": 10.294339369715468, "learning_rate": 4.664853636546884e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1496.0, "logps/rejected": -1560.0, "loss": 0.6798, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.498046875, "rewards/margins": 0.083984375, "rewards/rejected": 0.4140625, "step": 867 }, { "epoch": 0.2503966536852733, "grad_norm": 11.441166234335428, "learning_rate": 4.6635931180764114e-07, "logits/chosen": 3.25, "logits/rejected": 3.3125, "logps/chosen": -1576.0, "logps/rejected": -1504.0, "loss": 0.6924, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48828125, "rewards/margins": 0.0142822265625, "rewards/rejected": 0.474609375, "step": 868 }, { "epoch": 0.2506851290927448, "grad_norm": 11.168272849852537, "learning_rate": 4.662330404555266e-07, "logits/chosen": 3.34375, "logits/rejected": 3.328125, "logps/chosen": -1592.0, "logps/rejected": -1648.0, "loss": 0.6674, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.56640625, "rewards/margins": 0.033203125, "rewards/rejected": 0.53515625, "step": 869 }, { "epoch": 0.25097360450021633, "grad_norm": 10.391460529116333, "learning_rate": 4.6610654972645205e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1552.0, "logps/rejected": -1472.0, "loss": 0.6923, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.53515625, "rewards/margins": 0.0216064453125, "rewards/rejected": 0.515625, "step": 870 }, { "epoch": 0.2512620799076879, "grad_norm": 10.478185318363, "learning_rate": 4.6597983974874715e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1696.0, "logps/rejected": -1384.0, "loss": 0.6859, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.6171875, "rewards/margins": 0.154296875, "rewards/rejected": 0.46484375, "step": 871 }, { "epoch": 0.2515505553151594, "grad_norm": 10.68005738922613, "learning_rate": 4.6585291065096433e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1496.0, "logps/rejected": -1640.0, "loss": 0.6859, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.54296875, "rewards/margins": 0.058349609375, "rewards/rejected": 0.486328125, "step": 872 }, { "epoch": 0.2518390307226309, "grad_norm": 12.0253173496019, "learning_rate": 4.657257625618782e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1392.0, "logps/rejected": -1304.0, "loss": 0.7023, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.408203125, "rewards/margins": 0.0517578125, "rewards/rejected": 0.35546875, "step": 873 }, { "epoch": 0.25212750613010243, "grad_norm": 10.849421679069927, "learning_rate": 4.655983956104854e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1608.0, "logps/rejected": -1656.0, "loss": 0.6778, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.484375, "rewards/margins": 0.04150390625, "rewards/rejected": 0.443359375, "step": 874 }, { "epoch": 0.25241598153757394, "grad_norm": 11.229638318892812, "learning_rate": 4.6547080992600476e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1784.0, "logps/rejected": -1632.0, "loss": 0.624, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.66796875, "rewards/margins": 0.09521484375, "rewards/rejected": 0.57421875, "step": 875 }, { "epoch": 0.25270445694504545, "grad_norm": 10.287746748360936, "learning_rate": 4.6534300563787707e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -2096.0, "logps/rejected": -2064.0, "loss": 0.6576, "loss/demonstration_loss": -4256.0, "loss/preference_loss": -4224.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.7109375, "rewards/margins": 0.0595703125, "rewards/rejected": 0.65234375, "step": 876 }, { "epoch": 0.25299293235251696, "grad_norm": 11.070799087421612, "learning_rate": 4.6521498287576477e-07, "logits/chosen": 3.28125, "logits/rejected": 3.234375, "logps/chosen": -1520.0, "logps/rejected": -1560.0, "loss": 0.7085, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4296875, "rewards/margins": 0.034423828125, "rewards/rejected": 0.39453125, "step": 877 }, { "epoch": 0.2532814077599885, "grad_norm": 10.319685160440125, "learning_rate": 4.6508674176955196e-07, "logits/chosen": 3.328125, "logits/rejected": 3.25, "logps/chosen": -1352.0, "logps/rejected": -1648.0, "loss": 0.6921, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.435546875, "rewards/margins": -0.08154296875, "rewards/rejected": 0.515625, "step": 878 }, { "epoch": 0.25356988316746, "grad_norm": 10.737086190550391, "learning_rate": 4.6495828244934443e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1616.0, "logps/rejected": -1624.0, "loss": 0.6835, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4453125, "rewards/margins": -0.00537109375, "rewards/rejected": 0.451171875, "step": 879 }, { "epoch": 0.2538583585749315, "grad_norm": 10.46080949185506, "learning_rate": 4.6482960504546916e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1448.0, "logps/rejected": -1464.0, "loss": 0.6647, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.06396484375, "rewards/rejected": 0.419921875, "step": 880 }, { "epoch": 0.254146833982403, "grad_norm": 10.104147094205809, "learning_rate": 4.647007096884744e-07, "logits/chosen": 3.171875, "logits/rejected": 3.25, "logps/chosen": -1608.0, "logps/rejected": -1520.0, "loss": 0.6442, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.6640625, "rewards/margins": 0.1552734375, "rewards/rejected": 0.5078125, "step": 881 }, { "epoch": 0.2544353093898745, "grad_norm": 10.37551174282143, "learning_rate": 4.6457159650912975e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1880.0, "logps/rejected": -1624.0, "loss": 0.6432, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.60546875, "rewards/margins": 0.142578125, "rewards/rejected": 0.462890625, "step": 882 }, { "epoch": 0.25472378479734603, "grad_norm": 12.063460518113907, "learning_rate": 4.6444226563842547e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1600.0, "logps/rejected": -1664.0, "loss": 0.7275, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55859375, "rewards/margins": 0.001953125, "rewards/rejected": 0.55859375, "step": 883 }, { "epoch": 0.25501226020481754, "grad_norm": 10.208007822708796, "learning_rate": 4.643127172075729e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1672.0, "logps/rejected": -1448.0, "loss": 0.6816, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.52734375, "rewards/margins": 0.0166015625, "rewards/rejected": 0.51171875, "step": 884 }, { "epoch": 0.25530073561228905, "grad_norm": 12.909647109162057, "learning_rate": 4.641829513480041e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1816.0, "logps/rejected": -1800.0, "loss": 0.7251, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5078125, "rewards/margins": -0.060302734375, "rewards/rejected": 0.56640625, "step": 885 }, { "epoch": 0.25558921101976056, "grad_norm": 12.562323003347004, "learning_rate": 4.640529681913715e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1816.0, "logps/rejected": -1752.0, "loss": 0.6373, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.56640625, "rewards/margins": 0.1015625, "rewards/rejected": 0.466796875, "step": 886 }, { "epoch": 0.2558776864272321, "grad_norm": 9.988680779972267, "learning_rate": 4.639227678695483e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1440.0, "logps/rejected": -1552.0, "loss": 0.6855, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.44921875, "rewards/margins": 0.06396484375, "rewards/rejected": 0.384765625, "step": 887 }, { "epoch": 0.2561661618347036, "grad_norm": 10.56465139256315, "learning_rate": 4.6379235051462784e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1600.0, "logps/rejected": -1568.0, "loss": 0.704, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.46875, "rewards/margins": 0.006561279296875, "rewards/rejected": 0.462890625, "step": 888 }, { "epoch": 0.2564546372421751, "grad_norm": 10.520984044908994, "learning_rate": 4.6366171625892356e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1152.0, "logps/rejected": -1360.0, "loss": 0.7077, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2560.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.373046875, "rewards/margins": -0.1376953125, "rewards/rejected": 0.51171875, "step": 889 }, { "epoch": 0.2567431126496466, "grad_norm": 10.858736566901005, "learning_rate": 4.635308652349692e-07, "logits/chosen": 3.1875, "logits/rejected": 3.078125, "logps/chosen": -1776.0, "logps/rejected": -1400.0, "loss": 0.6575, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.59765625, "rewards/margins": 0.1630859375, "rewards/rejected": 0.435546875, "step": 890 }, { "epoch": 0.2570315880571181, "grad_norm": 10.964177166043617, "learning_rate": 4.6339979757551827e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1464.0, "logps/rejected": -1416.0, "loss": 0.7239, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.392578125, "rewards/margins": 0.045166015625, "rewards/rejected": 0.34765625, "step": 891 }, { "epoch": 0.25732006346458963, "grad_norm": 9.945796855872725, "learning_rate": 4.6326851341354414e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1784.0, "logps/rejected": -1432.0, "loss": 0.6578, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.46484375, "rewards/margins": 0.11767578125, "rewards/rejected": 0.34765625, "step": 892 }, { "epoch": 0.25760853887206114, "grad_norm": 9.452711379895465, "learning_rate": 4.631370128822396e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1560.0, "logps/rejected": -1440.0, "loss": 0.6603, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4296875, "rewards/margins": 0.060791015625, "rewards/rejected": 0.369140625, "step": 893 }, { "epoch": 0.25789701427953265, "grad_norm": 11.44738970038878, "learning_rate": 4.630052961150173e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1584.0, "logps/rejected": -1664.0, "loss": 0.6769, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.59765625, "rewards/margins": 0.08056640625, "rewards/rejected": 0.515625, "step": 894 }, { "epoch": 0.25818548968700417, "grad_norm": 12.360665721167132, "learning_rate": 4.6287336324550894e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1856.0, "logps/rejected": -1872.0, "loss": 0.7023, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.453125, "rewards/margins": -0.0595703125, "rewards/rejected": 0.51171875, "step": 895 }, { "epoch": 0.2584739650944757, "grad_norm": 12.862114412363448, "learning_rate": 4.627412144075658e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -2016.0, "logps/rejected": -1904.0, "loss": 0.6666, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.6640625, "rewards/margins": 0.0162353515625, "rewards/rejected": 0.64453125, "step": 896 }, { "epoch": 0.2587624405019472, "grad_norm": 11.352679163643932, "learning_rate": 4.6260884973525805e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1688.0, "logps/rejected": -1488.0, "loss": 0.6733, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.6171875, "rewards/margins": 0.087890625, "rewards/rejected": 0.52734375, "step": 897 }, { "epoch": 0.2590509159094187, "grad_norm": 10.478473507027283, "learning_rate": 4.624762693628748e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -2000.0, "logps/rejected": -2008.0, "loss": 0.7024, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.76171875, "rewards/margins": 0.031494140625, "rewards/rejected": 0.73046875, "step": 898 }, { "epoch": 0.2593393913168902, "grad_norm": 10.86230041069102, "learning_rate": 4.623434734249242e-07, "logits/chosen": 3.359375, "logits/rejected": 3.296875, "logps/chosen": -1656.0, "logps/rejected": -1672.0, "loss": 0.6584, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.52734375, "rewards/margins": -0.0069580078125, "rewards/rejected": 0.53515625, "step": 899 }, { "epoch": 0.2596278667243617, "grad_norm": 9.61391318321402, "learning_rate": 4.6221046205613286e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1688.0, "logps/rejected": -1592.0, "loss": 0.6589, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.455078125, "rewards/margins": 0.08349609375, "rewards/rejected": 0.37109375, "step": 900 }, { "epoch": 0.2599163421318333, "grad_norm": 10.264457826451379, "learning_rate": 4.620772353914461e-07, "logits/chosen": 3.3125, "logits/rejected": 3.21875, "logps/chosen": -1672.0, "logps/rejected": -1504.0, "loss": 0.6906, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.384765625, "rewards/margins": 0.0101318359375, "rewards/rejected": 0.373046875, "step": 901 }, { "epoch": 0.2602048175393048, "grad_norm": 14.32097661463869, "learning_rate": 4.6194379356602766e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1296.0, "logps/rejected": -1360.0, "loss": 0.7379, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.314453125, "rewards/margins": -0.08154296875, "rewards/rejected": 0.396484375, "step": 902 }, { "epoch": 0.2604932929467763, "grad_norm": 11.86760106922937, "learning_rate": 4.6181013671525955e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1848.0, "logps/rejected": -1928.0, "loss": 0.7009, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.6484375, "rewards/margins": -0.0115966796875, "rewards/rejected": 0.66015625, "step": 903 }, { "epoch": 0.2607817683542478, "grad_norm": 10.844722032987535, "learning_rate": 4.616762649747419e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -2144.0, "logps/rejected": -1824.0, "loss": 0.7081, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4032.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5625, "rewards/margins": -0.08642578125, "rewards/rejected": 0.6484375, "step": 904 }, { "epoch": 0.26107024376171933, "grad_norm": 10.291725140539793, "learning_rate": 4.615421784802928e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1888.0, "logps/rejected": -1808.0, "loss": 0.6825, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.6953125, "rewards/margins": 0.1455078125, "rewards/rejected": 0.546875, "step": 905 }, { "epoch": 0.26135871916919085, "grad_norm": 10.583078078739183, "learning_rate": 4.614078773679484e-07, "logits/chosen": 3.296875, "logits/rejected": 3.34375, "logps/chosen": -1800.0, "logps/rejected": -1744.0, "loss": 0.687, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.62890625, "rewards/margins": 0.043701171875, "rewards/rejected": 0.5859375, "step": 906 }, { "epoch": 0.26164719457666236, "grad_norm": 10.922708497340988, "learning_rate": 4.612733617739625e-07, "logits/chosen": 3.421875, "logits/rejected": 3.34375, "logps/chosen": -1648.0, "logps/rejected": -1640.0, "loss": 0.6925, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5546875, "rewards/margins": 0.03857421875, "rewards/rejected": 0.515625, "step": 907 }, { "epoch": 0.26193566998413387, "grad_norm": 11.682761380305216, "learning_rate": 4.6113863183480637e-07, "logits/chosen": 3.15625, "logits/rejected": 3.265625, "logps/chosen": -1728.0, "logps/rejected": -1456.0, "loss": 0.6741, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.59765625, "rewards/margins": 0.103515625, "rewards/rejected": 0.494140625, "step": 908 }, { "epoch": 0.2622241453916054, "grad_norm": 11.09898031546255, "learning_rate": 4.61003687687169e-07, "logits/chosen": 3.28125, "logits/rejected": 3.328125, "logps/chosen": -1704.0, "logps/rejected": -1600.0, "loss": 0.6779, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.53125, "rewards/margins": 0.1318359375, "rewards/rejected": 0.400390625, "step": 909 }, { "epoch": 0.2625126207990769, "grad_norm": 10.851917078634518, "learning_rate": 4.6086852946795646e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1696.0, "logps/rejected": -1632.0, "loss": 0.6374, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.69140625, "rewards/margins": 0.158203125, "rewards/rejected": 0.53125, "step": 910 }, { "epoch": 0.2628010962065484, "grad_norm": 11.697947647800692, "learning_rate": 4.607331573142921e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -2032.0, "logps/rejected": -1712.0, "loss": 0.6476, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.64453125, "rewards/margins": 0.10107421875, "rewards/rejected": 0.54296875, "step": 911 }, { "epoch": 0.2630895716140199, "grad_norm": 10.335457072007767, "learning_rate": 4.605975713635163e-07, "logits/chosen": 3.328125, "logits/rejected": 3.390625, "logps/chosen": -1792.0, "logps/rejected": -1896.0, "loss": 0.7025, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.408203125, "rewards/margins": -0.009521484375, "rewards/rejected": 0.41796875, "step": 912 }, { "epoch": 0.2633780470214914, "grad_norm": 10.896962704997595, "learning_rate": 4.604617717531865e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1736.0, "logps/rejected": -1760.0, "loss": 0.7059, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.498046875, "rewards/margins": 0.040283203125, "rewards/rejected": 0.45703125, "step": 913 }, { "epoch": 0.26366652242896294, "grad_norm": 10.72063677224275, "learning_rate": 4.603257586210766e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1480.0, "logps/rejected": -1704.0, "loss": 0.7031, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.447265625, "rewards/margins": 0.0074462890625, "rewards/rejected": 0.439453125, "step": 914 }, { "epoch": 0.26395499783643445, "grad_norm": 10.624299723002819, "learning_rate": 4.601895321051774e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1584.0, "logps/rejected": -1392.0, "loss": 0.6585, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51171875, "rewards/margins": 0.10595703125, "rewards/rejected": 0.408203125, "step": 915 }, { "epoch": 0.26424347324390596, "grad_norm": 12.922084550213027, "learning_rate": 4.6005309234369605e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1336.0, "logps/rejected": -1408.0, "loss": 0.6757, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.494140625, "rewards/margins": 0.031982421875, "rewards/rejected": 0.4609375, "step": 916 }, { "epoch": 0.26453194865137747, "grad_norm": 10.549364403217549, "learning_rate": 4.5991643947505605e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1624.0, "logps/rejected": -1528.0, "loss": 0.7162, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40625, "rewards/margins": -0.016357421875, "rewards/rejected": 0.421875, "step": 917 }, { "epoch": 0.264820424058849, "grad_norm": 11.725558179523782, "learning_rate": 4.5977957363789717e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1304.0, "logps/rejected": -1448.0, "loss": 0.7201, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.48828125, "rewards/margins": -0.095703125, "rewards/rejected": 0.58203125, "step": 918 }, { "epoch": 0.2651088994663205, "grad_norm": 11.237955345790029, "learning_rate": 4.5964249497107515e-07, "logits/chosen": 3.25, "logits/rejected": 3.34375, "logps/chosen": -1360.0, "logps/rejected": -1136.0, "loss": 0.6854, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2544.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.412109375, "rewards/margins": -0.017333984375, "rewards/rejected": 0.4296875, "step": 919 }, { "epoch": 0.265397374873792, "grad_norm": 12.127182650464324, "learning_rate": 4.5950520361366174e-07, "logits/chosen": 3.203125, "logits/rejected": 3.046875, "logps/chosen": -1368.0, "logps/rejected": -1288.0, "loss": 0.6876, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3515625, "rewards/margins": 0.04345703125, "rewards/rejected": 0.30859375, "step": 920 }, { "epoch": 0.2656858502812635, "grad_norm": 11.588734078301309, "learning_rate": 4.5936769970494453e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -2080.0, "logps/rejected": -1752.0, "loss": 0.6776, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.60546875, "rewards/margins": 0.12451171875, "rewards/rejected": 0.482421875, "step": 921 }, { "epoch": 0.265974325688735, "grad_norm": 11.915185126797628, "learning_rate": 4.592299833844266e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1960.0, "logps/rejected": -1680.0, "loss": 0.6293, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.65625, "rewards/margins": 0.173828125, "rewards/rejected": 0.48046875, "step": 922 }, { "epoch": 0.26626280109620654, "grad_norm": 13.688771694467569, "learning_rate": 4.5909205479182657e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1576.0, "logps/rejected": -1640.0, "loss": 0.6945, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.53515625, "rewards/margins": 0.1953125, "rewards/rejected": 0.33984375, "step": 923 }, { "epoch": 0.26655127650367805, "grad_norm": 10.70851650584107, "learning_rate": 4.589539140670784e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -1192.0, "logps/rejected": -1256.0, "loss": 0.6708, "loss/demonstration_loss": -2496.0, "loss/preference_loss": -2480.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.443359375, "rewards/margins": 0.0771484375, "rewards/rejected": 0.365234375, "step": 924 }, { "epoch": 0.26683975191114956, "grad_norm": 11.299161331986914, "learning_rate": 4.5881556135033147e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1640.0, "logps/rejected": -1400.0, "loss": 0.6642, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.52734375, "rewards/margins": 0.0028533935546875, "rewards/rejected": 0.5234375, "step": 925 }, { "epoch": 0.26712822731862107, "grad_norm": 12.550149018593496, "learning_rate": 4.5867699678194994e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1904.0, "logps/rejected": -1552.0, "loss": 0.686, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.396484375, "rewards/margins": 0.00347900390625, "rewards/rejected": 0.392578125, "step": 926 }, { "epoch": 0.2674167027260926, "grad_norm": 10.269958355210981, "learning_rate": 4.585382205025131e-07, "logits/chosen": 3.328125, "logits/rejected": 3.359375, "logps/chosen": -1968.0, "logps/rejected": -1960.0, "loss": 0.6758, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.58203125, "rewards/margins": 0.0625, "rewards/rejected": 0.51953125, "step": 927 }, { "epoch": 0.2677051781335641, "grad_norm": 11.59885762877722, "learning_rate": 4.583992326528149e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1840.0, "logps/rejected": -1832.0, "loss": 0.6812, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5234375, "rewards/margins": 0.0150146484375, "rewards/rejected": 0.5078125, "step": 928 }, { "epoch": 0.2679936535410356, "grad_norm": 11.847260015510527, "learning_rate": 4.582600333738641e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1520.0, "logps/rejected": -1704.0, "loss": 0.7462, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.337890625, "rewards/margins": -0.1845703125, "rewards/rejected": 0.5234375, "step": 929 }, { "epoch": 0.2682821289485071, "grad_norm": 10.907285003407916, "learning_rate": 4.581206228068838e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1816.0, "logps/rejected": -1832.0, "loss": 0.65, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.609375, "rewards/margins": 0.09912109375, "rewards/rejected": 0.51171875, "step": 930 }, { "epoch": 0.2685706043559786, "grad_norm": 10.38710655330245, "learning_rate": 4.5798100109331154e-07, "logits/chosen": 3.25, "logits/rejected": 3.28125, "logps/chosen": -1944.0, "logps/rejected": -1896.0, "loss": 0.6765, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.53515625, "rewards/margins": -0.0223388671875, "rewards/rejected": 0.55859375, "step": 931 }, { "epoch": 0.2688590797634502, "grad_norm": 10.529417558703228, "learning_rate": 4.578411683747991e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1280.0, "logps/rejected": -1496.0, "loss": 0.659, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.38671875, "rewards/margins": 0.02587890625, "rewards/rejected": 0.359375, "step": 932 }, { "epoch": 0.2691475551709217, "grad_norm": 11.385953152697091, "learning_rate": 4.577011247932122e-07, "logits/chosen": 3.265625, "logits/rejected": 3.265625, "logps/chosen": -1680.0, "logps/rejected": -1744.0, "loss": 0.7181, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.390625, "rewards/margins": -0.01300048828125, "rewards/rejected": 0.404296875, "step": 933 }, { "epoch": 0.2694360305783932, "grad_norm": 13.21984684209804, "learning_rate": 4.5756087049063077e-07, "logits/chosen": 3.125, "logits/rejected": 2.9375, "logps/chosen": -1368.0, "logps/rejected": -1528.0, "loss": 0.6995, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.31640625, "rewards/margins": -0.061279296875, "rewards/rejected": 0.376953125, "step": 934 }, { "epoch": 0.2697245059858647, "grad_norm": 11.010340775086508, "learning_rate": 4.574204056093481e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1600.0, "logps/rejected": -1496.0, "loss": 0.6628, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.326171875, "rewards/margins": -0.0419921875, "rewards/rejected": 0.369140625, "step": 935 }, { "epoch": 0.27001298139333624, "grad_norm": 9.700924017479833, "learning_rate": 4.572797302918715e-07, "logits/chosen": 3.28125, "logits/rejected": 3.234375, "logps/chosen": -1480.0, "logps/rejected": -1648.0, "loss": 0.7125, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41015625, "rewards/margins": -0.046630859375, "rewards/rejected": 0.455078125, "step": 936 }, { "epoch": 0.27030145680080775, "grad_norm": 11.545608732035248, "learning_rate": 4.571388446809216e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1600.0, "logps/rejected": -1464.0, "loss": 0.7067, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.50390625, "rewards/margins": 0.0020751953125, "rewards/rejected": 0.5, "step": 937 }, { "epoch": 0.27058993220827926, "grad_norm": 10.09897713589775, "learning_rate": 4.569977489194324e-07, "logits/chosen": 3.359375, "logits/rejected": 3.359375, "logps/chosen": -2112.0, "logps/rejected": -2112.0, "loss": 0.6516, "loss/demonstration_loss": -4256.0, "loss/preference_loss": -4256.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.453125, "rewards/margins": 0.08740234375, "rewards/rejected": 0.365234375, "step": 938 }, { "epoch": 0.27087840761575077, "grad_norm": 12.28518616478594, "learning_rate": 4.5685644315055126e-07, "logits/chosen": 3.265625, "logits/rejected": 3.0, "logps/chosen": -1600.0, "logps/rejected": -1456.0, "loss": 0.703, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4765625, "rewards/margins": 0.0751953125, "rewards/rejected": 0.40234375, "step": 939 }, { "epoch": 0.2711668830232223, "grad_norm": 11.479374518551127, "learning_rate": 4.567149275176383e-07, "logits/chosen": 3.3125, "logits/rejected": 3.25, "logps/chosen": -1472.0, "logps/rejected": -1552.0, "loss": 0.6702, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4609375, "rewards/margins": 0.0400390625, "rewards/rejected": 0.419921875, "step": 940 }, { "epoch": 0.2714553584306938, "grad_norm": 11.180617721697066, "learning_rate": 4.565732021642668e-07, "logits/chosen": 3.296875, "logits/rejected": 3.328125, "logps/chosen": -1608.0, "logps/rejected": -1464.0, "loss": 0.6783, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.478515625, "rewards/margins": -0.002593994140625, "rewards/rejected": 0.482421875, "step": 941 }, { "epoch": 0.2717438338381653, "grad_norm": 9.936490901949039, "learning_rate": 4.5643126723422267e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1536.0, "logps/rejected": -1280.0, "loss": 0.6754, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.458984375, "rewards/margins": 0.09814453125, "rewards/rejected": 0.361328125, "step": 942 }, { "epoch": 0.2720323092456368, "grad_norm": 12.882232756444198, "learning_rate": 4.562891228715046e-07, "logits/chosen": 3.078125, "logits/rejected": 3.15625, "logps/chosen": -1624.0, "logps/rejected": -1376.0, "loss": 0.6752, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.455078125, "rewards/margins": 0.1669921875, "rewards/rejected": 0.287109375, "step": 943 }, { "epoch": 0.2723207846531083, "grad_norm": 11.051097521419868, "learning_rate": 4.561467692203235e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1640.0, "logps/rejected": -1696.0, "loss": 0.7046, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.431640625, "rewards/margins": -0.0341796875, "rewards/rejected": 0.46484375, "step": 944 }, { "epoch": 0.27260926006057984, "grad_norm": 11.649599458378562, "learning_rate": 4.560042064251029e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1472.0, "logps/rejected": -1640.0, "loss": 0.692, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.43359375, "rewards/margins": 0.001190185546875, "rewards/rejected": 0.431640625, "step": 945 }, { "epoch": 0.27289773546805135, "grad_norm": 10.657568505395876, "learning_rate": 4.558614346304783e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1936.0, "logps/rejected": -1664.0, "loss": 0.649, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5546875, "rewards/margins": 0.12890625, "rewards/rejected": 0.423828125, "step": 946 }, { "epoch": 0.27318621087552286, "grad_norm": 9.471539729341357, "learning_rate": 4.5571845398129747e-07, "logits/chosen": 3.375, "logits/rejected": 3.359375, "logps/chosen": -1600.0, "logps/rejected": -1632.0, "loss": 0.6711, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4375, "rewards/margins": 0.0125732421875, "rewards/rejected": 0.42578125, "step": 947 }, { "epoch": 0.2734746862829944, "grad_norm": 12.005934480957475, "learning_rate": 4.5557526462261986e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1848.0, "logps/rejected": -1544.0, "loss": 0.7048, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.490234375, "rewards/margins": 0.08837890625, "rewards/rejected": 0.40234375, "step": 948 }, { "epoch": 0.2737631616904659, "grad_norm": 11.659956622386694, "learning_rate": 4.5543186669971665e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1488.0, "logps/rejected": -1544.0, "loss": 0.6498, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.435546875, "rewards/margins": 0.0546875, "rewards/rejected": 0.380859375, "step": 949 }, { "epoch": 0.2740516370979374, "grad_norm": 10.912399181192729, "learning_rate": 4.552882603580708e-07, "logits/chosen": 3.203125, "logits/rejected": 3.140625, "logps/chosen": -1552.0, "logps/rejected": -1704.0, "loss": 0.707, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.419921875, "rewards/margins": 0.01043701171875, "rewards/rejected": 0.41015625, "step": 950 }, { "epoch": 0.2743401125054089, "grad_norm": 10.121750567145934, "learning_rate": 4.5514444574337646e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -1472.0, "logps/rejected": -1520.0, "loss": 0.6638, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6015625, "rewards/margins": 0.11962890625, "rewards/rejected": 0.482421875, "step": 951 }, { "epoch": 0.2746285879128804, "grad_norm": 9.948714820656734, "learning_rate": 4.550004230015394e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1712.0, "logps/rejected": -1656.0, "loss": 0.6656, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.470703125, "rewards/margins": 0.09326171875, "rewards/rejected": 0.376953125, "step": 952 }, { "epoch": 0.27491706332035193, "grad_norm": 11.61056973275793, "learning_rate": 4.548561922786763e-07, "logits/chosen": 3.265625, "logits/rejected": 3.125, "logps/chosen": -1656.0, "logps/rejected": -1768.0, "loss": 0.6881, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.37109375, "rewards/margins": -0.039794921875, "rewards/rejected": 0.412109375, "step": 953 }, { "epoch": 0.27520553872782344, "grad_norm": 10.309604798565307, "learning_rate": 4.54711753721115e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1696.0, "logps/rejected": -1408.0, "loss": 0.6804, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3359375, "rewards/margins": 0.076171875, "rewards/rejected": 0.259765625, "step": 954 }, { "epoch": 0.27549401413529495, "grad_norm": 10.1682469776998, "learning_rate": 4.545671074753941e-07, "logits/chosen": 3.15625, "logits/rejected": 3.25, "logps/chosen": -1800.0, "logps/rejected": -1680.0, "loss": 0.6782, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.412109375, "rewards/margins": 0.03759765625, "rewards/rejected": 0.373046875, "step": 955 }, { "epoch": 0.27578248954276646, "grad_norm": 11.504953234766683, "learning_rate": 4.5442225368826285e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1696.0, "logps/rejected": -1624.0, "loss": 0.7076, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.416015625, "rewards/margins": 0.006439208984375, "rewards/rejected": 0.408203125, "step": 956 }, { "epoch": 0.276070964950238, "grad_norm": 12.168369097902977, "learning_rate": 4.542771925066812e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1520.0, "logps/rejected": -1624.0, "loss": 0.6718, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.50390625, "rewards/margins": 0.060302734375, "rewards/rejected": 0.443359375, "step": 957 }, { "epoch": 0.2763594403577095, "grad_norm": 10.488007577533148, "learning_rate": 4.541319240778194e-07, "logits/chosen": 3.21875, "logits/rejected": 3.125, "logps/chosen": -2176.0, "logps/rejected": -2096.0, "loss": 0.6818, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.59375, "rewards/margins": 0.125, "rewards/rejected": 0.470703125, "step": 958 }, { "epoch": 0.276647915765181, "grad_norm": 10.48390299437886, "learning_rate": 4.539864485490581e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1768.0, "logps/rejected": -1816.0, "loss": 0.6907, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.41015625, "rewards/margins": 0.01324462890625, "rewards/rejected": 0.3984375, "step": 959 }, { "epoch": 0.2769363911726525, "grad_norm": 13.463700722711927, "learning_rate": 4.538407660679879e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1384.0, "logps/rejected": -1336.0, "loss": 0.7136, "loss/demonstration_loss": -2752.0, "loss/preference_loss": -2752.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.369140625, "rewards/margins": 0.00616455078125, "rewards/rejected": 0.36328125, "step": 960 }, { "epoch": 0.277224866580124, "grad_norm": 12.609660001285349, "learning_rate": 4.5369487678240946e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1648.0, "logps/rejected": -1616.0, "loss": 0.7378, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.443359375, "rewards/margins": -0.02978515625, "rewards/rejected": 0.47265625, "step": 961 }, { "epoch": 0.27751334198759553, "grad_norm": 10.27041151151368, "learning_rate": 4.5354878084033313e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1752.0, "logps/rejected": -1760.0, "loss": 0.6745, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.498046875, "rewards/margins": 0.125, "rewards/rejected": 0.373046875, "step": 962 }, { "epoch": 0.2778018173950671, "grad_norm": 11.385464655029894, "learning_rate": 4.5340247838997917e-07, "logits/chosen": 3.25, "logits/rejected": 3.28125, "logps/chosen": -1824.0, "logps/rejected": -1760.0, "loss": 0.6702, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.50390625, "rewards/margins": 0.02880859375, "rewards/rejected": 0.474609375, "step": 963 }, { "epoch": 0.2780902928025386, "grad_norm": 11.05099343854332, "learning_rate": 4.532559695797771e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1640.0, "logps/rejected": -1544.0, "loss": 0.7059, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.447265625, "rewards/margins": 0.06494140625, "rewards/rejected": 0.3828125, "step": 964 }, { "epoch": 0.2783787682100101, "grad_norm": 11.707483514684181, "learning_rate": 4.531092545583659e-07, "logits/chosen": 3.25, "logits/rejected": 3.265625, "logps/chosen": -1960.0, "logps/rejected": -1752.0, "loss": 0.6795, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.390625, "rewards/margins": 0.030517578125, "rewards/rejected": 0.361328125, "step": 965 }, { "epoch": 0.27866724361748163, "grad_norm": 12.576312068577296, "learning_rate": 4.5296233347459377e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1544.0, "logps/rejected": -1352.0, "loss": 0.691, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.306640625, "rewards/margins": 0.051025390625, "rewards/rejected": 0.25390625, "step": 966 }, { "epoch": 0.27895571902495314, "grad_norm": 12.064003132879426, "learning_rate": 4.52815206477518e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -1784.0, "logps/rejected": -1976.0, "loss": 0.7139, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.484375, "rewards/margins": 0.024169921875, "rewards/rejected": 0.4609375, "step": 967 }, { "epoch": 0.27924419443242465, "grad_norm": 9.566324066539504, "learning_rate": 4.5266787371640464e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1392.0, "logps/rejected": -1600.0, "loss": 0.6783, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.423828125, "rewards/margins": 0.0419921875, "rewards/rejected": 0.3828125, "step": 968 }, { "epoch": 0.27953266983989616, "grad_norm": 12.405435269443675, "learning_rate": 4.5252033534072867e-07, "logits/chosen": 3.28125, "logits/rejected": 3.203125, "logps/chosen": -1600.0, "logps/rejected": -1640.0, "loss": 0.6738, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.33984375, "rewards/margins": 0.1083984375, "rewards/rejected": 0.2314453125, "step": 969 }, { "epoch": 0.2798211452473677, "grad_norm": 11.69870452479298, "learning_rate": 4.523725915001735e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1616.0, "logps/rejected": -1624.0, "loss": 0.6736, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.46484375, "rewards/margins": 0.07861328125, "rewards/rejected": 0.38671875, "step": 970 }, { "epoch": 0.2801096206548392, "grad_norm": 9.115535838774612, "learning_rate": 4.522246423446312e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1736.0, "logps/rejected": -1608.0, "loss": 0.6614, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.466796875, "rewards/margins": 0.130859375, "rewards/rejected": 0.3359375, "step": 971 }, { "epoch": 0.2803980960623107, "grad_norm": 10.938440071405513, "learning_rate": 4.520764880242021e-07, "logits/chosen": 3.15625, "logits/rejected": 3.234375, "logps/chosen": -1736.0, "logps/rejected": -1392.0, "loss": 0.6569, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.423828125, "rewards/margins": 0.12109375, "rewards/rejected": 0.302734375, "step": 972 }, { "epoch": 0.2806865714697822, "grad_norm": 9.953261382665383, "learning_rate": 4.519281286891943e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1336.0, "logps/rejected": -1352.0, "loss": 0.6762, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.263671875, "rewards/margins": -0.021484375, "rewards/rejected": 0.28515625, "step": 973 }, { "epoch": 0.2809750468772537, "grad_norm": 10.376730985362087, "learning_rate": 4.5177956449012454e-07, "logits/chosen": 3.328125, "logits/rejected": 3.34375, "logps/chosen": -1824.0, "logps/rejected": -1776.0, "loss": 0.699, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4453125, "rewards/margins": 0.02001953125, "rewards/rejected": 0.42578125, "step": 974 }, { "epoch": 0.28126352228472523, "grad_norm": 12.997210794188563, "learning_rate": 4.516307955777169e-07, "logits/chosen": 3.203125, "logits/rejected": 3.140625, "logps/chosen": -1632.0, "logps/rejected": -1656.0, "loss": 0.7223, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.375, "rewards/margins": -0.041015625, "rewards/rejected": 0.416015625, "step": 975 }, { "epoch": 0.28155199769219674, "grad_norm": 10.317418362600666, "learning_rate": 4.514818221029034e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1624.0, "logps/rejected": -1912.0, "loss": 0.6409, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.53515625, "rewards/margins": 0.0966796875, "rewards/rejected": 0.439453125, "step": 976 }, { "epoch": 0.28184047309966825, "grad_norm": 12.375737232105662, "learning_rate": 4.513326442168235e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1864.0, "logps/rejected": -2000.0, "loss": 0.6981, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.4453125, "rewards/margins": -0.0673828125, "rewards/rejected": 0.51171875, "step": 977 }, { "epoch": 0.28212894850713977, "grad_norm": 11.796289569828206, "learning_rate": 4.511832620708239e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1568.0, "logps/rejected": -1608.0, "loss": 0.6755, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.35546875, "rewards/margins": 0.0869140625, "rewards/rejected": 0.267578125, "step": 978 }, { "epoch": 0.2824174239146113, "grad_norm": 13.30386279288329, "learning_rate": 4.510336758164589e-07, "logits/chosen": 3.21875, "logits/rejected": 3.28125, "logps/chosen": -1448.0, "logps/rejected": -1656.0, "loss": 0.7361, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.3359375, "rewards/margins": -0.1240234375, "rewards/rejected": 0.458984375, "step": 979 }, { "epoch": 0.2827058993220828, "grad_norm": 9.743310505386415, "learning_rate": 4.508838856054896e-07, "logits/chosen": 3.3125, "logits/rejected": 3.359375, "logps/chosen": -1408.0, "logps/rejected": -1264.0, "loss": 0.669, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.380859375, "rewards/margins": 0.016845703125, "rewards/rejected": 0.36328125, "step": 980 }, { "epoch": 0.2829943747295543, "grad_norm": 9.850574818600123, "learning_rate": 4.50733891589884e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1592.0, "logps/rejected": -1424.0, "loss": 0.6611, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3984375, "rewards/margins": 0.0947265625, "rewards/rejected": 0.3046875, "step": 981 }, { "epoch": 0.2832828501370258, "grad_norm": 12.486022893711196, "learning_rate": 4.5058369392181707e-07, "logits/chosen": 3.109375, "logits/rejected": 3.28125, "logps/chosen": -1984.0, "logps/rejected": -1824.0, "loss": 0.7059, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51953125, "rewards/margins": 0.134765625, "rewards/rejected": 0.3828125, "step": 982 }, { "epoch": 0.2835713255444973, "grad_norm": 10.926752997714912, "learning_rate": 4.504332927536702e-07, "logits/chosen": 3.203125, "logits/rejected": 3.140625, "logps/chosen": -1728.0, "logps/rejected": -1648.0, "loss": 0.6906, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.45703125, "rewards/margins": -0.007415771484375, "rewards/rejected": 0.462890625, "step": 983 }, { "epoch": 0.28385980095196883, "grad_norm": 11.624839981421283, "learning_rate": 4.502826882380313e-07, "logits/chosen": 3.359375, "logits/rejected": 3.296875, "logps/chosen": -1744.0, "logps/rejected": -1616.0, "loss": 0.6613, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.376953125, "rewards/margins": 0.0546875, "rewards/rejected": 0.322265625, "step": 984 }, { "epoch": 0.28414827635944034, "grad_norm": 13.974501302543205, "learning_rate": 4.501318805276947e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1728.0, "logps/rejected": -1704.0, "loss": 0.6823, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.41015625, "rewards/margins": 0.0673828125, "rewards/rejected": 0.341796875, "step": 985 }, { "epoch": 0.28443675176691186, "grad_norm": 10.14777759121773, "learning_rate": 4.4998086977566067e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1440.0, "logps/rejected": -1496.0, "loss": 0.6658, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.359375, "rewards/margins": 0.0179443359375, "rewards/rejected": 0.33984375, "step": 986 }, { "epoch": 0.28472522717438337, "grad_norm": 9.416608047664688, "learning_rate": 4.4982965613513566e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -1264.0, "logps/rejected": -1048.0, "loss": 0.6708, "loss/demonstration_loss": -2352.0, "loss/preference_loss": -2336.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.31640625, "rewards/margins": 0.0849609375, "rewards/rejected": 0.2314453125, "step": 987 }, { "epoch": 0.2850137025818549, "grad_norm": 10.413555532071907, "learning_rate": 4.4967823975953185e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -1456.0, "logps/rejected": -1584.0, "loss": 0.6857, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.271484375, "rewards/margins": 0.053955078125, "rewards/rejected": 0.2177734375, "step": 988 }, { "epoch": 0.2853021779893264, "grad_norm": 9.715969066531516, "learning_rate": 4.495266208024671e-07, "logits/chosen": 3.0625, "logits/rejected": 3.109375, "logps/chosen": -1504.0, "logps/rejected": -1544.0, "loss": 0.6699, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.365234375, "rewards/margins": 0.03955078125, "rewards/rejected": 0.32421875, "step": 989 }, { "epoch": 0.2855906533967979, "grad_norm": 10.59297515613841, "learning_rate": 4.493747994177649e-07, "logits/chosen": 3.25, "logits/rejected": 3.296875, "logps/chosen": -1864.0, "logps/rejected": -1784.0, "loss": 0.6391, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.486328125, "rewards/margins": 0.1689453125, "rewards/rejected": 0.318359375, "step": 990 }, { "epoch": 0.2858791288042694, "grad_norm": 10.350104899989711, "learning_rate": 4.49222775759454e-07, "logits/chosen": 2.9375, "logits/rejected": 3.015625, "logps/chosen": -1448.0, "logps/rejected": -1232.0, "loss": 0.6608, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.33203125, "rewards/margins": 0.08837890625, "rewards/rejected": 0.244140625, "step": 991 }, { "epoch": 0.2861676042117409, "grad_norm": 10.78936749452285, "learning_rate": 4.4907054998176843e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1464.0, "logps/rejected": -1192.0, "loss": 0.66, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2672.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.349609375, "rewards/margins": 0.158203125, "rewards/rejected": 0.1923828125, "step": 992 }, { "epoch": 0.2864560796192125, "grad_norm": 10.437666605710469, "learning_rate": 4.4891812223914714e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1560.0, "logps/rejected": -1464.0, "loss": 0.6668, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.41015625, "rewards/margins": 0.087890625, "rewards/rejected": 0.322265625, "step": 993 }, { "epoch": 0.286744555026684, "grad_norm": 11.278607243510134, "learning_rate": 4.487654926862343e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1608.0, "logps/rejected": -1560.0, "loss": 0.6757, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.359375, "rewards/margins": 0.09814453125, "rewards/rejected": 0.259765625, "step": 994 }, { "epoch": 0.2870330304341555, "grad_norm": 11.662771886704727, "learning_rate": 4.486126614778785e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1640.0, "logps/rejected": -1592.0, "loss": 0.6931, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.333984375, "rewards/margins": 0.0264892578125, "rewards/rejected": 0.30859375, "step": 995 }, { "epoch": 0.287321505841627, "grad_norm": 11.15241005953297, "learning_rate": 4.4845962876913303e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1680.0, "logps/rejected": -1592.0, "loss": 0.6566, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.37109375, "rewards/margins": 0.05517578125, "rewards/rejected": 0.31640625, "step": 996 }, { "epoch": 0.28760998124909853, "grad_norm": 10.711424116386574, "learning_rate": 4.4830639471525555e-07, "logits/chosen": 3.390625, "logits/rejected": 3.34375, "logps/chosen": -1952.0, "logps/rejected": -1968.0, "loss": 0.6738, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.365234375, "rewards/margins": 0.09130859375, "rewards/rejected": 0.2734375, "step": 997 }, { "epoch": 0.28789845665657005, "grad_norm": 10.068915704386708, "learning_rate": 4.4815295947170824e-07, "logits/chosen": 3.078125, "logits/rejected": 3.25, "logps/chosen": -1408.0, "logps/rejected": -1184.0, "loss": 0.6385, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.322265625, "rewards/margins": 0.17578125, "rewards/rejected": 0.1455078125, "step": 998 }, { "epoch": 0.28818693206404156, "grad_norm": 10.757947704091364, "learning_rate": 4.479993231941571e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1416.0, "logps/rejected": -1504.0, "loss": 0.6823, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.376953125, "rewards/margins": -0.022705078125, "rewards/rejected": 0.3984375, "step": 999 }, { "epoch": 0.28847540747151307, "grad_norm": 10.722620086975104, "learning_rate": 4.4784548603847214e-07, "logits/chosen": 3.125, "logits/rejected": 3.046875, "logps/chosen": -1832.0, "logps/rejected": -1720.0, "loss": 0.6542, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3046875, "rewards/margins": 0.0625, "rewards/rejected": 0.2431640625, "step": 1000 }, { "epoch": 0.2887638828789846, "grad_norm": 11.129193803210033, "learning_rate": 4.4769144816072743e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1448.0, "logps/rejected": -1004.0, "loss": 0.6658, "loss/demonstration_loss": -2496.0, "loss/preference_loss": -2464.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.37890625, "rewards/margins": 0.185546875, "rewards/rejected": 0.193359375, "step": 1001 }, { "epoch": 0.2890523582864561, "grad_norm": 10.561876252521115, "learning_rate": 4.475372097172003e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1896.0, "logps/rejected": -2064.0, "loss": 0.6937, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.416015625, "rewards/margins": 0.020751953125, "rewards/rejected": 0.39453125, "step": 1002 }, { "epoch": 0.2893408336939276, "grad_norm": 11.559118321164453, "learning_rate": 4.4738277086437183e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -2128.0, "logps/rejected": -1960.0, "loss": 0.678, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.34375, "rewards/margins": -0.0269775390625, "rewards/rejected": 0.37109375, "step": 1003 }, { "epoch": 0.2896293091013991, "grad_norm": 11.268206051285386, "learning_rate": 4.472281317589263e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1768.0, "logps/rejected": -1648.0, "loss": 0.6608, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4296875, "rewards/margins": 0.0712890625, "rewards/rejected": 0.359375, "step": 1004 }, { "epoch": 0.2899177845088706, "grad_norm": 10.803036286773263, "learning_rate": 4.4707329255775115e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1768.0, "logps/rejected": -1920.0, "loss": 0.7422, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.421875, "rewards/margins": -0.01507568359375, "rewards/rejected": 0.4375, "step": 1005 }, { "epoch": 0.29020625991634214, "grad_norm": 8.933277300630738, "learning_rate": 4.4691825341793706e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1320.0, "logps/rejected": -1168.0, "loss": 0.6586, "loss/demonstration_loss": -2512.0, "loss/preference_loss": -2496.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.251953125, "rewards/margins": 0.05517578125, "rewards/rejected": 0.1962890625, "step": 1006 }, { "epoch": 0.29049473532381365, "grad_norm": 11.798056181397305, "learning_rate": 4.4676301449677713e-07, "logits/chosen": 3.3125, "logits/rejected": 3.21875, "logps/chosen": -1528.0, "logps/rejected": -1864.0, "loss": 0.6841, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51953125, "rewards/margins": -0.0213623046875, "rewards/rejected": 0.54296875, "step": 1007 }, { "epoch": 0.29078321073128516, "grad_norm": 10.69219364693706, "learning_rate": 4.4660757595176745e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -1496.0, "logps/rejected": -1736.0, "loss": 0.6692, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.345703125, "rewards/margins": 0.0673828125, "rewards/rejected": 0.279296875, "step": 1008 }, { "epoch": 0.29107168613875667, "grad_norm": 11.004899148027592, "learning_rate": 4.4645193794060655e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1552.0, "logps/rejected": -1752.0, "loss": 0.6768, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3984375, "rewards/margins": 0.11328125, "rewards/rejected": 0.28515625, "step": 1009 }, { "epoch": 0.2913601615462282, "grad_norm": 11.283551697444814, "learning_rate": 4.4629610062119544e-07, "logits/chosen": 3.171875, "logits/rejected": 3.203125, "logps/chosen": -1872.0, "logps/rejected": -1824.0, "loss": 0.652, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.49609375, "rewards/margins": 0.1064453125, "rewards/rejected": 0.388671875, "step": 1010 }, { "epoch": 0.2916486369536997, "grad_norm": 10.597379611134045, "learning_rate": 4.461400641516371e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1624.0, "logps/rejected": -1704.0, "loss": 0.6859, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.380859375, "rewards/margins": 0.037353515625, "rewards/rejected": 0.34375, "step": 1011 }, { "epoch": 0.2919371123611712, "grad_norm": 12.275416407080403, "learning_rate": 4.459838286902368e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1584.0, "logps/rejected": -1736.0, "loss": 0.7167, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.310546875, "rewards/margins": -0.040771484375, "rewards/rejected": 0.3515625, "step": 1012 }, { "epoch": 0.2922255877686427, "grad_norm": 10.867720526624751, "learning_rate": 4.4582739439550153e-07, "logits/chosen": 3.203125, "logits/rejected": 3.125, "logps/chosen": -1272.0, "logps/rejected": -1304.0, "loss": 0.6702, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.3359375, "rewards/margins": 0.099609375, "rewards/rejected": 0.2353515625, "step": 1013 }, { "epoch": 0.2925140631761142, "grad_norm": 12.282673187851413, "learning_rate": 4.456707614261401e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1568.0, "logps/rejected": -1680.0, "loss": 0.6844, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43359375, "rewards/margins": 0.08203125, "rewards/rejected": 0.3515625, "step": 1014 }, { "epoch": 0.29280253858358574, "grad_norm": 11.63405925402273, "learning_rate": 4.4551392994106275e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -1640.0, "logps/rejected": -1592.0, "loss": 0.7138, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.35546875, "rewards/margins": 0.007354736328125, "rewards/rejected": 0.34765625, "step": 1015 }, { "epoch": 0.29309101399105725, "grad_norm": 11.777567280944552, "learning_rate": 4.453569000993813e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1856.0, "logps/rejected": -1616.0, "loss": 0.6879, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.34375, "rewards/margins": 0.0311279296875, "rewards/rejected": 0.310546875, "step": 1016 }, { "epoch": 0.29337948939852876, "grad_norm": 11.136777305828762, "learning_rate": 4.4519967206040877e-07, "logits/chosen": 3.125, "logits/rejected": 3.25, "logps/chosen": -1648.0, "logps/rejected": -1440.0, "loss": 0.6465, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4453125, "rewards/margins": 0.177734375, "rewards/rejected": 0.267578125, "step": 1017 }, { "epoch": 0.29366796480600027, "grad_norm": 11.510737567604044, "learning_rate": 4.4504224598365916e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1712.0, "logps/rejected": -1736.0, "loss": 0.6938, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.423828125, "rewards/margins": -0.005889892578125, "rewards/rejected": 0.4296875, "step": 1018 }, { "epoch": 0.2939564402134718, "grad_norm": 11.077878080350898, "learning_rate": 4.4488462202884733e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1472.0, "logps/rejected": -1360.0, "loss": 0.6751, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.240234375, "rewards/margins": -0.0269775390625, "rewards/rejected": 0.267578125, "step": 1019 }, { "epoch": 0.2942449156209433, "grad_norm": 18.88115469119005, "learning_rate": 4.447268003558892e-07, "logits/chosen": 3.078125, "logits/rejected": 3.09375, "logps/chosen": -1768.0, "logps/rejected": -1616.0, "loss": 0.6963, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.408203125, "rewards/margins": -0.0361328125, "rewards/rejected": 0.443359375, "step": 1020 }, { "epoch": 0.2945333910284148, "grad_norm": 10.9235990367153, "learning_rate": 4.445687811249009e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1928.0, "logps/rejected": -1632.0, "loss": 0.655, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.419921875, "rewards/margins": 0.1328125, "rewards/rejected": 0.287109375, "step": 1021 }, { "epoch": 0.2948218664358863, "grad_norm": 11.314173501306511, "learning_rate": 4.444105644961994e-07, "logits/chosen": 3.171875, "logits/rejected": 3.109375, "logps/chosen": -1664.0, "logps/rejected": -1528.0, "loss": 0.6829, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4453125, "rewards/margins": 0.030029296875, "rewards/rejected": 0.416015625, "step": 1022 }, { "epoch": 0.2951103418433578, "grad_norm": 9.003809028651654, "learning_rate": 4.442521506303015e-07, "logits/chosen": 3.3125, "logits/rejected": 3.328125, "logps/chosen": -2112.0, "logps/rejected": -1912.0, "loss": 0.687, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.39453125, "rewards/margins": 0.006195068359375, "rewards/rejected": 0.388671875, "step": 1023 }, { "epoch": 0.2953988172508294, "grad_norm": 14.692557678898737, "learning_rate": 4.440935396879245e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -1320.0, "logps/rejected": -1384.0, "loss": 0.681, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.314453125, "rewards/margins": 0.0242919921875, "rewards/rejected": 0.2890625, "step": 1024 }, { "epoch": 0.2956872926583009, "grad_norm": 11.638718887654086, "learning_rate": 4.4393473182998544e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1488.0, "logps/rejected": -1472.0, "loss": 0.7111, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.35546875, "rewards/margins": -0.0008392333984375, "rewards/rejected": 0.35546875, "step": 1025 }, { "epoch": 0.2959757680657724, "grad_norm": 12.953793471008641, "learning_rate": 4.4377572721760105e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1816.0, "logps/rejected": -1880.0, "loss": 0.7134, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.431640625, "rewards/margins": 0.044921875, "rewards/rejected": 0.38671875, "step": 1026 }, { "epoch": 0.2962642434732439, "grad_norm": 11.176485915743152, "learning_rate": 4.436165260120879e-07, "logits/chosen": 3.375, "logits/rejected": 3.40625, "logps/chosen": -1952.0, "logps/rejected": -1872.0, "loss": 0.7111, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.341796875, "rewards/margins": -0.03125, "rewards/rejected": 0.373046875, "step": 1027 }, { "epoch": 0.29655271888071544, "grad_norm": 10.059378284593764, "learning_rate": 4.434571283749618e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1592.0, "logps/rejected": -1608.0, "loss": 0.6696, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.421875, "rewards/margins": 0.0081787109375, "rewards/rejected": 0.412109375, "step": 1028 }, { "epoch": 0.29684119428818695, "grad_norm": 10.670638904777086, "learning_rate": 4.4329753446793806e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1632.0, "logps/rejected": -1624.0, "loss": 0.6876, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.388671875, "rewards/margins": -0.00177001953125, "rewards/rejected": 0.388671875, "step": 1029 }, { "epoch": 0.29712966969565846, "grad_norm": 11.14820599901039, "learning_rate": 4.4313774445293097e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1896.0, "logps/rejected": -1880.0, "loss": 0.6791, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.58984375, "rewards/margins": 0.10009765625, "rewards/rejected": 0.48828125, "step": 1030 }, { "epoch": 0.29741814510312997, "grad_norm": 9.454338091620606, "learning_rate": 4.4297775849205365e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1424.0, "logps/rejected": -1352.0, "loss": 0.6728, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3203125, "rewards/margins": 0.059326171875, "rewards/rejected": 0.259765625, "step": 1031 }, { "epoch": 0.2977066205106015, "grad_norm": 12.41192727049395, "learning_rate": 4.428175767476184e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1880.0, "logps/rejected": -2008.0, "loss": 0.6824, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5078125, "rewards/margins": 0.046142578125, "rewards/rejected": 0.4609375, "step": 1032 }, { "epoch": 0.297995095918073, "grad_norm": 12.377032880347572, "learning_rate": 4.426571993821359e-07, "logits/chosen": 3.265625, "logits/rejected": 3.296875, "logps/chosen": -1368.0, "logps/rejected": -1432.0, "loss": 0.6934, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.26171875, "rewards/margins": -0.057373046875, "rewards/rejected": 0.318359375, "step": 1033 }, { "epoch": 0.2982835713255445, "grad_norm": 9.95151473774507, "learning_rate": 4.424966265583152e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1632.0, "logps/rejected": -1744.0, "loss": 0.681, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.48046875, "rewards/margins": 0.07421875, "rewards/rejected": 0.40625, "step": 1034 }, { "epoch": 0.298572046733016, "grad_norm": 11.367216827256211, "learning_rate": 4.423358584390639e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1688.0, "logps/rejected": -1688.0, "loss": 0.7254, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.333984375, "rewards/margins": -0.0908203125, "rewards/rejected": 0.42578125, "step": 1035 }, { "epoch": 0.29886052214048753, "grad_norm": 11.74417259921719, "learning_rate": 4.4217489518748753e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1464.0, "logps/rejected": -1264.0, "loss": 0.7052, "loss/demonstration_loss": -2752.0, "loss/preference_loss": -2752.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2451171875, "rewards/margins": 0.021240234375, "rewards/rejected": 0.2236328125, "step": 1036 }, { "epoch": 0.29914899754795904, "grad_norm": 10.66541803953039, "learning_rate": 4.4201373696688967e-07, "logits/chosen": 3.203125, "logits/rejected": 3.25, "logps/chosen": -1824.0, "logps/rejected": -1840.0, "loss": 0.6599, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.455078125, "rewards/margins": 0.138671875, "rewards/rejected": 0.31640625, "step": 1037 }, { "epoch": 0.29943747295543055, "grad_norm": 11.47063053001984, "learning_rate": 4.4185238394077167e-07, "logits/chosen": 3.296875, "logits/rejected": 3.328125, "logps/chosen": -1856.0, "logps/rejected": -2000.0, "loss": 0.6921, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.44921875, "rewards/margins": 0.0169677734375, "rewards/rejected": 0.43359375, "step": 1038 }, { "epoch": 0.29972594836290206, "grad_norm": 11.531669306695642, "learning_rate": 4.4169083627283264e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1344.0, "logps/rejected": -1280.0, "loss": 0.6667, "loss/demonstration_loss": -2672.0, "loss/preference_loss": -2656.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3828125, "rewards/margins": 0.08203125, "rewards/rejected": 0.30078125, "step": 1039 }, { "epoch": 0.3000144237703736, "grad_norm": 12.85030395789688, "learning_rate": 4.41529094126969e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -2040.0, "logps/rejected": -1808.0, "loss": 0.697, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.58203125, "rewards/margins": 0.01953125, "rewards/rejected": 0.5625, "step": 1040 }, { "epoch": 0.3003028991778451, "grad_norm": 10.156820907951008, "learning_rate": 4.413671576672745e-07, "logits/chosen": 3.296875, "logits/rejected": 3.3125, "logps/chosen": -1576.0, "logps/rejected": -1600.0, "loss": 0.6551, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.498046875, "rewards/margins": 0.0859375, "rewards/rejected": 0.412109375, "step": 1041 }, { "epoch": 0.3005913745853166, "grad_norm": 10.96774179539688, "learning_rate": 4.412050270580402e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1336.0, "logps/rejected": -1568.0, "loss": 0.6876, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.380859375, "rewards/margins": 0.021728515625, "rewards/rejected": 0.359375, "step": 1042 }, { "epoch": 0.3008798499927881, "grad_norm": 10.26203078171827, "learning_rate": 4.4104270246375397e-07, "logits/chosen": 3.265625, "logits/rejected": 3.1875, "logps/chosen": -1824.0, "logps/rejected": -1776.0, "loss": 0.6624, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.54296875, "rewards/margins": 0.06640625, "rewards/rejected": 0.4765625, "step": 1043 }, { "epoch": 0.3011683254002596, "grad_norm": 10.545231165689852, "learning_rate": 4.4088018404910043e-07, "logits/chosen": 3.125, "logits/rejected": 3.046875, "logps/chosen": -1816.0, "logps/rejected": -1816.0, "loss": 0.6857, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.423828125, "rewards/margins": -0.049072265625, "rewards/rejected": 0.474609375, "step": 1044 }, { "epoch": 0.30145680080773113, "grad_norm": 11.567764862834972, "learning_rate": 4.407174719789611e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1656.0, "logps/rejected": -1576.0, "loss": 0.6888, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.41796875, "rewards/margins": 0.023681640625, "rewards/rejected": 0.39453125, "step": 1045 }, { "epoch": 0.30174527621520264, "grad_norm": 12.007998303641164, "learning_rate": 4.405545664184136e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -1552.0, "logps/rejected": -1608.0, "loss": 0.6815, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.462890625, "rewards/margins": -0.01025390625, "rewards/rejected": 0.47265625, "step": 1046 }, { "epoch": 0.30203375162267415, "grad_norm": 9.960620502492322, "learning_rate": 4.403914675327322e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1608.0, "logps/rejected": -1696.0, "loss": 0.6492, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.478515625, "rewards/margins": 0.15625, "rewards/rejected": 0.322265625, "step": 1047 }, { "epoch": 0.30232222703014566, "grad_norm": 10.199605205688002, "learning_rate": 4.402281754873871e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1496.0, "logps/rejected": -1440.0, "loss": 0.6937, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41796875, "rewards/margins": 0.016357421875, "rewards/rejected": 0.40234375, "step": 1048 }, { "epoch": 0.3026107024376172, "grad_norm": 13.242753505198397, "learning_rate": 4.4006469044804454e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1680.0, "logps/rejected": -1544.0, "loss": 0.6695, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.546875, "rewards/margins": 0.1806640625, "rewards/rejected": 0.3671875, "step": 1049 }, { "epoch": 0.3028991778450887, "grad_norm": 10.868260554685914, "learning_rate": 4.399010125805666e-07, "logits/chosen": 3.171875, "logits/rejected": 3.203125, "logps/chosen": -1936.0, "logps/rejected": -2008.0, "loss": 0.6995, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.51953125, "rewards/margins": 0.03759765625, "rewards/rejected": 0.48046875, "step": 1050 }, { "epoch": 0.3031876532525602, "grad_norm": 10.449313221078908, "learning_rate": 4.397371420510108e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -2008.0, "logps/rejected": -1792.0, "loss": 0.6232, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.57421875, "rewards/margins": 0.2275390625, "rewards/rejected": 0.345703125, "step": 1051 }, { "epoch": 0.3034761286600317, "grad_norm": 9.545849479115878, "learning_rate": 4.3957307902563043e-07, "logits/chosen": 3.390625, "logits/rejected": 3.375, "logps/chosen": -1808.0, "logps/rejected": -1696.0, "loss": 0.6603, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5546875, "rewards/margins": 0.08056640625, "rewards/rejected": 0.474609375, "step": 1052 }, { "epoch": 0.3037646040675032, "grad_norm": 10.071343070601001, "learning_rate": 4.394088236708738e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1392.0, "logps/rejected": -1384.0, "loss": 0.6742, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.32421875, "rewards/margins": 0.04736328125, "rewards/rejected": 0.27734375, "step": 1053 }, { "epoch": 0.3040530794749748, "grad_norm": 10.793802524685072, "learning_rate": 4.392443761533846e-07, "logits/chosen": 3.078125, "logits/rejected": 3.03125, "logps/chosen": -1328.0, "logps/rejected": -1416.0, "loss": 0.6731, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.341796875, "rewards/margins": 0.09375, "rewards/rejected": 0.2470703125, "step": 1054 }, { "epoch": 0.3043415548824463, "grad_norm": 10.460205711825303, "learning_rate": 4.3907973664000113e-07, "logits/chosen": 3.390625, "logits/rejected": 3.375, "logps/chosen": -2080.0, "logps/rejected": -1968.0, "loss": 0.6514, "loss/demonstration_loss": -4080.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4609375, "rewards/margins": 0.0693359375, "rewards/rejected": 0.390625, "step": 1055 }, { "epoch": 0.3046300302899178, "grad_norm": 9.927944464628002, "learning_rate": 4.389149052977568e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1664.0, "logps/rejected": -1728.0, "loss": 0.7085, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.408203125, "rewards/margins": 0.001983642578125, "rewards/rejected": 0.40625, "step": 1056 }, { "epoch": 0.3049185056973893, "grad_norm": 10.118494224451947, "learning_rate": 4.387498822938795e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1544.0, "logps/rejected": -1328.0, "loss": 0.6917, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.419921875, "rewards/margins": 0.0947265625, "rewards/rejected": 0.32421875, "step": 1057 }, { "epoch": 0.30520698110486083, "grad_norm": 11.446795630909975, "learning_rate": 4.385846677957916e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1560.0, "logps/rejected": -1496.0, "loss": 0.7209, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.369140625, "rewards/margins": -0.0419921875, "rewards/rejected": 0.41015625, "step": 1058 }, { "epoch": 0.30549545651233234, "grad_norm": 9.83134059349555, "learning_rate": 4.3841926197110967e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1248.0, "logps/rejected": -1496.0, "loss": 0.6356, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.416015625, "rewards/margins": 0.06884765625, "rewards/rejected": 0.345703125, "step": 1059 }, { "epoch": 0.30578393191980385, "grad_norm": 8.975430158885743, "learning_rate": 4.382536649876445e-07, "logits/chosen": 3.203125, "logits/rejected": 3.1875, "logps/chosen": -1904.0, "logps/rejected": -1888.0, "loss": 0.6712, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.40234375, "rewards/margins": 0.0751953125, "rewards/rejected": 0.328125, "step": 1060 }, { "epoch": 0.30607240732727536, "grad_norm": 9.725347570040716, "learning_rate": 4.3808787701340075e-07, "logits/chosen": 3.28125, "logits/rejected": 3.203125, "logps/chosen": -2016.0, "logps/rejected": -1896.0, "loss": 0.6513, "loss/demonstration_loss": -3968.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.52734375, "rewards/margins": 0.1298828125, "rewards/rejected": 0.3984375, "step": 1061 }, { "epoch": 0.3063608827347469, "grad_norm": 9.902936543435803, "learning_rate": 4.3792189821657695e-07, "logits/chosen": 3.203125, "logits/rejected": 3.125, "logps/chosen": -1856.0, "logps/rejected": -1696.0, "loss": 0.6813, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.462890625, "rewards/margins": 0.050048828125, "rewards/rejected": 0.4140625, "step": 1062 }, { "epoch": 0.3066493581422184, "grad_norm": 9.27244034248201, "learning_rate": 4.3775572876556504e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1216.0, "logps/rejected": -1200.0, "loss": 0.6893, "loss/demonstration_loss": -2448.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.30078125, "rewards/margins": 0.02490234375, "rewards/rejected": 0.275390625, "step": 1063 }, { "epoch": 0.3069378335496899, "grad_norm": 11.271431667081364, "learning_rate": 4.3758936882895046e-07, "logits/chosen": 3.28125, "logits/rejected": 3.140625, "logps/chosen": -1648.0, "logps/rejected": -1832.0, "loss": 0.6499, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51171875, "rewards/margins": 0.1796875, "rewards/rejected": 0.33203125, "step": 1064 }, { "epoch": 0.3072263089571614, "grad_norm": 9.5791873118906, "learning_rate": 4.374228185755121e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1936.0, "logps/rejected": -1656.0, "loss": 0.672, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.51953125, "rewards/margins": 0.06396484375, "rewards/rejected": 0.45703125, "step": 1065 }, { "epoch": 0.3075147843646329, "grad_norm": 12.241701521904064, "learning_rate": 4.372560781742216e-07, "logits/chosen": 3.03125, "logits/rejected": 3.09375, "logps/chosen": -1528.0, "logps/rejected": -1488.0, "loss": 0.6649, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.365234375, "rewards/margins": 0.171875, "rewards/rejected": 0.1943359375, "step": 1066 }, { "epoch": 0.30780325977210443, "grad_norm": 10.937796100133534, "learning_rate": 4.370891477942439e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1632.0, "logps/rejected": -1384.0, "loss": 0.673, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51953125, "rewards/margins": 0.091796875, "rewards/rejected": 0.427734375, "step": 1067 }, { "epoch": 0.30809173517957594, "grad_norm": 11.827708482688212, "learning_rate": 4.369220276049362e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -2048.0, "logps/rejected": -1944.0, "loss": 0.6866, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.6484375, "rewards/margins": 0.0147705078125, "rewards/rejected": 0.63671875, "step": 1068 }, { "epoch": 0.30838021058704745, "grad_norm": 10.209626008337633, "learning_rate": 4.3675471777584867e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1696.0, "logps/rejected": -1704.0, "loss": 0.6394, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.451171875, "rewards/margins": 0.05908203125, "rewards/rejected": 0.392578125, "step": 1069 }, { "epoch": 0.30866868599451897, "grad_norm": 10.813931493175227, "learning_rate": 4.3658721847672374e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1352.0, "logps/rejected": -1320.0, "loss": 0.6512, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.466796875, "rewards/margins": 0.1923828125, "rewards/rejected": 0.275390625, "step": 1070 }, { "epoch": 0.3089571614019905, "grad_norm": 12.781476936616123, "learning_rate": 4.3641952987749604e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1616.0, "logps/rejected": -1440.0, "loss": 0.7042, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.427734375, "rewards/margins": 0.04345703125, "rewards/rejected": 0.384765625, "step": 1071 }, { "epoch": 0.309245636809462, "grad_norm": 10.480854534217741, "learning_rate": 4.362516521482923e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1472.0, "logps/rejected": -1456.0, "loss": 0.6577, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.45703125, "rewards/margins": 0.16015625, "rewards/rejected": 0.296875, "step": 1072 }, { "epoch": 0.3095341122169335, "grad_norm": 9.74032546417054, "learning_rate": 4.3608358545943105e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -2160.0, "logps/rejected": -1656.0, "loss": 0.637, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.76171875, "rewards/margins": 0.263671875, "rewards/rejected": 0.498046875, "step": 1073 }, { "epoch": 0.309822587624405, "grad_norm": 11.741331183220632, "learning_rate": 4.3591532998142266e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1856.0, "logps/rejected": -1696.0, "loss": 0.6885, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4921875, "rewards/margins": 0.09033203125, "rewards/rejected": 0.40234375, "step": 1074 }, { "epoch": 0.3101110630318765, "grad_norm": 9.98462904954572, "learning_rate": 4.3574688588496896e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1024.0, "logps/rejected": -1168.0, "loss": 0.6736, "loss/demonstration_loss": -2224.0, "loss/preference_loss": -2224.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.306640625, "rewards/margins": 0.041259765625, "rewards/rejected": 0.265625, "step": 1075 }, { "epoch": 0.31039953843934803, "grad_norm": 10.81813828702515, "learning_rate": 4.3557825334096306e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1616.0, "logps/rejected": -1616.0, "loss": 0.6821, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.36328125, "rewards/margins": 0.0262451171875, "rewards/rejected": 0.337890625, "step": 1076 }, { "epoch": 0.31068801384681954, "grad_norm": 10.156942234471277, "learning_rate": 4.354094325204894e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1336.0, "logps/rejected": -1248.0, "loss": 0.6641, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.310546875, "rewards/margins": 0.06103515625, "rewards/rejected": 0.25, "step": 1077 }, { "epoch": 0.31097648925429106, "grad_norm": 12.540158102646249, "learning_rate": 4.352404235948233e-07, "logits/chosen": 3.453125, "logits/rejected": 3.328125, "logps/chosen": -1584.0, "logps/rejected": -1720.0, "loss": 0.7334, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.482421875, "rewards/margins": -0.0732421875, "rewards/rejected": 0.5546875, "step": 1078 }, { "epoch": 0.31126496466176257, "grad_norm": 9.958180223700081, "learning_rate": 4.350712267354311e-07, "logits/chosen": 3.34375, "logits/rejected": 3.359375, "logps/chosen": -1392.0, "logps/rejected": -1280.0, "loss": 0.661, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.470703125, "rewards/margins": 0.1396484375, "rewards/rejected": 0.33203125, "step": 1079 }, { "epoch": 0.3115534400692341, "grad_norm": 11.095589732013739, "learning_rate": 4.3490184211396963e-07, "logits/chosen": 3.203125, "logits/rejected": 3.1875, "logps/chosen": -1640.0, "logps/rejected": -1872.0, "loss": 0.6682, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.48046875, "rewards/margins": 0.036376953125, "rewards/rejected": 0.443359375, "step": 1080 }, { "epoch": 0.3118419154767056, "grad_norm": 10.62922833344852, "learning_rate": 4.347322699022863e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -2000.0, "logps/rejected": -1712.0, "loss": 0.655, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.62109375, "rewards/margins": 0.12109375, "rewards/rejected": 0.5, "step": 1081 }, { "epoch": 0.3121303908841771, "grad_norm": 10.809439703632, "learning_rate": 4.3456251027241876e-07, "logits/chosen": 3.328125, "logits/rejected": 3.421875, "logps/chosen": -1640.0, "logps/rejected": -1480.0, "loss": 0.673, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.412109375, "rewards/margins": 0.0869140625, "rewards/rejected": 0.32421875, "step": 1082 }, { "epoch": 0.3124188662916486, "grad_norm": 11.558772145283713, "learning_rate": 4.343925633965949e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1520.0, "logps/rejected": -1712.0, "loss": 0.7331, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.447265625, "rewards/margins": -0.09716796875, "rewards/rejected": 0.54296875, "step": 1083 }, { "epoch": 0.3127073416991201, "grad_norm": 9.600416604496514, "learning_rate": 4.342224294472326e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -2040.0, "logps/rejected": -1784.0, "loss": 0.653, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.796875, "rewards/margins": 0.271484375, "rewards/rejected": 0.5234375, "step": 1084 }, { "epoch": 0.3129958171065917, "grad_norm": 11.641016808151411, "learning_rate": 4.3405210859693935e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1480.0, "logps/rejected": -1504.0, "loss": 0.6728, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3046875, "rewards/margins": 0.04248046875, "rewards/rejected": 0.26171875, "step": 1085 }, { "epoch": 0.3132842925140632, "grad_norm": 11.879649323097299, "learning_rate": 4.3388160101851244e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1928.0, "logps/rejected": -2128.0, "loss": 0.7119, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5234375, "rewards/margins": -0.06103515625, "rewards/rejected": 0.5859375, "step": 1086 }, { "epoch": 0.3135727679215347, "grad_norm": 9.323380293363014, "learning_rate": 4.337109068849386e-07, "logits/chosen": 3.25, "logits/rejected": 3.3125, "logps/chosen": -1824.0, "logps/rejected": -1776.0, "loss": 0.6538, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.490234375, "rewards/margins": 0.0546875, "rewards/rejected": 0.435546875, "step": 1087 }, { "epoch": 0.3138612433290062, "grad_norm": 11.202696120029435, "learning_rate": 4.335400263693937e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1576.0, "logps/rejected": -1472.0, "loss": 0.6697, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.51953125, "rewards/margins": 0.150390625, "rewards/rejected": 0.369140625, "step": 1088 }, { "epoch": 0.31414971873647773, "grad_norm": 9.943585237587477, "learning_rate": 4.3336895964524276e-07, "logits/chosen": 3.34375, "logits/rejected": 3.296875, "logps/chosen": -1832.0, "logps/rejected": -1864.0, "loss": 0.6633, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.6171875, "rewards/margins": -0.0380859375, "rewards/rejected": 0.65625, "step": 1089 }, { "epoch": 0.31443819414394925, "grad_norm": 10.660608823952572, "learning_rate": 4.3319770688603975e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -1584.0, "logps/rejected": -1728.0, "loss": 0.7169, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.45703125, "rewards/margins": -0.03515625, "rewards/rejected": 0.4921875, "step": 1090 }, { "epoch": 0.31472666955142076, "grad_norm": 9.962700899735973, "learning_rate": 4.3302626826552733e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1600.0, "logps/rejected": -1712.0, "loss": 0.6934, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.455078125, "rewards/margins": -0.0269775390625, "rewards/rejected": 0.482421875, "step": 1091 }, { "epoch": 0.31501514495889227, "grad_norm": 10.959556846793634, "learning_rate": 4.3285464395763694e-07, "logits/chosen": 3.1875, "logits/rejected": 3.15625, "logps/chosen": -1824.0, "logps/rejected": -1816.0, "loss": 0.6589, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.6328125, "rewards/margins": 0.11572265625, "rewards/rejected": 0.51953125, "step": 1092 }, { "epoch": 0.3153036203663638, "grad_norm": 10.07789154443979, "learning_rate": 4.3268283413648786e-07, "logits/chosen": 3.125, "logits/rejected": 3.078125, "logps/chosen": -1576.0, "logps/rejected": -1384.0, "loss": 0.6336, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.462890625, "rewards/margins": 0.1923828125, "rewards/rejected": 0.271484375, "step": 1093 }, { "epoch": 0.3155920957738353, "grad_norm": 11.767494127444492, "learning_rate": 4.325108389763883e-07, "logits/chosen": 3.203125, "logits/rejected": 3.140625, "logps/chosen": -1696.0, "logps/rejected": -1936.0, "loss": 0.7125, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40234375, "rewards/margins": 0.03955078125, "rewards/rejected": 0.36328125, "step": 1094 }, { "epoch": 0.3158805711813068, "grad_norm": 12.36861140594807, "learning_rate": 4.3233865865183396e-07, "logits/chosen": 3.0, "logits/rejected": 3.09375, "logps/chosen": -1608.0, "logps/rejected": -1424.0, "loss": 0.6807, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3359375, "rewards/margins": -0.0146484375, "rewards/rejected": 0.3515625, "step": 1095 }, { "epoch": 0.3161690465887783, "grad_norm": 10.625365820116428, "learning_rate": 4.321662933375085e-07, "logits/chosen": 3.09375, "logits/rejected": 3.15625, "logps/chosen": -2320.0, "logps/rejected": -2208.0, "loss": 0.6663, "loss/demonstration_loss": -4608.0, "loss/preference_loss": -4608.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.8515625, "rewards/margins": 0.047119140625, "rewards/rejected": 0.8046875, "step": 1096 }, { "epoch": 0.3164575219962498, "grad_norm": 10.689177670928903, "learning_rate": 4.3199374320828357e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1872.0, "logps/rejected": -1736.0, "loss": 0.6848, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.484375, "rewards/margins": 0.08251953125, "rewards/rejected": 0.400390625, "step": 1097 }, { "epoch": 0.31674599740372134, "grad_norm": 11.486609689176712, "learning_rate": 4.3182100843921794e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1824.0, "logps/rejected": -1984.0, "loss": 0.6559, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.57421875, "rewards/margins": 0.09716796875, "rewards/rejected": 0.4765625, "step": 1098 }, { "epoch": 0.31703447281119285, "grad_norm": 10.913265392848576, "learning_rate": 4.3164808920555783e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1808.0, "logps/rejected": -1672.0, "loss": 0.694, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.470703125, "rewards/margins": 0.0302734375, "rewards/rejected": 0.439453125, "step": 1099 }, { "epoch": 0.31732294821866436, "grad_norm": 11.751779066090771, "learning_rate": 4.3147498568273674e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1840.0, "logps/rejected": -1632.0, "loss": 0.6732, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4296875, "rewards/margins": 0.09765625, "rewards/rejected": 0.33203125, "step": 1100 }, { "epoch": 0.31761142362613587, "grad_norm": 11.018008193645517, "learning_rate": 4.3130169804637497e-07, "logits/chosen": 3.140625, "logits/rejected": 3.265625, "logps/chosen": -2000.0, "logps/rejected": -1912.0, "loss": 0.6755, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.453125, "rewards/margins": -0.010498046875, "rewards/rejected": 0.462890625, "step": 1101 }, { "epoch": 0.3178998990336074, "grad_norm": 11.12508340848376, "learning_rate": 4.311282264722796e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1216.0, "logps/rejected": -1024.0, "loss": 0.6775, "loss/demonstration_loss": -2288.0, "loss/preference_loss": -2272.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3671875, "rewards/margins": 0.07470703125, "rewards/rejected": 0.291015625, "step": 1102 }, { "epoch": 0.3181883744410789, "grad_norm": 10.817337869101193, "learning_rate": 4.3095457113644456e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1376.0, "logps/rejected": -1592.0, "loss": 0.7084, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.443359375, "rewards/margins": -0.0625, "rewards/rejected": 0.5078125, "step": 1103 }, { "epoch": 0.3184768498485504, "grad_norm": 11.285619032441735, "learning_rate": 4.3078073221504997e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0625, "logps/chosen": -1640.0, "logps/rejected": -1576.0, "loss": 0.6654, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.51953125, "rewards/margins": 0.028076171875, "rewards/rejected": 0.4921875, "step": 1104 }, { "epoch": 0.3187653252560219, "grad_norm": 9.543128841084139, "learning_rate": 4.3060670988446226e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1080.0, "logps/rejected": -892.0, "loss": 0.6893, "loss/demonstration_loss": -1992.0, "loss/preference_loss": -1976.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2197265625, "rewards/margins": 0.12890625, "rewards/rejected": 0.0908203125, "step": 1105 }, { "epoch": 0.3190538006634934, "grad_norm": 10.96143869200322, "learning_rate": 4.304325043212339e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1720.0, "logps/rejected": -1832.0, "loss": 0.6803, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5, "rewards/margins": 0.0133056640625, "rewards/rejected": 0.48828125, "step": 1106 }, { "epoch": 0.31934227607096494, "grad_norm": 10.958561735749933, "learning_rate": 4.302581157021034e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1664.0, "logps/rejected": -1680.0, "loss": 0.6748, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.466796875, "rewards/margins": 0.029541015625, "rewards/rejected": 0.4375, "step": 1107 }, { "epoch": 0.31963075147843645, "grad_norm": 9.249602364134974, "learning_rate": 4.300835442039949e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1400.0, "logps/rejected": -1344.0, "loss": 0.6719, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.46484375, "rewards/margins": 0.06640625, "rewards/rejected": 0.3984375, "step": 1108 }, { "epoch": 0.31991922688590796, "grad_norm": 10.054629788603743, "learning_rate": 4.299087900040181e-07, "logits/chosen": 3.375, "logits/rejected": 3.34375, "logps/chosen": -1816.0, "logps/rejected": -1664.0, "loss": 0.6697, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.53515625, "rewards/margins": 0.05712890625, "rewards/rejected": 0.478515625, "step": 1109 }, { "epoch": 0.32020770229337947, "grad_norm": 9.498590751331028, "learning_rate": 4.2973385327946796e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1624.0, "logps/rejected": -1544.0, "loss": 0.6692, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4375, "rewards/margins": 0.055419921875, "rewards/rejected": 0.3828125, "step": 1110 }, { "epoch": 0.320496177700851, "grad_norm": 9.865754437646652, "learning_rate": 4.295587342078247e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1752.0, "logps/rejected": -1736.0, "loss": 0.6766, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.55859375, "rewards/margins": -0.020263671875, "rewards/rejected": 0.578125, "step": 1111 }, { "epoch": 0.3207846531083225, "grad_norm": 9.605724895544492, "learning_rate": 4.2938343296675365e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1608.0, "logps/rejected": -1520.0, "loss": 0.6618, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5703125, "rewards/margins": 0.10986328125, "rewards/rejected": 0.462890625, "step": 1112 }, { "epoch": 0.321073128515794, "grad_norm": 11.046802334784493, "learning_rate": 4.2920794973410476e-07, "logits/chosen": 3.171875, "logits/rejected": 3.25, "logps/chosen": -1752.0, "logps/rejected": -1400.0, "loss": 0.6615, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.376953125, "rewards/margins": 0.17578125, "rewards/rejected": 0.2021484375, "step": 1113 }, { "epoch": 0.3213616039232655, "grad_norm": 11.953063907399423, "learning_rate": 4.290322846879126e-07, "logits/chosen": 3.34375, "logits/rejected": 3.203125, "logps/chosen": -1344.0, "logps/rejected": -1496.0, "loss": 0.6938, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40234375, "rewards/margins": -0.006134033203125, "rewards/rejected": 0.408203125, "step": 1114 }, { "epoch": 0.3216500793307371, "grad_norm": 13.186866721082433, "learning_rate": 4.2885643800639657e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1936.0, "logps/rejected": -1568.0, "loss": 0.6515, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.6015625, "rewards/margins": 0.15234375, "rewards/rejected": 0.447265625, "step": 1115 }, { "epoch": 0.3219385547382086, "grad_norm": 9.39494627570994, "learning_rate": 4.2868040986795985e-07, "logits/chosen": 3.328125, "logits/rejected": 3.328125, "logps/chosen": -1864.0, "logps/rejected": -1824.0, "loss": 0.6553, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5546875, "rewards/margins": 0.060791015625, "rewards/rejected": 0.494140625, "step": 1116 }, { "epoch": 0.3222270301456801, "grad_norm": 9.498534393571315, "learning_rate": 4.2850420045118993e-07, "logits/chosen": 3.3125, "logits/rejected": 3.359375, "logps/chosen": -2008.0, "logps/rejected": -2080.0, "loss": 0.6542, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.53125, "rewards/margins": -0.00372314453125, "rewards/rejected": 0.53515625, "step": 1117 }, { "epoch": 0.3225155055531516, "grad_norm": 11.915042788541308, "learning_rate": 4.283278099348584e-07, "logits/chosen": 3.28125, "logits/rejected": 3.203125, "logps/chosen": -1576.0, "logps/rejected": -1720.0, "loss": 0.6633, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.50390625, "rewards/margins": 0.049072265625, "rewards/rejected": 0.45703125, "step": 1118 }, { "epoch": 0.3228039809606231, "grad_norm": 10.987924428308617, "learning_rate": 4.2815123849792024e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -1552.0, "logps/rejected": -1360.0, "loss": 0.7063, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.453125, "rewards/margins": 0.0164794921875, "rewards/rejected": 0.4375, "step": 1119 }, { "epoch": 0.32309245636809464, "grad_norm": 10.160955119843878, "learning_rate": 4.279744863195142e-07, "logits/chosen": 3.328125, "logits/rejected": 3.34375, "logps/chosen": -1632.0, "logps/rejected": -1680.0, "loss": 0.6783, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4140625, "rewards/margins": -0.0147705078125, "rewards/rejected": 0.427734375, "step": 1120 }, { "epoch": 0.32338093177556615, "grad_norm": 11.225946867998706, "learning_rate": 4.277975535789623e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -1488.0, "logps/rejected": -1200.0, "loss": 0.6802, "loss/demonstration_loss": -2752.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.5, "rewards/margins": 0.11181640625, "rewards/rejected": 0.388671875, "step": 1121 }, { "epoch": 0.32366940718303766, "grad_norm": 9.448584759046966, "learning_rate": 4.276204404557698e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1480.0, "logps/rejected": -1488.0, "loss": 0.6595, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.390625, "rewards/margins": 0.02197265625, "rewards/rejected": 0.369140625, "step": 1122 }, { "epoch": 0.32395788259050917, "grad_norm": 10.568033714034645, "learning_rate": 4.2744314712962516e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1688.0, "logps/rejected": -1704.0, "loss": 0.652, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.546875, "rewards/margins": 0.06396484375, "rewards/rejected": 0.482421875, "step": 1123 }, { "epoch": 0.3242463579979807, "grad_norm": 11.01463413733453, "learning_rate": 4.2726567378039926e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1528.0, "logps/rejected": -1392.0, "loss": 0.6906, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5625, "rewards/margins": 0.0130615234375, "rewards/rejected": 0.55078125, "step": 1124 }, { "epoch": 0.3245348334054522, "grad_norm": 10.402394081822598, "learning_rate": 4.2708802058814586e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1656.0, "logps/rejected": -1608.0, "loss": 0.6529, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.55859375, "rewards/margins": 0.09814453125, "rewards/rejected": 0.4609375, "step": 1125 }, { "epoch": 0.3248233088129237, "grad_norm": 11.03117060755873, "learning_rate": 4.269101877331011e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1792.0, "logps/rejected": -1880.0, "loss": 0.7263, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4375, "rewards/margins": -0.0103759765625, "rewards/rejected": 0.447265625, "step": 1126 }, { "epoch": 0.3251117842203952, "grad_norm": 10.644485543976, "learning_rate": 4.267321753956835e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -2352.0, "logps/rejected": -2288.0, "loss": 0.6652, "loss/demonstration_loss": -4704.0, "loss/preference_loss": -4704.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.76171875, "rewards/margins": 0.080078125, "rewards/rejected": 0.6796875, "step": 1127 }, { "epoch": 0.32540025962786673, "grad_norm": 10.39211679108361, "learning_rate": 4.265539837564936e-07, "logits/chosen": 3.25, "logits/rejected": 3.15625, "logps/chosen": -1504.0, "logps/rejected": -1544.0, "loss": 0.6548, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.44921875, "rewards/margins": 0.0537109375, "rewards/rejected": 0.39453125, "step": 1128 }, { "epoch": 0.32568873503533824, "grad_norm": 11.48942717572012, "learning_rate": 4.263756129963138e-07, "logits/chosen": 3.015625, "logits/rejected": 3.109375, "logps/chosen": -1728.0, "logps/rejected": -1608.0, "loss": 0.68, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.60546875, "rewards/margins": 0.11328125, "rewards/rejected": 0.4921875, "step": 1129 }, { "epoch": 0.32597721044280975, "grad_norm": 10.052919449483925, "learning_rate": 4.261970632961084e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1360.0, "logps/rejected": -1408.0, "loss": 0.6945, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.41015625, "rewards/margins": -0.08056640625, "rewards/rejected": 0.4921875, "step": 1130 }, { "epoch": 0.32626568585028126, "grad_norm": 9.4922394428165, "learning_rate": 4.2601833483702297e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1544.0, "logps/rejected": -1584.0, "loss": 0.6729, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.455078125, "rewards/margins": 0.0693359375, "rewards/rejected": 0.384765625, "step": 1131 }, { "epoch": 0.3265541612577528, "grad_norm": 11.038417929831315, "learning_rate": 4.258394278003847e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -2024.0, "logps/rejected": -1888.0, "loss": 0.6564, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.515625, "rewards/margins": 0.10302734375, "rewards/rejected": 0.4140625, "step": 1132 }, { "epoch": 0.3268426366652243, "grad_norm": 9.891690587284414, "learning_rate": 4.2566034236770186e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -2224.0, "logps/rejected": -1992.0, "loss": 0.633, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.6328125, "rewards/margins": 0.1357421875, "rewards/rejected": 0.498046875, "step": 1133 }, { "epoch": 0.3271311120726958, "grad_norm": 9.223117953425657, "learning_rate": 4.2548107872066364e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1672.0, "logps/rejected": -1656.0, "loss": 0.6504, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.48046875, "rewards/margins": 0.12353515625, "rewards/rejected": 0.357421875, "step": 1134 }, { "epoch": 0.3274195874801673, "grad_norm": 11.155238346951988, "learning_rate": 4.2530163704114006e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1544.0, "logps/rejected": -1616.0, "loss": 0.6556, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.392578125, "rewards/margins": 0.041748046875, "rewards/rejected": 0.3515625, "step": 1135 }, { "epoch": 0.3277080628876388, "grad_norm": 9.190078299527917, "learning_rate": 4.2512201751118194e-07, "logits/chosen": 3.203125, "logits/rejected": 3.234375, "logps/chosen": -1432.0, "logps/rejected": -1048.0, "loss": 0.6297, "loss/demonstration_loss": -2512.0, "loss/preference_loss": -2496.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.33984375, "rewards/margins": 0.1630859375, "rewards/rejected": 0.17578125, "step": 1136 }, { "epoch": 0.32799653829511033, "grad_norm": 11.349245867175151, "learning_rate": 4.249422203130201e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -1752.0, "logps/rejected": -1640.0, "loss": 0.6863, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4140625, "rewards/margins": 0.00201416015625, "rewards/rejected": 0.412109375, "step": 1137 }, { "epoch": 0.32828501370258184, "grad_norm": 9.911915662201984, "learning_rate": 4.2476224562906616e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1664.0, "logps/rejected": -1480.0, "loss": 0.6906, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.396484375, "rewards/margins": 0.06591796875, "rewards/rejected": 0.33203125, "step": 1138 }, { "epoch": 0.32857348911005335, "grad_norm": 10.13273038028613, "learning_rate": 4.245820936419115e-07, "logits/chosen": 3.09375, "logits/rejected": 3.078125, "logps/chosen": -1648.0, "logps/rejected": -1488.0, "loss": 0.6768, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.376953125, "rewards/margins": 0.0576171875, "rewards/rejected": 0.318359375, "step": 1139 }, { "epoch": 0.32886196451752486, "grad_norm": 11.821421491096762, "learning_rate": 4.2440176453432734e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1856.0, "logps/rejected": -1648.0, "loss": 0.6696, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.1171875, "rewards/rejected": 0.3671875, "step": 1140 }, { "epoch": 0.3291504399249964, "grad_norm": 11.201923070900376, "learning_rate": 4.2422125848926485e-07, "logits/chosen": 3.0, "logits/rejected": 2.75, "logps/chosen": -1384.0, "logps/rejected": -1528.0, "loss": 0.6661, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.326171875, "rewards/margins": 0.12890625, "rewards/rejected": 0.1982421875, "step": 1141 }, { "epoch": 0.3294389153324679, "grad_norm": 12.043609996358251, "learning_rate": 4.240405756898543e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1856.0, "logps/rejected": -1688.0, "loss": 0.6896, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.419921875, "rewards/margins": 0.044189453125, "rewards/rejected": 0.376953125, "step": 1142 }, { "epoch": 0.3297273907399394, "grad_norm": 10.50533730843436, "learning_rate": 4.2385971631940566e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1312.0, "logps/rejected": -1272.0, "loss": 0.682, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.318359375, "rewards/margins": 0.09619140625, "rewards/rejected": 0.2216796875, "step": 1143 }, { "epoch": 0.3300158661474109, "grad_norm": 11.808515536777824, "learning_rate": 4.236786805614079e-07, "logits/chosen": 3.171875, "logits/rejected": 3.265625, "logps/chosen": -1600.0, "logps/rejected": -1432.0, "loss": 0.7482, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.34375, "rewards/margins": -0.0654296875, "rewards/rejected": 0.408203125, "step": 1144 }, { "epoch": 0.3303043415548824, "grad_norm": 12.242090967844083, "learning_rate": 4.2349746859952894e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1352.0, "logps/rejected": -1320.0, "loss": 0.7022, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5, "rewards/margins": 0.12451171875, "rewards/rejected": 0.375, "step": 1145 }, { "epoch": 0.330592816962354, "grad_norm": 10.853033861983233, "learning_rate": 4.233160806176155e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1912.0, "logps/rejected": -1584.0, "loss": 0.6714, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.453125, "rewards/margins": -0.00799560546875, "rewards/rejected": 0.4609375, "step": 1146 }, { "epoch": 0.3308812923698255, "grad_norm": 9.736436329938586, "learning_rate": 4.2313451679969283e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -1880.0, "logps/rejected": -1808.0, "loss": 0.691, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55078125, "rewards/margins": 0.07275390625, "rewards/rejected": 0.48046875, "step": 1147 }, { "epoch": 0.331169767777297, "grad_norm": 10.167725352894609, "learning_rate": 4.229527773299645e-07, "logits/chosen": 3.21875, "logits/rejected": 3.15625, "logps/chosen": -1616.0, "logps/rejected": -1592.0, "loss": 0.6571, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.50390625, "rewards/margins": 0.1318359375, "rewards/rejected": 0.373046875, "step": 1148 }, { "epoch": 0.3314582431847685, "grad_norm": 11.660646932229469, "learning_rate": 4.2277086239281256e-07, "logits/chosen": 3.328125, "logits/rejected": 3.265625, "logps/chosen": -1952.0, "logps/rejected": -1704.0, "loss": 0.7161, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5625, "rewards/margins": 0.0966796875, "rewards/rejected": 0.466796875, "step": 1149 }, { "epoch": 0.33174671859224003, "grad_norm": 13.041432753936327, "learning_rate": 4.225887721727968e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -1856.0, "logps/rejected": -1920.0, "loss": 0.6829, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5234375, "rewards/margins": 0.1171875, "rewards/rejected": 0.408203125, "step": 1150 }, { "epoch": 0.33203519399971154, "grad_norm": 9.965438308467057, "learning_rate": 4.2240650685465493e-07, "logits/chosen": 2.984375, "logits/rejected": 2.96875, "logps/chosen": -1264.0, "logps/rejected": -1312.0, "loss": 0.6862, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2608.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.447265625, "rewards/margins": 0.07421875, "rewards/rejected": 0.373046875, "step": 1151 }, { "epoch": 0.33232366940718305, "grad_norm": 8.49525303603664, "learning_rate": 4.2222406662330233e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1784.0, "logps/rejected": -1456.0, "loss": 0.609, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.578125, "rewards/margins": 0.2353515625, "rewards/rejected": 0.341796875, "step": 1152 }, { "epoch": 0.33261214481465456, "grad_norm": 10.693450766992868, "learning_rate": 4.2204145166383185e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1560.0, "logps/rejected": -1552.0, "loss": 0.7242, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.41015625, "rewards/margins": -0.039306640625, "rewards/rejected": 0.44921875, "step": 1153 }, { "epoch": 0.3329006202221261, "grad_norm": 10.248540609208248, "learning_rate": 4.218586621615136e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1632.0, "logps/rejected": -1616.0, "loss": 0.6858, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5625, "rewards/margins": 0.016357421875, "rewards/rejected": 0.546875, "step": 1154 }, { "epoch": 0.3331890956295976, "grad_norm": 13.241247193449121, "learning_rate": 4.216756983017946e-07, "logits/chosen": 3.265625, "logits/rejected": 3.21875, "logps/chosen": -1488.0, "logps/rejected": -1528.0, "loss": 0.649, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.462890625, "rewards/margins": 0.16015625, "rewards/rejected": 0.3046875, "step": 1155 }, { "epoch": 0.3334775710370691, "grad_norm": 11.303538123352176, "learning_rate": 4.2149256027029914e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1488.0, "logps/rejected": -1488.0, "loss": 0.6941, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4375, "rewards/margins": 0.01019287109375, "rewards/rejected": 0.427734375, "step": 1156 }, { "epoch": 0.3337660464445406, "grad_norm": 11.556663034465197, "learning_rate": 4.2130924825282777e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1656.0, "logps/rejected": -1584.0, "loss": 0.6186, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.27734375, "rewards/margins": 0.1708984375, "rewards/rejected": 0.10546875, "step": 1157 }, { "epoch": 0.3340545218520121, "grad_norm": 10.527528895772924, "learning_rate": 4.211257624353579e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1840.0, "logps/rejected": -1656.0, "loss": 0.6434, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.52734375, "rewards/margins": 0.08203125, "rewards/rejected": 0.4453125, "step": 1158 }, { "epoch": 0.33434299725948363, "grad_norm": 9.83585989965221, "learning_rate": 4.2094210300404306e-07, "logits/chosen": 3.171875, "logits/rejected": 3.1875, "logps/chosen": -1976.0, "logps/rejected": -2032.0, "loss": 0.6603, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.56640625, "rewards/margins": 0.11328125, "rewards/rejected": 0.451171875, "step": 1159 }, { "epoch": 0.33463147266695514, "grad_norm": 11.189821775386244, "learning_rate": 4.2075827014521304e-07, "logits/chosen": 3.265625, "logits/rejected": 3.21875, "logps/chosen": -1400.0, "logps/rejected": -1472.0, "loss": 0.7112, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": 0.0142822265625, "rewards/rejected": 0.318359375, "step": 1160 }, { "epoch": 0.33491994807442665, "grad_norm": 9.65783390216607, "learning_rate": 4.2057426404537357e-07, "logits/chosen": 3.265625, "logits/rejected": 3.265625, "logps/chosen": -1544.0, "logps/rejected": -1768.0, "loss": 0.6645, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.546875, "rewards/margins": 0.0262451171875, "rewards/rejected": 0.51953125, "step": 1161 }, { "epoch": 0.33520842348189817, "grad_norm": 10.798214811995859, "learning_rate": 4.2039008489120604e-07, "logits/chosen": 3.1875, "logits/rejected": 3.15625, "logps/chosen": -1656.0, "logps/rejected": -1640.0, "loss": 0.6792, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5234375, "rewards/margins": 0.0712890625, "rewards/rejected": 0.453125, "step": 1162 }, { "epoch": 0.3354968988893697, "grad_norm": 13.42015298250084, "learning_rate": 4.202057328695675e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1624.0, "logps/rejected": -1480.0, "loss": 0.6995, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.421875, "rewards/margins": -0.01116943359375, "rewards/rejected": 0.43359375, "step": 1163 }, { "epoch": 0.3357853742968412, "grad_norm": 11.301053398264328, "learning_rate": 4.200212081674904e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1408.0, "logps/rejected": -1440.0, "loss": 0.707, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.421875, "rewards/margins": -0.0159912109375, "rewards/rejected": 0.4375, "step": 1164 }, { "epoch": 0.3360738497043127, "grad_norm": 11.201672563859537, "learning_rate": 4.198365109721823e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1560.0, "logps/rejected": -1456.0, "loss": 0.6524, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.52734375, "rewards/margins": 0.10986328125, "rewards/rejected": 0.41796875, "step": 1165 }, { "epoch": 0.3363623251117842, "grad_norm": 11.269175375241158, "learning_rate": 4.196516414710258e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1592.0, "logps/rejected": -1680.0, "loss": 0.6998, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.45703125, "rewards/margins": -0.0625, "rewards/rejected": 0.51953125, "step": 1166 }, { "epoch": 0.3366508005192557, "grad_norm": 11.47215685353217, "learning_rate": 4.194665998515783e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1912.0, "logps/rejected": -1728.0, "loss": 0.7243, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.53125, "rewards/margins": -0.0537109375, "rewards/rejected": 0.5859375, "step": 1167 }, { "epoch": 0.33693927592672723, "grad_norm": 11.38601269178146, "learning_rate": 4.192813863015719e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1832.0, "logps/rejected": -1680.0, "loss": 0.679, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.5078125, "rewards/margins": 0.03955078125, "rewards/rejected": 0.46875, "step": 1168 }, { "epoch": 0.33722775133419874, "grad_norm": 11.147829750320977, "learning_rate": 4.19096001008913e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1616.0, "logps/rejected": -1480.0, "loss": 0.7078, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.408203125, "rewards/margins": -0.0849609375, "rewards/rejected": 0.494140625, "step": 1169 }, { "epoch": 0.33751622674167026, "grad_norm": 10.76755103946222, "learning_rate": 4.189104441616823e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1344.0, "logps/rejected": -1144.0, "loss": 0.6754, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.419921875, "rewards/margins": 0.08740234375, "rewards/rejected": 0.33203125, "step": 1170 }, { "epoch": 0.33780470214914177, "grad_norm": 10.199894516434862, "learning_rate": 4.187247159481345e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1976.0, "logps/rejected": -1920.0, "loss": 0.6576, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.59765625, "rewards/margins": 0.06396484375, "rewards/rejected": 0.53125, "step": 1171 }, { "epoch": 0.3380931775566133, "grad_norm": 11.033576485906467, "learning_rate": 4.185388165566983e-07, "logits/chosen": 3.21875, "logits/rejected": 3.125, "logps/chosen": -1584.0, "logps/rejected": -1536.0, "loss": 0.6862, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5546875, "rewards/margins": 0.07666015625, "rewards/rejected": 0.4765625, "step": 1172 }, { "epoch": 0.3383816529640848, "grad_norm": 10.797517689260367, "learning_rate": 4.1835274617597596e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1552.0, "logps/rejected": -1376.0, "loss": 0.6643, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.53515625, "rewards/margins": 0.173828125, "rewards/rejected": 0.361328125, "step": 1173 }, { "epoch": 0.3386701283715563, "grad_norm": 9.96713535535957, "learning_rate": 4.181665049947433e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1672.0, "logps/rejected": -1784.0, "loss": 0.6404, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.49609375, "rewards/margins": 0.185546875, "rewards/rejected": 0.310546875, "step": 1174 }, { "epoch": 0.3389586037790278, "grad_norm": 11.159489652410565, "learning_rate": 4.17980093201949e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1904.0, "logps/rejected": -1616.0, "loss": 0.6763, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.427734375, "rewards/margins": 0.0673828125, "rewards/rejected": 0.359375, "step": 1175 }, { "epoch": 0.3392470791864994, "grad_norm": 10.487960293414185, "learning_rate": 4.1779351098671573e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1648.0, "logps/rejected": -1432.0, "loss": 0.6453, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.453125, "rewards/margins": 0.12109375, "rewards/rejected": 0.33203125, "step": 1176 }, { "epoch": 0.3395355545939709, "grad_norm": 12.10421145151164, "learning_rate": 4.176067585383382e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1872.0, "logps/rejected": -2016.0, "loss": 0.7471, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.66796875, "rewards/margins": -0.012451171875, "rewards/rejected": 0.6796875, "step": 1177 }, { "epoch": 0.3398240300014424, "grad_norm": 10.665589122195815, "learning_rate": 4.174198360462841e-07, "logits/chosen": 3.15625, "logits/rejected": 3.234375, "logps/chosen": -1664.0, "logps/rejected": -1520.0, "loss": 0.6653, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.49609375, "rewards/margins": 0.12353515625, "rewards/rejected": 0.37109375, "step": 1178 }, { "epoch": 0.3401125054089139, "grad_norm": 9.440306643909677, "learning_rate": 4.1723274370019373e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1944.0, "logps/rejected": -1976.0, "loss": 0.6567, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.66015625, "rewards/margins": 0.0966796875, "rewards/rejected": 0.5625, "step": 1179 }, { "epoch": 0.3404009808163854, "grad_norm": 10.707031687381827, "learning_rate": 4.170454816898798e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1648.0, "logps/rejected": -1328.0, "loss": 0.6641, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5390625, "rewards/margins": 0.0859375, "rewards/rejected": 0.455078125, "step": 1180 }, { "epoch": 0.34068945622385693, "grad_norm": 12.083004587730889, "learning_rate": 4.1685805020532683e-07, "logits/chosen": 3.21875, "logits/rejected": 3.296875, "logps/chosen": -1720.0, "logps/rejected": -1728.0, "loss": 0.6876, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.45703125, "rewards/margins": 0.10205078125, "rewards/rejected": 0.353515625, "step": 1181 }, { "epoch": 0.34097793163132845, "grad_norm": 10.79867636871633, "learning_rate": 4.166704494366916e-07, "logits/chosen": 3.265625, "logits/rejected": 3.296875, "logps/chosen": -1800.0, "logps/rejected": -1552.0, "loss": 0.6362, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.578125, "rewards/margins": 0.15234375, "rewards/rejected": 0.423828125, "step": 1182 }, { "epoch": 0.34126640703879996, "grad_norm": 11.607845874521145, "learning_rate": 4.164826795743025e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1472.0, "logps/rejected": -1736.0, "loss": 0.7083, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.46875, "rewards/margins": -0.09033203125, "rewards/rejected": 0.55859375, "step": 1183 }, { "epoch": 0.34155488244627147, "grad_norm": 10.992957628024477, "learning_rate": 4.1629474080865936e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1440.0, "logps/rejected": -1432.0, "loss": 0.6772, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.408203125, "rewards/margins": 0.08203125, "rewards/rejected": 0.326171875, "step": 1184 }, { "epoch": 0.341843357853743, "grad_norm": 9.783187473727502, "learning_rate": 4.161066333304336e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -1688.0, "logps/rejected": -1688.0, "loss": 0.6606, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.435546875, "rewards/margins": 0.115234375, "rewards/rejected": 0.3203125, "step": 1185 }, { "epoch": 0.3421318332612145, "grad_norm": 9.444421637467595, "learning_rate": 4.159183573304675e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1256.0, "logps/rejected": -1240.0, "loss": 0.6806, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.37890625, "rewards/margins": 0.037109375, "rewards/rejected": 0.341796875, "step": 1186 }, { "epoch": 0.342420308668686, "grad_norm": 10.596573558664689, "learning_rate": 4.157299129997748e-07, "logits/chosen": 3.015625, "logits/rejected": 3.015625, "logps/chosen": -1424.0, "logps/rejected": -1336.0, "loss": 0.6703, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.44921875, "rewards/margins": 0.12353515625, "rewards/rejected": 0.32421875, "step": 1187 }, { "epoch": 0.3427087840761575, "grad_norm": 10.232235543749564, "learning_rate": 4.155413005295394e-07, "logits/chosen": 3.15625, "logits/rejected": 3.0625, "logps/chosen": -2040.0, "logps/rejected": -1928.0, "loss": 0.6551, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.46484375, "rewards/margins": 0.140625, "rewards/rejected": 0.32421875, "step": 1188 }, { "epoch": 0.342997259483629, "grad_norm": 8.869981902020646, "learning_rate": 4.1535252011111633e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1496.0, "logps/rejected": -1632.0, "loss": 0.6584, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.458984375, "rewards/margins": -0.002471923828125, "rewards/rejected": 0.4609375, "step": 1189 }, { "epoch": 0.34328573489110054, "grad_norm": 11.175947012805695, "learning_rate": 4.151635719360307e-07, "logits/chosen": 3.15625, "logits/rejected": 3.25, "logps/chosen": -1672.0, "logps/rejected": -1520.0, "loss": 0.739, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.326171875, "rewards/margins": -0.06494140625, "rewards/rejected": 0.390625, "step": 1190 }, { "epoch": 0.34357421029857205, "grad_norm": 10.717793583926355, "learning_rate": 4.149744561959779e-07, "logits/chosen": 3.234375, "logits/rejected": 3.15625, "logps/chosen": -1496.0, "logps/rejected": -1616.0, "loss": 0.6863, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.470703125, "rewards/margins": 0.049560546875, "rewards/rejected": 0.421875, "step": 1191 }, { "epoch": 0.34386268570604356, "grad_norm": 13.148875037968324, "learning_rate": 4.1478517308282324e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0625, "logps/chosen": -1496.0, "logps/rejected": -1696.0, "loss": 0.7001, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.294921875, "rewards/margins": -0.018798828125, "rewards/rejected": 0.314453125, "step": 1192 }, { "epoch": 0.34415116111351507, "grad_norm": 10.163981515009384, "learning_rate": 4.14595722788602e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1776.0, "logps/rejected": -1712.0, "loss": 0.6955, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.337890625, "rewards/margins": 0.03125, "rewards/rejected": 0.306640625, "step": 1193 }, { "epoch": 0.3444396365209866, "grad_norm": 10.32738887342833, "learning_rate": 4.14406105505519e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1288.0, "logps/rejected": -1136.0, "loss": 0.6482, "loss/demonstration_loss": -2464.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.30078125, "rewards/margins": 0.0634765625, "rewards/rejected": 0.2373046875, "step": 1194 }, { "epoch": 0.3447281119284581, "grad_norm": 10.384757746187962, "learning_rate": 4.142163214259484e-07, "logits/chosen": 3.296875, "logits/rejected": 3.234375, "logps/chosen": -1488.0, "logps/rejected": -1448.0, "loss": 0.674, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.478515625, "rewards/margins": 0.1279296875, "rewards/rejected": 0.3515625, "step": 1195 }, { "epoch": 0.3450165873359296, "grad_norm": 11.667809095858859, "learning_rate": 4.140263707424337e-07, "logits/chosen": 2.984375, "logits/rejected": 3.0, "logps/chosen": -1592.0, "logps/rejected": -1560.0, "loss": 0.6945, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.48828125, "rewards/margins": 0.029052734375, "rewards/rejected": 0.458984375, "step": 1196 }, { "epoch": 0.3453050627434011, "grad_norm": 12.822285800547911, "learning_rate": 4.1383625364768736e-07, "logits/chosen": 3.15625, "logits/rejected": 3.078125, "logps/chosen": -1672.0, "logps/rejected": -1280.0, "loss": 0.7132, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.490234375, "rewards/margins": 0.0849609375, "rewards/rejected": 0.404296875, "step": 1197 }, { "epoch": 0.3455935381508726, "grad_norm": 11.00139319274122, "learning_rate": 4.136459703345907e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -1432.0, "logps/rejected": -1392.0, "loss": 0.679, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5234375, "rewards/margins": -0.003509521484375, "rewards/rejected": 0.52734375, "step": 1198 }, { "epoch": 0.34588201355834414, "grad_norm": 10.006378299767858, "learning_rate": 4.134555209961936e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1896.0, "logps/rejected": -1584.0, "loss": 0.6124, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.47265625, "rewards/margins": 0.2451171875, "rewards/rejected": 0.2275390625, "step": 1199 }, { "epoch": 0.34617048896581565, "grad_norm": 10.228589565980382, "learning_rate": 4.1326490582571444e-07, "logits/chosen": 3.171875, "logits/rejected": 3.171875, "logps/chosen": -1640.0, "logps/rejected": -1656.0, "loss": 0.6747, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.447265625, "rewards/margins": 0.0203857421875, "rewards/rejected": 0.427734375, "step": 1200 }, { "epoch": 0.34645896437328716, "grad_norm": 11.063630002430141, "learning_rate": 4.1307412501653987e-07, "logits/chosen": 3.09375, "logits/rejected": 3.15625, "logps/chosen": -1856.0, "logps/rejected": -1648.0, "loss": 0.6999, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.58203125, "rewards/margins": 0.078125, "rewards/rejected": 0.50390625, "step": 1201 }, { "epoch": 0.34674743978075867, "grad_norm": 9.52957027784144, "learning_rate": 4.128831787622246e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1336.0, "logps/rejected": -1336.0, "loss": 0.6465, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.392578125, "rewards/margins": 0.16015625, "rewards/rejected": 0.232421875, "step": 1202 }, { "epoch": 0.3470359151882302, "grad_norm": 13.360297381685506, "learning_rate": 4.12692067256491e-07, "logits/chosen": 3.109375, "logits/rejected": 3.1875, "logps/chosen": -1936.0, "logps/rejected": -1912.0, "loss": 0.6837, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.5, "rewards/margins": -0.038818359375, "rewards/rejected": 0.5390625, "step": 1203 }, { "epoch": 0.3473243905957017, "grad_norm": 10.034674232099421, "learning_rate": 4.125007906932294e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1808.0, "logps/rejected": -1632.0, "loss": 0.6771, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5703125, "rewards/margins": 0.04345703125, "rewards/rejected": 0.5234375, "step": 1204 }, { "epoch": 0.3476128660031732, "grad_norm": 11.015711643987565, "learning_rate": 4.1230934926649736e-07, "logits/chosen": 3.21875, "logits/rejected": 3.171875, "logps/chosen": -1992.0, "logps/rejected": -2128.0, "loss": 0.6714, "loss/demonstration_loss": -4192.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.73046875, "rewards/margins": 0.0712890625, "rewards/rejected": 0.66015625, "step": 1205 }, { "epoch": 0.3479013414106447, "grad_norm": 9.351317176476279, "learning_rate": 4.1211774317051973e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -2000.0, "logps/rejected": -1992.0, "loss": 0.666, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6953125, "rewards/margins": 0.177734375, "rewards/rejected": 0.51953125, "step": 1206 }, { "epoch": 0.3481898168181163, "grad_norm": 11.503135769328699, "learning_rate": 4.119259725996886e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1624.0, "logps/rejected": -1528.0, "loss": 0.6959, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.36328125, "rewards/margins": 0.0125732421875, "rewards/rejected": 0.3515625, "step": 1207 }, { "epoch": 0.3484782922255878, "grad_norm": 11.166304634273079, "learning_rate": 4.1173403774856264e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1552.0, "logps/rejected": -1600.0, "loss": 0.7046, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5625, "rewards/margins": 0.04296875, "rewards/rejected": 0.51953125, "step": 1208 }, { "epoch": 0.3487667676330593, "grad_norm": 11.294867745634383, "learning_rate": 4.115419388118674e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1472.0, "logps/rejected": -1544.0, "loss": 0.6943, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.361328125, "rewards/margins": -0.037109375, "rewards/rejected": 0.3984375, "step": 1209 }, { "epoch": 0.3490552430405308, "grad_norm": 9.142591171113695, "learning_rate": 4.113496759844948e-07, "logits/chosen": 3.03125, "logits/rejected": 3.078125, "logps/chosen": -1176.0, "logps/rejected": -1256.0, "loss": 0.662, "loss/demonstration_loss": -2464.0, "loss/preference_loss": -2448.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.326171875, "rewards/margins": 0.03466796875, "rewards/rejected": 0.29296875, "step": 1210 }, { "epoch": 0.3493437184480023, "grad_norm": 12.197461037548551, "learning_rate": 4.111572494615031e-07, "logits/chosen": 3.234375, "logits/rejected": 3.15625, "logps/chosen": -1824.0, "logps/rejected": -2176.0, "loss": 0.7894, "loss/demonstration_loss": -4048.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.39453125, "rewards/margins": -0.13671875, "rewards/rejected": 0.53125, "step": 1211 }, { "epoch": 0.34963219385547384, "grad_norm": 10.72024228935816, "learning_rate": 4.1096465943811666e-07, "logits/chosen": 3.109375, "logits/rejected": 3.03125, "logps/chosen": -1464.0, "logps/rejected": -1440.0, "loss": 0.6882, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.404296875, "rewards/margins": 0.07275390625, "rewards/rejected": 0.33203125, "step": 1212 }, { "epoch": 0.34992066926294535, "grad_norm": 12.549032756973414, "learning_rate": 4.1077190610972555e-07, "logits/chosen": 2.90625, "logits/rejected": 2.796875, "logps/chosen": -1280.0, "logps/rejected": -1608.0, "loss": 0.6478, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.373046875, "rewards/margins": 0.1142578125, "rewards/rejected": 0.2578125, "step": 1213 }, { "epoch": 0.35020914467041686, "grad_norm": 10.641341868219293, "learning_rate": 4.1057898967188575e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1584.0, "logps/rejected": -1504.0, "loss": 0.6714, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40234375, "rewards/margins": 0.1181640625, "rewards/rejected": 0.283203125, "step": 1214 }, { "epoch": 0.35049762007788837, "grad_norm": 10.321655524550449, "learning_rate": 4.1038591032031853e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1376.0, "logps/rejected": -1336.0, "loss": 0.6794, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.267578125, "rewards/margins": 0.0458984375, "rewards/rejected": 0.220703125, "step": 1215 }, { "epoch": 0.3507860954853599, "grad_norm": 10.209069340440468, "learning_rate": 4.101926682509106e-07, "logits/chosen": 3.140625, "logits/rejected": 3.1875, "logps/chosen": -1760.0, "logps/rejected": -1736.0, "loss": 0.6542, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.412109375, "rewards/margins": 0.1328125, "rewards/rejected": 0.279296875, "step": 1216 }, { "epoch": 0.3510745708928314, "grad_norm": 11.255312610539976, "learning_rate": 4.0999926365971354e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -1504.0, "logps/rejected": -1376.0, "loss": 0.6752, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.431640625, "rewards/margins": 0.1025390625, "rewards/rejected": 0.328125, "step": 1217 }, { "epoch": 0.3513630463003029, "grad_norm": 10.609593452090905, "learning_rate": 4.098056967429441e-07, "logits/chosen": 2.96875, "logits/rejected": 3.0625, "logps/chosen": -1440.0, "logps/rejected": -1288.0, "loss": 0.6724, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41796875, "rewards/margins": 0.0498046875, "rewards/rejected": 0.3671875, "step": 1218 }, { "epoch": 0.3516515217077744, "grad_norm": 11.616146803065622, "learning_rate": 4.096119676969834e-07, "logits/chosen": 3.03125, "logits/rejected": 3.046875, "logps/chosen": -1384.0, "logps/rejected": -1400.0, "loss": 0.6874, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.353515625, "rewards/margins": 0.10693359375, "rewards/rejected": 0.2470703125, "step": 1219 }, { "epoch": 0.35193999711524593, "grad_norm": 11.51356497614385, "learning_rate": 4.0941807671837736e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -1880.0, "logps/rejected": -1720.0, "loss": 0.6271, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.7109375, "rewards/margins": 0.2041015625, "rewards/rejected": 0.5078125, "step": 1220 }, { "epoch": 0.35222847252271744, "grad_norm": 11.442237706173277, "learning_rate": 4.0922402400383594e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0625, "logps/chosen": -1752.0, "logps/rejected": -1536.0, "loss": 0.6962, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.34375, "rewards/margins": -0.007659912109375, "rewards/rejected": 0.3515625, "step": 1221 }, { "epoch": 0.35251694793018895, "grad_norm": 11.433299502337318, "learning_rate": 4.0902980975023333e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1392.0, "logps/rejected": -1256.0, "loss": 0.6847, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2672.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.349609375, "rewards/margins": 0.0791015625, "rewards/rejected": 0.26953125, "step": 1222 }, { "epoch": 0.35280542333766046, "grad_norm": 10.504823701963634, "learning_rate": 4.088354341546075e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1840.0, "logps/rejected": -1752.0, "loss": 0.7106, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5078125, "rewards/margins": 0.07275390625, "rewards/rejected": 0.43359375, "step": 1223 }, { "epoch": 0.353093898745132, "grad_norm": 10.53658315043906, "learning_rate": 4.086408974141603e-07, "logits/chosen": 3.265625, "logits/rejected": 3.203125, "logps/chosen": -1336.0, "logps/rejected": -1432.0, "loss": 0.6942, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2275390625, "rewards/margins": -0.07177734375, "rewards/rejected": 0.298828125, "step": 1224 }, { "epoch": 0.3533823741526035, "grad_norm": 11.020135901959646, "learning_rate": 4.084461997262568e-07, "logits/chosen": 3.0, "logits/rejected": 2.953125, "logps/chosen": -2008.0, "logps/rejected": -1920.0, "loss": 0.6665, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3968.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.54296875, "rewards/margins": 0.1650390625, "rewards/rejected": 0.37890625, "step": 1225 }, { "epoch": 0.353670849560075, "grad_norm": 10.18382192812381, "learning_rate": 4.0825134128842553e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1464.0, "logps/rejected": -1536.0, "loss": 0.7198, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.392578125, "rewards/margins": -0.01519775390625, "rewards/rejected": 0.408203125, "step": 1226 }, { "epoch": 0.3539593249675465, "grad_norm": 11.09982612299353, "learning_rate": 4.0805632229835805e-07, "logits/chosen": 3.125, "logits/rejected": 3.0625, "logps/chosen": -1872.0, "logps/rejected": -1880.0, "loss": 0.6848, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4921875, "rewards/margins": 0.042724609375, "rewards/rejected": 0.44921875, "step": 1227 }, { "epoch": 0.354247800375018, "grad_norm": 11.124833218998273, "learning_rate": 4.0786114295390893e-07, "logits/chosen": 3.21875, "logits/rejected": 3.15625, "logps/chosen": -1512.0, "logps/rejected": -1512.0, "loss": 0.7101, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.30078125, "rewards/margins": -0.037353515625, "rewards/rejected": 0.337890625, "step": 1228 }, { "epoch": 0.35453627578248953, "grad_norm": 10.652363995478082, "learning_rate": 4.076658034530953e-07, "logits/chosen": 3.140625, "logits/rejected": 3.09375, "logps/chosen": -1392.0, "logps/rejected": -1392.0, "loss": 0.6691, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.34765625, "rewards/margins": 0.029296875, "rewards/rejected": 0.318359375, "step": 1229 }, { "epoch": 0.35482475118996104, "grad_norm": 10.1175322010966, "learning_rate": 4.0747030399409663e-07, "logits/chosen": 3.140625, "logits/rejected": 3.0, "logps/chosen": -1768.0, "logps/rejected": -1792.0, "loss": 0.6733, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.392578125, "rewards/margins": 0.04833984375, "rewards/rejected": 0.34375, "step": 1230 }, { "epoch": 0.35511322659743255, "grad_norm": 13.285611126719353, "learning_rate": 4.072746447752551e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1864.0, "logps/rejected": -1824.0, "loss": 0.7203, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.39453125, "rewards/margins": -0.09375, "rewards/rejected": 0.48828125, "step": 1231 }, { "epoch": 0.35540170200490406, "grad_norm": 9.94212127679558, "learning_rate": 4.070788259950745e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1696.0, "logps/rejected": -1752.0, "loss": 0.6833, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.427734375, "rewards/margins": -0.0250244140625, "rewards/rejected": 0.453125, "step": 1232 }, { "epoch": 0.3556901774123756, "grad_norm": 9.695569175614988, "learning_rate": 4.068828478522208e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1736.0, "logps/rejected": -1512.0, "loss": 0.6272, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.59765625, "rewards/margins": 0.2578125, "rewards/rejected": 0.341796875, "step": 1233 }, { "epoch": 0.3559786528198471, "grad_norm": 11.552814260861226, "learning_rate": 4.066867105455216e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1312.0, "logps/rejected": -1192.0, "loss": 0.6837, "loss/demonstration_loss": -2544.0, "loss/preference_loss": -2544.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.349609375, "rewards/margins": 0.02587890625, "rewards/rejected": 0.32421875, "step": 1234 }, { "epoch": 0.3562671282273186, "grad_norm": 10.74267503893525, "learning_rate": 4.0649041427396593e-07, "logits/chosen": 3.0, "logits/rejected": 3.0625, "logps/chosen": -1944.0, "logps/rejected": -2144.0, "loss": 0.6954, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.421875, "rewards/margins": -0.0001220703125, "rewards/rejected": 0.423828125, "step": 1235 }, { "epoch": 0.3565556036347901, "grad_norm": 14.278704332197602, "learning_rate": 4.062939592367041e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -1536.0, "logps/rejected": -1552.0, "loss": 0.6797, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.373046875, "rewards/margins": -0.0517578125, "rewards/rejected": 0.42578125, "step": 1236 }, { "epoch": 0.3568440790422617, "grad_norm": 10.087521814145758, "learning_rate": 4.060973456330474e-07, "logits/chosen": 3.015625, "logits/rejected": 2.921875, "logps/chosen": -1608.0, "logps/rejected": -1656.0, "loss": 0.6661, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.28515625, "rewards/margins": 0.119140625, "rewards/rejected": 0.166015625, "step": 1237 }, { "epoch": 0.3571325544497332, "grad_norm": 10.03717005152117, "learning_rate": 4.0590057366246835e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1480.0, "logps/rejected": -1608.0, "loss": 0.7037, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.25390625, "rewards/margins": -0.039794921875, "rewards/rejected": 0.29296875, "step": 1238 }, { "epoch": 0.3574210298572047, "grad_norm": 15.489038957949608, "learning_rate": 4.057036435245996e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1632.0, "logps/rejected": -1640.0, "loss": 0.6624, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.443359375, "rewards/margins": 0.134765625, "rewards/rejected": 0.30859375, "step": 1239 }, { "epoch": 0.3577095052646762, "grad_norm": 11.184135470319777, "learning_rate": 4.0550655541923475e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1768.0, "logps/rejected": -1544.0, "loss": 0.6364, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.384765625, "rewards/margins": 0.12890625, "rewards/rejected": 0.255859375, "step": 1240 }, { "epoch": 0.3579979806721477, "grad_norm": 11.938426000634484, "learning_rate": 4.0530930954632736e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1720.0, "logps/rejected": -1552.0, "loss": 0.6837, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.515625, "rewards/margins": 0.2001953125, "rewards/rejected": 0.31640625, "step": 1241 }, { "epoch": 0.35828645607961923, "grad_norm": 12.457016555737873, "learning_rate": 4.0511190610599123e-07, "logits/chosen": 3.078125, "logits/rejected": 3.015625, "logps/chosen": -2016.0, "logps/rejected": -1992.0, "loss": 0.7114, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5625, "rewards/margins": 0.11279296875, "rewards/rejected": 0.44921875, "step": 1242 }, { "epoch": 0.35857493148709074, "grad_norm": 11.537539904187014, "learning_rate": 4.049143452984999e-07, "logits/chosen": 3.109375, "logits/rejected": 3.125, "logps/chosen": -1656.0, "logps/rejected": -1744.0, "loss": 0.7536, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3046875, "rewards/margins": -0.134765625, "rewards/rejected": 0.439453125, "step": 1243 }, { "epoch": 0.35886340689456225, "grad_norm": 11.670549779837993, "learning_rate": 4.0471662732428665e-07, "logits/chosen": 3.046875, "logits/rejected": 3.046875, "logps/chosen": -1520.0, "logps/rejected": -1368.0, "loss": 0.709, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.16796875, "rewards/margins": -0.0615234375, "rewards/rejected": 0.228515625, "step": 1244 }, { "epoch": 0.35915188230203376, "grad_norm": 13.655157700573195, "learning_rate": 4.045187523839441e-07, "logits/chosen": 2.90625, "logits/rejected": 2.875, "logps/chosen": -1272.0, "logps/rejected": -1240.0, "loss": 0.6927, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2512.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.1787109375, "rewards/margins": 0.130859375, "rewards/rejected": 0.048095703125, "step": 1245 }, { "epoch": 0.3594403577095053, "grad_norm": 12.46107323221246, "learning_rate": 4.0432072067822434e-07, "logits/chosen": 3.078125, "logits/rejected": 2.96875, "logps/chosen": -1480.0, "logps/rejected": -1544.0, "loss": 0.6813, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3046875, "rewards/margins": 0.1142578125, "rewards/rejected": 0.1904296875, "step": 1246 }, { "epoch": 0.3597288331169768, "grad_norm": 10.244311602395129, "learning_rate": 4.041225324080382e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0, "logps/chosen": -1704.0, "logps/rejected": -1672.0, "loss": 0.6378, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.396484375, "rewards/margins": 0.12890625, "rewards/rejected": 0.265625, "step": 1247 }, { "epoch": 0.3600173085244483, "grad_norm": 11.917569739552388, "learning_rate": 4.039241877744556e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1936.0, "logps/rejected": -1912.0, "loss": 0.6782, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3828125, "rewards/margins": 0.0888671875, "rewards/rejected": 0.294921875, "step": 1248 }, { "epoch": 0.3603057839319198, "grad_norm": 11.093510006923282, "learning_rate": 4.037256869787049e-07, "logits/chosen": 3.078125, "logits/rejected": 3.109375, "logps/chosen": -1904.0, "logps/rejected": -2000.0, "loss": 0.6832, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.294921875, "rewards/margins": 0.095703125, "rewards/rejected": 0.2001953125, "step": 1249 }, { "epoch": 0.3605942593393913, "grad_norm": 12.588195066679393, "learning_rate": 4.035270302221732e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1536.0, "logps/rejected": -1672.0, "loss": 0.705, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.421875, "rewards/margins": 0.0947265625, "rewards/rejected": 0.328125, "step": 1250 }, { "epoch": 0.36088273474686283, "grad_norm": 11.501715676588695, "learning_rate": 4.0332821770640535e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1384.0, "logps/rejected": -1336.0, "loss": 0.6939, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1611328125, "rewards/margins": -0.037109375, "rewards/rejected": 0.19921875, "step": 1251 }, { "epoch": 0.36117121015433434, "grad_norm": 11.251298637465409, "learning_rate": 4.031292496331047e-07, "logits/chosen": 2.984375, "logits/rejected": 2.953125, "logps/chosen": -1616.0, "logps/rejected": -1784.0, "loss": 0.6865, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2099609375, "rewards/margins": -0.0286865234375, "rewards/rejected": 0.2392578125, "step": 1252 }, { "epoch": 0.36145968556180585, "grad_norm": 13.118916015022132, "learning_rate": 4.0293012620413224e-07, "logits/chosen": 2.96875, "logits/rejected": 3.03125, "logps/chosen": -1464.0, "logps/rejected": -1408.0, "loss": 0.7103, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2734375, "rewards/margins": -0.02001953125, "rewards/rejected": 0.29296875, "step": 1253 }, { "epoch": 0.36174816096927737, "grad_norm": 9.931187199086231, "learning_rate": 4.027308476215064e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1624.0, "logps/rejected": -1456.0, "loss": 0.6636, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.259765625, "rewards/margins": 0.01287841796875, "rewards/rejected": 0.24609375, "step": 1254 }, { "epoch": 0.3620366363767489, "grad_norm": 10.89305215170687, "learning_rate": 4.0253141408740325e-07, "logits/chosen": 2.953125, "logits/rejected": 2.96875, "logps/chosen": -1832.0, "logps/rejected": -1720.0, "loss": 0.6681, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2734375, "rewards/margins": 0.12451171875, "rewards/rejected": 0.1484375, "step": 1255 }, { "epoch": 0.3623251117842204, "grad_norm": 11.713311771807788, "learning_rate": 4.02331825804156e-07, "logits/chosen": 3.015625, "logits/rejected": 3.078125, "logps/chosen": -1696.0, "logps/rejected": -1720.0, "loss": 0.7479, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.33203125, "rewards/margins": -0.146484375, "rewards/rejected": 0.478515625, "step": 1256 }, { "epoch": 0.3626135871916919, "grad_norm": 9.437006245338177, "learning_rate": 4.0213208297425486e-07, "logits/chosen": 3.21875, "logits/rejected": 3.140625, "logps/chosen": -1736.0, "logps/rejected": -1528.0, "loss": 0.6428, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.439453125, "rewards/margins": 0.259765625, "rewards/rejected": 0.1787109375, "step": 1257 }, { "epoch": 0.3629020625991634, "grad_norm": 12.573423922064407, "learning_rate": 4.019321858003468e-07, "logits/chosen": 3.171875, "logits/rejected": 3.1875, "logps/chosen": -1496.0, "logps/rejected": -1464.0, "loss": 0.7266, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.255859375, "rewards/margins": -0.0810546875, "rewards/rejected": 0.3359375, "step": 1258 }, { "epoch": 0.3631905380066349, "grad_norm": 11.801353247488077, "learning_rate": 4.017321344852354e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1288.0, "logps/rejected": -1240.0, "loss": 0.6866, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2544.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.26171875, "rewards/margins": 0.0908203125, "rewards/rejected": 0.1708984375, "step": 1259 }, { "epoch": 0.36347901341410643, "grad_norm": 10.531897234328193, "learning_rate": 4.015319292318806e-07, "logits/chosen": 3.046875, "logits/rejected": 3.078125, "logps/chosen": -1608.0, "logps/rejected": -1328.0, "loss": 0.6207, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.416015625, "rewards/margins": 0.271484375, "rewards/rejected": 0.14453125, "step": 1260 }, { "epoch": 0.36376748882157794, "grad_norm": 12.955951911397776, "learning_rate": 4.013315702433986e-07, "logits/chosen": 3.046875, "logits/rejected": 3.109375, "logps/chosen": -1752.0, "logps/rejected": -1680.0, "loss": 0.6813, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.39453125, "rewards/margins": 0.00030517578125, "rewards/rejected": 0.39453125, "step": 1261 }, { "epoch": 0.36405596422904946, "grad_norm": 11.923890955328783, "learning_rate": 4.0113105772306143e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1704.0, "logps/rejected": -1712.0, "loss": 0.6939, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2373046875, "rewards/margins": 0.05322265625, "rewards/rejected": 0.1845703125, "step": 1262 }, { "epoch": 0.36434443963652097, "grad_norm": 11.33155065346091, "learning_rate": 4.00930391874297e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1624.0, "logps/rejected": -1544.0, "loss": 0.7014, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2236328125, "rewards/margins": -0.026123046875, "rewards/rejected": 0.25, "step": 1263 }, { "epoch": 0.3646329150439925, "grad_norm": 11.857339437582848, "learning_rate": 4.007295729006888e-07, "logits/chosen": 2.984375, "logits/rejected": 2.9375, "logps/chosen": -1808.0, "logps/rejected": -1840.0, "loss": 0.6733, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.369140625, "rewards/margins": 0.07666015625, "rewards/rejected": 0.291015625, "step": 1264 }, { "epoch": 0.364921390451464, "grad_norm": 12.113647886242498, "learning_rate": 4.0052860100597535e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1472.0, "logps/rejected": -1432.0, "loss": 0.6879, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.25390625, "rewards/margins": 0.038818359375, "rewards/rejected": 0.21484375, "step": 1265 }, { "epoch": 0.3652098658589355, "grad_norm": 11.206608483338035, "learning_rate": 4.003274763940509e-07, "logits/chosen": 3.0625, "logits/rejected": 2.984375, "logps/chosen": -1880.0, "logps/rejected": -1800.0, "loss": 0.6611, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.40234375, "rewards/margins": 0.1435546875, "rewards/rejected": 0.2578125, "step": 1266 }, { "epoch": 0.365498341266407, "grad_norm": 12.267812826918988, "learning_rate": 4.0012619926896414e-07, "logits/chosen": 3.046875, "logits/rejected": 2.984375, "logps/chosen": -1728.0, "logps/rejected": -1720.0, "loss": 0.6833, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.412109375, "rewards/margins": 0.0634765625, "rewards/rejected": 0.34765625, "step": 1267 }, { "epoch": 0.3657868166738786, "grad_norm": 10.650627825858022, "learning_rate": 3.999247698349187e-07, "logits/chosen": 2.96875, "logits/rejected": 2.875, "logps/chosen": -1752.0, "logps/rejected": -1688.0, "loss": 0.6709, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.50390625, "rewards/margins": 0.1318359375, "rewards/rejected": 0.373046875, "step": 1268 }, { "epoch": 0.3660752920813501, "grad_norm": 12.068252124892172, "learning_rate": 3.9972318829627275e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -2096.0, "logps/rejected": -1744.0, "loss": 0.6227, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.498046875, "rewards/margins": 0.2021484375, "rewards/rejected": 0.294921875, "step": 1269 }, { "epoch": 0.3663637674888216, "grad_norm": 11.604748372620438, "learning_rate": 3.9952145485753864e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1464.0, "logps/rejected": -1560.0, "loss": 0.7193, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.30859375, "rewards/margins": 0.050537109375, "rewards/rejected": 0.2578125, "step": 1270 }, { "epoch": 0.3666522428962931, "grad_norm": 9.853525963837496, "learning_rate": 3.9931956972338295e-07, "logits/chosen": 3.0, "logits/rejected": 2.921875, "logps/chosen": -1208.0, "logps/rejected": -1184.0, "loss": 0.6489, "loss/demonstration_loss": -2416.0, "loss/preference_loss": -2400.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.275390625, "rewards/margins": 0.1279296875, "rewards/rejected": 0.1474609375, "step": 1271 }, { "epoch": 0.3669407183037646, "grad_norm": 10.253161845846412, "learning_rate": 3.991175330986261e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1808.0, "logps/rejected": -1752.0, "loss": 0.6408, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5625, "rewards/margins": 0.212890625, "rewards/rejected": 0.3515625, "step": 1272 }, { "epoch": 0.36722919371123613, "grad_norm": 10.603741507959823, "learning_rate": 3.989153451882422e-07, "logits/chosen": 3.0625, "logits/rejected": 3.15625, "logps/chosen": -1736.0, "logps/rejected": -1440.0, "loss": 0.6688, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.1923828125, "rewards/margins": 0.00311279296875, "rewards/rejected": 0.1884765625, "step": 1273 }, { "epoch": 0.36751766911870765, "grad_norm": 11.008733143182452, "learning_rate": 3.9871300619735905e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1552.0, "logps/rejected": -1408.0, "loss": 0.6758, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.251953125, "rewards/margins": 0.07421875, "rewards/rejected": 0.1787109375, "step": 1274 }, { "epoch": 0.36780614452617916, "grad_norm": 11.961868030763327, "learning_rate": 3.9851051633125733e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -1856.0, "logps/rejected": -1824.0, "loss": 0.6711, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.353515625, "rewards/margins": 0.06640625, "rewards/rejected": 0.287109375, "step": 1275 }, { "epoch": 0.36809461993365067, "grad_norm": 10.89029653309017, "learning_rate": 3.983078757953711e-07, "logits/chosen": 3.140625, "logits/rejected": 3.0625, "logps/chosen": -1832.0, "logps/rejected": -1864.0, "loss": 0.6665, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4921875, "rewards/margins": 0.1298828125, "rewards/rejected": 0.361328125, "step": 1276 }, { "epoch": 0.3683830953411222, "grad_norm": 13.884388927635102, "learning_rate": 3.981050847952871e-07, "logits/chosen": 3.125, "logits/rejected": 3.078125, "logps/chosen": -1800.0, "logps/rejected": -1816.0, "loss": 0.7044, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.330078125, "rewards/margins": 0.0262451171875, "rewards/rejected": 0.3046875, "step": 1277 }, { "epoch": 0.3686715707485937, "grad_norm": 11.871776413169744, "learning_rate": 3.979021435367449e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1480.0, "logps/rejected": -1520.0, "loss": 0.6681, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.376953125, "rewards/margins": 0.06396484375, "rewards/rejected": 0.314453125, "step": 1278 }, { "epoch": 0.3689600461560652, "grad_norm": 10.313029121317424, "learning_rate": 3.9769905222563647e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1744.0, "logps/rejected": -1704.0, "loss": 0.6875, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2734375, "rewards/margins": 0.07470703125, "rewards/rejected": 0.19921875, "step": 1279 }, { "epoch": 0.3692485215635367, "grad_norm": 10.362736003116298, "learning_rate": 3.974958110680059e-07, "logits/chosen": 3.0, "logits/rejected": 3.03125, "logps/chosen": -1384.0, "logps/rejected": -1240.0, "loss": 0.6716, "loss/demonstration_loss": -2656.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2412109375, "rewards/margins": 0.09521484375, "rewards/rejected": 0.1455078125, "step": 1280 }, { "epoch": 0.3695369969710082, "grad_norm": 10.520858644058022, "learning_rate": 3.9729242027004937e-07, "logits/chosen": 2.90625, "logits/rejected": 2.96875, "logps/chosen": -1864.0, "logps/rejected": -1952.0, "loss": 0.7301, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.271484375, "rewards/margins": -0.018798828125, "rewards/rejected": 0.291015625, "step": 1281 }, { "epoch": 0.36982547237847974, "grad_norm": 10.502517234981248, "learning_rate": 3.9708888003811487e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1752.0, "logps/rejected": -1824.0, "loss": 0.7171, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.0625, "rewards/chosen": 0.1982421875, "rewards/margins": -0.134765625, "rewards/rejected": 0.33203125, "step": 1282 }, { "epoch": 0.37011394778595125, "grad_norm": 12.677737589428888, "learning_rate": 3.96885190578702e-07, "logits/chosen": 3.171875, "logits/rejected": 3.09375, "logps/chosen": -1304.0, "logps/rejected": -1376.0, "loss": 0.673, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2353515625, "rewards/margins": 0.064453125, "rewards/rejected": 0.1708984375, "step": 1283 }, { "epoch": 0.37040242319342276, "grad_norm": 10.529804641501595, "learning_rate": 3.9668135209846177e-07, "logits/chosen": 2.984375, "logits/rejected": 3.109375, "logps/chosen": -1864.0, "logps/rejected": -1672.0, "loss": 0.6664, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.361328125, "rewards/margins": 0.08544921875, "rewards/rejected": 0.275390625, "step": 1284 }, { "epoch": 0.37069089860089427, "grad_norm": 12.156094409288738, "learning_rate": 3.964773648041964e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0, "logps/chosen": -1944.0, "logps/rejected": -2080.0, "loss": 0.6691, "loss/demonstration_loss": -4048.0, "loss/preference_loss": -4048.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.349609375, "rewards/margins": 0.06005859375, "rewards/rejected": 0.2890625, "step": 1285 }, { "epoch": 0.3709793740083658, "grad_norm": 10.640789097759042, "learning_rate": 3.9627322890285903e-07, "logits/chosen": 3.015625, "logits/rejected": 3.171875, "logps/chosen": -1488.0, "logps/rejected": -1208.0, "loss": 0.6634, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2720.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.28125, "rewards/margins": 0.1513671875, "rewards/rejected": 0.130859375, "step": 1286 }, { "epoch": 0.3712678494158373, "grad_norm": 12.30840419428015, "learning_rate": 3.960689446015536e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -2048.0, "logps/rejected": -1880.0, "loss": 0.6749, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3984.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.515625, "rewards/margins": 0.002685546875, "rewards/rejected": 0.51171875, "step": 1287 }, { "epoch": 0.3715563248233088, "grad_norm": 10.693499044062825, "learning_rate": 3.958645121075347e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1792.0, "logps/rejected": -1752.0, "loss": 0.6469, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.42578125, "rewards/margins": 0.2490234375, "rewards/rejected": 0.177734375, "step": 1288 }, { "epoch": 0.3718448002307803, "grad_norm": 10.103059828947796, "learning_rate": 3.9565993162820685e-07, "logits/chosen": 3.015625, "logits/rejected": 2.984375, "logps/chosen": -1688.0, "logps/rejected": -1504.0, "loss": 0.668, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.287109375, "rewards/margins": -0.04150390625, "rewards/rejected": 0.328125, "step": 1289 }, { "epoch": 0.3721332756382518, "grad_norm": 11.99122284873429, "learning_rate": 3.9545520337112546e-07, "logits/chosen": 3.0, "logits/rejected": 3.109375, "logps/chosen": -1936.0, "logps/rejected": -1672.0, "loss": 0.712, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.19140625, "rewards/margins": 0.0242919921875, "rewards/rejected": 0.1669921875, "step": 1290 }, { "epoch": 0.37242175104572334, "grad_norm": 10.35646483085471, "learning_rate": 3.952503275439951e-07, "logits/chosen": 3.140625, "logits/rejected": 3.109375, "logps/chosen": -1712.0, "logps/rejected": -1504.0, "loss": 0.6732, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.259765625, "rewards/margins": 0.18359375, "rewards/rejected": 0.0771484375, "step": 1291 }, { "epoch": 0.37271022645319485, "grad_norm": 11.217378813959431, "learning_rate": 3.950453043546706e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1920.0, "logps/rejected": -1872.0, "loss": 0.6913, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.310546875, "rewards/margins": -0.001373291015625, "rewards/rejected": 0.3125, "step": 1292 }, { "epoch": 0.37299870186066636, "grad_norm": 12.198469021608128, "learning_rate": 3.948401340111559e-07, "logits/chosen": 3.09375, "logits/rejected": 3.140625, "logps/chosen": -1472.0, "logps/rejected": -1552.0, "loss": 0.6628, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.30078125, "rewards/margins": 0.0184326171875, "rewards/rejected": 0.283203125, "step": 1293 }, { "epoch": 0.37328717726813787, "grad_norm": 11.45192956024991, "learning_rate": 3.946348167216046e-07, "logits/chosen": 2.984375, "logits/rejected": 2.984375, "logps/chosen": -1760.0, "logps/rejected": -1680.0, "loss": 0.6688, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.294921875, "rewards/margins": 0.05224609375, "rewards/rejected": 0.2412109375, "step": 1294 }, { "epoch": 0.3735756526756094, "grad_norm": 11.761034050668208, "learning_rate": 3.94429352694319e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1752.0, "logps/rejected": -1576.0, "loss": 0.6743, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.234375, "rewards/margins": 0.091796875, "rewards/rejected": 0.142578125, "step": 1295 }, { "epoch": 0.3738641280830809, "grad_norm": 12.946000588423907, "learning_rate": 3.9422374213775065e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1520.0, "logps/rejected": -1504.0, "loss": 0.6896, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.255859375, "rewards/margins": 0.10791015625, "rewards/rejected": 0.1474609375, "step": 1296 }, { "epoch": 0.3741526034905524, "grad_norm": 10.195209424163581, "learning_rate": 3.940179852604995e-07, "logits/chosen": 3.0, "logits/rejected": 3.0, "logps/chosen": -1264.0, "logps/rejected": -1536.0, "loss": 0.6448, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.212890625, "rewards/margins": 0.076171875, "rewards/rejected": 0.1376953125, "step": 1297 }, { "epoch": 0.3744410788980239, "grad_norm": 10.588385115520685, "learning_rate": 3.9381208227131406e-07, "logits/chosen": 2.921875, "logits/rejected": 2.9375, "logps/chosen": -1864.0, "logps/rejected": -1632.0, "loss": 0.6175, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.396484375, "rewards/margins": 0.2431640625, "rewards/rejected": 0.154296875, "step": 1298 }, { "epoch": 0.3747295543054955, "grad_norm": 12.250245377682738, "learning_rate": 3.93606033379091e-07, "logits/chosen": 3.046875, "logits/rejected": 2.90625, "logps/chosen": -1488.0, "logps/rejected": -1664.0, "loss": 0.6581, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.25390625, "rewards/margins": 0.04443359375, "rewards/rejected": 0.2099609375, "step": 1299 }, { "epoch": 0.375018029712967, "grad_norm": 12.313145270385382, "learning_rate": 3.933998387928751e-07, "logits/chosen": 3.125, "logits/rejected": 3.0, "logps/chosen": -1800.0, "logps/rejected": -1760.0, "loss": 0.6779, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.2890625, "rewards/margins": 0.07958984375, "rewards/rejected": 0.2099609375, "step": 1300 }, { "epoch": 0.3753065051204385, "grad_norm": 12.785025271343446, "learning_rate": 3.931934987218589e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -1472.0, "logps/rejected": -1680.0, "loss": 0.7155, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2333984375, "rewards/margins": -0.01153564453125, "rewards/rejected": 0.2451171875, "step": 1301 }, { "epoch": 0.37559498052791, "grad_norm": 11.500316437536918, "learning_rate": 3.9298701337538255e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1680.0, "logps/rejected": -1496.0, "loss": 0.689, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.265625, "rewards/margins": 0.10546875, "rewards/rejected": 0.16015625, "step": 1302 }, { "epoch": 0.3758834559353815, "grad_norm": 13.908586601294262, "learning_rate": 3.927803829629336e-07, "logits/chosen": 2.921875, "logits/rejected": 3.046875, "logps/chosen": -2224.0, "logps/rejected": -2040.0, "loss": 0.7136, "loss/demonstration_loss": -4288.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.2431640625, "rewards/margins": -0.050537109375, "rewards/rejected": 0.29296875, "step": 1303 }, { "epoch": 0.37617193134285304, "grad_norm": 12.665459028622772, "learning_rate": 3.925736076941467e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1696.0, "logps/rejected": -1536.0, "loss": 0.6698, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.244140625, "rewards/margins": 0.07275390625, "rewards/rejected": 0.171875, "step": 1304 }, { "epoch": 0.37646040675032455, "grad_norm": 11.599603626351664, "learning_rate": 3.9236668777880355e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1768.0, "logps/rejected": -1888.0, "loss": 0.6906, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2109375, "rewards/margins": 0.02099609375, "rewards/rejected": 0.1904296875, "step": 1305 }, { "epoch": 0.37674888215779606, "grad_norm": 11.086085065575784, "learning_rate": 3.9215962342683266e-07, "logits/chosen": 2.96875, "logits/rejected": 3.03125, "logps/chosen": -1664.0, "logps/rejected": -1544.0, "loss": 0.6872, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1552734375, "rewards/margins": -0.04833984375, "rewards/rejected": 0.203125, "step": 1306 }, { "epoch": 0.3770373575652676, "grad_norm": 10.464538772184198, "learning_rate": 3.91952414848309e-07, "logits/chosen": 3.0, "logits/rejected": 3.0625, "logps/chosen": -1832.0, "logps/rejected": -1864.0, "loss": 0.683, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.205078125, "rewards/margins": -0.033203125, "rewards/rejected": 0.23828125, "step": 1307 }, { "epoch": 0.3773258329727391, "grad_norm": 10.559774990618118, "learning_rate": 3.9174506225345373e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1824.0, "logps/rejected": -1624.0, "loss": 0.6612, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3125, "rewards/margins": 0.1611328125, "rewards/rejected": 0.15234375, "step": 1308 }, { "epoch": 0.3776143083802106, "grad_norm": 11.501113010606543, "learning_rate": 3.915375658526343e-07, "logits/chosen": 2.875, "logits/rejected": 2.890625, "logps/chosen": -1632.0, "logps/rejected": -1720.0, "loss": 0.6979, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1025390625, "rewards/margins": -0.027587890625, "rewards/rejected": 0.1298828125, "step": 1309 }, { "epoch": 0.3779027837876821, "grad_norm": 12.792367570953155, "learning_rate": 3.9132992585636406e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1096.0, "logps/rejected": -1304.0, "loss": 0.7051, "loss/demonstration_loss": -2400.0, "loss/preference_loss": -2416.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.107421875, "rewards/margins": -0.04248046875, "rewards/rejected": 0.150390625, "step": 1310 }, { "epoch": 0.3781912591951536, "grad_norm": 11.664082855726287, "learning_rate": 3.911221424753019e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1504.0, "logps/rejected": -1480.0, "loss": 0.6755, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.212890625, "rewards/margins": 0.06689453125, "rewards/rejected": 0.1455078125, "step": 1311 }, { "epoch": 0.37847973460262513, "grad_norm": 10.933817609782864, "learning_rate": 3.909142159202523e-07, "logits/chosen": 3.0625, "logits/rejected": 3.1875, "logps/chosen": -1800.0, "logps/rejected": -1776.0, "loss": 0.6661, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.1474609375, "rewards/margins": 0.06982421875, "rewards/rejected": 0.07763671875, "step": 1312 }, { "epoch": 0.37876821001009664, "grad_norm": 12.337783587066738, "learning_rate": 3.9070614640216503e-07, "logits/chosen": 3.03125, "logits/rejected": 3.09375, "logps/chosen": -1344.0, "logps/rejected": -1496.0, "loss": 0.7125, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1748046875, "rewards/margins": -0.1064453125, "rewards/rejected": 0.28125, "step": 1313 }, { "epoch": 0.37905668541756815, "grad_norm": 10.192238319533546, "learning_rate": 3.904979341321348e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -2048.0, "logps/rejected": -1800.0, "loss": 0.6696, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.34375, "rewards/margins": 0.173828125, "rewards/rejected": 0.1689453125, "step": 1314 }, { "epoch": 0.37934516082503966, "grad_norm": 13.807593183105208, "learning_rate": 3.902895793214011e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0, "logps/chosen": -1576.0, "logps/rejected": -1720.0, "loss": 0.7178, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.33984375, "rewards/margins": -0.0042724609375, "rewards/rejected": 0.34375, "step": 1315 }, { "epoch": 0.3796336362325112, "grad_norm": 12.336995914001294, "learning_rate": 3.900810821813482e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1976.0, "logps/rejected": -1912.0, "loss": 0.6672, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4765625, "rewards/margins": 0.12060546875, "rewards/rejected": 0.35546875, "step": 1316 }, { "epoch": 0.3799221116399827, "grad_norm": 11.07192044233282, "learning_rate": 3.898724429235046e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -1960.0, "logps/rejected": -2016.0, "loss": 0.6863, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.33203125, "rewards/margins": -0.0087890625, "rewards/rejected": 0.341796875, "step": 1317 }, { "epoch": 0.3802105870474542, "grad_norm": 10.16552468544801, "learning_rate": 3.8966366175954323e-07, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -1664.0, "logps/rejected": -1800.0, "loss": 0.671, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.32421875, "rewards/margins": 0.1669921875, "rewards/rejected": 0.15625, "step": 1318 }, { "epoch": 0.3804990624549257, "grad_norm": 12.337054319881316, "learning_rate": 3.8945473890128066e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -2144.0, "logps/rejected": -1832.0, "loss": 0.6798, "loss/demonstration_loss": -4000.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.328125, "rewards/margins": 0.10986328125, "rewards/rejected": 0.2177734375, "step": 1319 }, { "epoch": 0.3807875378623972, "grad_norm": 11.415406563828665, "learning_rate": 3.8924567456067747e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1888.0, "logps/rejected": -1616.0, "loss": 0.6793, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.337890625, "rewards/margins": 0.13671875, "rewards/rejected": 0.203125, "step": 1320 }, { "epoch": 0.38107601326986873, "grad_norm": 10.206101516211046, "learning_rate": 3.8903646894983765e-07, "logits/chosen": 2.8125, "logits/rejected": 2.859375, "logps/chosen": -1720.0, "logps/rejected": -1552.0, "loss": 0.6549, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1943359375, "rewards/margins": 0.0595703125, "rewards/rejected": 0.134765625, "step": 1321 }, { "epoch": 0.38136448867734024, "grad_norm": 10.014761563192863, "learning_rate": 3.8882712228100854e-07, "logits/chosen": 3.15625, "logits/rejected": 3.03125, "logps/chosen": -1296.0, "logps/rejected": -1448.0, "loss": 0.6841, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1484375, "rewards/margins": 0.00048828125, "rewards/rejected": 0.1484375, "step": 1322 }, { "epoch": 0.38165296408481175, "grad_norm": 11.060297814267088, "learning_rate": 3.8861763476658074e-07, "logits/chosen": 2.9375, "logits/rejected": 2.984375, "logps/chosen": -1928.0, "logps/rejected": -1952.0, "loss": 0.6707, "loss/demonstration_loss": -3920.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.3359375, "rewards/margins": 0.0498046875, "rewards/rejected": 0.28515625, "step": 1323 }, { "epoch": 0.38194143949228326, "grad_norm": 11.12051303946696, "learning_rate": 3.884080066190874e-07, "logits/chosen": 3.078125, "logits/rejected": 3.0, "logps/chosen": -1856.0, "logps/rejected": -1904.0, "loss": 0.6624, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.306640625, "rewards/margins": 0.126953125, "rewards/rejected": 0.1806640625, "step": 1324 }, { "epoch": 0.3822299148997548, "grad_norm": 11.992897182442796, "learning_rate": 3.8819823805120474e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1888.0, "logps/rejected": -2048.0, "loss": 0.7081, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1943359375, "rewards/margins": -0.0625, "rewards/rejected": 0.2578125, "step": 1325 }, { "epoch": 0.3825183903072263, "grad_norm": 12.041075268194472, "learning_rate": 3.879883292757511e-07, "logits/chosen": 3.0625, "logits/rejected": 3.109375, "logps/chosen": -1808.0, "logps/rejected": -1776.0, "loss": 0.723, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2021484375, "rewards/margins": -0.0712890625, "rewards/rejected": 0.2734375, "step": 1326 }, { "epoch": 0.3828068657146978, "grad_norm": 12.625450253386129, "learning_rate": 3.8777828050568735e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1776.0, "logps/rejected": -1640.0, "loss": 0.6385, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.375, "rewards/margins": 0.2197265625, "rewards/rejected": 0.1552734375, "step": 1327 }, { "epoch": 0.3830953411221693, "grad_norm": 11.825084181263108, "learning_rate": 3.875680919541162e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1576.0, "logps/rejected": -1504.0, "loss": 0.6569, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.208984375, "rewards/margins": 0.026123046875, "rewards/rejected": 0.1826171875, "step": 1328 }, { "epoch": 0.3833838165296409, "grad_norm": 9.84833110363681, "learning_rate": 3.873577638342823e-07, "logits/chosen": 2.984375, "logits/rejected": 3.078125, "logps/chosen": -2080.0, "logps/rejected": -1864.0, "loss": 0.6588, "loss/demonstration_loss": -3968.0, "loss/preference_loss": -3968.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3359375, "rewards/margins": 0.12158203125, "rewards/rejected": 0.2138671875, "step": 1329 }, { "epoch": 0.3836722919371124, "grad_norm": 9.924558047227473, "learning_rate": 3.871472963595717e-07, "logits/chosen": 3.0, "logits/rejected": 3.015625, "logps/chosen": -1896.0, "logps/rejected": -1832.0, "loss": 0.6385, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.45703125, "rewards/margins": 0.1455078125, "rewards/rejected": 0.310546875, "step": 1330 }, { "epoch": 0.3839607673445839, "grad_norm": 11.812324278447832, "learning_rate": 3.86936689743512e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1592.0, "logps/rejected": -1376.0, "loss": 0.6623, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.271484375, "rewards/margins": 0.1650390625, "rewards/rejected": 0.10546875, "step": 1331 }, { "epoch": 0.3842492427520554, "grad_norm": 12.33186081272834, "learning_rate": 3.867259441997721e-07, "logits/chosen": 2.875, "logits/rejected": 2.90625, "logps/chosen": -1760.0, "logps/rejected": -1448.0, "loss": 0.674, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.14453125, "rewards/margins": 0.109375, "rewards/rejected": 0.03515625, "step": 1332 }, { "epoch": 0.3845377181595269, "grad_norm": 11.100998626421617, "learning_rate": 3.865150599421615e-07, "logits/chosen": 2.734375, "logits/rejected": 2.84375, "logps/chosen": -1696.0, "logps/rejected": -1816.0, "loss": 0.6913, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.291015625, "rewards/margins": 0.1083984375, "rewards/rejected": 0.1826171875, "step": 1333 }, { "epoch": 0.38482619356699843, "grad_norm": 10.026343311711983, "learning_rate": 3.863040371846307e-07, "logits/chosen": 3.0, "logits/rejected": 2.96875, "logps/chosen": -1120.0, "logps/rejected": -1120.0, "loss": 0.6519, "loss/demonstration_loss": -2240.0, "loss/preference_loss": -2224.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.072265625, "rewards/margins": 0.1298828125, "rewards/rejected": -0.056884765625, "step": 1334 }, { "epoch": 0.38511466897446994, "grad_norm": 9.305094915348548, "learning_rate": 3.860928761412705e-07, "logits/chosen": 3.109375, "logits/rejected": 3.078125, "logps/chosen": -1720.0, "logps/rejected": -1760.0, "loss": 0.6697, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13671875, "rewards/margins": 0.048828125, "rewards/rejected": 0.08740234375, "step": 1335 }, { "epoch": 0.38540314438194145, "grad_norm": 10.899976416896294, "learning_rate": 3.8588157702631235e-07, "logits/chosen": 3.0625, "logits/rejected": 3.109375, "logps/chosen": -1352.0, "logps/rejected": -1184.0, "loss": 0.6993, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2560.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.17578125, "rewards/margins": 0.01324462890625, "rewards/rejected": 0.1630859375, "step": 1336 }, { "epoch": 0.38569161978941296, "grad_norm": 11.667203214597203, "learning_rate": 3.8567014005412733e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1456.0, "logps/rejected": -1552.0, "loss": 0.668, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.306640625, "rewards/margins": 0.166015625, "rewards/rejected": 0.1416015625, "step": 1337 }, { "epoch": 0.3859800951968845, "grad_norm": 12.892471100540002, "learning_rate": 3.854585654392267e-07, "logits/chosen": 3.03125, "logits/rejected": 3.109375, "logps/chosen": -1720.0, "logps/rejected": -1624.0, "loss": 0.713, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.365234375, "rewards/margins": 0.1416015625, "rewards/rejected": 0.2236328125, "step": 1338 }, { "epoch": 0.386268570604356, "grad_norm": 11.061038781517992, "learning_rate": 3.8524685339626123e-07, "logits/chosen": 3.0, "logits/rejected": 3.078125, "logps/chosen": -1712.0, "logps/rejected": -1344.0, "loss": 0.7069, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.322265625, "rewards/margins": 0.1259765625, "rewards/rejected": 0.1962890625, "step": 1339 }, { "epoch": 0.3865570460118275, "grad_norm": 12.102606742884733, "learning_rate": 3.8503500414002116e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -2024.0, "logps/rejected": -1944.0, "loss": 0.6797, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.451171875, "rewards/margins": 0.022705078125, "rewards/rejected": 0.4296875, "step": 1340 }, { "epoch": 0.386845521419299, "grad_norm": 11.359686357459275, "learning_rate": 3.848230178854359e-07, "logits/chosen": 2.9375, "logits/rejected": 2.96875, "logps/chosen": -1864.0, "logps/rejected": -1952.0, "loss": 0.6365, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.333984375, "rewards/margins": 0.140625, "rewards/rejected": 0.193359375, "step": 1341 }, { "epoch": 0.3871339968267705, "grad_norm": 10.556664842274381, "learning_rate": 3.846108948475739e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0, "logps/chosen": -2240.0, "logps/rejected": -1848.0, "loss": 0.6451, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.51171875, "rewards/margins": 0.2255859375, "rewards/rejected": 0.28515625, "step": 1342 }, { "epoch": 0.38742247223424203, "grad_norm": 9.614234889079466, "learning_rate": 3.843986352416424e-07, "logits/chosen": 3.0, "logits/rejected": 3.0625, "logps/chosen": -1680.0, "logps/rejected": -1560.0, "loss": 0.6378, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.279296875, "rewards/margins": 0.1337890625, "rewards/rejected": 0.146484375, "step": 1343 }, { "epoch": 0.38771094764171354, "grad_norm": 13.07384355354088, "learning_rate": 3.841862392829871e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1776.0, "logps/rejected": -1376.0, "loss": 0.6483, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.283203125, "rewards/margins": 0.08984375, "rewards/rejected": 0.193359375, "step": 1344 }, { "epoch": 0.38799942304918505, "grad_norm": 10.496379282287489, "learning_rate": 3.839737071870922e-07, "logits/chosen": 3.109375, "logits/rejected": 2.953125, "logps/chosen": -2272.0, "logps/rejected": -2224.0, "loss": 0.6858, "loss/demonstration_loss": -4544.0, "loss/preference_loss": -4544.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4296875, "rewards/margins": 0.07421875, "rewards/rejected": 0.35546875, "step": 1345 }, { "epoch": 0.38828789845665657, "grad_norm": 11.039847914322003, "learning_rate": 3.837610391695797e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1504.0, "logps/rejected": -1464.0, "loss": 0.6662, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.34765625, "rewards/margins": 0.125, "rewards/rejected": 0.22265625, "step": 1346 }, { "epoch": 0.3885763738641281, "grad_norm": 10.76695953507155, "learning_rate": 3.835482354462098e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0, "logps/chosen": -1672.0, "logps/rejected": -1544.0, "loss": 0.6761, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.37109375, "rewards/margins": 0.130859375, "rewards/rejected": 0.2412109375, "step": 1347 }, { "epoch": 0.3888648492715996, "grad_norm": 10.868246063809314, "learning_rate": 3.8333529623288035e-07, "logits/chosen": 3.140625, "logits/rejected": 3.21875, "logps/chosen": -1728.0, "logps/rejected": -1680.0, "loss": 0.6672, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.46875, "rewards/margins": 0.07958984375, "rewards/rejected": 0.390625, "step": 1348 }, { "epoch": 0.3891533246790711, "grad_norm": 10.080346986521903, "learning_rate": 3.8312222174562655e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1632.0, "logps/rejected": -1568.0, "loss": 0.6699, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1943359375, "rewards/margins": -0.006439208984375, "rewards/rejected": 0.201171875, "step": 1349 }, { "epoch": 0.3894418000865426, "grad_norm": 12.20409589874611, "learning_rate": 3.8290901220062086e-07, "logits/chosen": 3.046875, "logits/rejected": 3.09375, "logps/chosen": -1960.0, "logps/rejected": -1752.0, "loss": 0.697, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.318359375, "rewards/margins": 0.0556640625, "rewards/rejected": 0.263671875, "step": 1350 }, { "epoch": 0.3897302754940141, "grad_norm": 10.73859605566082, "learning_rate": 3.8269566781417274e-07, "logits/chosen": 3.0, "logits/rejected": 3.0625, "logps/chosen": -1768.0, "logps/rejected": -1624.0, "loss": 0.6434, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3203125, "rewards/margins": 0.1103515625, "rewards/rejected": 0.2099609375, "step": 1351 }, { "epoch": 0.39001875090148563, "grad_norm": 11.550203952385035, "learning_rate": 3.8248218880272864e-07, "logits/chosen": 3.046875, "logits/rejected": 2.984375, "logps/chosen": -1968.0, "logps/rejected": -2096.0, "loss": 0.679, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.296875, "rewards/margins": 0.06884765625, "rewards/rejected": 0.228515625, "step": 1352 }, { "epoch": 0.39030722630895714, "grad_norm": 11.126824280258022, "learning_rate": 3.822685753828714e-07, "logits/chosen": 3.015625, "logits/rejected": 3.0, "logps/chosen": -1576.0, "logps/rejected": -1736.0, "loss": 0.6789, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.271484375, "rewards/margins": 0.0859375, "rewards/rejected": 0.185546875, "step": 1353 }, { "epoch": 0.39059570171642866, "grad_norm": 10.933375145729544, "learning_rate": 3.8205482777132016e-07, "logits/chosen": 3.109375, "logits/rejected": 3.171875, "logps/chosen": -1760.0, "logps/rejected": -2008.0, "loss": 0.6794, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.365234375, "rewards/margins": 0.06884765625, "rewards/rejected": 0.296875, "step": 1354 }, { "epoch": 0.39088417712390017, "grad_norm": 12.505520595950992, "learning_rate": 3.8184094618493035e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1864.0, "logps/rejected": -1800.0, "loss": 0.6827, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.10986328125, "rewards/rejected": 0.30078125, "step": 1355 }, { "epoch": 0.3911726525313717, "grad_norm": 9.65454026520546, "learning_rate": 3.816269308406934e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1648.0, "logps/rejected": -1536.0, "loss": 0.6626, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.353515625, "rewards/margins": 0.0888671875, "rewards/rejected": 0.263671875, "step": 1356 }, { "epoch": 0.3914611279388432, "grad_norm": 12.229723173659552, "learning_rate": 3.8141278195573623e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1464.0, "logps/rejected": -1536.0, "loss": 0.6883, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.341796875, "rewards/margins": 0.0732421875, "rewards/rejected": 0.267578125, "step": 1357 }, { "epoch": 0.3917496033463147, "grad_norm": 12.040253231296099, "learning_rate": 3.8119849974732145e-07, "logits/chosen": 3.03125, "logits/rejected": 3.0625, "logps/chosen": -1768.0, "logps/rejected": -1736.0, "loss": 0.6794, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.244140625, "rewards/margins": 0.0006866455078125, "rewards/rejected": 0.2431640625, "step": 1358 }, { "epoch": 0.3920380787537862, "grad_norm": 10.318467411874689, "learning_rate": 3.809840844328466e-07, "logits/chosen": 3.0, "logits/rejected": 2.984375, "logps/chosen": -1712.0, "logps/rejected": -1480.0, "loss": 0.6581, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.44921875, "rewards/margins": 0.09619140625, "rewards/rejected": 0.353515625, "step": 1359 }, { "epoch": 0.3923265541612578, "grad_norm": 12.243879111749385, "learning_rate": 3.8076953622984467e-07, "logits/chosen": 3.03125, "logits/rejected": 3.171875, "logps/chosen": -1608.0, "logps/rejected": -1416.0, "loss": 0.7198, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2001953125, "rewards/margins": -0.08447265625, "rewards/rejected": 0.28515625, "step": 1360 }, { "epoch": 0.3926150295687293, "grad_norm": 11.781847816637676, "learning_rate": 3.805548553559833e-07, "logits/chosen": 3.125, "logits/rejected": 3.03125, "logps/chosen": -1600.0, "logps/rejected": -1904.0, "loss": 0.7139, "loss/demonstration_loss": -3536.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3203125, "rewards/margins": 0.059814453125, "rewards/rejected": 0.259765625, "step": 1361 }, { "epoch": 0.3929035049762008, "grad_norm": 11.480501214551937, "learning_rate": 3.8034004202906464e-07, "logits/chosen": 3.0625, "logits/rejected": 3.015625, "logps/chosen": -1632.0, "logps/rejected": -1544.0, "loss": 0.6816, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2373046875, "rewards/margins": 0.037109375, "rewards/rejected": 0.2001953125, "step": 1362 }, { "epoch": 0.3931919803836723, "grad_norm": 10.470357408802219, "learning_rate": 3.801250964670253e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1440.0, "logps/rejected": -1336.0, "loss": 0.6275, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39453125, "rewards/margins": 0.216796875, "rewards/rejected": 0.1787109375, "step": 1363 }, { "epoch": 0.3934804557911438, "grad_norm": 12.325714808891378, "learning_rate": 3.7991001888793604e-07, "logits/chosen": 2.890625, "logits/rejected": 2.984375, "logps/chosen": -1456.0, "logps/rejected": -1864.0, "loss": 0.7493, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.248046875, "rewards/margins": -0.185546875, "rewards/rejected": 0.43359375, "step": 1364 }, { "epoch": 0.39376893119861534, "grad_norm": 12.986212830845139, "learning_rate": 3.796948095100016e-07, "logits/chosen": 3.0, "logits/rejected": 3.03125, "logps/chosen": -1600.0, "logps/rejected": -1592.0, "loss": 0.7638, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.220703125, "rewards/margins": -0.1552734375, "rewards/rejected": 0.376953125, "step": 1365 }, { "epoch": 0.39405740660608685, "grad_norm": 12.470036061226049, "learning_rate": 3.794794685515604e-07, "logits/chosen": 3.015625, "logits/rejected": 3.046875, "logps/chosen": -1568.0, "logps/rejected": -1280.0, "loss": 0.6902, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36328125, "rewards/margins": 0.03271484375, "rewards/rejected": 0.33203125, "step": 1366 }, { "epoch": 0.39434588201355836, "grad_norm": 10.99052175249744, "learning_rate": 3.792639962310843e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1656.0, "logps/rejected": -1632.0, "loss": 0.6445, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31640625, "rewards/margins": 0.08447265625, "rewards/rejected": 0.2314453125, "step": 1367 }, { "epoch": 0.39463435742102987, "grad_norm": 11.268494598420116, "learning_rate": 3.790483927671785e-07, "logits/chosen": 2.875, "logits/rejected": 2.96875, "logps/chosen": -1672.0, "logps/rejected": -1200.0, "loss": 0.6528, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.3046875, "rewards/margins": 0.21484375, "rewards/rejected": 0.09033203125, "step": 1368 }, { "epoch": 0.3949228328285014, "grad_norm": 11.0056605286496, "learning_rate": 3.7883265837858113e-07, "logits/chosen": 3.046875, "logits/rejected": 3.125, "logps/chosen": -1512.0, "logps/rejected": -1432.0, "loss": 0.6931, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2138671875, "rewards/margins": 0.0286865234375, "rewards/rejected": 0.185546875, "step": 1369 }, { "epoch": 0.3952113082359729, "grad_norm": 11.905407093673032, "learning_rate": 3.786167932841634e-07, "logits/chosen": 3.21875, "logits/rejected": 3.15625, "logps/chosen": -2304.0, "logps/rejected": -2080.0, "loss": 0.6664, "loss/demonstration_loss": -4448.0, "loss/preference_loss": -4416.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.51953125, "rewards/margins": 0.208984375, "rewards/rejected": 0.310546875, "step": 1370 }, { "epoch": 0.3954997836434444, "grad_norm": 11.906840026211704, "learning_rate": 3.78400797702929e-07, "logits/chosen": 3.078125, "logits/rejected": 3.03125, "logps/chosen": -1520.0, "logps/rejected": -1744.0, "loss": 0.6989, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2275390625, "rewards/margins": -0.04638671875, "rewards/rejected": 0.2734375, "step": 1371 }, { "epoch": 0.3957882590509159, "grad_norm": 12.102952826717768, "learning_rate": 3.7818467185401395e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1664.0, "logps/rejected": -1640.0, "loss": 0.7003, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.380859375, "rewards/margins": 0.083984375, "rewards/rejected": 0.296875, "step": 1372 }, { "epoch": 0.3960767344583874, "grad_norm": 11.203280671650758, "learning_rate": 3.7796841595668614e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1504.0, "logps/rejected": -1472.0, "loss": 0.6673, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.353515625, "rewards/margins": 0.1455078125, "rewards/rejected": 0.208984375, "step": 1373 }, { "epoch": 0.39636520986585894, "grad_norm": 11.336706634656963, "learning_rate": 3.7775203023034617e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1488.0, "logps/rejected": -1656.0, "loss": 0.7113, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2197265625, "rewards/margins": 0.0048828125, "rewards/rejected": 0.21484375, "step": 1374 }, { "epoch": 0.39665368527333045, "grad_norm": 10.744365767136983, "learning_rate": 3.775355148945257e-07, "logits/chosen": 2.984375, "logits/rejected": 2.984375, "logps/chosen": -1608.0, "logps/rejected": -1512.0, "loss": 0.6526, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.267578125, "rewards/margins": -0.0093994140625, "rewards/rejected": 0.27734375, "step": 1375 }, { "epoch": 0.39694216068080196, "grad_norm": 9.158056439949062, "learning_rate": 3.773188701688881e-07, "logits/chosen": 2.96875, "logits/rejected": 2.96875, "logps/chosen": -1480.0, "logps/rejected": -1480.0, "loss": 0.6863, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2734375, "rewards/margins": 0.03271484375, "rewards/rejected": 0.240234375, "step": 1376 }, { "epoch": 0.39723063608827347, "grad_norm": 11.515527318046763, "learning_rate": 3.771020962732281e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1768.0, "logps/rejected": -1648.0, "loss": 0.6592, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.345703125, "rewards/margins": 0.01007080078125, "rewards/rejected": 0.3359375, "step": 1377 }, { "epoch": 0.397519111495745, "grad_norm": 11.236653091790737, "learning_rate": 3.768851934274712e-07, "logits/chosen": 2.96875, "logits/rejected": 2.953125, "logps/chosen": -2160.0, "logps/rejected": -2128.0, "loss": 0.7079, "loss/demonstration_loss": -4320.0, "loss/preference_loss": -4320.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4609375, "rewards/margins": 0.0260009765625, "rewards/rejected": 0.43359375, "step": 1378 }, { "epoch": 0.3978075869032165, "grad_norm": 12.145344469991288, "learning_rate": 3.76668161851674e-07, "logits/chosen": 3.0, "logits/rejected": 3.03125, "logps/chosen": -1664.0, "logps/rejected": -1456.0, "loss": 0.6705, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21875, "rewards/margins": -0.001495361328125, "rewards/rejected": 0.2197265625, "step": 1379 }, { "epoch": 0.398096062310688, "grad_norm": 10.980631119278355, "learning_rate": 3.764510017660236e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1664.0, "logps/rejected": -1496.0, "loss": 0.6542, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.259765625, "rewards/margins": 0.21875, "rewards/rejected": 0.04052734375, "step": 1380 }, { "epoch": 0.3983845377181595, "grad_norm": 11.120170658415976, "learning_rate": 3.762337133908375e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -1832.0, "logps/rejected": -1944.0, "loss": 0.6989, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2265625, "rewards/margins": -0.021240234375, "rewards/rejected": 0.248046875, "step": 1381 }, { "epoch": 0.398673013125631, "grad_norm": 13.71060202869591, "learning_rate": 3.7601629694656335e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1544.0, "logps/rejected": -1488.0, "loss": 0.7083, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.13671875, "rewards/margins": -0.044921875, "rewards/rejected": 0.181640625, "step": 1382 }, { "epoch": 0.39896148853310254, "grad_norm": 9.966469146257252, "learning_rate": 3.757987526537787e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1320.0, "logps/rejected": -1544.0, "loss": 0.704, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.26953125, "rewards/margins": -0.07470703125, "rewards/rejected": 0.345703125, "step": 1383 }, { "epoch": 0.39924996394057405, "grad_norm": 12.274157913828146, "learning_rate": 3.7558108073319075e-07, "logits/chosen": 3.078125, "logits/rejected": 3.09375, "logps/chosen": -1672.0, "logps/rejected": -1368.0, "loss": 0.6885, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36328125, "rewards/margins": 0.04833984375, "rewards/rejected": 0.31640625, "step": 1384 }, { "epoch": 0.39953843934804556, "grad_norm": 12.134508144994225, "learning_rate": 3.7536328140563644e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1712.0, "logps/rejected": -1608.0, "loss": 0.6463, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.39453125, "rewards/margins": 0.251953125, "rewards/rejected": 0.1416015625, "step": 1385 }, { "epoch": 0.39982691475551707, "grad_norm": 12.630642875861069, "learning_rate": 3.7514535489208155e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1624.0, "logps/rejected": -1656.0, "loss": 0.6865, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.30859375, "rewards/margins": -0.058349609375, "rewards/rejected": 0.3671875, "step": 1386 }, { "epoch": 0.4001153901629886, "grad_norm": 15.527601778733017, "learning_rate": 3.749273014136213e-07, "logits/chosen": 3.015625, "logits/rejected": 2.96875, "logps/chosen": -2240.0, "logps/rejected": -2128.0, "loss": 0.6904, "loss/demonstration_loss": -4384.0, "loss/preference_loss": -4384.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.228515625, "rewards/margins": 0.0791015625, "rewards/rejected": 0.150390625, "step": 1387 }, { "epoch": 0.4004038655704601, "grad_norm": 11.928344845671907, "learning_rate": 3.747091211914796e-07, "logits/chosen": 2.859375, "logits/rejected": 2.890625, "logps/chosen": -1592.0, "logps/rejected": -1544.0, "loss": 0.6718, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2333984375, "rewards/margins": 0.1591796875, "rewards/rejected": 0.0751953125, "step": 1388 }, { "epoch": 0.4006923409779316, "grad_norm": 9.831527296931709, "learning_rate": 3.7449081444700883e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1488.0, "logps/rejected": -1624.0, "loss": 0.626, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.361328125, "rewards/margins": 0.1484375, "rewards/rejected": 0.212890625, "step": 1389 }, { "epoch": 0.40098081638540317, "grad_norm": 20.268959916633097, "learning_rate": 3.7427238140168974e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1928.0, "logps/rejected": -1904.0, "loss": 0.6844, "loss/demonstration_loss": -3872.0, "loss/preference_loss": -3872.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.390625, "rewards/margins": 0.056640625, "rewards/rejected": 0.333984375, "step": 1390 }, { "epoch": 0.4012692917928747, "grad_norm": 12.538307808945405, "learning_rate": 3.740538222771314e-07, "logits/chosen": 2.890625, "logits/rejected": 2.953125, "logps/chosen": -1488.0, "logps/rejected": -1336.0, "loss": 0.6573, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.353515625, "rewards/margins": 0.1796875, "rewards/rejected": 0.1728515625, "step": 1391 }, { "epoch": 0.4015577672003462, "grad_norm": 11.925766712077214, "learning_rate": 3.7383513729507055e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1304.0, "logps/rejected": -1488.0, "loss": 0.6857, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.33203125, "rewards/margins": 0.051025390625, "rewards/rejected": 0.279296875, "step": 1392 }, { "epoch": 0.4018462426078177, "grad_norm": 10.756370347506724, "learning_rate": 3.7361632667737187e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1368.0, "logps/rejected": -1216.0, "loss": 0.6603, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2578125, "rewards/margins": 0.13671875, "rewards/rejected": 0.1201171875, "step": 1393 }, { "epoch": 0.4021347180152892, "grad_norm": 11.80577392427376, "learning_rate": 3.733973906460273e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1560.0, "logps/rejected": -1480.0, "loss": 0.686, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2255859375, "rewards/margins": -0.037353515625, "rewards/rejected": 0.263671875, "step": 1394 }, { "epoch": 0.4024231934227607, "grad_norm": 10.541930387057253, "learning_rate": 3.731783294231561e-07, "logits/chosen": 2.984375, "logits/rejected": 3.03125, "logps/chosen": -1816.0, "logps/rejected": -1800.0, "loss": 0.6609, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.353515625, "rewards/margins": 0.0927734375, "rewards/rejected": 0.26171875, "step": 1395 }, { "epoch": 0.40271166883023224, "grad_norm": 10.062561509224166, "learning_rate": 3.729591432310045e-07, "logits/chosen": 3.046875, "logits/rejected": 3.015625, "logps/chosen": -1728.0, "logps/rejected": -1472.0, "loss": 0.6734, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2578125, "rewards/margins": 0.1298828125, "rewards/rejected": 0.12890625, "step": 1396 }, { "epoch": 0.40300014423770375, "grad_norm": 10.555801747508836, "learning_rate": 3.7273983229194564e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1504.0, "logps/rejected": -1888.0, "loss": 0.6598, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.369140625, "rewards/margins": 0.107421875, "rewards/rejected": 0.26171875, "step": 1397 }, { "epoch": 0.40328861964517526, "grad_norm": 10.839586544957013, "learning_rate": 3.7252039682847907e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1736.0, "logps/rejected": -1712.0, "loss": 0.6826, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.43359375, "rewards/margins": 0.005462646484375, "rewards/rejected": 0.4296875, "step": 1398 }, { "epoch": 0.4035770950526468, "grad_norm": 12.326347718252821, "learning_rate": 3.723008370632308e-07, "logits/chosen": 3.078125, "logits/rejected": 3.125, "logps/chosen": -1608.0, "logps/rejected": -1664.0, "loss": 0.7119, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.263671875, "rewards/margins": -0.02490234375, "rewards/rejected": 0.287109375, "step": 1399 }, { "epoch": 0.4038655704601183, "grad_norm": 11.204368014144857, "learning_rate": 3.7208115321895265e-07, "logits/chosen": 2.890625, "logits/rejected": 2.96875, "logps/chosen": -1776.0, "logps/rejected": -1568.0, "loss": 0.6639, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.306640625, "rewards/margins": 0.134765625, "rewards/rejected": 0.1708984375, "step": 1400 }, { "epoch": 0.4041540458675898, "grad_norm": 10.756107706064277, "learning_rate": 3.7186134551852287e-07, "logits/chosen": 3.1875, "logits/rejected": 3.15625, "logps/chosen": -1704.0, "logps/rejected": -1568.0, "loss": 0.6777, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.232421875, "rewards/margins": 0.032958984375, "rewards/rejected": 0.19921875, "step": 1401 }, { "epoch": 0.4044425212750613, "grad_norm": 11.12544501702627, "learning_rate": 3.7164141418494494e-07, "logits/chosen": 3.140625, "logits/rejected": 3.09375, "logps/chosen": -1768.0, "logps/rejected": -1832.0, "loss": 0.6691, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2197265625, "rewards/margins": 0.0625, "rewards/rejected": 0.1572265625, "step": 1402 }, { "epoch": 0.4047309966825328, "grad_norm": 11.38833971702098, "learning_rate": 3.7142135944134777e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1808.0, "logps/rejected": -1888.0, "loss": 0.6982, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1787109375, "rewards/margins": -0.048828125, "rewards/rejected": 0.2275390625, "step": 1403 }, { "epoch": 0.40501947209000433, "grad_norm": 10.95284458710104, "learning_rate": 3.7120118151098574e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -1744.0, "logps/rejected": -1608.0, "loss": 0.6341, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.484375, "rewards/margins": 0.22265625, "rewards/rejected": 0.26171875, "step": 1404 }, { "epoch": 0.40530794749747584, "grad_norm": 10.5961858911717, "learning_rate": 3.7098088061723796e-07, "logits/chosen": 3.125, "logits/rejected": 3.171875, "logps/chosen": -1440.0, "logps/rejected": -1312.0, "loss": 0.6956, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1943359375, "rewards/margins": 0.0419921875, "rewards/rejected": 0.1513671875, "step": 1405 }, { "epoch": 0.40559642290494735, "grad_norm": 12.869756717025295, "learning_rate": 3.7076045698360847e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1848.0, "logps/rejected": -1768.0, "loss": 0.7134, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.296875, "rewards/margins": 0.03173828125, "rewards/rejected": 0.263671875, "step": 1406 }, { "epoch": 0.40588489831241886, "grad_norm": 10.251453879356593, "learning_rate": 3.705399108337257e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1712.0, "logps/rejected": -1512.0, "loss": 0.6443, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.2490234375, "rewards/margins": 0.1025390625, "rewards/rejected": 0.146484375, "step": 1407 }, { "epoch": 0.4061733737198904, "grad_norm": 10.534555354754037, "learning_rate": 3.703192423913424e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1784.0, "logps/rejected": -1728.0, "loss": 0.6663, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.484375, "rewards/margins": 0.1435546875, "rewards/rejected": 0.33984375, "step": 1408 }, { "epoch": 0.4064618491273619, "grad_norm": 12.155557117387959, "learning_rate": 3.7009845188033543e-07, "logits/chosen": 3.046875, "logits/rejected": 2.984375, "logps/chosen": -1680.0, "logps/rejected": -1464.0, "loss": 0.7018, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.310546875, "rewards/margins": 0.024658203125, "rewards/rejected": 0.28515625, "step": 1409 }, { "epoch": 0.4067503245348334, "grad_norm": 11.364593550007232, "learning_rate": 3.698775395247056e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1672.0, "logps/rejected": -1624.0, "loss": 0.7057, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.197265625, "rewards/margins": 0.01043701171875, "rewards/rejected": 0.1875, "step": 1410 }, { "epoch": 0.4070387999423049, "grad_norm": 12.565424634853716, "learning_rate": 3.696565055485771e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0, "logps/chosen": -1648.0, "logps/rejected": -1624.0, "loss": 0.6498, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.357421875, "rewards/margins": 0.08251953125, "rewards/rejected": 0.275390625, "step": 1411 }, { "epoch": 0.4073272753497764, "grad_norm": 10.262384716323979, "learning_rate": 3.6943535017619765e-07, "logits/chosen": 3.109375, "logits/rejected": 3.140625, "logps/chosen": -1984.0, "logps/rejected": -1816.0, "loss": 0.6331, "loss/demonstration_loss": -3840.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.306640625, "rewards/margins": 0.0859375, "rewards/rejected": 0.220703125, "step": 1412 }, { "epoch": 0.40761575075724793, "grad_norm": 13.0962401317729, "learning_rate": 3.692140736319381e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -1864.0, "logps/rejected": -1456.0, "loss": 0.701, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.328125, "rewards/margins": 0.038818359375, "rewards/rejected": 0.291015625, "step": 1413 }, { "epoch": 0.40790422616471944, "grad_norm": 11.69446479732212, "learning_rate": 3.6899267614029226e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -2128.0, "logps/rejected": -2000.0, "loss": 0.6784, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.294921875, "rewards/margins": 0.07275390625, "rewards/rejected": 0.22265625, "step": 1414 }, { "epoch": 0.40819270157219095, "grad_norm": 11.692005369738197, "learning_rate": 3.6877115792587673e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1632.0, "logps/rejected": -1424.0, "loss": 0.6797, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.322265625, "rewards/margins": 0.1123046875, "rewards/rejected": 0.208984375, "step": 1415 }, { "epoch": 0.40848117697966246, "grad_norm": 10.727445434001131, "learning_rate": 3.685495192134303e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1376.0, "logps/rejected": -1456.0, "loss": 0.7109, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2255859375, "rewards/margins": 0.031982421875, "rewards/rejected": 0.193359375, "step": 1416 }, { "epoch": 0.408769652387134, "grad_norm": 11.302926833031483, "learning_rate": 3.683277602278143e-07, "logits/chosen": 3.109375, "logits/rejected": 3.078125, "logps/chosen": -1808.0, "logps/rejected": -1800.0, "loss": 0.6863, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.400390625, "rewards/margins": 0.00433349609375, "rewards/rejected": 0.396484375, "step": 1417 }, { "epoch": 0.4090581277946055, "grad_norm": 10.977197944094765, "learning_rate": 3.6810588119401196e-07, "logits/chosen": 3.125, "logits/rejected": 3.203125, "logps/chosen": -1552.0, "logps/rejected": -1280.0, "loss": 0.6544, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.1630859375, "rewards/margins": 0.09716796875, "rewards/rejected": 0.0654296875, "step": 1418 }, { "epoch": 0.409346603202077, "grad_norm": 11.012789089180847, "learning_rate": 3.678838823371283e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1568.0, "logps/rejected": -1368.0, "loss": 0.6827, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2158203125, "rewards/margins": -0.01123046875, "rewards/rejected": 0.2275390625, "step": 1419 }, { "epoch": 0.4096350786095485, "grad_norm": 12.1094565590526, "learning_rate": 3.6766176388239005e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -1456.0, "logps/rejected": -1648.0, "loss": 0.7296, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.234375, "rewards/margins": -0.0361328125, "rewards/rejected": 0.26953125, "step": 1420 }, { "epoch": 0.4099235540170201, "grad_norm": 12.696097130619057, "learning_rate": 3.6743952605514506e-07, "logits/chosen": 3.265625, "logits/rejected": 3.328125, "logps/chosen": -1888.0, "logps/rejected": -1632.0, "loss": 0.6933, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3125, "rewards/margins": 0.08740234375, "rewards/rejected": 0.2255859375, "step": 1421 }, { "epoch": 0.4102120294244916, "grad_norm": 12.206874522742618, "learning_rate": 3.672171690808623e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1832.0, "logps/rejected": -1776.0, "loss": 0.6679, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2216796875, "rewards/margins": 0.1513671875, "rewards/rejected": 0.0712890625, "step": 1422 }, { "epoch": 0.4105005048319631, "grad_norm": 12.191887095644232, "learning_rate": 3.6699469318513187e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1720.0, "logps/rejected": -1832.0, "loss": 0.7068, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.263671875, "rewards/margins": -0.072265625, "rewards/rejected": 0.3359375, "step": 1423 }, { "epoch": 0.4107889802394346, "grad_norm": 10.268853974892556, "learning_rate": 3.667720985936643e-07, "logits/chosen": 3.0625, "logits/rejected": 3.140625, "logps/chosen": -1680.0, "logps/rejected": -1384.0, "loss": 0.6804, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.376953125, "rewards/margins": 0.09228515625, "rewards/rejected": 0.28515625, "step": 1424 }, { "epoch": 0.4110774556469061, "grad_norm": 11.687883109081131, "learning_rate": 3.6654938553229054e-07, "logits/chosen": 3.265625, "logits/rejected": 3.3125, "logps/chosen": -1808.0, "logps/rejected": -1608.0, "loss": 0.6547, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.33203125, "rewards/margins": 0.06640625, "rewards/rejected": 0.265625, "step": 1425 }, { "epoch": 0.41136593105437763, "grad_norm": 13.259898993857163, "learning_rate": 3.663265542269618e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -1728.0, "logps/rejected": -1840.0, "loss": 0.6532, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48828125, "rewards/margins": 0.06494140625, "rewards/rejected": 0.423828125, "step": 1426 }, { "epoch": 0.41165440646184914, "grad_norm": 11.391535438646725, "learning_rate": 3.6610360490374924e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1904.0, "logps/rejected": -1696.0, "loss": 0.6628, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.29296875, "rewards/margins": 0.0986328125, "rewards/rejected": 0.1943359375, "step": 1427 }, { "epoch": 0.41194288186932065, "grad_norm": 11.820450681871057, "learning_rate": 3.6588053778884383e-07, "logits/chosen": 3.21875, "logits/rejected": 3.296875, "logps/chosen": -1656.0, "logps/rejected": -1504.0, "loss": 0.6381, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.474609375, "rewards/margins": 0.1806640625, "rewards/rejected": 0.29296875, "step": 1428 }, { "epoch": 0.41223135727679217, "grad_norm": 11.738650351186301, "learning_rate": 3.656573531085559e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1875, "logps/chosen": -1632.0, "logps/rejected": -1632.0, "loss": 0.6722, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.267578125, "rewards/margins": 0.050048828125, "rewards/rejected": 0.2177734375, "step": 1429 }, { "epoch": 0.4125198326842637, "grad_norm": 11.15405910299464, "learning_rate": 3.654340510893151e-07, "logits/chosen": 3.09375, "logits/rejected": 3.15625, "logps/chosen": -1656.0, "logps/rejected": -1656.0, "loss": 0.7001, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.41015625, "rewards/margins": 0.07958984375, "rewards/rejected": 0.330078125, "step": 1430 }, { "epoch": 0.4128083080917352, "grad_norm": 10.026177807685471, "learning_rate": 3.652106319576702e-07, "logits/chosen": 3.09375, "logits/rejected": 3.078125, "logps/chosen": -1472.0, "logps/rejected": -1448.0, "loss": 0.6566, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.27734375, "rewards/margins": 0.11376953125, "rewards/rejected": 0.1640625, "step": 1431 }, { "epoch": 0.4130967834992067, "grad_norm": 12.622292901106505, "learning_rate": 3.6498709594028877e-07, "logits/chosen": 3.34375, "logits/rejected": 3.203125, "logps/chosen": -1696.0, "logps/rejected": -1880.0, "loss": 0.7127, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.353515625, "rewards/margins": 0.0174560546875, "rewards/rejected": 0.3359375, "step": 1432 }, { "epoch": 0.4133852589066782, "grad_norm": 11.683668149785372, "learning_rate": 3.6476344326395674e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1552.0, "logps/rejected": -1520.0, "loss": 0.6762, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3088.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.279296875, "rewards/margins": 0.11962890625, "rewards/rejected": 0.16015625, "step": 1433 }, { "epoch": 0.4136737343141497, "grad_norm": 10.325876383907781, "learning_rate": 3.6453967415557887e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1392.0, "logps/rejected": -1368.0, "loss": 0.6577, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1796875, "rewards/margins": 0.1162109375, "rewards/rejected": 0.06396484375, "step": 1434 }, { "epoch": 0.41396220972162123, "grad_norm": 11.919956287974754, "learning_rate": 3.6431578884217753e-07, "logits/chosen": 3.15625, "logits/rejected": 3.078125, "logps/chosen": -2016.0, "logps/rejected": -1800.0, "loss": 0.6448, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4765625, "rewards/margins": 0.244140625, "rewards/rejected": 0.232421875, "step": 1435 }, { "epoch": 0.41425068512909274, "grad_norm": 10.815510973650063, "learning_rate": 3.640917875508933e-07, "logits/chosen": 2.9375, "logits/rejected": 2.953125, "logps/chosen": -1568.0, "logps/rejected": -1448.0, "loss": 0.6537, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2236328125, "rewards/margins": 0.10546875, "rewards/rejected": 0.11767578125, "step": 1436 }, { "epoch": 0.41453916053656426, "grad_norm": 10.214991346460826, "learning_rate": 3.6386767050898433e-07, "logits/chosen": 3.125, "logits/rejected": 3.09375, "logps/chosen": -1440.0, "logps/rejected": -1304.0, "loss": 0.6505, "loss/demonstration_loss": -2768.0, "loss/preference_loss": -2752.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21484375, "rewards/margins": 0.0693359375, "rewards/rejected": 0.1455078125, "step": 1437 }, { "epoch": 0.41482763594403577, "grad_norm": 11.540583460797453, "learning_rate": 3.636434379438262e-07, "logits/chosen": 3.203125, "logits/rejected": 3.15625, "logps/chosen": -1928.0, "logps/rejected": -1880.0, "loss": 0.6906, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.134765625, "rewards/margins": -0.05029296875, "rewards/rejected": 0.1845703125, "step": 1438 }, { "epoch": 0.4151161113515073, "grad_norm": 11.028301760614935, "learning_rate": 3.634190900829117e-07, "logits/chosen": 3.109375, "logits/rejected": 3.015625, "logps/chosen": -1728.0, "logps/rejected": -1456.0, "loss": 0.6435, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33984375, "rewards/margins": 0.146484375, "rewards/rejected": 0.1923828125, "step": 1439 }, { "epoch": 0.4154045867589788, "grad_norm": 10.491212609001602, "learning_rate": 3.6319462715385063e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1512.0, "logps/rejected": -1408.0, "loss": 0.6603, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.35546875, "rewards/margins": 0.056640625, "rewards/rejected": 0.298828125, "step": 1440 }, { "epoch": 0.4156930621664503, "grad_norm": 11.017226258611302, "learning_rate": 3.6297004938436946e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1088.0, "logps/rejected": -1464.0, "loss": 0.7047, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2560.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1904296875, "rewards/margins": 0.044189453125, "rewards/rejected": 0.146484375, "step": 1441 }, { "epoch": 0.4159815375739218, "grad_norm": 13.660578113998382, "learning_rate": 3.6274535700231127e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1976.0, "logps/rejected": -1896.0, "loss": 0.6631, "loss/demonstration_loss": -3920.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.39453125, "rewards/margins": 0.146484375, "rewards/rejected": 0.248046875, "step": 1442 }, { "epoch": 0.4162700129813933, "grad_norm": 11.35993497872218, "learning_rate": 3.6252055023563533e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1552.0, "logps/rejected": -1616.0, "loss": 0.677, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.259765625, "rewards/margins": 0.0277099609375, "rewards/rejected": 0.232421875, "step": 1443 }, { "epoch": 0.41655848838886483, "grad_norm": 12.111398884681277, "learning_rate": 3.622956293124168e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1992.0, "logps/rejected": -2112.0, "loss": 0.7147, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.46875, "rewards/margins": -0.07470703125, "rewards/rejected": 0.54296875, "step": 1444 }, { "epoch": 0.41684696379633634, "grad_norm": 10.721734781623606, "learning_rate": 3.620705944608472e-07, "logits/chosen": 3.15625, "logits/rejected": 3.078125, "logps/chosen": -1840.0, "logps/rejected": -1392.0, "loss": 0.6317, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.4765625, "rewards/margins": 0.287109375, "rewards/rejected": 0.1904296875, "step": 1445 }, { "epoch": 0.41713543920380786, "grad_norm": 10.6842336521273, "learning_rate": 3.6184544590923293e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1512.0, "logps/rejected": -1384.0, "loss": 0.6302, "loss/demonstration_loss": -2928.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.341796875, "rewards/margins": 0.1748046875, "rewards/rejected": 0.16796875, "step": 1446 }, { "epoch": 0.41742391461127937, "grad_norm": 11.750520518640997, "learning_rate": 3.6162018388599636e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1920.0, "logps/rejected": -1424.0, "loss": 0.6511, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.439453125, "rewards/margins": 0.1640625, "rewards/rejected": 0.275390625, "step": 1447 }, { "epoch": 0.4177123900187509, "grad_norm": 13.74312256116365, "learning_rate": 3.613948086196745e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1752.0, "logps/rejected": -1672.0, "loss": 0.6517, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.376953125, "rewards/margins": 0.16015625, "rewards/rejected": 0.2177734375, "step": 1448 }, { "epoch": 0.4180008654262224, "grad_norm": 10.662334200385676, "learning_rate": 3.6116932033891955e-07, "logits/chosen": 3.0, "logits/rejected": 3.046875, "logps/chosen": -1584.0, "logps/rejected": -1568.0, "loss": 0.698, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4375, "rewards/margins": 0.0888671875, "rewards/rejected": 0.34765625, "step": 1449 }, { "epoch": 0.4182893408336939, "grad_norm": 9.351814372817023, "learning_rate": 3.6094371927249833e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1504.0, "logps/rejected": -1264.0, "loss": 0.6325, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.32421875, "rewards/margins": 0.244140625, "rewards/rejected": 0.0791015625, "step": 1450 }, { "epoch": 0.41857781624116547, "grad_norm": 11.388130442381131, "learning_rate": 3.6071800564929203e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1496.0, "logps/rejected": -1440.0, "loss": 0.6949, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.306640625, "rewards/margins": 0.06640625, "rewards/rejected": 0.2412109375, "step": 1451 }, { "epoch": 0.418866291648637, "grad_norm": 11.556995170180452, "learning_rate": 3.604921796982958e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1744.0, "logps/rejected": -1544.0, "loss": 0.6627, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.1630859375, "rewards/rejected": 0.322265625, "step": 1452 }, { "epoch": 0.4191547670561085, "grad_norm": 11.034114543354196, "learning_rate": 3.6026624164861924e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -1624.0, "logps/rejected": -1520.0, "loss": 0.6439, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.36328125, "rewards/margins": 0.08349609375, "rewards/rejected": 0.279296875, "step": 1453 }, { "epoch": 0.41944324246358, "grad_norm": 10.50495607951258, "learning_rate": 3.6004019172948536e-07, "logits/chosen": 3.109375, "logits/rejected": 3.15625, "logps/chosen": -1584.0, "logps/rejected": -1400.0, "loss": 0.6702, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.50390625, "rewards/margins": 0.1591796875, "rewards/rejected": 0.345703125, "step": 1454 }, { "epoch": 0.4197317178710515, "grad_norm": 11.274932405920156, "learning_rate": 3.5981403017023075e-07, "logits/chosen": 3.09375, "logits/rejected": 3.171875, "logps/chosen": -1656.0, "logps/rejected": -1736.0, "loss": 0.7018, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.365234375, "rewards/margins": 0.060546875, "rewards/rejected": 0.3046875, "step": 1455 }, { "epoch": 0.420020193278523, "grad_norm": 10.984688850160628, "learning_rate": 3.5958775720030526e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1440.0, "logps/rejected": -1552.0, "loss": 0.6998, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.302734375, "rewards/margins": -0.01177978515625, "rewards/rejected": 0.314453125, "step": 1456 }, { "epoch": 0.42030866868599454, "grad_norm": 10.754272265175567, "learning_rate": 3.5936137304927166e-07, "logits/chosen": 3.203125, "logits/rejected": 3.125, "logps/chosen": -1632.0, "logps/rejected": -1928.0, "loss": 0.6976, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.345703125, "rewards/margins": -0.0390625, "rewards/rejected": 0.384765625, "step": 1457 }, { "epoch": 0.42059714409346605, "grad_norm": 10.658615351240256, "learning_rate": 3.591348779468056e-07, "logits/chosen": 3.265625, "logits/rejected": 3.21875, "logps/chosen": -1808.0, "logps/rejected": -1904.0, "loss": 0.6738, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36328125, "rewards/margins": 0.1025390625, "rewards/rejected": 0.259765625, "step": 1458 }, { "epoch": 0.42088561950093756, "grad_norm": 12.096013865912218, "learning_rate": 3.5890827212269554e-07, "logits/chosen": 3.21875, "logits/rejected": 3.296875, "logps/chosen": -1640.0, "logps/rejected": -1560.0, "loss": 0.6888, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.29296875, "rewards/margins": 0.051513671875, "rewards/rejected": 0.2421875, "step": 1459 }, { "epoch": 0.42117409490840907, "grad_norm": 10.719007931907088, "learning_rate": 3.586815558068417e-07, "logits/chosen": 3.109375, "logits/rejected": 3.03125, "logps/chosen": -1760.0, "logps/rejected": -1520.0, "loss": 0.6136, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.546875, "rewards/margins": 0.259765625, "rewards/rejected": 0.2890625, "step": 1460 }, { "epoch": 0.4214625703158806, "grad_norm": 9.338823539807724, "learning_rate": 3.584547292292571e-07, "logits/chosen": 3.125, "logits/rejected": 3.03125, "logps/chosen": -1632.0, "logps/rejected": -1408.0, "loss": 0.6274, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.462890625, "rewards/margins": 0.1416015625, "rewards/rejected": 0.322265625, "step": 1461 }, { "epoch": 0.4217510457233521, "grad_norm": 11.475211810643314, "learning_rate": 3.5822779262006585e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -2160.0, "logps/rejected": -1864.0, "loss": 0.7166, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.318359375, "rewards/margins": -0.006378173828125, "rewards/rejected": 0.32421875, "step": 1462 }, { "epoch": 0.4220395211308236, "grad_norm": 10.599393354314644, "learning_rate": 3.5800074620950445e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1952.0, "logps/rejected": -1792.0, "loss": 0.6044, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.470703125, "rewards/margins": 0.2080078125, "rewards/rejected": 0.26171875, "step": 1463 }, { "epoch": 0.4223279965382951, "grad_norm": 11.24534671107371, "learning_rate": 3.577735902279203e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1368.0, "logps/rejected": -1416.0, "loss": 0.6747, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2109375, "rewards/margins": 0.048095703125, "rewards/rejected": 0.162109375, "step": 1464 }, { "epoch": 0.4226164719457666, "grad_norm": 11.524128506135431, "learning_rate": 3.5754632490577217e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1568.0, "logps/rejected": -1552.0, "loss": 0.6863, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.392578125, "rewards/margins": -0.006256103515625, "rewards/rejected": 0.3984375, "step": 1465 }, { "epoch": 0.42290494735323814, "grad_norm": 12.028380128721462, "learning_rate": 3.573189504736296e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1848.0, "logps/rejected": -1712.0, "loss": 0.6814, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": 0.0263671875, "rewards/rejected": 0.3046875, "step": 1466 }, { "epoch": 0.42319342276070965, "grad_norm": 11.490971015061676, "learning_rate": 3.5709146716217314e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1744.0, "logps/rejected": -1560.0, "loss": 0.7112, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.357421875, "rewards/margins": -0.00909423828125, "rewards/rejected": 0.3671875, "step": 1467 }, { "epoch": 0.42348189816818116, "grad_norm": 10.591383614157886, "learning_rate": 3.5686387520219334e-07, "logits/chosen": 3.25, "logits/rejected": 3.328125, "logps/chosen": -2048.0, "logps/rejected": -1968.0, "loss": 0.7042, "loss/demonstration_loss": -4064.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.44921875, "rewards/margins": 0.03857421875, "rewards/rejected": 0.41015625, "step": 1468 }, { "epoch": 0.42377037357565267, "grad_norm": 18.18874131949807, "learning_rate": 3.566361748245915e-07, "logits/chosen": 3.171875, "logits/rejected": 3.1875, "logps/chosen": -1912.0, "logps/rejected": -1928.0, "loss": 0.6893, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48828125, "rewards/margins": 0.0791015625, "rewards/rejected": 0.408203125, "step": 1469 }, { "epoch": 0.4240588489831242, "grad_norm": 10.069922765865112, "learning_rate": 3.5640836626037835e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1464.0, "logps/rejected": -1392.0, "loss": 0.6451, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2864.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2001953125, "rewards/margins": 0.07568359375, "rewards/rejected": 0.12451171875, "step": 1470 }, { "epoch": 0.4243473243905957, "grad_norm": 12.104243013174909, "learning_rate": 3.561804497406748e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1520.0, "logps/rejected": -1480.0, "loss": 0.7076, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.34375, "rewards/margins": 0.06005859375, "rewards/rejected": 0.28515625, "step": 1471 }, { "epoch": 0.4246357997980672, "grad_norm": 8.933401577253049, "learning_rate": 3.559524254967114e-07, "logits/chosen": 3.0625, "logits/rejected": 3.171875, "logps/chosen": -1568.0, "logps/rejected": -1312.0, "loss": 0.6716, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.306640625, "rewards/margins": 0.0025177001953125, "rewards/rejected": 0.302734375, "step": 1472 }, { "epoch": 0.4249242752055387, "grad_norm": 11.636055138453058, "learning_rate": 3.557242937598274e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1640.0, "logps/rejected": -1728.0, "loss": 0.7321, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.255859375, "rewards/margins": 0.02099609375, "rewards/rejected": 0.234375, "step": 1473 }, { "epoch": 0.4252127506130102, "grad_norm": 10.857286000385852, "learning_rate": 3.554960547614716e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -2024.0, "logps/rejected": -2000.0, "loss": 0.6852, "loss/demonstration_loss": -4080.0, "loss/preference_loss": -4064.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.48828125, "rewards/margins": 0.072265625, "rewards/rejected": 0.416015625, "step": 1474 }, { "epoch": 0.42550122602048174, "grad_norm": 13.022104543379063, "learning_rate": 3.552677087332015e-07, "logits/chosen": 3.34375, "logits/rejected": 3.296875, "logps/chosen": -1544.0, "logps/rejected": -1872.0, "loss": 0.7245, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.23046875, "rewards/margins": -0.10546875, "rewards/rejected": 0.3359375, "step": 1475 }, { "epoch": 0.42578970142795325, "grad_norm": 11.401791001816868, "learning_rate": 3.550392559066831e-07, "logits/chosen": 3.21875, "logits/rejected": 3.28125, "logps/chosen": -1544.0, "logps/rejected": -1456.0, "loss": 0.6519, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.375, "rewards/margins": 0.1416015625, "rewards/rejected": 0.234375, "step": 1476 }, { "epoch": 0.42607817683542476, "grad_norm": 10.92018613656943, "learning_rate": 3.5481069651369094e-07, "logits/chosen": 3.15625, "logits/rejected": 3.21875, "logps/chosen": -1376.0, "logps/rejected": -1288.0, "loss": 0.7062, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2001953125, "rewards/margins": -0.007781982421875, "rewards/rejected": 0.2080078125, "step": 1477 }, { "epoch": 0.42636665224289627, "grad_norm": 11.29848873932859, "learning_rate": 3.545820307861075e-07, "logits/chosen": 3.0625, "logits/rejected": 3.046875, "logps/chosen": -1968.0, "logps/rejected": -1760.0, "loss": 0.7017, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.36328125, "rewards/margins": 0.111328125, "rewards/rejected": 0.251953125, "step": 1478 }, { "epoch": 0.4266551276503678, "grad_norm": 12.351038068854823, "learning_rate": 3.5435325895592306e-07, "logits/chosen": 3.234375, "logits/rejected": 3.171875, "logps/chosen": -1856.0, "logps/rejected": -1928.0, "loss": 0.6968, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3824.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.36328125, "rewards/margins": 0.0032958984375, "rewards/rejected": 0.359375, "step": 1479 }, { "epoch": 0.4269436030578393, "grad_norm": 9.760906515927875, "learning_rate": 3.5412438125523576e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1784.0, "logps/rejected": -1752.0, "loss": 0.6622, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.375, "rewards/margins": 0.1318359375, "rewards/rejected": 0.2431640625, "step": 1480 }, { "epoch": 0.4272320784653108, "grad_norm": 10.97592202940016, "learning_rate": 3.5389539791625115e-07, "logits/chosen": 3.03125, "logits/rejected": 3.03125, "logps/chosen": -1824.0, "logps/rejected": -1792.0, "loss": 0.6846, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.337890625, "rewards/margins": 0.0133056640625, "rewards/rejected": 0.32421875, "step": 1481 }, { "epoch": 0.42752055387278237, "grad_norm": 11.415929299624795, "learning_rate": 3.5366630917128184e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -2144.0, "logps/rejected": -1752.0, "loss": 0.6274, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3920.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.46484375, "rewards/margins": 0.21875, "rewards/rejected": 0.24609375, "step": 1482 }, { "epoch": 0.4278090292802539, "grad_norm": 10.07627410815915, "learning_rate": 3.534371152527473e-07, "logits/chosen": 3.296875, "logits/rejected": 3.28125, "logps/chosen": -1344.0, "logps/rejected": -1352.0, "loss": 0.6995, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2734375, "rewards/margins": -0.0654296875, "rewards/rejected": 0.33984375, "step": 1483 }, { "epoch": 0.4280975046877254, "grad_norm": 11.408048765193723, "learning_rate": 3.532078163931739e-07, "logits/chosen": 3.09375, "logits/rejected": 3.09375, "logps/chosen": -1544.0, "logps/rejected": -1264.0, "loss": 0.6546, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3203125, "rewards/margins": 0.1826171875, "rewards/rejected": 0.1376953125, "step": 1484 }, { "epoch": 0.4283859800951969, "grad_norm": 10.541827352240697, "learning_rate": 3.5297841282519436e-07, "logits/chosen": 3.140625, "logits/rejected": 3.03125, "logps/chosen": -1880.0, "logps/rejected": -1760.0, "loss": 0.691, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.453125, "rewards/margins": 0.126953125, "rewards/rejected": 0.326171875, "step": 1485 }, { "epoch": 0.4286744555026684, "grad_norm": 12.497894178831125, "learning_rate": 3.527489047815478e-07, "logits/chosen": 3.0, "logits/rejected": 2.96875, "logps/chosen": -1592.0, "logps/rejected": -1704.0, "loss": 0.6682, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.546875, "rewards/margins": 0.10205078125, "rewards/rejected": 0.4453125, "step": 1486 }, { "epoch": 0.42896293091013993, "grad_norm": 10.651432605591802, "learning_rate": 3.5251929249507896e-07, "logits/chosen": 3.1875, "logits/rejected": 3.15625, "logps/chosen": -1648.0, "logps/rejected": -1536.0, "loss": 0.676, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.41015625, "rewards/margins": 0.0089111328125, "rewards/rejected": 0.40234375, "step": 1487 }, { "epoch": 0.42925140631761144, "grad_norm": 13.718201424486463, "learning_rate": 3.5228957619873874e-07, "logits/chosen": 3.109375, "logits/rejected": 3.171875, "logps/chosen": -1728.0, "logps/rejected": -1640.0, "loss": 0.693, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.28125, "rewards/margins": 0.07177734375, "rewards/rejected": 0.2099609375, "step": 1488 }, { "epoch": 0.42953988172508295, "grad_norm": 10.250272815428048, "learning_rate": 3.520597561255834e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1496.0, "logps/rejected": -1520.0, "loss": 0.6668, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.42578125, "rewards/margins": 0.051513671875, "rewards/rejected": 0.375, "step": 1489 }, { "epoch": 0.42982835713255446, "grad_norm": 10.34411047323811, "learning_rate": 3.518298325087743e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1875, "logps/chosen": -1496.0, "logps/rejected": -1680.0, "loss": 0.6757, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.205078125, "rewards/margins": 0.043701171875, "rewards/rejected": 0.1611328125, "step": 1490 }, { "epoch": 0.430116832540026, "grad_norm": 10.58101444119307, "learning_rate": 3.515998055815782e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1656.0, "logps/rejected": -1672.0, "loss": 0.7004, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.390625, "rewards/margins": 0.0106201171875, "rewards/rejected": 0.37890625, "step": 1491 }, { "epoch": 0.4304053079474975, "grad_norm": 11.99524345002724, "learning_rate": 3.513696755773665e-07, "logits/chosen": 3.28125, "logits/rejected": 3.3125, "logps/chosen": -1640.0, "logps/rejected": -1592.0, "loss": 0.7647, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.29296875, "rewards/margins": -0.06396484375, "rewards/rejected": 0.357421875, "step": 1492 }, { "epoch": 0.430693783354969, "grad_norm": 12.074562094676685, "learning_rate": 3.511394427296151e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -1760.0, "logps/rejected": -1664.0, "loss": 0.676, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.306640625, "rewards/margins": 0.025390625, "rewards/rejected": 0.28125, "step": 1493 }, { "epoch": 0.4309822587624405, "grad_norm": 10.291934724167026, "learning_rate": 3.5090910727190435e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1672.0, "logps/rejected": -1656.0, "loss": 0.6695, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.1337890625, "rewards/rejected": 0.349609375, "step": 1494 }, { "epoch": 0.431270734169912, "grad_norm": 12.18476995600669, "learning_rate": 3.5067866943791874e-07, "logits/chosen": 3.296875, "logits/rejected": 3.28125, "logps/chosen": -1496.0, "logps/rejected": -1392.0, "loss": 0.6832, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.25390625, "rewards/margins": -0.0458984375, "rewards/rejected": 0.30078125, "step": 1495 }, { "epoch": 0.43155920957738353, "grad_norm": 11.29213144970456, "learning_rate": 3.5044812946144646e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1616.0, "logps/rejected": -1704.0, "loss": 0.6919, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31640625, "rewards/margins": 0.1337890625, "rewards/rejected": 0.1826171875, "step": 1496 }, { "epoch": 0.43184768498485504, "grad_norm": 12.035781256974701, "learning_rate": 3.502174875763794e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -2112.0, "logps/rejected": -1984.0, "loss": 0.6764, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4296875, "rewards/margins": 0.0400390625, "rewards/rejected": 0.390625, "step": 1497 }, { "epoch": 0.43213616039232655, "grad_norm": 11.115259547939155, "learning_rate": 3.49986744016713e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1704.0, "logps/rejected": -1800.0, "loss": 0.6672, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.48046875, "rewards/margins": 0.06689453125, "rewards/rejected": 0.412109375, "step": 1498 }, { "epoch": 0.43242463579979806, "grad_norm": 9.1792430654426, "learning_rate": 3.4975589901654555e-07, "logits/chosen": 3.109375, "logits/rejected": 3.1875, "logps/chosen": -1336.0, "logps/rejected": -1336.0, "loss": 0.6729, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1318359375, "rewards/margins": 0.0111083984375, "rewards/rejected": 0.12060546875, "step": 1499 }, { "epoch": 0.4327131112072696, "grad_norm": 11.16928600759935, "learning_rate": 3.495249528100786e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -2048.0, "logps/rejected": -1992.0, "loss": 0.6832, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4080.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5546875, "rewards/margins": 0.19140625, "rewards/rejected": 0.361328125, "step": 1500 }, { "epoch": 0.4330015866147411, "grad_norm": 11.179808598153329, "learning_rate": 3.4929390563161606e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1600.0, "logps/rejected": -1368.0, "loss": 0.6486, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.349609375, "rewards/margins": 0.1533203125, "rewards/rejected": 0.1962890625, "step": 1501 }, { "epoch": 0.4332900620222126, "grad_norm": 11.084748611247903, "learning_rate": 3.4906275771556435e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1496.0, "logps/rejected": -1568.0, "loss": 0.6989, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.287109375, "rewards/margins": 0.02587890625, "rewards/rejected": 0.26171875, "step": 1502 }, { "epoch": 0.4335785374296841, "grad_norm": 10.5528167332915, "learning_rate": 3.4883150929643236e-07, "logits/chosen": 3.125, "logits/rejected": 3.125, "logps/chosen": -1360.0, "logps/rejected": -1136.0, "loss": 0.6586, "loss/demonstration_loss": -2512.0, "loss/preference_loss": -2496.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.138671875, "rewards/margins": 0.055419921875, "rewards/rejected": 0.08349609375, "step": 1503 }, { "epoch": 0.4338670128371556, "grad_norm": 10.384056551323711, "learning_rate": 3.486001606088307e-07, "logits/chosen": 3.03125, "logits/rejected": 3.09375, "logps/chosen": -1440.0, "logps/rejected": -1648.0, "loss": 0.6469, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2734375, "rewards/margins": 0.142578125, "rewards/rejected": 0.1318359375, "step": 1504 }, { "epoch": 0.43415548824462713, "grad_norm": 10.566809235732396, "learning_rate": 3.4836871188747165e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1592.0, "logps/rejected": -1600.0, "loss": 0.6807, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.265625, "rewards/margins": -0.0108642578125, "rewards/rejected": 0.27734375, "step": 1505 }, { "epoch": 0.43444396365209864, "grad_norm": 11.197892260093418, "learning_rate": 3.48137163367169e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1544.0, "logps/rejected": -1480.0, "loss": 0.6611, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1533203125, "rewards/margins": -0.061767578125, "rewards/rejected": 0.2158203125, "step": 1506 }, { "epoch": 0.43473243905957015, "grad_norm": 11.455916758922642, "learning_rate": 3.479055152828382e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1600.0, "logps/rejected": -1776.0, "loss": 0.6713, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.30859375, "rewards/margins": 0.051513671875, "rewards/rejected": 0.2578125, "step": 1507 }, { "epoch": 0.43502091446704166, "grad_norm": 13.166538113964373, "learning_rate": 3.476737678694951e-07, "logits/chosen": 3.140625, "logits/rejected": 3.1875, "logps/chosen": -1808.0, "logps/rejected": -1632.0, "loss": 0.6819, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.291015625, "rewards/margins": 0.1171875, "rewards/rejected": 0.1748046875, "step": 1508 }, { "epoch": 0.4353093898745132, "grad_norm": 13.407125853963413, "learning_rate": 3.474419213622567e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1872.0, "logps/rejected": -1984.0, "loss": 0.6786, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.322265625, "rewards/margins": 0.0235595703125, "rewards/rejected": 0.296875, "step": 1509 }, { "epoch": 0.4355978652819847, "grad_norm": 10.465238306107826, "learning_rate": 3.472099759963404e-07, "logits/chosen": 3.09375, "logits/rejected": 3.125, "logps/chosen": -1592.0, "logps/rejected": -1792.0, "loss": 0.6251, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2265625, "rewards/margins": 0.0301513671875, "rewards/rejected": 0.1962890625, "step": 1510 }, { "epoch": 0.4358863406894562, "grad_norm": 10.30714250308569, "learning_rate": 3.4697793200706395e-07, "logits/chosen": 3.203125, "logits/rejected": 3.25, "logps/chosen": -1424.0, "logps/rejected": -1384.0, "loss": 0.6339, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2832.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3125, "rewards/margins": 0.1123046875, "rewards/rejected": 0.201171875, "step": 1511 }, { "epoch": 0.43617481609692776, "grad_norm": 10.594098814439167, "learning_rate": 3.467457896298452e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1584.0, "logps/rejected": -1728.0, "loss": 0.6526, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.28125, "rewards/margins": 0.095703125, "rewards/rejected": 0.185546875, "step": 1512 }, { "epoch": 0.4364632915043993, "grad_norm": 11.18977581108728, "learning_rate": 3.465135491002017e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1848.0, "logps/rejected": -1824.0, "loss": 0.6675, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.421875, "rewards/margins": 0.0634765625, "rewards/rejected": 0.357421875, "step": 1513 }, { "epoch": 0.4367517669118708, "grad_norm": 11.67466983083644, "learning_rate": 3.462812106537506e-07, "logits/chosen": 3.015625, "logits/rejected": 3.03125, "logps/chosen": -1800.0, "logps/rejected": -1800.0, "loss": 0.6767, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.322265625, "rewards/margins": 0.1083984375, "rewards/rejected": 0.212890625, "step": 1514 }, { "epoch": 0.4370402423193423, "grad_norm": 22.602851819045313, "learning_rate": 3.4604877452620853e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1768.0, "logps/rejected": -1688.0, "loss": 0.6974, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.353515625, "rewards/margins": -0.0089111328125, "rewards/rejected": 0.361328125, "step": 1515 }, { "epoch": 0.4373287177268138, "grad_norm": 11.26426053911973, "learning_rate": 3.4581624095339114e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1960.0, "logps/rejected": -1944.0, "loss": 0.6805, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.34375, "rewards/margins": 0.04296875, "rewards/rejected": 0.30078125, "step": 1516 }, { "epoch": 0.4376171931342853, "grad_norm": 11.020470055369165, "learning_rate": 3.4558361017121275e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1864.0, "logps/rejected": -1864.0, "loss": 0.6659, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.40625, "rewards/margins": 0.08642578125, "rewards/rejected": 0.3203125, "step": 1517 }, { "epoch": 0.43790566854175683, "grad_norm": 11.02489383490104, "learning_rate": 3.453508824156866e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1712.0, "logps/rejected": -1600.0, "loss": 0.6729, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4921875, "rewards/margins": 0.078125, "rewards/rejected": 0.4140625, "step": 1518 }, { "epoch": 0.43819414394922834, "grad_norm": 11.641776554706913, "learning_rate": 3.451180579229242e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1875, "logps/chosen": -1656.0, "logps/rejected": -1640.0, "loss": 0.6934, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3828125, "rewards/margins": 0.03173828125, "rewards/rejected": 0.3515625, "step": 1519 }, { "epoch": 0.43848261935669985, "grad_norm": 10.702828371236077, "learning_rate": 3.448851369291351e-07, "logits/chosen": 3.15625, "logits/rejected": 3.03125, "logps/chosen": -1720.0, "logps/rejected": -1568.0, "loss": 0.6484, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46484375, "rewards/margins": 0.1923828125, "rewards/rejected": 0.2734375, "step": 1520 }, { "epoch": 0.43877109476417137, "grad_norm": 12.82441133727652, "learning_rate": 3.446521196706271e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1776.0, "logps/rejected": -1896.0, "loss": 0.7014, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4609375, "rewards/margins": 0.07275390625, "rewards/rejected": 0.388671875, "step": 1521 }, { "epoch": 0.4390595701716429, "grad_norm": 12.00846089074656, "learning_rate": 3.4441900638380503e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1808.0, "logps/rejected": -1824.0, "loss": 0.6988, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48046875, "rewards/margins": -0.013671875, "rewards/rejected": 0.494140625, "step": 1522 }, { "epoch": 0.4393480455791144, "grad_norm": 12.468991620479162, "learning_rate": 3.4418579730517185e-07, "logits/chosen": 3.03125, "logits/rejected": 3.09375, "logps/chosen": -1992.0, "logps/rejected": -1784.0, "loss": 0.6887, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.349609375, "rewards/margins": 0.115234375, "rewards/rejected": 0.234375, "step": 1523 }, { "epoch": 0.4396365209865859, "grad_norm": 9.023467586083413, "learning_rate": 3.439524926713272e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1672.0, "logps/rejected": -1704.0, "loss": 0.6411, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": 0.076171875, "rewards/rejected": 0.255859375, "step": 1524 }, { "epoch": 0.4399249963940574, "grad_norm": 11.096406358094628, "learning_rate": 3.4371909271896786e-07, "logits/chosen": 3.21875, "logits/rejected": 3.15625, "logps/chosen": -1800.0, "logps/rejected": -1864.0, "loss": 0.6544, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5078125, "rewards/margins": 0.1357421875, "rewards/rejected": 0.373046875, "step": 1525 }, { "epoch": 0.4402134718015289, "grad_norm": 10.846299160475898, "learning_rate": 3.4348559768488747e-07, "logits/chosen": 3.15625, "logits/rejected": 3.25, "logps/chosen": -1576.0, "logps/rejected": -1112.0, "loss": 0.6783, "loss/demonstration_loss": -2704.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2001953125, "rewards/margins": 0.08154296875, "rewards/rejected": 0.119140625, "step": 1526 }, { "epoch": 0.44050194720900043, "grad_norm": 12.50277006010662, "learning_rate": 3.432520078059758e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1560.0, "logps/rejected": -1784.0, "loss": 0.7116, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.380859375, "rewards/margins": 0.0281982421875, "rewards/rejected": 0.3515625, "step": 1527 }, { "epoch": 0.44079042261647194, "grad_norm": 10.438881187852047, "learning_rate": 3.4301832331921894e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1304.0, "logps/rejected": -1256.0, "loss": 0.682, "loss/demonstration_loss": -2576.0, "loss/preference_loss": -2576.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.134765625, "rewards/margins": 0.0269775390625, "rewards/rejected": 0.10791015625, "step": 1528 }, { "epoch": 0.44107889802394346, "grad_norm": 12.043717891923857, "learning_rate": 3.4278454446169926e-07, "logits/chosen": 3.203125, "logits/rejected": 3.125, "logps/chosen": -1512.0, "logps/rejected": -1656.0, "loss": 0.7047, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.267578125, "rewards/margins": 0.0205078125, "rewards/rejected": 0.24609375, "step": 1529 }, { "epoch": 0.44136737343141497, "grad_norm": 10.909474402337759, "learning_rate": 3.4255067147059446e-07, "logits/chosen": 3.140625, "logits/rejected": 3.15625, "logps/chosen": -1720.0, "logps/rejected": -1384.0, "loss": 0.688, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.14453125, "rewards/margins": 0.0361328125, "rewards/rejected": 0.10888671875, "step": 1530 }, { "epoch": 0.4416558488388865, "grad_norm": 11.163215450835171, "learning_rate": 3.42316704583178e-07, "logits/chosen": 3.171875, "logits/rejected": 3.203125, "logps/chosen": -1840.0, "logps/rejected": -1864.0, "loss": 0.6673, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.330078125, "rewards/margins": -0.007415771484375, "rewards/rejected": 0.337890625, "step": 1531 }, { "epoch": 0.441944324246358, "grad_norm": 11.428731644688176, "learning_rate": 3.420826440368185e-07, "logits/chosen": 3.125, "logits/rejected": 3.140625, "logps/chosen": -1928.0, "logps/rejected": -2048.0, "loss": 0.7112, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.337890625, "rewards/margins": 0.005889892578125, "rewards/rejected": 0.33203125, "step": 1532 }, { "epoch": 0.4422327996538295, "grad_norm": 13.242785426143541, "learning_rate": 3.4184849006897965e-07, "logits/chosen": 3.078125, "logits/rejected": 3.046875, "logps/chosen": -1592.0, "logps/rejected": -1616.0, "loss": 0.7047, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34375, "rewards/margins": 0.12060546875, "rewards/rejected": 0.2236328125, "step": 1533 }, { "epoch": 0.442521275061301, "grad_norm": 10.695306665147804, "learning_rate": 3.4161424291722e-07, "logits/chosen": 3.15625, "logits/rejected": 3.1875, "logps/chosen": -1432.0, "logps/rejected": -1528.0, "loss": 0.6439, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.28125, "rewards/margins": 0.056884765625, "rewards/rejected": 0.224609375, "step": 1534 }, { "epoch": 0.4428097504687725, "grad_norm": 11.559363719089523, "learning_rate": 3.413799028191923e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1528.0, "logps/rejected": -1776.0, "loss": 0.6787, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.2236328125, "rewards/margins": -0.003021240234375, "rewards/rejected": 0.2265625, "step": 1535 }, { "epoch": 0.44309822587624403, "grad_norm": 11.80060863439122, "learning_rate": 3.41145470012644e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1680.0, "logps/rejected": -1560.0, "loss": 0.6838, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.40625, "rewards/margins": 0.05810546875, "rewards/rejected": 0.34765625, "step": 1536 }, { "epoch": 0.44338670128371555, "grad_norm": 11.28455687625612, "learning_rate": 3.4091094473541643e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1360.0, "logps/rejected": -1400.0, "loss": 0.7161, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39453125, "rewards/margins": 0.025390625, "rewards/rejected": 0.369140625, "step": 1537 }, { "epoch": 0.44367517669118706, "grad_norm": 12.514969802051278, "learning_rate": 3.406763272254447e-07, "logits/chosen": 3.0, "logits/rejected": 3.03125, "logps/chosen": -1640.0, "logps/rejected": -1496.0, "loss": 0.6927, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4609375, "rewards/margins": 0.193359375, "rewards/rejected": 0.267578125, "step": 1538 }, { "epoch": 0.44396365209865857, "grad_norm": 11.270246395427481, "learning_rate": 3.404416177207576e-07, "logits/chosen": 3.234375, "logits/rejected": 3.328125, "logps/chosen": -1560.0, "logps/rejected": -1232.0, "loss": 0.6985, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.173828125, "rewards/margins": 0.05322265625, "rewards/rejected": 0.12060546875, "step": 1539 }, { "epoch": 0.4442521275061301, "grad_norm": 12.486459281296215, "learning_rate": 3.4020681645947714e-07, "logits/chosen": 3.25, "logits/rejected": 3.34375, "logps/chosen": -1792.0, "logps/rejected": -1552.0, "loss": 0.6809, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.28515625, "rewards/margins": 0.1728515625, "rewards/rejected": 0.11279296875, "step": 1540 }, { "epoch": 0.4445406029136016, "grad_norm": 11.513183998244546, "learning_rate": 3.3997192367981846e-07, "logits/chosen": 3.09375, "logits/rejected": 3.1875, "logps/chosen": -1568.0, "logps/rejected": -1696.0, "loss": 0.6741, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.234375, "rewards/margins": 0.0849609375, "rewards/rejected": 0.1494140625, "step": 1541 }, { "epoch": 0.4448290783210731, "grad_norm": 12.169513875846395, "learning_rate": 3.3973693962008964e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1616.0, "logps/rejected": -1576.0, "loss": 0.7222, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.296875, "rewards/margins": -0.04833984375, "rewards/rejected": 0.345703125, "step": 1542 }, { "epoch": 0.44511755372854467, "grad_norm": 11.185228664591719, "learning_rate": 3.395018645186913e-07, "logits/chosen": 3.109375, "logits/rejected": 3.109375, "logps/chosen": -1888.0, "logps/rejected": -1752.0, "loss": 0.637, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.546875, "rewards/margins": 0.11376953125, "rewards/rejected": 0.43359375, "step": 1543 }, { "epoch": 0.4454060291360162, "grad_norm": 11.48268926144227, "learning_rate": 3.3926669861411623e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1560.0, "logps/rejected": -1520.0, "loss": 0.6781, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1826171875, "rewards/margins": 0.0216064453125, "rewards/rejected": 0.1611328125, "step": 1544 }, { "epoch": 0.4456945045434877, "grad_norm": 10.466847863688741, "learning_rate": 3.3903144214494976e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1672.0, "logps/rejected": -1536.0, "loss": 0.654, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.30859375, "rewards/margins": 0.031494140625, "rewards/rejected": 0.27734375, "step": 1545 }, { "epoch": 0.4459829799509592, "grad_norm": 12.004655549239814, "learning_rate": 3.387960953498687e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1472.0, "logps/rejected": -1232.0, "loss": 0.6689, "loss/demonstration_loss": -2720.0, "loss/preference_loss": -2704.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1953125, "rewards/margins": 0.1259765625, "rewards/rejected": 0.06982421875, "step": 1546 }, { "epoch": 0.4462714553584307, "grad_norm": 13.128230455047806, "learning_rate": 3.3856065846764174e-07, "logits/chosen": 3.1875, "logits/rejected": 3.1875, "logps/chosen": -1688.0, "logps/rejected": -1568.0, "loss": 0.6219, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.205078125, "rewards/margins": 0.09375, "rewards/rejected": 0.111328125, "step": 1547 }, { "epoch": 0.4465599307659022, "grad_norm": 14.52973155880915, "learning_rate": 3.3832513173712895e-07, "logits/chosen": 3.09375, "logits/rejected": 3.0625, "logps/chosen": -1352.0, "logps/rejected": -1360.0, "loss": 0.7164, "loss/demonstration_loss": -2736.0, "loss/preference_loss": -2736.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.333984375, "rewards/margins": 0.013916015625, "rewards/rejected": 0.3203125, "step": 1548 }, { "epoch": 0.44684840617337374, "grad_norm": 12.76137699543709, "learning_rate": 3.3808951539728145e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1840.0, "logps/rejected": -1728.0, "loss": 0.6718, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.310546875, "rewards/margins": 0.07275390625, "rewards/rejected": 0.2373046875, "step": 1549 }, { "epoch": 0.44713688158084525, "grad_norm": 11.938625019606807, "learning_rate": 3.378538096871412e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1912.0, "logps/rejected": -1832.0, "loss": 0.7285, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.341796875, "rewards/margins": -0.0712890625, "rewards/rejected": 0.412109375, "step": 1550 }, { "epoch": 0.44742535698831676, "grad_norm": 11.123998625204196, "learning_rate": 3.376180148458412e-07, "logits/chosen": 3.3125, "logits/rejected": 3.328125, "logps/chosen": -1408.0, "logps/rejected": -1568.0, "loss": 0.6926, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.328125, "rewards/margins": -0.002777099609375, "rewards/rejected": 0.330078125, "step": 1551 }, { "epoch": 0.44771383239578827, "grad_norm": 12.054509010670344, "learning_rate": 3.373821311126044e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1512.0, "logps/rejected": -1656.0, "loss": 0.697, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2265625, "rewards/margins": -0.025146484375, "rewards/rejected": 0.251953125, "step": 1552 }, { "epoch": 0.4480023078032598, "grad_norm": 10.209600410213321, "learning_rate": 3.371461587267444e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1560.0, "logps/rejected": -1360.0, "loss": 0.6876, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31640625, "rewards/margins": 0.00555419921875, "rewards/rejected": 0.310546875, "step": 1553 }, { "epoch": 0.4482907832107313, "grad_norm": 10.701349061954332, "learning_rate": 3.3691009792766424e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1760.0, "logps/rejected": -1720.0, "loss": 0.6697, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.294921875, "rewards/margins": 0.032470703125, "rewards/rejected": 0.263671875, "step": 1554 }, { "epoch": 0.4485792586182028, "grad_norm": 10.878644934593197, "learning_rate": 3.3667394895485705e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1472.0, "logps/rejected": -1344.0, "loss": 0.6806, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1953125, "rewards/margins": 0.1025390625, "rewards/rejected": 0.09228515625, "step": 1555 }, { "epoch": 0.4488677340256743, "grad_norm": 12.740973489305533, "learning_rate": 3.364377120479054e-07, "logits/chosen": 3.25, "logits/rejected": 3.296875, "logps/chosen": -1640.0, "logps/rejected": -1616.0, "loss": 0.6921, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.1220703125, "rewards/margins": -0.0732421875, "rewards/rejected": 0.1953125, "step": 1556 }, { "epoch": 0.4491562094331458, "grad_norm": 12.020944587129142, "learning_rate": 3.3620138744648076e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1488.0, "logps/rejected": -1544.0, "loss": 0.7266, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.337890625, "rewards/margins": 0.02001953125, "rewards/rejected": 0.318359375, "step": 1557 }, { "epoch": 0.44944468484061734, "grad_norm": 12.29827772532482, "learning_rate": 3.3596497539034396e-07, "logits/chosen": 3.171875, "logits/rejected": 3.125, "logps/chosen": -1648.0, "logps/rejected": -1792.0, "loss": 0.7012, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.3359375, "rewards/margins": -0.0174560546875, "rewards/rejected": 0.353515625, "step": 1558 }, { "epoch": 0.44973316024808885, "grad_norm": 11.45808879883158, "learning_rate": 3.3572847611934417e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1600.0, "logps/rejected": -1520.0, "loss": 0.6841, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2255859375, "rewards/margins": 0.07275390625, "rewards/rejected": 0.15234375, "step": 1559 }, { "epoch": 0.45002163565556036, "grad_norm": 10.428768429102382, "learning_rate": 3.354918898734194e-07, "logits/chosen": 3.15625, "logits/rejected": 3.03125, "logps/chosen": -1768.0, "logps/rejected": -1544.0, "loss": 0.6547, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.2373046875, "rewards/margins": 0.1982421875, "rewards/rejected": 0.038330078125, "step": 1560 }, { "epoch": 0.45031011106303187, "grad_norm": 10.80151748863185, "learning_rate": 3.352552168925957e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1944.0, "logps/rejected": -1880.0, "loss": 0.6698, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.314453125, "rewards/margins": 0.056396484375, "rewards/rejected": 0.2578125, "step": 1561 }, { "epoch": 0.4505985864705034, "grad_norm": 11.242299107912803, "learning_rate": 3.350184574169872e-07, "logits/chosen": 3.0625, "logits/rejected": 3.15625, "logps/chosen": -1384.0, "logps/rejected": -1488.0, "loss": 0.6974, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2912.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3203125, "rewards/margins": -0.054931640625, "rewards/rejected": 0.375, "step": 1562 }, { "epoch": 0.4508870618779749, "grad_norm": 10.54453354195842, "learning_rate": 3.347816116867956e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1920.0, "logps/rejected": -1584.0, "loss": 0.6585, "loss/demonstration_loss": -3552.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.443359375, "rewards/margins": 0.1689453125, "rewards/rejected": 0.2734375, "step": 1563 }, { "epoch": 0.4511755372854464, "grad_norm": 9.739913471552216, "learning_rate": 3.345446799423103e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1232.0, "logps/rejected": -1496.0, "loss": 0.688, "loss/demonstration_loss": -2752.0, "loss/preference_loss": -2752.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1591796875, "rewards/margins": 0.01177978515625, "rewards/rejected": 0.1474609375, "step": 1564 }, { "epoch": 0.4514640126929179, "grad_norm": 10.979060066881837, "learning_rate": 3.343076624239081e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1712.0, "logps/rejected": -1432.0, "loss": 0.6838, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.201171875, "rewards/margins": 0.056640625, "rewards/rejected": 0.14453125, "step": 1565 }, { "epoch": 0.4517524881003894, "grad_norm": 10.107511809314284, "learning_rate": 3.3407055937205233e-07, "logits/chosen": 3.015625, "logits/rejected": 3.078125, "logps/chosen": -1600.0, "logps/rejected": -1440.0, "loss": 0.6306, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.26953125, "rewards/margins": 0.15234375, "rewards/rejected": 0.11767578125, "step": 1566 }, { "epoch": 0.45204096350786094, "grad_norm": 11.976752168481113, "learning_rate": 3.338333710272936e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -1944.0, "logps/rejected": -1984.0, "loss": 0.6985, "loss/demonstration_loss": -3968.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3359375, "rewards/margins": 0.06689453125, "rewards/rejected": 0.26953125, "step": 1567 }, { "epoch": 0.45232943891533245, "grad_norm": 9.589232610036449, "learning_rate": 3.3359609763026875e-07, "logits/chosen": 3.234375, "logits/rejected": 3.203125, "logps/chosen": -1592.0, "logps/rejected": -1672.0, "loss": 0.6697, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3046875, "rewards/margins": 0.041259765625, "rewards/rejected": 0.263671875, "step": 1568 }, { "epoch": 0.45261791432280396, "grad_norm": 10.016207608726702, "learning_rate": 3.333587394217011e-07, "logits/chosen": 3.328125, "logits/rejected": 3.28125, "logps/chosen": -2016.0, "logps/rejected": -1920.0, "loss": 0.6058, "loss/demonstration_loss": -3984.0, "loss/preference_loss": -3952.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.54296875, "rewards/margins": 0.29296875, "rewards/rejected": 0.251953125, "step": 1569 }, { "epoch": 0.45290638973027547, "grad_norm": 11.002271865363085, "learning_rate": 3.3312129664239995e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1376.0, "logps/rejected": -1400.0, "loss": 0.6397, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.421875, "rewards/margins": 0.2333984375, "rewards/rejected": 0.189453125, "step": 1570 }, { "epoch": 0.453194865137747, "grad_norm": 11.563546539471192, "learning_rate": 3.328837695332603e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -1184.0, "logps/rejected": -1200.0, "loss": 0.7157, "loss/demonstration_loss": -2400.0, "loss/preference_loss": -2400.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.205078125, "rewards/margins": -0.0235595703125, "rewards/rejected": 0.228515625, "step": 1571 }, { "epoch": 0.4534833405452185, "grad_norm": 11.417359228722788, "learning_rate": 3.326461583352628e-07, "logits/chosen": 3.265625, "logits/rejected": 3.3125, "logps/chosen": -1624.0, "logps/rejected": -1400.0, "loss": 0.6878, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.35546875, "rewards/margins": -0.000152587890625, "rewards/rejected": 0.357421875, "step": 1572 }, { "epoch": 0.45377181595269006, "grad_norm": 10.926266677011677, "learning_rate": 3.3240846328947344e-07, "logits/chosen": 3.140625, "logits/rejected": 3.1875, "logps/chosen": -1536.0, "logps/rejected": -1512.0, "loss": 0.6918, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.06640625, "rewards/margins": -0.0172119140625, "rewards/rejected": 0.083984375, "step": 1573 }, { "epoch": 0.45406029136016157, "grad_norm": 12.408531519132385, "learning_rate": 3.3217068463704314e-07, "logits/chosen": 3.375, "logits/rejected": 3.296875, "logps/chosen": -1792.0, "logps/rejected": -1888.0, "loss": 0.6757, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.50390625, "rewards/margins": 0.09228515625, "rewards/rejected": 0.41015625, "step": 1574 }, { "epoch": 0.4543487667676331, "grad_norm": 11.68798859693865, "learning_rate": 3.319328226192078e-07, "logits/chosen": 3.390625, "logits/rejected": 3.34375, "logps/chosen": -1808.0, "logps/rejected": -1816.0, "loss": 0.6769, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.271484375, "rewards/margins": 0.050048828125, "rewards/rejected": 0.2216796875, "step": 1575 }, { "epoch": 0.4546372421751046, "grad_norm": 11.959906105865793, "learning_rate": 3.316948774772878e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -2160.0, "logps/rejected": -1968.0, "loss": 0.7026, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4160.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.255859375, "rewards/margins": 0.014404296875, "rewards/rejected": 0.2412109375, "step": 1576 }, { "epoch": 0.4549257175825761, "grad_norm": 11.432737253268336, "learning_rate": 3.314568494526879e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1488.0, "logps/rejected": -1656.0, "loss": 0.6793, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.357421875, "rewards/margins": 0.007110595703125, "rewards/rejected": 0.349609375, "step": 1577 }, { "epoch": 0.4552141929900476, "grad_norm": 9.636465383121207, "learning_rate": 3.31218738786897e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1528.0, "logps/rejected": -1456.0, "loss": 0.6393, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3984375, "rewards/margins": 0.130859375, "rewards/rejected": 0.267578125, "step": 1578 }, { "epoch": 0.45550266839751913, "grad_norm": 10.285673423955991, "learning_rate": 3.309805457214877e-07, "logits/chosen": 3.359375, "logits/rejected": 3.34375, "logps/chosen": -1824.0, "logps/rejected": -1752.0, "loss": 0.6726, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3359375, "rewards/margins": 0.047607421875, "rewards/rejected": 0.287109375, "step": 1579 }, { "epoch": 0.45579114380499064, "grad_norm": 10.672935570587352, "learning_rate": 3.3074227049811624e-07, "logits/chosen": 3.3125, "logits/rejected": 3.328125, "logps/chosen": -1336.0, "logps/rejected": -1464.0, "loss": 0.636, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2800.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.201171875, "rewards/margins": 0.1201171875, "rewards/rejected": 0.0810546875, "step": 1580 }, { "epoch": 0.45607961921246215, "grad_norm": 12.585785571947707, "learning_rate": 3.305039133585223e-07, "logits/chosen": 3.203125, "logits/rejected": 3.3125, "logps/chosen": -1464.0, "logps/rejected": -1400.0, "loss": 0.7053, "loss/demonstration_loss": -2896.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.353515625, "rewards/margins": -0.0263671875, "rewards/rejected": 0.37890625, "step": 1581 }, { "epoch": 0.45636809461993366, "grad_norm": 12.024795240532654, "learning_rate": 3.3026547454452863e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1520.0, "logps/rejected": -1640.0, "loss": 0.6483, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.171875, "rewards/margins": 0.04150390625, "rewards/rejected": 0.130859375, "step": 1582 }, { "epoch": 0.4566565700274052, "grad_norm": 10.264610433567263, "learning_rate": 3.3002695429804084e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1280.0, "logps/rejected": -1504.0, "loss": 0.6642, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2138671875, "rewards/margins": -0.0118408203125, "rewards/rejected": 0.2255859375, "step": 1583 }, { "epoch": 0.4569450454348767, "grad_norm": 11.29637641607261, "learning_rate": 3.2978835286104705e-07, "logits/chosen": 3.234375, "logits/rejected": 3.296875, "logps/chosen": -1840.0, "logps/rejected": -1832.0, "loss": 0.7186, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.255859375, "rewards/margins": -0.08056640625, "rewards/rejected": 0.337890625, "step": 1584 }, { "epoch": 0.4572335208423482, "grad_norm": 9.280602330638628, "learning_rate": 3.295496704756179e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1552.0, "logps/rejected": -1384.0, "loss": 0.6304, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2960.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.39453125, "rewards/margins": 0.18359375, "rewards/rejected": 0.2109375, "step": 1585 }, { "epoch": 0.4575219962498197, "grad_norm": 9.715913642587658, "learning_rate": 3.2931090738390597e-07, "logits/chosen": 3.265625, "logits/rejected": 3.3125, "logps/chosen": -1792.0, "logps/rejected": -1936.0, "loss": 0.715, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.291015625, "rewards/margins": -0.0167236328125, "rewards/rejected": 0.30859375, "step": 1586 }, { "epoch": 0.4578104716572912, "grad_norm": 11.433723127124694, "learning_rate": 3.2907206382814606e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1608.0, "logps/rejected": -1656.0, "loss": 0.6518, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40625, "rewards/margins": 0.189453125, "rewards/rejected": 0.2177734375, "step": 1587 }, { "epoch": 0.45809894706476273, "grad_norm": 11.419950109332657, "learning_rate": 3.2883314005065434e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1840.0, "logps/rejected": -1680.0, "loss": 0.6088, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.45703125, "rewards/margins": 0.2451171875, "rewards/rejected": 0.2109375, "step": 1588 }, { "epoch": 0.45838742247223424, "grad_norm": 11.81995880819628, "learning_rate": 3.285941362938284e-07, "logits/chosen": 3.3125, "logits/rejected": 3.265625, "logps/chosen": -1992.0, "logps/rejected": -2096.0, "loss": 0.7213, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.482421875, "rewards/margins": -0.01904296875, "rewards/rejected": 0.5, "step": 1589 }, { "epoch": 0.45867589787970575, "grad_norm": 12.531602892650216, "learning_rate": 3.283550528001469e-07, "logits/chosen": 3.09375, "logits/rejected": 3.109375, "logps/chosen": -1928.0, "logps/rejected": -2048.0, "loss": 0.7113, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40234375, "rewards/margins": 0.07421875, "rewards/rejected": 0.328125, "step": 1590 }, { "epoch": 0.45896437328717726, "grad_norm": 12.219974697480648, "learning_rate": 3.2811588981216946e-07, "logits/chosen": 3.265625, "logits/rejected": 3.296875, "logps/chosen": -1880.0, "logps/rejected": -1680.0, "loss": 0.6763, "loss/demonstration_loss": -3600.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.353515625, "rewards/margins": 0.11572265625, "rewards/rejected": 0.23828125, "step": 1591 }, { "epoch": 0.4592528486946488, "grad_norm": 10.91595781896608, "learning_rate": 3.2787664757253663e-07, "logits/chosen": 3.296875, "logits/rejected": 3.25, "logps/chosen": -1864.0, "logps/rejected": -1856.0, "loss": 0.7209, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.275390625, "rewards/margins": -0.05322265625, "rewards/rejected": 0.328125, "step": 1592 }, { "epoch": 0.4595413241021203, "grad_norm": 11.285074798067944, "learning_rate": 3.2763732632396885e-07, "logits/chosen": 3.0, "logits/rejected": 3.21875, "logps/chosen": -1416.0, "logps/rejected": -1400.0, "loss": 0.6818, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3515625, "rewards/margins": 0.0849609375, "rewards/rejected": 0.267578125, "step": 1593 }, { "epoch": 0.4598297995095918, "grad_norm": 10.475028053202847, "learning_rate": 3.273979263092671e-07, "logits/chosen": 3.328125, "logits/rejected": 3.375, "logps/chosen": -2064.0, "logps/rejected": -2040.0, "loss": 0.6253, "loss/demonstration_loss": -4160.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.53125, "rewards/margins": 0.26171875, "rewards/rejected": 0.26953125, "step": 1594 }, { "epoch": 0.4601182749170633, "grad_norm": 10.283963465561667, "learning_rate": 3.271584477713121e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1768.0, "logps/rejected": -1712.0, "loss": 0.6874, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.26953125, "rewards/margins": 0.025634765625, "rewards/rejected": 0.244140625, "step": 1595 }, { "epoch": 0.4604067503245348, "grad_norm": 9.422505031334358, "learning_rate": 3.269188909530644e-07, "logits/chosen": 3.078125, "logits/rejected": 3.125, "logps/chosen": -1608.0, "logps/rejected": -1656.0, "loss": 0.6855, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.3515625, "rewards/margins": -0.0216064453125, "rewards/rejected": 0.373046875, "step": 1596 }, { "epoch": 0.46069522573200633, "grad_norm": 16.83808828034605, "learning_rate": 3.266792560975638e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1760.0, "logps/rejected": -1888.0, "loss": 0.7069, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.40625, "rewards/margins": 0.07763671875, "rewards/rejected": 0.328125, "step": 1597 }, { "epoch": 0.46098370113947784, "grad_norm": 14.722103705466585, "learning_rate": 3.264395434479292e-07, "logits/chosen": 3.28125, "logits/rejected": 3.34375, "logps/chosen": -1512.0, "logps/rejected": -1600.0, "loss": 0.6779, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.302734375, "rewards/margins": 0.11865234375, "rewards/rejected": 0.18359375, "step": 1598 }, { "epoch": 0.46127217654694935, "grad_norm": 10.824814018495962, "learning_rate": 3.2619975324735866e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1680.0, "logps/rejected": -1536.0, "loss": 0.7293, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2412109375, "rewards/margins": -0.0400390625, "rewards/rejected": 0.28125, "step": 1599 }, { "epoch": 0.46156065195442086, "grad_norm": 10.191871572579466, "learning_rate": 3.259598857391289e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1552.0, "logps/rejected": -1568.0, "loss": 0.6488, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.279296875, "rewards/margins": 0.023681640625, "rewards/rejected": 0.255859375, "step": 1600 }, { "epoch": 0.4618491273618924, "grad_norm": 12.877389668105883, "learning_rate": 3.2571994116659474e-07, "logits/chosen": 3.125, "logits/rejected": 3.1875, "logps/chosen": -1808.0, "logps/rejected": -1664.0, "loss": 0.6634, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.228515625, "rewards/margins": 0.076171875, "rewards/rejected": 0.15234375, "step": 1601 }, { "epoch": 0.4621376027693639, "grad_norm": 10.438705605147105, "learning_rate": 3.254799197731896e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1648.0, "logps/rejected": -1960.0, "loss": 0.6796, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4765625, "rewards/margins": 0.08740234375, "rewards/rejected": 0.388671875, "step": 1602 }, { "epoch": 0.4624260781768354, "grad_norm": 10.56010210180927, "learning_rate": 3.2523982180242465e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -1528.0, "logps/rejected": -1568.0, "loss": 0.6579, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.263671875, "rewards/margins": 0.1494140625, "rewards/rejected": 0.11328125, "step": 1603 }, { "epoch": 0.46271455358430696, "grad_norm": 10.218838138049144, "learning_rate": 3.249996474978887e-07, "logits/chosen": 3.234375, "logits/rejected": 3.171875, "logps/chosen": -1784.0, "logps/rejected": -1552.0, "loss": 0.651, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.41796875, "rewards/margins": 0.1201171875, "rewards/rejected": 0.298828125, "step": 1604 }, { "epoch": 0.4630030289917785, "grad_norm": 11.209920137305854, "learning_rate": 3.2475939710324817e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1728.0, "logps/rejected": -1568.0, "loss": 0.7146, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1708984375, "rewards/margins": -0.150390625, "rewards/rejected": 0.3203125, "step": 1605 }, { "epoch": 0.46329150439925, "grad_norm": 11.03767058470553, "learning_rate": 3.245190708622465e-07, "logits/chosen": 3.25, "logits/rejected": 3.28125, "logps/chosen": -1920.0, "logps/rejected": -1728.0, "loss": 0.705, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.291015625, "rewards/margins": 0.03369140625, "rewards/rejected": 0.2578125, "step": 1606 }, { "epoch": 0.4635799798067215, "grad_norm": 10.86809960140214, "learning_rate": 3.242786690187042e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -1784.0, "logps/rejected": -1824.0, "loss": 0.667, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.36328125, "rewards/margins": 0.16796875, "rewards/rejected": 0.1953125, "step": 1607 }, { "epoch": 0.463868455214193, "grad_norm": 12.081922729437892, "learning_rate": 3.2403819181651836e-07, "logits/chosen": 3.15625, "logits/rejected": 3.125, "logps/chosen": -1464.0, "logps/rejected": -1552.0, "loss": 0.6693, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3046875, "rewards/margins": 0.08544921875, "rewards/rejected": 0.21875, "step": 1608 }, { "epoch": 0.4641569306216645, "grad_norm": 13.26160229986464, "learning_rate": 3.237976394996626e-07, "logits/chosen": 3.40625, "logits/rejected": 3.390625, "logps/chosen": -1976.0, "logps/rejected": -2112.0, "loss": 0.7524, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4128.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.26953125, "rewards/margins": -0.2431640625, "rewards/rejected": 0.51171875, "step": 1609 }, { "epoch": 0.46444540602913603, "grad_norm": 9.047943325793094, "learning_rate": 3.235570123121869e-07, "logits/chosen": 3.234375, "logits/rejected": 3.265625, "logps/chosen": -1568.0, "logps/rejected": -1344.0, "loss": 0.6666, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41015625, "rewards/margins": 0.11865234375, "rewards/rejected": 0.291015625, "step": 1610 }, { "epoch": 0.46473388143660754, "grad_norm": 10.39249120187221, "learning_rate": 3.233163104982169e-07, "logits/chosen": 3.140625, "logits/rejected": 3.046875, "logps/chosen": -1720.0, "logps/rejected": -1592.0, "loss": 0.686, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.259765625, "rewards/margins": 0.00299072265625, "rewards/rejected": 0.255859375, "step": 1611 }, { "epoch": 0.46502235684407905, "grad_norm": 9.829661040315031, "learning_rate": 3.2307553430195407e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1760.0, "logps/rejected": -1824.0, "loss": 0.6541, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.455078125, "rewards/margins": 0.11376953125, "rewards/rejected": 0.341796875, "step": 1612 }, { "epoch": 0.46531083225155057, "grad_norm": 11.087936566447548, "learning_rate": 3.2283468396767546e-07, "logits/chosen": 3.0625, "logits/rejected": 3.09375, "logps/chosen": -2064.0, "logps/rejected": -2096.0, "loss": 0.6747, "loss/demonstration_loss": -4224.0, "loss/preference_loss": -4224.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.61328125, "rewards/margins": 0.06640625, "rewards/rejected": 0.546875, "step": 1613 }, { "epoch": 0.4655993076590221, "grad_norm": 11.602640017393142, "learning_rate": 3.225937597397332e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1568.0, "logps/rejected": -1464.0, "loss": 0.6688, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.361328125, "rewards/margins": 0.05615234375, "rewards/rejected": 0.3046875, "step": 1614 }, { "epoch": 0.4658877830664936, "grad_norm": 12.708516687313352, "learning_rate": 3.223527618625545e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1680.0, "logps/rejected": -1880.0, "loss": 0.7216, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3584.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.263671875, "rewards/margins": -0.027099609375, "rewards/rejected": 0.2890625, "step": 1615 }, { "epoch": 0.4661762584739651, "grad_norm": 11.10808862709176, "learning_rate": 3.221116905806412e-07, "logits/chosen": 3.1875, "logits/rejected": 3.21875, "logps/chosen": -1680.0, "logps/rejected": -1552.0, "loss": 0.7016, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.376953125, "rewards/margins": -0.07275390625, "rewards/rejected": 0.44921875, "step": 1616 }, { "epoch": 0.4664647338814366, "grad_norm": 10.951397953440221, "learning_rate": 3.218705461385695e-07, "logits/chosen": 3.171875, "logits/rejected": 3.0625, "logps/chosen": -1736.0, "logps/rejected": -1672.0, "loss": 0.6367, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.30078125, "rewards/margins": 0.09814453125, "rewards/rejected": 0.203125, "step": 1617 }, { "epoch": 0.4667532092889081, "grad_norm": 10.638182654596397, "learning_rate": 3.2162932878099026e-07, "logits/chosen": 3.0625, "logits/rejected": 3.0625, "logps/chosen": -1320.0, "logps/rejected": -1288.0, "loss": 0.7032, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2624.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2421875, "rewards/margins": 0.031982421875, "rewards/rejected": 0.2099609375, "step": 1618 }, { "epoch": 0.46704168469637963, "grad_norm": 11.251332516602643, "learning_rate": 3.213880387526277e-07, "logits/chosen": 3.171875, "logits/rejected": 3.140625, "logps/chosen": -1704.0, "logps/rejected": -1560.0, "loss": 0.6891, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.1875, "rewards/margins": -0.01220703125, "rewards/rejected": 0.2001953125, "step": 1619 }, { "epoch": 0.46733016010385114, "grad_norm": 10.402063040743373, "learning_rate": 3.2114667629828027e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -1712.0, "logps/rejected": -1680.0, "loss": 0.6674, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.369140625, "rewards/margins": 0.11328125, "rewards/rejected": 0.255859375, "step": 1620 }, { "epoch": 0.46761863551132266, "grad_norm": 10.309833023481726, "learning_rate": 3.209052416628196e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1784.0, "logps/rejected": -1520.0, "loss": 0.6745, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.390625, "rewards/margins": 0.0064697265625, "rewards/rejected": 0.3828125, "step": 1621 }, { "epoch": 0.46790711091879417, "grad_norm": 11.626064321456356, "learning_rate": 3.206637350911908e-07, "logits/chosen": 3.203125, "logits/rejected": 3.21875, "logps/chosen": -1240.0, "logps/rejected": -1224.0, "loss": 0.6689, "loss/demonstration_loss": -2496.0, "loss/preference_loss": -2480.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.337890625, "rewards/margins": 0.0810546875, "rewards/rejected": 0.2578125, "step": 1622 }, { "epoch": 0.4681955863262657, "grad_norm": 12.740876088796279, "learning_rate": 3.204221568284117e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1896.0, "logps/rejected": -1832.0, "loss": 0.6684, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.40625, "rewards/margins": 0.04248046875, "rewards/rejected": 0.36328125, "step": 1623 }, { "epoch": 0.4684840617337372, "grad_norm": 11.254569496823319, "learning_rate": 3.2018050711957314e-07, "logits/chosen": 3.265625, "logits/rejected": 3.390625, "logps/chosen": -1776.0, "logps/rejected": -1600.0, "loss": 0.6892, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.287109375, "rewards/margins": -0.0244140625, "rewards/rejected": 0.310546875, "step": 1624 }, { "epoch": 0.4687725371412087, "grad_norm": 9.583885925615416, "learning_rate": 3.199387862098381e-07, "logits/chosen": 3.171875, "logits/rejected": 3.15625, "logps/chosen": -1352.0, "logps/rejected": -1288.0, "loss": 0.6734, "loss/demonstration_loss": -2672.0, "loss/preference_loss": -2672.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.275390625, "rewards/margins": 0.017578125, "rewards/rejected": 0.2578125, "step": 1625 }, { "epoch": 0.4690610125486802, "grad_norm": 11.396783106626067, "learning_rate": 3.1969699434444207e-07, "logits/chosen": 3.421875, "logits/rejected": 3.46875, "logps/chosen": -1776.0, "logps/rejected": -1600.0, "loss": 0.7299, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.345703125, "rewards/margins": -0.046875, "rewards/rejected": 0.392578125, "step": 1626 }, { "epoch": 0.4693494879561517, "grad_norm": 10.785681936126883, "learning_rate": 3.1945513176869256e-07, "logits/chosen": 3.265625, "logits/rejected": 3.203125, "logps/chosen": -1664.0, "logps/rejected": -1800.0, "loss": 0.6849, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.1875, "rewards/chosen": 0.279296875, "rewards/margins": -0.0238037109375, "rewards/rejected": 0.302734375, "step": 1627 }, { "epoch": 0.46963796336362323, "grad_norm": 11.481099538840834, "learning_rate": 3.1921319872796856e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1920.0, "logps/rejected": -1856.0, "loss": 0.7278, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.373046875, "rewards/margins": -0.020263671875, "rewards/rejected": 0.392578125, "step": 1628 }, { "epoch": 0.46992643877109475, "grad_norm": 12.93254296511721, "learning_rate": 3.189711954677208e-07, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -1408.0, "logps/rejected": -1336.0, "loss": 0.744, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.380859375, "rewards/margins": -0.03369140625, "rewards/rejected": 0.416015625, "step": 1629 }, { "epoch": 0.47021491417856626, "grad_norm": 11.876540826926718, "learning_rate": 3.187291222334709e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -1640.0, "logps/rejected": -1504.0, "loss": 0.7214, "loss/demonstration_loss": -3168.0, "loss/preference_loss": -3168.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.298828125, "rewards/margins": 0.0057373046875, "rewards/rejected": 0.29296875, "step": 1630 }, { "epoch": 0.47050338958603777, "grad_norm": 9.881219982676935, "learning_rate": 3.184869792708121e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -2040.0, "logps/rejected": -1808.0, "loss": 0.6335, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3888.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.484375, "rewards/margins": 0.095703125, "rewards/rejected": 0.388671875, "step": 1631 }, { "epoch": 0.4707918649935093, "grad_norm": 11.306274851210203, "learning_rate": 3.182447668254077e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1968.0, "logps/rejected": -1920.0, "loss": 0.7064, "loss/demonstration_loss": -3936.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.498046875, "rewards/margins": -0.05712890625, "rewards/rejected": 0.5546875, "step": 1632 }, { "epoch": 0.4710803404009808, "grad_norm": 12.687967811554584, "learning_rate": 3.1800248514299195e-07, "logits/chosen": 3.375, "logits/rejected": 3.34375, "logps/chosen": -1816.0, "logps/rejected": -1768.0, "loss": 0.693, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.41015625, "rewards/margins": 0.0322265625, "rewards/rejected": 0.376953125, "step": 1633 }, { "epoch": 0.47136881580845236, "grad_norm": 11.49439956266737, "learning_rate": 3.177601344693692e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1672.0, "logps/rejected": -1696.0, "loss": 0.6761, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.37890625, "rewards/margins": 0.010986328125, "rewards/rejected": 0.3671875, "step": 1634 }, { "epoch": 0.47165729121592387, "grad_norm": 10.277180864267585, "learning_rate": 3.1751771505041357e-07, "logits/chosen": 3.109375, "logits/rejected": 3.171875, "logps/chosen": -1696.0, "logps/rejected": -1640.0, "loss": 0.6911, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.283203125, "rewards/margins": 0.027587890625, "rewards/rejected": 0.255859375, "step": 1635 }, { "epoch": 0.4719457666233954, "grad_norm": 12.082963955240855, "learning_rate": 3.172752271320693e-07, "logits/chosen": 3.28125, "logits/rejected": 3.28125, "logps/chosen": -1768.0, "logps/rejected": -1552.0, "loss": 0.685, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.44140625, "rewards/margins": 0.1689453125, "rewards/rejected": 0.2734375, "step": 1636 }, { "epoch": 0.4722342420308669, "grad_norm": 10.910851654540808, "learning_rate": 3.170326709603501e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1240.0, "logps/rejected": -1352.0, "loss": 0.6743, "loss/demonstration_loss": -2624.0, "loss/preference_loss": -2624.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36328125, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.365234375, "step": 1637 }, { "epoch": 0.4725227174383384, "grad_norm": 10.133684060220697, "learning_rate": 3.1679004678133853e-07, "logits/chosen": 3.34375, "logits/rejected": 3.390625, "logps/chosen": -1440.0, "logps/rejected": -1576.0, "loss": 0.6431, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.431640625, "rewards/margins": 0.107421875, "rewards/rejected": 0.32421875, "step": 1638 }, { "epoch": 0.4728111928458099, "grad_norm": 11.754098116177238, "learning_rate": 3.165473548411864e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1720.0, "logps/rejected": -1640.0, "loss": 0.6565, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.51171875, "rewards/margins": 0.173828125, "rewards/rejected": 0.33984375, "step": 1639 }, { "epoch": 0.4730996682532814, "grad_norm": 10.141994952867801, "learning_rate": 3.163045953861145e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -1744.0, "logps/rejected": -1680.0, "loss": 0.6511, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4453125, "rewards/margins": 0.07275390625, "rewards/rejected": 0.373046875, "step": 1640 }, { "epoch": 0.47338814366075294, "grad_norm": 13.657853005883569, "learning_rate": 3.160617686624117e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1928.0, "logps/rejected": -1880.0, "loss": 0.7189, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.416015625, "rewards/margins": -0.05712890625, "rewards/rejected": 0.47265625, "step": 1641 }, { "epoch": 0.47367661906822445, "grad_norm": 11.197384594235942, "learning_rate": 3.158188749164354e-07, "logits/chosen": 3.359375, "logits/rejected": 3.359375, "logps/chosen": -1856.0, "logps/rejected": -1584.0, "loss": 0.629, "loss/demonstration_loss": -3472.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3515625, "rewards/margins": 0.177734375, "rewards/rejected": 0.173828125, "step": 1642 }, { "epoch": 0.47396509447569596, "grad_norm": 9.925114677252418, "learning_rate": 3.155759143946108e-07, "logits/chosen": 3.25, "logits/rejected": 3.1875, "logps/chosen": -1224.0, "logps/rejected": -1280.0, "loss": 0.6697, "loss/demonstration_loss": -2544.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.291015625, "rewards/margins": 0.025390625, "rewards/rejected": 0.265625, "step": 1643 }, { "epoch": 0.47425356988316747, "grad_norm": 9.946484403005293, "learning_rate": 3.15332887343431e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -2192.0, "logps/rejected": -1872.0, "loss": 0.6608, "loss/demonstration_loss": -4128.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5234375, "rewards/margins": 0.1201171875, "rewards/rejected": 0.40234375, "step": 1644 }, { "epoch": 0.474542045290639, "grad_norm": 10.887422199369505, "learning_rate": 3.1508979400945664e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -2192.0, "logps/rejected": -1872.0, "loss": 0.654, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.435546875, "rewards/margins": 0.002044677734375, "rewards/rejected": 0.43359375, "step": 1645 }, { "epoch": 0.4748305206981105, "grad_norm": 10.964583662633919, "learning_rate": 3.148466346393154e-07, "logits/chosen": 3.171875, "logits/rejected": 3.234375, "logps/chosen": -1928.0, "logps/rejected": -1896.0, "loss": 0.6478, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.36328125, "rewards/margins": 0.1025390625, "rewards/rejected": 0.259765625, "step": 1646 }, { "epoch": 0.475118996105582, "grad_norm": 11.193614909120155, "learning_rate": 3.1460340947970197e-07, "logits/chosen": 3.375, "logits/rejected": 3.3125, "logps/chosen": -1752.0, "logps/rejected": -1632.0, "loss": 0.7308, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.162109375, "rewards/margins": -0.08056640625, "rewards/rejected": 0.2431640625, "step": 1647 }, { "epoch": 0.4754074715130535, "grad_norm": 18.495126359423217, "learning_rate": 3.14360118777378e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1720.0, "logps/rejected": -1464.0, "loss": 0.6885, "loss/demonstration_loss": -3216.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.439453125, "rewards/margins": 0.0247802734375, "rewards/rejected": 0.4140625, "step": 1648 }, { "epoch": 0.475695946920525, "grad_norm": 9.766135893823739, "learning_rate": 3.141167627791716e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1680.0, "logps/rejected": -1512.0, "loss": 0.6561, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.365234375, "rewards/margins": 0.1435546875, "rewards/rejected": 0.220703125, "step": 1649 }, { "epoch": 0.47598442232799654, "grad_norm": 10.691923087368378, "learning_rate": 3.138733417319769e-07, "logits/chosen": 3.21875, "logits/rejected": 3.25, "logps/chosen": -1784.0, "logps/rejected": -1600.0, "loss": 0.6689, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.38671875, "rewards/margins": 0.10302734375, "rewards/rejected": 0.28515625, "step": 1650 }, { "epoch": 0.47627289773546805, "grad_norm": 10.911471294544677, "learning_rate": 3.1362985588275427e-07, "logits/chosen": 3.296875, "logits/rejected": 3.265625, "logps/chosen": -1584.0, "logps/rejected": -1520.0, "loss": 0.6914, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2431640625, "rewards/margins": 0.027099609375, "rewards/rejected": 0.2158203125, "step": 1651 }, { "epoch": 0.47656137314293956, "grad_norm": 10.623893711203577, "learning_rate": 3.1338630547852954e-07, "logits/chosen": 3.296875, "logits/rejected": 3.203125, "logps/chosen": -1712.0, "logps/rejected": -1544.0, "loss": 0.6503, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33984375, "rewards/margins": 0.11083984375, "rewards/rejected": 0.228515625, "step": 1652 }, { "epoch": 0.47684984855041107, "grad_norm": 11.739311315205153, "learning_rate": 3.131426907663944e-07, "logits/chosen": 3.375, "logits/rejected": 3.390625, "logps/chosen": -1840.0, "logps/rejected": -1440.0, "loss": 0.7048, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.263671875, "rewards/margins": 0.005126953125, "rewards/rejected": 0.259765625, "step": 1653 }, { "epoch": 0.4771383239578826, "grad_norm": 11.126984397384586, "learning_rate": 3.1289901199350555e-07, "logits/chosen": 3.34375, "logits/rejected": 3.34375, "logps/chosen": -1648.0, "logps/rejected": -1632.0, "loss": 0.6675, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.33203125, "rewards/margins": 0.004241943359375, "rewards/rejected": 0.328125, "step": 1654 }, { "epoch": 0.4774267993653541, "grad_norm": 13.13897902520666, "learning_rate": 3.126552694070847e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1944.0, "logps/rejected": -1712.0, "loss": 0.6359, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5078125, "rewards/margins": 0.1318359375, "rewards/rejected": 0.376953125, "step": 1655 }, { "epoch": 0.4777152747728256, "grad_norm": 10.450774644529604, "learning_rate": 3.1241146325441835e-07, "logits/chosen": 3.296875, "logits/rejected": 3.34375, "logps/chosen": -1920.0, "logps/rejected": -1680.0, "loss": 0.633, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3648.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.65625, "rewards/margins": 0.1796875, "rewards/rejected": 0.474609375, "step": 1656 }, { "epoch": 0.4780037501802971, "grad_norm": 11.431990694580966, "learning_rate": 3.121675937828575e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1152.0, "logps/rejected": -1248.0, "loss": 0.7245, "loss/demonstration_loss": -2432.0, "loss/preference_loss": -2432.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2392578125, "rewards/margins": -0.08447265625, "rewards/rejected": 0.32421875, "step": 1657 }, { "epoch": 0.4782922255877686, "grad_norm": 10.545083225769096, "learning_rate": 3.1192366123981726e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1568.0, "logps/rejected": -1472.0, "loss": 0.7066, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.25, "rewards/margins": 0.041259765625, "rewards/rejected": 0.208984375, "step": 1658 }, { "epoch": 0.47858070099524014, "grad_norm": 10.097535202465012, "learning_rate": 3.11679665872777e-07, "logits/chosen": 3.25, "logits/rejected": 3.265625, "logps/chosen": -1200.0, "logps/rejected": -1208.0, "loss": 0.6983, "loss/demonstration_loss": -2432.0, "loss/preference_loss": -2432.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2890625, "rewards/margins": 0.07666015625, "rewards/rejected": 0.2109375, "step": 1659 }, { "epoch": 0.47886917640271165, "grad_norm": 12.37592120192126, "learning_rate": 3.1143560792927946e-07, "logits/chosen": 3.359375, "logits/rejected": 3.328125, "logps/chosen": -1968.0, "logps/rejected": -1696.0, "loss": 0.6381, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3203125, "rewards/margins": 0.08984375, "rewards/rejected": 0.2314453125, "step": 1660 }, { "epoch": 0.47915765181018316, "grad_norm": 11.474424649725128, "learning_rate": 3.111914876569312e-07, "logits/chosen": 3.265625, "logits/rejected": 3.21875, "logps/chosen": -1584.0, "logps/rejected": -1632.0, "loss": 0.6705, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.498046875, "rewards/margins": 0.09765625, "rewards/rejected": 0.400390625, "step": 1661 }, { "epoch": 0.47944612721765467, "grad_norm": 11.838169836819477, "learning_rate": 3.1094730530340183e-07, "logits/chosen": 3.234375, "logits/rejected": 3.296875, "logps/chosen": -1712.0, "logps/rejected": -1656.0, "loss": 0.7224, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4296875, "rewards/margins": 0.036376953125, "rewards/rejected": 0.392578125, "step": 1662 }, { "epoch": 0.4797346026251262, "grad_norm": 11.965439163826261, "learning_rate": 3.10703061116424e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -1584.0, "logps/rejected": -1520.0, "loss": 0.6555, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2451171875, "rewards/margins": 0.16015625, "rewards/rejected": 0.0849609375, "step": 1663 }, { "epoch": 0.4800230780325977, "grad_norm": 11.668302841820907, "learning_rate": 3.104587553437932e-07, "logits/chosen": 3.140625, "logits/rejected": 3.015625, "logps/chosen": -1896.0, "logps/rejected": -1840.0, "loss": 0.6543, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.375, "rewards/margins": 0.07470703125, "rewards/rejected": 0.30078125, "step": 1664 }, { "epoch": 0.48031155344006926, "grad_norm": 9.225169460851358, "learning_rate": 3.10214388233367e-07, "logits/chosen": 3.359375, "logits/rejected": 3.375, "logps/chosen": -1576.0, "logps/rejected": -1456.0, "loss": 0.6348, "loss/demonstration_loss": -3072.0, "loss/preference_loss": -3056.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4765625, "rewards/margins": 0.1875, "rewards/rejected": 0.2890625, "step": 1665 }, { "epoch": 0.48060002884754077, "grad_norm": 9.79909332658279, "learning_rate": 3.0996996003306576e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1464.0, "logps/rejected": -1552.0, "loss": 0.6735, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.27734375, "rewards/margins": 0.11328125, "rewards/rejected": 0.1640625, "step": 1666 }, { "epoch": 0.4808885042550123, "grad_norm": 11.849989859238182, "learning_rate": 3.0972547099087136e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1368.0, "logps/rejected": -1400.0, "loss": 0.7144, "loss/demonstration_loss": -2800.0, "loss/preference_loss": -2784.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.326171875, "rewards/margins": 0.06640625, "rewards/rejected": 0.259765625, "step": 1667 }, { "epoch": 0.4811769796624838, "grad_norm": 10.39264686463552, "learning_rate": 3.0948092135482776e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1904.0, "logps/rejected": -1728.0, "loss": 0.6535, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3680.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.46875, "rewards/margins": 0.0810546875, "rewards/rejected": 0.388671875, "step": 1668 }, { "epoch": 0.4814654550699553, "grad_norm": 11.520226026464945, "learning_rate": 3.092363113730401e-07, "logits/chosen": 3.28125, "logits/rejected": 3.203125, "logps/chosen": -1408.0, "logps/rejected": -1408.0, "loss": 0.6998, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.337890625, "rewards/margins": 0.02001953125, "rewards/rejected": 0.318359375, "step": 1669 }, { "epoch": 0.4817539304774268, "grad_norm": 10.93738670374896, "learning_rate": 3.0899164129367483e-07, "logits/chosen": 3.296875, "logits/rejected": 3.34375, "logps/chosen": -1528.0, "logps/rejected": -1568.0, "loss": 0.6617, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.37890625, "rewards/margins": 0.03369140625, "rewards/rejected": 0.345703125, "step": 1670 }, { "epoch": 0.48204240588489833, "grad_norm": 13.380987602000694, "learning_rate": 3.087469113649596e-07, "logits/chosen": 3.3125, "logits/rejected": 3.296875, "logps/chosen": -1864.0, "logps/rejected": -1792.0, "loss": 0.675, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.44140625, "rewards/margins": -0.00689697265625, "rewards/rejected": 0.447265625, "step": 1671 }, { "epoch": 0.48233088129236984, "grad_norm": 10.06951658831439, "learning_rate": 3.085021218351824e-07, "logits/chosen": 3.40625, "logits/rejected": 3.359375, "logps/chosen": -1736.0, "logps/rejected": -1840.0, "loss": 0.639, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3828125, "rewards/margins": 0.04248046875, "rewards/rejected": 0.341796875, "step": 1672 }, { "epoch": 0.48261935669984135, "grad_norm": 13.251292234310187, "learning_rate": 3.08257272952692e-07, "logits/chosen": 3.140625, "logits/rejected": 3.21875, "logps/chosen": -1600.0, "logps/rejected": -1224.0, "loss": 0.6576, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2490234375, "rewards/margins": 0.003875732421875, "rewards/rejected": 0.2451171875, "step": 1673 }, { "epoch": 0.48290783210731286, "grad_norm": 12.711856301089107, "learning_rate": 3.080123649658971e-07, "logits/chosen": 3.234375, "logits/rejected": 3.328125, "logps/chosen": -2064.0, "logps/rejected": -1992.0, "loss": 0.6868, "loss/demonstration_loss": -4096.0, "loss/preference_loss": -4096.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.53125, "rewards/margins": 0.01043701171875, "rewards/rejected": 0.51953125, "step": 1674 }, { "epoch": 0.4831963075147844, "grad_norm": 10.121013474621607, "learning_rate": 3.077673981232667e-07, "logits/chosen": 3.3125, "logits/rejected": 3.375, "logps/chosen": -1344.0, "logps/rejected": -1464.0, "loss": 0.6498, "loss/demonstration_loss": -2848.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.318359375, "rewards/margins": 0.00750732421875, "rewards/rejected": 0.310546875, "step": 1675 }, { "epoch": 0.4834847829222559, "grad_norm": 11.519979402165841, "learning_rate": 3.0752237267332927e-07, "logits/chosen": 3.1875, "logits/rejected": 3.203125, "logps/chosen": -1424.0, "logps/rejected": -1656.0, "loss": 0.7018, "loss/demonstration_loss": -3104.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.298828125, "rewards/margins": -0.043212890625, "rewards/rejected": 0.34375, "step": 1676 }, { "epoch": 0.4837732583297274, "grad_norm": 10.042373408926226, "learning_rate": 3.072772888646728e-07, "logits/chosen": 3.140625, "logits/rejected": 3.171875, "logps/chosen": -1600.0, "logps/rejected": -1584.0, "loss": 0.6439, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.470703125, "rewards/margins": 0.08837890625, "rewards/rejected": 0.380859375, "step": 1677 }, { "epoch": 0.4840617337371989, "grad_norm": 10.796593422601202, "learning_rate": 3.0703214694594455e-07, "logits/chosen": 3.296875, "logits/rejected": 3.359375, "logps/chosen": -1424.0, "logps/rejected": -1768.0, "loss": 0.6744, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.447265625, "rewards/margins": 0.09033203125, "rewards/rejected": 0.357421875, "step": 1678 }, { "epoch": 0.4843502091446704, "grad_norm": 12.583742560916544, "learning_rate": 3.0678694716585053e-07, "logits/chosen": 3.40625, "logits/rejected": 3.453125, "logps/chosen": -1448.0, "logps/rejected": -1472.0, "loss": 0.6856, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.337890625, "rewards/margins": 0.09228515625, "rewards/rejected": 0.2451171875, "step": 1679 }, { "epoch": 0.48463868455214193, "grad_norm": 9.818383899862901, "learning_rate": 3.0654168977315577e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1224.0, "logps/rejected": -1328.0, "loss": 0.665, "loss/demonstration_loss": -2592.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.345703125, "rewards/margins": 0.02880859375, "rewards/rejected": 0.31640625, "step": 1680 }, { "epoch": 0.48492715995961344, "grad_norm": 10.785864429127946, "learning_rate": 3.062963750166835e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1680.0, "logps/rejected": -1600.0, "loss": 0.6201, "loss/demonstration_loss": -3328.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.55078125, "rewards/margins": 0.154296875, "rewards/rejected": 0.396484375, "step": 1681 }, { "epoch": 0.48521563536708495, "grad_norm": 9.874442072546318, "learning_rate": 3.0605100314531523e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -2080.0, "logps/rejected": -1808.0, "loss": 0.647, "loss/demonstration_loss": -3952.0, "loss/preference_loss": -3936.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.55078125, "rewards/margins": 0.16796875, "rewards/rejected": 0.3828125, "step": 1682 }, { "epoch": 0.48550411077455646, "grad_norm": 12.190108625830177, "learning_rate": 3.058055744079904e-07, "logits/chosen": 3.28125, "logits/rejected": 3.328125, "logps/chosen": -1728.0, "logps/rejected": -1656.0, "loss": 0.6505, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.384765625, "rewards/margins": 0.09912109375, "rewards/rejected": 0.287109375, "step": 1683 }, { "epoch": 0.485792586182028, "grad_norm": 13.855554880694768, "learning_rate": 3.055600890537063e-07, "logits/chosen": 3.109375, "logits/rejected": 3.0625, "logps/chosen": -1656.0, "logps/rejected": -1816.0, "loss": 0.7065, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.337890625, "rewards/margins": 0.03759765625, "rewards/rejected": 0.298828125, "step": 1684 }, { "epoch": 0.4860810615894995, "grad_norm": 10.471673671181692, "learning_rate": 3.053145473315173e-07, "logits/chosen": 3.328125, "logits/rejected": 3.25, "logps/chosen": -1664.0, "logps/rejected": -1632.0, "loss": 0.6221, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3344.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5859375, "rewards/margins": 0.2001953125, "rewards/rejected": 0.384765625, "step": 1685 }, { "epoch": 0.486369536996971, "grad_norm": 12.613366625243815, "learning_rate": 3.050689494905354e-07, "logits/chosen": 3.1875, "logits/rejected": 3.171875, "logps/chosen": -1384.0, "logps/rejected": -1184.0, "loss": 0.6603, "loss/demonstration_loss": -2608.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.271484375, "rewards/margins": 0.1435546875, "rewards/rejected": 0.1279296875, "step": 1686 }, { "epoch": 0.4866580124044425, "grad_norm": 10.522916954187854, "learning_rate": 3.04823295779929e-07, "logits/chosen": 3.15625, "logits/rejected": 3.203125, "logps/chosen": -1936.0, "logps/rejected": -1896.0, "loss": 0.6184, "loss/demonstration_loss": -3888.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5859375, "rewards/margins": 0.271484375, "rewards/rejected": 0.314453125, "step": 1687 }, { "epoch": 0.486946487811914, "grad_norm": 10.62874471888353, "learning_rate": 3.045775864489238e-07, "logits/chosen": 3.203125, "logits/rejected": 3.234375, "logps/chosen": -1616.0, "logps/rejected": -1720.0, "loss": 0.6863, "loss/demonstration_loss": -3376.0, "loss/preference_loss": -3376.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.421875, "rewards/margins": 0.019287109375, "rewards/rejected": 0.40234375, "step": 1688 }, { "epoch": 0.48723496321938553, "grad_norm": 11.97141886346288, "learning_rate": 3.043318217468015e-07, "logits/chosen": 3.3125, "logits/rejected": 3.34375, "logps/chosen": -2064.0, "logps/rejected": -1904.0, "loss": 0.6973, "loss/demonstration_loss": -4016.0, "loss/preference_loss": -4000.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.62109375, "rewards/margins": 0.146484375, "rewards/rejected": 0.474609375, "step": 1689 }, { "epoch": 0.48752343862685704, "grad_norm": 10.095586672460136, "learning_rate": 3.0408600192290006e-07, "logits/chosen": 3.15625, "logits/rejected": 3.171875, "logps/chosen": -1776.0, "logps/rejected": -1360.0, "loss": 0.6269, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.53515625, "rewards/margins": 0.310546875, "rewards/rejected": 0.22265625, "step": 1690 }, { "epoch": 0.48781191403432855, "grad_norm": 10.3762084448092, "learning_rate": 3.038401272266135e-07, "logits/chosen": 3.265625, "logits/rejected": 3.21875, "logps/chosen": -1176.0, "logps/rejected": -936.0, "loss": 0.6903, "loss/demonstration_loss": -2144.0, "loss/preference_loss": -2128.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.251953125, "rewards/margins": 0.061767578125, "rewards/rejected": 0.189453125, "step": 1691 }, { "epoch": 0.48810038944180006, "grad_norm": 10.97064109696163, "learning_rate": 3.035941979073913e-07, "logits/chosen": 3.296875, "logits/rejected": 3.203125, "logps/chosen": -1736.0, "logps/rejected": -1728.0, "loss": 0.6725, "loss/demonstration_loss": -3520.0, "loss/preference_loss": -3504.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.470703125, "rewards/margins": 0.0791015625, "rewards/rejected": 0.390625, "step": 1692 }, { "epoch": 0.4883888648492716, "grad_norm": 11.350142017783533, "learning_rate": 3.0334821421473853e-07, "logits/chosen": 3.3125, "logits/rejected": 3.359375, "logps/chosen": -1880.0, "logps/rejected": -1928.0, "loss": 0.6491, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3856.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.490234375, "rewards/margins": 0.0284423828125, "rewards/rejected": 0.4609375, "step": 1693 }, { "epoch": 0.4886773402567431, "grad_norm": 20.307940854580874, "learning_rate": 3.031021763982154e-07, "logits/chosen": 3.296875, "logits/rejected": 3.1875, "logps/chosen": -1496.0, "logps/rejected": -1496.0, "loss": 0.6575, "loss/demonstration_loss": -3040.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.453125, "rewards/margins": 0.0849609375, "rewards/rejected": 0.3671875, "step": 1694 }, { "epoch": 0.4889658156642146, "grad_norm": 9.481432164936566, "learning_rate": 3.028560847074369e-07, "logits/chosen": 3.390625, "logits/rejected": 3.40625, "logps/chosen": -1640.0, "logps/rejected": -1512.0, "loss": 0.6657, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3184.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.330078125, "rewards/margins": 0.0142822265625, "rewards/rejected": 0.31640625, "step": 1695 }, { "epoch": 0.48925429107168616, "grad_norm": 12.702576381337801, "learning_rate": 3.026099393920728e-07, "logits/chosen": 3.0625, "logits/rejected": 3.15625, "logps/chosen": -1632.0, "logps/rejected": -1328.0, "loss": 0.665, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.349609375, "rewards/margins": 0.1015625, "rewards/rejected": 0.2490234375, "step": 1696 }, { "epoch": 0.4895427664791577, "grad_norm": 11.805345781642751, "learning_rate": 3.023637407018473e-07, "logits/chosen": 3.203125, "logits/rejected": 3.203125, "logps/chosen": -1720.0, "logps/rejected": -1520.0, "loss": 0.6561, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.50390625, "rewards/margins": 0.1826171875, "rewards/rejected": 0.322265625, "step": 1697 }, { "epoch": 0.4898312418866292, "grad_norm": 10.213244821480894, "learning_rate": 3.0211748888653857e-07, "logits/chosen": 3.375, "logits/rejected": 3.296875, "logps/chosen": -1544.0, "logps/rejected": -1464.0, "loss": 0.6718, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3024.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.431640625, "rewards/margins": 0.1591796875, "rewards/rejected": 0.2734375, "step": 1698 }, { "epoch": 0.4901197172941007, "grad_norm": 9.940280439380725, "learning_rate": 3.0187118419597896e-07, "logits/chosen": 3.265625, "logits/rejected": 3.203125, "logps/chosen": -1992.0, "logps/rejected": -1752.0, "loss": 0.6473, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.55078125, "rewards/margins": 0.10498046875, "rewards/rejected": 0.4453125, "step": 1699 }, { "epoch": 0.4904081927015722, "grad_norm": 10.083668236499864, "learning_rate": 3.0162482688005427e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1432.0, "logps/rejected": -1240.0, "loss": 0.6586, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1689453125, "rewards/margins": -0.0150146484375, "rewards/rejected": 0.18359375, "step": 1700 }, { "epoch": 0.4906966681090437, "grad_norm": 11.415420947992045, "learning_rate": 3.0137841718870347e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1680.0, "logps/rejected": -1688.0, "loss": 0.6451, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.53515625, "rewards/margins": 0.10546875, "rewards/rejected": 0.4296875, "step": 1701 }, { "epoch": 0.49098514351651523, "grad_norm": 11.657889584744469, "learning_rate": 3.0113195537191935e-07, "logits/chosen": 3.328125, "logits/rejected": 3.328125, "logps/chosen": -1256.0, "logps/rejected": -1256.0, "loss": 0.7116, "loss/demonstration_loss": -2528.0, "loss/preference_loss": -2528.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2177734375, "rewards/margins": -0.056640625, "rewards/rejected": 0.2734375, "step": 1702 }, { "epoch": 0.49127361892398674, "grad_norm": 12.261114573786756, "learning_rate": 3.00885441679747e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -2080.0, "logps/rejected": -2160.0, "loss": 0.7263, "loss/demonstration_loss": -4256.0, "loss/preference_loss": -4288.0, "rewards/accuracies": 0.125, "rewards/chosen": 0.40625, "rewards/margins": -0.1064453125, "rewards/rejected": 0.51171875, "step": 1703 }, { "epoch": 0.49156209433145825, "grad_norm": 11.410358699500748, "learning_rate": 3.006388763622841e-07, "logits/chosen": 3.296875, "logits/rejected": 3.3125, "logps/chosen": -1296.0, "logps/rejected": -1328.0, "loss": 0.6862, "loss/demonstration_loss": -2656.0, "loss/preference_loss": -2656.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.322265625, "rewards/margins": 0.09814453125, "rewards/rejected": 0.224609375, "step": 1704 }, { "epoch": 0.49185056973892977, "grad_norm": 10.83339660277847, "learning_rate": 3.003922596696811e-07, "logits/chosen": 3.140625, "logits/rejected": 3.203125, "logps/chosen": -1320.0, "logps/rejected": -1344.0, "loss": 0.7155, "loss/demonstration_loss": -2688.0, "loss/preference_loss": -2688.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2578125, "rewards/margins": -0.0274658203125, "rewards/rejected": 0.28515625, "step": 1705 }, { "epoch": 0.4921390451464013, "grad_norm": 12.484747357776277, "learning_rate": 3.001455918521403e-07, "logits/chosen": 3.1875, "logits/rejected": 3.234375, "logps/chosen": -1328.0, "logps/rejected": -1288.0, "loss": 0.7074, "loss/demonstration_loss": -2640.0, "loss/preference_loss": -2640.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.2890625, "rewards/margins": 0.047607421875, "rewards/rejected": 0.240234375, "step": 1706 }, { "epoch": 0.4924275205538728, "grad_norm": 9.695931056236464, "learning_rate": 2.9989887315991603e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -1768.0, "logps/rejected": -1480.0, "loss": 0.6388, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.470703125, "rewards/margins": 0.1806640625, "rewards/rejected": 0.2890625, "step": 1707 }, { "epoch": 0.4927159959613443, "grad_norm": 9.86618962519078, "learning_rate": 2.996521038433141e-07, "logits/chosen": 3.125, "logits/rejected": 3.234375, "logps/chosen": -1856.0, "logps/rejected": -1856.0, "loss": 0.6492, "loss/demonstration_loss": -3760.0, "loss/preference_loss": -3760.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.435546875, "rewards/margins": 0.0927734375, "rewards/rejected": 0.341796875, "step": 1708 }, { "epoch": 0.4930044713688158, "grad_norm": 12.423198378005038, "learning_rate": 2.9940528415269166e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1816.0, "logps/rejected": -1896.0, "loss": 0.6849, "loss/demonstration_loss": -3776.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.6171875, "rewards/margins": 0.0908203125, "rewards/rejected": 0.52734375, "step": 1709 }, { "epoch": 0.4932929467762873, "grad_norm": 11.756586119945142, "learning_rate": 2.991584143384571e-07, "logits/chosen": 3.265625, "logits/rejected": 3.328125, "logps/chosen": -1448.0, "logps/rejected": -1512.0, "loss": 0.6779, "loss/demonstration_loss": -2992.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.291015625, "rewards/margins": 0.062255859375, "rewards/rejected": 0.2294921875, "step": 1710 }, { "epoch": 0.49358142218375883, "grad_norm": 10.252668862959137, "learning_rate": 2.9891149465106964e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1488.0, "logps/rejected": -1512.0, "loss": 0.6693, "loss/demonstration_loss": -3056.0, "loss/preference_loss": -3040.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.484375, "rewards/margins": 0.0224609375, "rewards/rejected": 0.4609375, "step": 1711 }, { "epoch": 0.49386989759123034, "grad_norm": 8.84332308906608, "learning_rate": 2.986645253410389e-07, "logits/chosen": 3.359375, "logits/rejected": 3.34375, "logps/chosen": -1736.0, "logps/rejected": -1672.0, "loss": 0.6516, "loss/demonstration_loss": -3440.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.353515625, "rewards/margins": -0.033447265625, "rewards/rejected": 0.38671875, "step": 1712 }, { "epoch": 0.49415837299870186, "grad_norm": 11.644937668776777, "learning_rate": 2.9841750665892525e-07, "logits/chosen": 3.234375, "logits/rejected": 3.21875, "logps/chosen": -1952.0, "logps/rejected": -1864.0, "loss": 0.6519, "loss/demonstration_loss": -3856.0, "loss/preference_loss": -3840.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5078125, "rewards/margins": 0.1962890625, "rewards/rejected": 0.3125, "step": 1713 }, { "epoch": 0.49444684840617337, "grad_norm": 11.404532680943579, "learning_rate": 2.9817043885533866e-07, "logits/chosen": 3.421875, "logits/rejected": 3.4375, "logps/chosen": -1656.0, "logps/rejected": -1704.0, "loss": 0.6696, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.412109375, "rewards/margins": 0.021484375, "rewards/rejected": 0.390625, "step": 1714 }, { "epoch": 0.4947353238136449, "grad_norm": 10.293634058233197, "learning_rate": 2.9792332218093925e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1608.0, "logps/rejected": -1464.0, "loss": 0.666, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3104.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.359375, "rewards/margins": 0.05810546875, "rewards/rejected": 0.30078125, "step": 1715 }, { "epoch": 0.4950237992211164, "grad_norm": 12.789946193674773, "learning_rate": 2.976761568864367e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -1824.0, "logps/rejected": -1784.0, "loss": 0.7097, "loss/demonstration_loss": -3664.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.5234375, "rewards/margins": -0.038818359375, "rewards/rejected": 0.5625, "step": 1716 }, { "epoch": 0.4953122746285879, "grad_norm": 9.59985322023126, "learning_rate": 2.9742894322258995e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1416.0, "logps/rejected": -1336.0, "loss": 0.6609, "loss/demonstration_loss": -2784.0, "loss/preference_loss": -2768.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.29296875, "rewards/margins": 0.0869140625, "rewards/rejected": 0.2060546875, "step": 1717 }, { "epoch": 0.4956007500360594, "grad_norm": 10.900744999863498, "learning_rate": 2.9718168144020697e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -1720.0, "logps/rejected": -1512.0, "loss": 0.642, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.43359375, "rewards/margins": 0.08740234375, "rewards/rejected": 0.345703125, "step": 1718 }, { "epoch": 0.4958892254435309, "grad_norm": 11.799864498094603, "learning_rate": 2.9693437179014465e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -1752.0, "logps/rejected": -1656.0, "loss": 0.6484, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3440.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5546875, "rewards/margins": 0.11767578125, "rewards/rejected": 0.435546875, "step": 1719 }, { "epoch": 0.49617770085100243, "grad_norm": 10.742482625477354, "learning_rate": 2.9668701452330835e-07, "logits/chosen": 3.15625, "logits/rejected": 3.15625, "logps/chosen": -1720.0, "logps/rejected": -1736.0, "loss": 0.6896, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.42578125, "rewards/margins": 0.08935546875, "rewards/rejected": 0.3359375, "step": 1720 }, { "epoch": 0.49646617625847395, "grad_norm": 11.530089102220035, "learning_rate": 2.9643960989065185e-07, "logits/chosen": 3.34375, "logits/rejected": 3.25, "logps/chosen": -1728.0, "logps/rejected": -1688.0, "loss": 0.67, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3456.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.4296875, "rewards/margins": 0.1279296875, "rewards/rejected": 0.30078125, "step": 1721 }, { "epoch": 0.49675465166594546, "grad_norm": 12.19537685761578, "learning_rate": 2.96192158143177e-07, "logits/chosen": 3.25, "logits/rejected": 3.265625, "logps/chosen": -1592.0, "logps/rejected": -1312.0, "loss": 0.672, "loss/demonstration_loss": -2944.0, "loss/preference_loss": -2928.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3984375, "rewards/margins": 0.15625, "rewards/rejected": 0.2412109375, "step": 1722 }, { "epoch": 0.49704312707341697, "grad_norm": 11.424301235682192, "learning_rate": 2.9594465953193304e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1632.0, "logps/rejected": -1584.0, "loss": 0.6848, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.423828125, "rewards/margins": 0.0140380859375, "rewards/rejected": 0.41015625, "step": 1723 }, { "epoch": 0.4973316024808885, "grad_norm": 12.854738111464284, "learning_rate": 2.956971143080175e-07, "logits/chosen": 3.046875, "logits/rejected": 3.109375, "logps/chosen": -1696.0, "logps/rejected": -1536.0, "loss": 0.6635, "loss/demonstration_loss": -3296.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.51171875, "rewards/margins": 0.15234375, "rewards/rejected": 0.359375, "step": 1724 }, { "epoch": 0.49762007788836, "grad_norm": 9.387828390750895, "learning_rate": 2.954495227225745e-07, "logits/chosen": 3.28125, "logits/rejected": 3.234375, "logps/chosen": -1864.0, "logps/rejected": -1832.0, "loss": 0.6413, "loss/demonstration_loss": -3728.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.359375, "rewards/margins": 0.0537109375, "rewards/rejected": 0.3046875, "step": 1725 }, { "epoch": 0.49790855329583156, "grad_norm": 10.28469694324359, "learning_rate": 2.952018850267957e-07, "logits/chosen": 3.21875, "logits/rejected": 3.28125, "logps/chosen": -1904.0, "logps/rejected": -1728.0, "loss": 0.6509, "loss/demonstration_loss": -3696.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.62109375, "rewards/margins": 0.298828125, "rewards/rejected": 0.3203125, "step": 1726 }, { "epoch": 0.49819702870330307, "grad_norm": 10.355016485892893, "learning_rate": 2.949542014719191e-07, "logits/chosen": 3.234375, "logits/rejected": 3.234375, "logps/chosen": -1384.0, "logps/rejected": -1136.0, "loss": 0.6547, "loss/demonstration_loss": -2560.0, "loss/preference_loss": -2544.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.328125, "rewards/margins": 0.10791015625, "rewards/rejected": 0.2197265625, "step": 1727 }, { "epoch": 0.4984855041107746, "grad_norm": 10.940245488522729, "learning_rate": 2.947064723092296e-07, "logits/chosen": 3.375, "logits/rejected": 3.390625, "logps/chosen": -1480.0, "logps/rejected": -1688.0, "loss": 0.6548, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.3515625, "rewards/margins": -0.07080078125, "rewards/rejected": 0.421875, "step": 1728 }, { "epoch": 0.4987739795182461, "grad_norm": 10.422880718733358, "learning_rate": 2.9445869779005817e-07, "logits/chosen": 3.125, "logits/rejected": 3.109375, "logps/chosen": -1576.0, "logps/rejected": -1296.0, "loss": 0.6312, "loss/demonstration_loss": -2912.0, "loss/preference_loss": -2896.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.390625, "rewards/margins": 0.1884765625, "rewards/rejected": 0.2041015625, "step": 1729 }, { "epoch": 0.4990624549257176, "grad_norm": 9.65906454017761, "learning_rate": 2.9421087816578186e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1616.0, "logps/rejected": -1352.0, "loss": 0.6443, "loss/demonstration_loss": -3008.0, "loss/preference_loss": -2992.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.404296875, "rewards/margins": 0.1591796875, "rewards/rejected": 0.244140625, "step": 1730 }, { "epoch": 0.4993509303331891, "grad_norm": 11.671909107525298, "learning_rate": 2.9396301368782346e-07, "logits/chosen": 3.296875, "logits/rejected": 3.203125, "logps/chosen": -1408.0, "logps/rejected": -1424.0, "loss": 0.6738, "loss/demonstration_loss": -2864.0, "loss/preference_loss": -2848.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.251953125, "rewards/margins": 0.0263671875, "rewards/rejected": 0.224609375, "step": 1731 }, { "epoch": 0.4996394057406606, "grad_norm": 10.127893448593674, "learning_rate": 2.937151046076512e-07, "logits/chosen": 3.34375, "logits/rejected": 3.3125, "logps/chosen": -1512.0, "logps/rejected": -1584.0, "loss": 0.6533, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.375, "rewards/margins": 0.1328125, "rewards/rejected": 0.2421875, "step": 1732 }, { "epoch": 0.49992788114813214, "grad_norm": 10.544998510943747, "learning_rate": 2.934671511767788e-07, "logits/chosen": 3.34375, "logits/rejected": 3.328125, "logps/chosen": -1840.0, "logps/rejected": -1512.0, "loss": 0.6683, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.359375, "rewards/margins": 0.01104736328125, "rewards/rejected": 0.34765625, "step": 1733 }, { "epoch": 0.5002163565556036, "grad_norm": 11.867311749742239, "learning_rate": 2.9321915364676463e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1832.0, "logps/rejected": -1616.0, "loss": 0.6476, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.435546875, "rewards/margins": 0.1201171875, "rewards/rejected": 0.31640625, "step": 1734 }, { "epoch": 0.5005048319630752, "grad_norm": 11.009077027857101, "learning_rate": 2.929711122692122e-07, "logits/chosen": 3.15625, "logits/rejected": 3.109375, "logps/chosen": -2096.0, "logps/rejected": -1888.0, "loss": 0.6307, "loss/demonstration_loss": -4032.0, "loss/preference_loss": -4016.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.466796875, "rewards/margins": 0.125, "rewards/rejected": 0.341796875, "step": 1735 }, { "epoch": 0.5007933073705466, "grad_norm": 10.418677852444347, "learning_rate": 2.92723027295769e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -2064.0, "logps/rejected": -2096.0, "loss": 0.6851, "loss/demonstration_loss": -4224.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.53125, "rewards/margins": 0.1650390625, "rewards/rejected": 0.3671875, "step": 1736 }, { "epoch": 0.5010817827780182, "grad_norm": 11.831636208493132, "learning_rate": 2.9247489897812723e-07, "logits/chosen": 3.265625, "logits/rejected": 3.265625, "logps/chosen": -1496.0, "logps/rejected": -1624.0, "loss": 0.67, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3136.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.255859375, "rewards/margins": 0.0517578125, "rewards/rejected": 0.2041015625, "step": 1737 }, { "epoch": 0.5013702581854896, "grad_norm": 10.127416356434798, "learning_rate": 2.922267275680228e-07, "logits/chosen": 3.34375, "logits/rejected": 3.28125, "logps/chosen": -1576.0, "logps/rejected": -1680.0, "loss": 0.6618, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3312.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.53125, "rewards/margins": 0.0240478515625, "rewards/rejected": 0.5078125, "step": 1738 }, { "epoch": 0.5016587335929612, "grad_norm": 9.891721305961726, "learning_rate": 2.9197851331723544e-07, "logits/chosen": 3.3125, "logits/rejected": 3.3125, "logps/chosen": -1736.0, "logps/rejected": -1800.0, "loss": 0.6852, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3568.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2421875, "rewards/margins": -0.0159912109375, "rewards/rejected": 0.2578125, "step": 1739 }, { "epoch": 0.5019472090004327, "grad_norm": 10.342976397999225, "learning_rate": 2.9173025647758836e-07, "logits/chosen": 3.296875, "logits/rejected": 3.21875, "logps/chosen": -1696.0, "logps/rejected": -1896.0, "loss": 0.6598, "loss/demonstration_loss": -3632.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.451171875, "rewards/margins": 0.146484375, "rewards/rejected": 0.3046875, "step": 1740 }, { "epoch": 0.5022356844079042, "grad_norm": 10.974733975622495, "learning_rate": 2.914819573009478e-07, "logits/chosen": 3.203125, "logits/rejected": 3.1875, "logps/chosen": -1616.0, "logps/rejected": -1528.0, "loss": 0.6645, "loss/demonstration_loss": -3184.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33984375, "rewards/margins": 0.1845703125, "rewards/rejected": 0.1552734375, "step": 1741 }, { "epoch": 0.5025241598153758, "grad_norm": 10.179767107236293, "learning_rate": 2.912336160392231e-07, "logits/chosen": 3.28125, "logits/rejected": 3.328125, "logps/chosen": -1448.0, "logps/rejected": -1352.0, "loss": 0.6683, "loss/demonstration_loss": -2816.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.2451171875, "rewards/margins": 0.11083984375, "rewards/rejected": 0.1337890625, "step": 1742 }, { "epoch": 0.5028126352228472, "grad_norm": 11.017607509733278, "learning_rate": 2.909852329443665e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1656.0, "logps/rejected": -1672.0, "loss": 0.733, "loss/demonstration_loss": -3360.0, "loss/preference_loss": -3360.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.341796875, "rewards/margins": -0.031005859375, "rewards/rejected": 0.373046875, "step": 1743 }, { "epoch": 0.5031011106303188, "grad_norm": 9.580508703581222, "learning_rate": 2.9073680826837216e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -1920.0, "logps/rejected": -1608.0, "loss": 0.6513, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3552.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.388671875, "rewards/margins": 0.1708984375, "rewards/rejected": 0.21875, "step": 1744 }, { "epoch": 0.5033895860377903, "grad_norm": 10.27492287968011, "learning_rate": 2.9048834226327687e-07, "logits/chosen": 3.375, "logits/rejected": 3.3125, "logps/chosen": -1496.0, "logps/rejected": -1744.0, "loss": 0.6995, "loss/demonstration_loss": -3280.0, "loss/preference_loss": -3280.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.390625, "rewards/margins": 0.030517578125, "rewards/rejected": 0.359375, "step": 1745 }, { "epoch": 0.5036780614452618, "grad_norm": 10.737938929214332, "learning_rate": 2.902398351811592e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -2368.0, "logps/rejected": -2048.0, "loss": 0.6541, "loss/demonstration_loss": -4448.0, "loss/preference_loss": -4448.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5390625, "rewards/margins": 0.1181640625, "rewards/rejected": 0.41796875, "step": 1746 }, { "epoch": 0.5039665368527333, "grad_norm": 10.369215253405205, "learning_rate": 2.8999128727413933e-07, "logits/chosen": 3.046875, "logits/rejected": 2.96875, "logps/chosen": -1368.0, "logps/rejected": -1344.0, "loss": 0.6983, "loss/demonstration_loss": -2752.0, "loss/preference_loss": -2752.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3203125, "rewards/margins": 0.01422119140625, "rewards/rejected": 0.306640625, "step": 1747 }, { "epoch": 0.5042550122602049, "grad_norm": 10.498277552345048, "learning_rate": 2.8974269879437915e-07, "logits/chosen": 3.34375, "logits/rejected": 3.34375, "logps/chosen": -1872.0, "logps/rejected": -1880.0, "loss": 0.6699, "loss/demonstration_loss": -3792.0, "loss/preference_loss": -3792.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.462890625, "rewards/margins": 0.041015625, "rewards/rejected": 0.421875, "step": 1748 }, { "epoch": 0.5045434876676763, "grad_norm": 12.688366685296309, "learning_rate": 2.8949406999408117e-07, "logits/chosen": 3.1875, "logits/rejected": 3.109375, "logps/chosen": -1720.0, "logps/rejected": -1456.0, "loss": 0.7061, "loss/demonstration_loss": -3200.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.341796875, "rewards/margins": 0.10791015625, "rewards/rejected": 0.234375, "step": 1749 }, { "epoch": 0.5048319630751479, "grad_norm": 11.348897766516595, "learning_rate": 2.8924540112548933e-07, "logits/chosen": 3.1875, "logits/rejected": 3.28125, "logps/chosen": -1888.0, "logps/rejected": -1744.0, "loss": 0.6845, "loss/demonstration_loss": -3680.0, "loss/preference_loss": -3664.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.40234375, "rewards/margins": -0.0240478515625, "rewards/rejected": 0.42578125, "step": 1750 }, { "epoch": 0.5051204384826193, "grad_norm": 11.201366737806685, "learning_rate": 2.8899669244088803e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1640.0, "logps/rejected": -1560.0, "loss": 0.6846, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.408203125, "rewards/margins": 0.02099609375, "rewards/rejected": 0.38671875, "step": 1751 }, { "epoch": 0.5054089138900909, "grad_norm": 9.549822380818073, "learning_rate": 2.88747944192602e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1824.0, "logps/rejected": -1680.0, "loss": 0.642, "loss/demonstration_loss": -3568.0, "loss/preference_loss": -3536.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.58984375, "rewards/margins": 0.2177734375, "rewards/rejected": 0.37109375, "step": 1752 }, { "epoch": 0.5056973892975624, "grad_norm": 13.444438510722783, "learning_rate": 2.8849915663299606e-07, "logits/chosen": 3.265625, "logits/rejected": 3.203125, "logps/chosen": -1648.0, "logps/rejected": -1584.0, "loss": 0.6857, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.34765625, "rewards/margins": 0.162109375, "rewards/rejected": 0.1865234375, "step": 1753 }, { "epoch": 0.5059858647050339, "grad_norm": 12.888674000666649, "learning_rate": 2.882503300144752e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1736.0, "logps/rejected": -1848.0, "loss": 0.7273, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.322265625, "rewards/margins": -0.0634765625, "rewards/rejected": 0.384765625, "step": 1754 }, { "epoch": 0.5062743401125054, "grad_norm": 12.400425308277164, "learning_rate": 2.880014645894837e-07, "logits/chosen": 3.28125, "logits/rejected": 3.1875, "logps/chosen": -1904.0, "logps/rejected": -1776.0, "loss": 0.6421, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3696.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.3046875, "rewards/margins": 0.123046875, "rewards/rejected": 0.181640625, "step": 1755 }, { "epoch": 0.506562815519977, "grad_norm": 9.901752448052402, "learning_rate": 2.8775256061050555e-07, "logits/chosen": 3.328125, "logits/rejected": 3.40625, "logps/chosen": -1480.0, "logps/rejected": -1304.0, "loss": 0.6498, "loss/demonstration_loss": -2832.0, "loss/preference_loss": -2816.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.408203125, "rewards/margins": 0.13671875, "rewards/rejected": 0.271484375, "step": 1756 }, { "epoch": 0.5068512909274484, "grad_norm": 11.889209988164124, "learning_rate": 2.8750361833006354e-07, "logits/chosen": 3.1875, "logits/rejected": 3.140625, "logps/chosen": -1544.0, "logps/rejected": -1384.0, "loss": 0.6734, "loss/demonstration_loss": -2976.0, "loss/preference_loss": -2976.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.412109375, "rewards/margins": 0.00347900390625, "rewards/rejected": 0.408203125, "step": 1757 }, { "epoch": 0.50713976633492, "grad_norm": 10.833362685525985, "learning_rate": 2.8725463800071937e-07, "logits/chosen": 3.234375, "logits/rejected": 3.1875, "logps/chosen": -1544.0, "logps/rejected": -1656.0, "loss": 0.678, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.31640625, "rewards/margins": 0.00830078125, "rewards/rejected": 0.30859375, "step": 1758 }, { "epoch": 0.5074282417423914, "grad_norm": 11.889963462855883, "learning_rate": 2.8700561987507357e-07, "logits/chosen": 3.265625, "logits/rejected": 3.25, "logps/chosen": -1832.0, "logps/rejected": -1272.0, "loss": 0.6644, "loss/demonstration_loss": -3136.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.310546875, "rewards/margins": 0.0703125, "rewards/rejected": 0.240234375, "step": 1759 }, { "epoch": 0.507716717149863, "grad_norm": 12.335351247616256, "learning_rate": 2.867565642057648e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1976.0, "logps/rejected": -1896.0, "loss": 0.6631, "loss/demonstration_loss": -3904.0, "loss/preference_loss": -3904.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.404296875, "rewards/margins": 0.060302734375, "rewards/rejected": 0.345703125, "step": 1760 }, { "epoch": 0.5080051925573345, "grad_norm": 12.615211501644168, "learning_rate": 2.865074712454698e-07, "logits/chosen": 3.203125, "logits/rejected": 3.265625, "logps/chosen": -1752.0, "logps/rejected": -1624.0, "loss": 0.7446, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3408.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.333984375, "rewards/margins": -0.0771484375, "rewards/rejected": 0.41015625, "step": 1761 }, { "epoch": 0.508293667964806, "grad_norm": 11.030303696586278, "learning_rate": 2.8625834124690337e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -1600.0, "logps/rejected": -1744.0, "loss": 0.6954, "loss/demonstration_loss": -3392.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.33203125, "rewards/margins": -0.0306396484375, "rewards/rejected": 0.361328125, "step": 1762 }, { "epoch": 0.5085821433722775, "grad_norm": 10.830900033664998, "learning_rate": 2.860091744628175e-07, "logits/chosen": 3.09375, "logits/rejected": 3.046875, "logps/chosen": -1080.0, "logps/rejected": -1368.0, "loss": 0.6865, "loss/demonstration_loss": -2480.0, "loss/preference_loss": -2464.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.267578125, "rewards/margins": 0.0289306640625, "rewards/rejected": 0.23828125, "step": 1763 }, { "epoch": 0.508870618779749, "grad_norm": 11.75584625699594, "learning_rate": 2.857599711460021e-07, "logits/chosen": 3.171875, "logits/rejected": 3.21875, "logps/chosen": -1752.0, "logps/rejected": -1824.0, "loss": 0.68, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3616.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.375, "rewards/margins": 0.078125, "rewards/rejected": 0.296875, "step": 1764 }, { "epoch": 0.5091590941872205, "grad_norm": 11.037678554057901, "learning_rate": 2.8551073154928353e-07, "logits/chosen": 3.140625, "logits/rejected": 3.125, "logps/chosen": -1968.0, "logps/rejected": -1680.0, "loss": 0.7048, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.57421875, "rewards/margins": -0.01708984375, "rewards/rejected": 0.59375, "step": 1765 }, { "epoch": 0.5094475695946921, "grad_norm": 13.128162958752643, "learning_rate": 2.852614559255251e-07, "logits/chosen": 3.078125, "logits/rejected": 3.078125, "logps/chosen": -1432.0, "logps/rejected": -1440.0, "loss": 0.6617, "loss/demonstration_loss": -2880.0, "loss/preference_loss": -2880.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2265625, "rewards/margins": 0.08251953125, "rewards/rejected": 0.1435546875, "step": 1766 }, { "epoch": 0.5097360450021635, "grad_norm": 12.149654655111377, "learning_rate": 2.85012144527627e-07, "logits/chosen": 3.21875, "logits/rejected": 3.234375, "logps/chosen": -1096.0, "logps/rejected": -1040.0, "loss": 0.6613, "loss/demonstration_loss": -2160.0, "loss/preference_loss": -2160.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.185546875, "rewards/margins": 0.03173828125, "rewards/rejected": 0.154296875, "step": 1767 }, { "epoch": 0.5100245204096351, "grad_norm": 11.01578755937033, "learning_rate": 2.847627976085254e-07, "logits/chosen": 3.21875, "logits/rejected": 3.265625, "logps/chosen": -1784.0, "logps/rejected": -1392.0, "loss": 0.6149, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3200.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.48828125, "rewards/margins": 0.30859375, "rewards/rejected": 0.1787109375, "step": 1768 }, { "epoch": 0.5103129958171065, "grad_norm": 12.07039681947874, "learning_rate": 2.8451341542119264e-07, "logits/chosen": 3.203125, "logits/rejected": 3.171875, "logps/chosen": -1824.0, "logps/rejected": -1736.0, "loss": 0.6987, "loss/demonstration_loss": -3584.0, "loss/preference_loss": -3600.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.267578125, "rewards/margins": -0.07861328125, "rewards/rejected": 0.345703125, "step": 1769 }, { "epoch": 0.5106014712245781, "grad_norm": 9.261033049972546, "learning_rate": 2.842639982186367e-07, "logits/chosen": 3.328125, "logits/rejected": 3.3125, "logps/chosen": -1800.0, "logps/rejected": -1568.0, "loss": 0.6302, "loss/demonstration_loss": -3408.0, "loss/preference_loss": -3392.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.408203125, "rewards/margins": 0.166015625, "rewards/rejected": 0.2421875, "step": 1770 }, { "epoch": 0.5108899466320496, "grad_norm": 11.028187169667907, "learning_rate": 2.840145462539013e-07, "logits/chosen": 3.25, "logits/rejected": 3.25, "logps/chosen": -1632.0, "logps/rejected": -1584.0, "loss": 0.6897, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.2734375, "rewards/margins": -0.0025634765625, "rewards/rejected": 0.27734375, "step": 1771 }, { "epoch": 0.5111784220395211, "grad_norm": 10.958856507788164, "learning_rate": 2.8376505978006523e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1760.0, "logps/rejected": -1440.0, "loss": 0.6868, "loss/demonstration_loss": -3248.0, "loss/preference_loss": -3232.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.39453125, "rewards/margins": 0.06982421875, "rewards/rejected": 0.326171875, "step": 1772 }, { "epoch": 0.5114668974469927, "grad_norm": 10.271647073437943, "learning_rate": 2.835155390502424e-07, "logits/chosen": 3.265625, "logits/rejected": 3.28125, "logps/chosen": -1888.0, "logps/rejected": -1544.0, "loss": 0.6309, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.6484375, "rewards/margins": 0.169921875, "rewards/rejected": 0.4765625, "step": 1773 }, { "epoch": 0.5117553728544642, "grad_norm": 10.07837943509352, "learning_rate": 2.832659843175814e-07, "logits/chosen": 3.234375, "logits/rejected": 3.171875, "logps/chosen": -1624.0, "logps/rejected": -1600.0, "loss": 0.6793, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3264.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.4296875, "rewards/margins": 0.042236328125, "rewards/rejected": 0.38671875, "step": 1774 }, { "epoch": 0.5120438482619357, "grad_norm": 10.2426390943358, "learning_rate": 2.830163958352655e-07, "logits/chosen": 3.21875, "logits/rejected": 3.203125, "logps/chosen": -1760.0, "logps/rejected": -1696.0, "loss": 0.6788, "loss/demonstration_loss": -3488.0, "loss/preference_loss": -3488.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.345703125, "rewards/margins": 0.009765625, "rewards/rejected": 0.3359375, "step": 1775 }, { "epoch": 0.5123323236694072, "grad_norm": 12.045472262208655, "learning_rate": 2.827667738565119e-07, "logits/chosen": 3.171875, "logits/rejected": 3.109375, "logps/chosen": -1256.0, "logps/rejected": -1320.0, "loss": 0.6523, "loss/demonstration_loss": -2592.0, "loss/preference_loss": -2592.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.142578125, "rewards/margins": 0.042724609375, "rewards/rejected": 0.09912109375, "step": 1776 }, { "epoch": 0.5126207990768787, "grad_norm": 9.512392921154305, "learning_rate": 2.8251711863457204e-07, "logits/chosen": 3.296875, "logits/rejected": 3.15625, "logps/chosen": -1608.0, "logps/rejected": -1664.0, "loss": 0.6118, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.48828125, "rewards/margins": 0.2109375, "rewards/rejected": 0.27734375, "step": 1777 }, { "epoch": 0.5129092744843502, "grad_norm": 12.276800496727052, "learning_rate": 2.8226743042273106e-07, "logits/chosen": 3.296875, "logits/rejected": 3.359375, "logps/chosen": -2064.0, "logps/rejected": -1680.0, "loss": 0.6464, "loss/demonstration_loss": -3808.0, "loss/preference_loss": -3776.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.55078125, "rewards/margins": 0.212890625, "rewards/rejected": 0.337890625, "step": 1778 }, { "epoch": 0.5131977498918218, "grad_norm": 11.193187245544724, "learning_rate": 2.8201770947430746e-07, "logits/chosen": 3.15625, "logits/rejected": 3.078125, "logps/chosen": -1496.0, "logps/rejected": -1616.0, "loss": 0.7102, "loss/demonstration_loss": -3152.0, "loss/preference_loss": -3152.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.361328125, "rewards/margins": -0.022705078125, "rewards/rejected": 0.384765625, "step": 1779 }, { "epoch": 0.5134862252992932, "grad_norm": 11.263666515842987, "learning_rate": 2.817679560426529e-07, "logits/chosen": 3.359375, "logits/rejected": 3.359375, "logps/chosen": -1824.0, "logps/rejected": -1760.0, "loss": 0.7045, "loss/demonstration_loss": -3616.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.2890625, "rewards/margins": -0.109375, "rewards/rejected": 0.3984375, "step": 1780 }, { "epoch": 0.5137747007067648, "grad_norm": 12.047409658844018, "learning_rate": 2.8151817038115225e-07, "logits/chosen": 3.375, "logits/rejected": 3.421875, "logps/chosen": -1832.0, "logps/rejected": -1600.0, "loss": 0.7298, "loss/demonstration_loss": -3456.0, "loss/preference_loss": -3472.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.322265625, "rewards/margins": -0.09033203125, "rewards/rejected": 0.412109375, "step": 1781 }, { "epoch": 0.5140631761142362, "grad_norm": 12.049594595867552, "learning_rate": 2.8126835274322285e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1536.0, "logps/rejected": -1552.0, "loss": 0.7147, "loss/demonstration_loss": -3120.0, "loss/preference_loss": -3120.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.3125, "rewards/margins": -0.06103515625, "rewards/rejected": 0.373046875, "step": 1782 }, { "epoch": 0.5143516515217078, "grad_norm": 10.544502936629046, "learning_rate": 2.810185033823147e-07, "logits/chosen": 3.3125, "logits/rejected": 3.28125, "logps/chosen": -2272.0, "logps/rejected": -2064.0, "loss": 0.5775, "loss/demonstration_loss": -4384.0, "loss/preference_loss": -4352.0, "rewards/accuracies": 0.5625, "rewards/chosen": 0.60546875, "rewards/margins": 0.26171875, "rewards/rejected": 0.341796875, "step": 1783 }, { "epoch": 0.5146401269291793, "grad_norm": 12.577301818836316, "learning_rate": 2.807686225519097e-07, "logits/chosen": 3.40625, "logits/rejected": 3.265625, "logps/chosen": -1528.0, "logps/rejected": -1512.0, "loss": 0.6682, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.388671875, "rewards/margins": 0.12158203125, "rewards/rejected": 0.267578125, "step": 1784 }, { "epoch": 0.5149286023366508, "grad_norm": 10.639348773105537, "learning_rate": 2.805187105055217e-07, "logits/chosen": 3.21875, "logits/rejected": 3.1875, "logps/chosen": -1568.0, "logps/rejected": -1616.0, "loss": 0.6508, "loss/demonstration_loss": -3232.0, "loss/preference_loss": -3216.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.423828125, "rewards/margins": 0.12890625, "rewards/rejected": 0.294921875, "step": 1785 }, { "epoch": 0.5152170777441223, "grad_norm": 8.79555986209923, "learning_rate": 2.8026876749669666e-07, "logits/chosen": 3.25, "logits/rejected": 3.21875, "logps/chosen": -1528.0, "logps/rejected": -1440.0, "loss": 0.6092, "loss/demonstration_loss": -3024.0, "loss/preference_loss": -3008.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.5625, "rewards/margins": 0.298828125, "rewards/rejected": 0.263671875, "step": 1786 }, { "epoch": 0.5155055531515939, "grad_norm": 12.385555017777516, "learning_rate": 2.8001879377901144e-07, "logits/chosen": 3.328125, "logits/rejected": 3.1875, "logps/chosen": -1880.0, "logps/rejected": -1896.0, "loss": 0.6561, "loss/demonstration_loss": -3824.0, "loss/preference_loss": -3808.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.48046875, "rewards/margins": 0.053955078125, "rewards/rejected": 0.42578125, "step": 1787 }, { "epoch": 0.5157940285590653, "grad_norm": 10.924212942877688, "learning_rate": 2.7976878960607423e-07, "logits/chosen": 3.21875, "logits/rejected": 3.21875, "logps/chosen": -1664.0, "logps/rejected": -2032.0, "loss": 0.6226, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3728.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.412109375, "rewards/margins": 0.09130859375, "rewards/rejected": 0.3203125, "step": 1788 }, { "epoch": 0.5160825039665369, "grad_norm": 10.78593814369221, "learning_rate": 2.795187552315242e-07, "logits/chosen": 3.375, "logits/rejected": 3.375, "logps/chosen": -1696.0, "logps/rejected": -1592.0, "loss": 0.7046, "loss/demonstration_loss": -3344.0, "loss/preference_loss": -3328.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.56640625, "rewards/margins": 0.04296875, "rewards/rejected": 0.5234375, "step": 1789 }, { "epoch": 0.5163709793740083, "grad_norm": 11.279712481239342, "learning_rate": 2.792686909090311e-07, "logits/chosen": 3.21875, "logits/rejected": 3.140625, "logps/chosen": -1528.0, "logps/rejected": -1872.0, "loss": 0.707, "loss/demonstration_loss": -3424.0, "loss/preference_loss": -3424.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.1650390625, "rewards/margins": -0.0546875, "rewards/rejected": 0.2197265625, "step": 1790 }, { "epoch": 0.5166594547814799, "grad_norm": 11.547203408410557, "learning_rate": 2.790185968922951e-07, "logits/chosen": 3.328125, "logits/rejected": 3.3125, "logps/chosen": -1584.0, "logps/rejected": -1688.0, "loss": 0.658, "loss/demonstration_loss": -3312.0, "loss/preference_loss": -3296.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.310546875, "rewards/margins": 0.1162109375, "rewards/rejected": 0.1943359375, "step": 1791 }, { "epoch": 0.5169479301889514, "grad_norm": 9.674800434602586, "learning_rate": 2.787684734350464e-07, "logits/chosen": 3.234375, "logits/rejected": 3.25, "logps/chosen": -1544.0, "logps/rejected": -1512.0, "loss": 0.6423, "loss/demonstration_loss": -3088.0, "loss/preference_loss": -3072.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.296875, "rewards/margins": 0.12890625, "rewards/rejected": 0.16796875, "step": 1792 }, { "epoch": 0.5172364055964229, "grad_norm": 11.004983692149857, "learning_rate": 2.785183207910451e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -1856.0, "logps/rejected": -1744.0, "loss": 0.6205, "loss/demonstration_loss": -3648.0, "loss/preference_loss": -3632.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.49609375, "rewards/margins": 0.109375, "rewards/rejected": 0.38671875, "step": 1793 }, { "epoch": 0.5175248810038944, "grad_norm": 11.036381043107324, "learning_rate": 2.78268139214081e-07, "logits/chosen": 3.140625, "logits/rejected": 3.140625, "logps/chosen": -1552.0, "logps/rejected": -1656.0, "loss": 0.6823, "loss/demonstration_loss": -3264.0, "loss/preference_loss": -3248.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.412109375, "rewards/margins": 0.03857421875, "rewards/rejected": 0.373046875, "step": 1794 }, { "epoch": 0.5178133564113659, "grad_norm": 11.74127916604716, "learning_rate": 2.7801792895797314e-07, "logits/chosen": 3.25, "logits/rejected": 3.234375, "logps/chosen": -2144.0, "logps/rejected": -2016.0, "loss": 0.6805, "loss/demonstration_loss": -4192.0, "loss/preference_loss": -4192.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.443359375, "rewards/margins": 0.087890625, "rewards/rejected": 0.35546875, "step": 1795 }, { "epoch": 0.5181018318188374, "grad_norm": 11.510896381212797, "learning_rate": 2.777676902765697e-07, "logits/chosen": 3.296875, "logits/rejected": 3.296875, "logps/chosen": -1984.0, "logps/rejected": -1728.0, "loss": 0.6583, "loss/demonstration_loss": -3744.0, "loss/preference_loss": -3744.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.37109375, "rewards/margins": 0.047119140625, "rewards/rejected": 0.32421875, "step": 1796 }, { "epoch": 0.518390307226309, "grad_norm": 11.714203265773671, "learning_rate": 2.7751742342374785e-07, "logits/chosen": 3.28125, "logits/rejected": 3.296875, "logps/chosen": -1688.0, "logps/rejected": -1776.0, "loss": 0.6896, "loss/demonstration_loss": -3504.0, "loss/preference_loss": -3520.0, "rewards/accuracies": 0.25, "rewards/chosen": 0.48828125, "rewards/margins": -0.01953125, "rewards/rejected": 0.5078125, "step": 1797 }, { "epoch": 0.5186787826337804, "grad_norm": 11.394969558183211, "learning_rate": 2.7726712865341297e-07, "logits/chosen": 3.0625, "logits/rejected": 3.234375, "logps/chosen": -1488.0, "logps/rejected": -1448.0, "loss": 0.6825, "loss/demonstration_loss": -2960.0, "loss/preference_loss": -2944.0, "rewards/accuracies": 0.5, "rewards/chosen": 0.171875, "rewards/margins": 0.025390625, "rewards/rejected": 0.146484375, "step": 1798 }, { "epoch": 0.518967258041252, "grad_norm": 10.172716302831192, "learning_rate": 2.770168062194991e-07, "logits/chosen": 3.234375, "logits/rejected": 3.359375, "logps/chosen": -1256.0, "logps/rejected": -1144.0, "loss": 0.6469, "loss/demonstration_loss": -2416.0, "loss/preference_loss": -2416.0, "rewards/accuracies": 0.375, "rewards/chosen": 0.1689453125, "rewards/margins": 0.0234375, "rewards/rejected": 0.1455078125, "step": 1799 }, { "epoch": 0.5192557334487234, "grad_norm": 11.763509760059897, "learning_rate": 2.767664563759683e-07, "logits/chosen": 3.109375, "logits/rejected": 3.09375, "logps/chosen": -1848.0, "logps/rejected": -1832.0, "loss": 0.6771, "loss/demonstration_loss": -3712.0, "loss/preference_loss": -3712.0, "rewards/accuracies": 0.4375, "rewards/chosen": 0.390625, "rewards/margins": 0.068359375, "rewards/rejected": 0.322265625, "step": 1800 }, { "epoch": 0.5192557334487234, "step": 1800, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 1.1312, "train_samples_per_second": 35360.967, "train_steps_per_second": 1105.03 } ], "logging_steps": 1, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }