diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,119149 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.979883566776183, + "eval_steps": 1000, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.2900763358778623e-09, + "logits/chosen": -3.078238010406494, + "logits/rejected": -3.036238670349121, + "logps/chosen": -119.78216552734375, + "logps/rejected": -81.49699401855469, + "loss": 0.6813, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07474613189697266, + "rewards/margins": 0.049993135035037994, + "rewards/rejected": -0.12473927438259125, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.580152671755725e-09, + "logits/chosen": -3.19315242767334, + "logits/rejected": -3.0488533973693848, + "logps/chosen": -379.8439636230469, + "logps/rejected": -246.86712646484375, + "loss": 0.7741, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.25631868839263916, + "rewards/margins": -0.1461872160434723, + "rewards/rejected": -0.11013145744800568, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 6.870229007633587e-09, + "logits/chosen": -3.103341579437256, + "logits/rejected": -3.2139434814453125, + "logps/chosen": -155.2957763671875, + "logps/rejected": -280.8891296386719, + "loss": 0.5763, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11268434673547745, + "rewards/margins": 0.2866156995296478, + "rewards/rejected": -0.17393136024475098, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 9.16030534351145e-09, + "logits/chosen": -2.8749852180480957, + "logits/rejected": -2.8633487224578857, + "logps/chosen": -414.8719787597656, + "logps/rejected": -280.2159118652344, + "loss": 0.9265, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3218037188053131, + "rewards/margins": -0.36112719774246216, + "rewards/rejected": 0.03932347521185875, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 1.1450381679389314e-08, + "logits/chosen": -3.214733839035034, + "logits/rejected": -3.014448642730713, + "logps/chosen": -287.48492431640625, + "logps/rejected": -319.5427551269531, + "loss": 0.634, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0066321855410933495, + "rewards/margins": 0.18307408690452576, + "rewards/rejected": -0.17644190788269043, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1.3740458015267175e-08, + "logits/chosen": -3.198916435241699, + "logits/rejected": -3.1198673248291016, + "logps/chosen": -306.1859436035156, + "logps/rejected": -256.5517578125, + "loss": 0.7132, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3292255699634552, + "rewards/margins": 0.04297979548573494, + "rewards/rejected": -0.37220534682273865, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.6030534351145036e-08, + "logits/chosen": -3.576998472213745, + "logits/rejected": -3.4146039485931396, + "logps/chosen": -218.9892578125, + "logps/rejected": -149.21798706054688, + "loss": 0.8367, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24793888628482819, + "rewards/margins": -0.1352853775024414, + "rewards/rejected": -0.11265352368354797, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.83206106870229e-08, + "logits/chosen": -3.1488728523254395, + "logits/rejected": -3.140510082244873, + "logps/chosen": -531.25732421875, + "logps/rejected": -244.979736328125, + "loss": 0.852, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35448914766311646, + "rewards/margins": -0.252580851316452, + "rewards/rejected": -0.10190828144550323, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.0610687022900764e-08, + "logits/chosen": -3.243004083633423, + "logits/rejected": -2.971198558807373, + "logps/chosen": -338.7559509277344, + "logps/rejected": -160.9521484375, + "loss": 0.8119, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.358298122882843, + "rewards/margins": -0.13543759286403656, + "rewards/rejected": -0.22286053001880646, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 2.2900763358778627e-08, + "logits/chosen": -3.717552661895752, + "logits/rejected": -3.6545305252075195, + "logps/chosen": -293.1756591796875, + "logps/rejected": -308.94097900390625, + "loss": 0.5919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.016703132539987564, + "rewards/margins": 0.2437538057565689, + "rewards/rejected": -0.260456919670105, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.5190839694656487e-08, + "logits/chosen": -3.0810492038726807, + "logits/rejected": -2.7138595581054688, + "logps/chosen": -140.66273498535156, + "logps/rejected": -198.98141479492188, + "loss": 0.7935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24244041740894318, + "rewards/margins": -0.13574858009815216, + "rewards/rejected": -0.10669183731079102, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 2.748091603053435e-08, + "logits/chosen": -2.6906967163085938, + "logits/rejected": -2.969830274581909, + "logps/chosen": -533.2620239257812, + "logps/rejected": -321.76251220703125, + "loss": 0.914, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6085913777351379, + "rewards/margins": -0.23153522610664368, + "rewards/rejected": -0.3770561218261719, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 2.9770992366412212e-08, + "logits/chosen": -3.072509765625, + "logits/rejected": -3.102613687515259, + "logps/chosen": -334.9183349609375, + "logps/rejected": -248.54986572265625, + "loss": 0.679, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20921511948108673, + "rewards/margins": 0.10817017406225204, + "rewards/rejected": -0.3173852860927582, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 3.206106870229007e-08, + "logits/chosen": -3.446181297302246, + "logits/rejected": -3.629345417022705, + "logps/chosen": -188.58091735839844, + "logps/rejected": -214.6560821533203, + "loss": 0.6999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14689147472381592, + "rewards/margins": 0.1304509937763214, + "rewards/rejected": -0.27734246850013733, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 3.435114503816794e-08, + "logits/chosen": -3.1664552688598633, + "logits/rejected": -3.3170418739318848, + "logps/chosen": -202.69989013671875, + "logps/rejected": -311.98406982421875, + "loss": 0.6956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19405591487884521, + "rewards/margins": 0.052658870816230774, + "rewards/rejected": -0.246714785695076, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 3.66412213740458e-08, + "logits/chosen": -3.3414244651794434, + "logits/rejected": -3.961564302444458, + "logps/chosen": -99.53945922851562, + "logps/rejected": -190.3000946044922, + "loss": 0.5005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06142204999923706, + "rewards/margins": 0.6696228981018066, + "rewards/rejected": -0.7310448884963989, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 3.893129770992366e-08, + "logits/chosen": -3.0124399662017822, + "logits/rejected": -2.840730667114258, + "logps/chosen": -269.3916015625, + "logps/rejected": -235.80328369140625, + "loss": 0.8463, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4099459648132324, + "rewards/margins": -0.1655757576227188, + "rewards/rejected": -0.24437019228935242, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 4.122137404580153e-08, + "logits/chosen": -3.5484561920166016, + "logits/rejected": -3.308903932571411, + "logps/chosen": -267.91192626953125, + "logps/rejected": -269.2580871582031, + "loss": 0.8622, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.38084864616394043, + "rewards/margins": -0.15083739161491394, + "rewards/rejected": -0.2300112396478653, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 4.351145038167938e-08, + "logits/chosen": -2.7539050579071045, + "logits/rejected": -2.8171746730804443, + "logps/chosen": -444.2049865722656, + "logps/rejected": -528.9166259765625, + "loss": 0.7051, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11224745959043503, + "rewards/margins": 0.049409493803977966, + "rewards/rejected": -0.1616569459438324, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 4.5801526717557254e-08, + "logits/chosen": -3.1177096366882324, + "logits/rejected": -3.0110886096954346, + "logps/chosen": -105.60438537597656, + "logps/rejected": -201.22738647460938, + "loss": 0.7114, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17709673941135406, + "rewards/margins": -0.0056163109838962555, + "rewards/rejected": -0.1714804470539093, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 4.809160305343511e-08, + "logits/chosen": -3.28956937789917, + "logits/rejected": -3.1621954441070557, + "logps/chosen": -218.82611083984375, + "logps/rejected": -224.52459716796875, + "loss": 0.7087, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22030237317085266, + "rewards/margins": 0.13378384709358215, + "rewards/rejected": -0.3540862202644348, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 5.038167938931297e-08, + "logits/chosen": -3.541567087173462, + "logits/rejected": -3.389125108718872, + "logps/chosen": -234.33401489257812, + "logps/rejected": -265.4866638183594, + "loss": 1.0767, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.44112852215766907, + "rewards/margins": -0.5136497616767883, + "rewards/rejected": 0.07252121716737747, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 5.267175572519083e-08, + "logits/chosen": -3.6236681938171387, + "logits/rejected": -3.337611675262451, + "logps/chosen": -161.0250244140625, + "logps/rejected": -152.80413818359375, + "loss": 0.8573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.43034422397613525, + "rewards/margins": -0.14431187510490417, + "rewards/rejected": -0.28603237867355347, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 5.49618320610687e-08, + "logits/chosen": -3.0478150844573975, + "logits/rejected": -2.997501850128174, + "logps/chosen": -262.8673095703125, + "logps/rejected": -198.7688751220703, + "loss": 0.7549, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3380405902862549, + "rewards/margins": -0.09912433475255966, + "rewards/rejected": -0.23891624808311462, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 5.7251908396946565e-08, + "logits/chosen": -2.9718570709228516, + "logits/rejected": -2.6679067611694336, + "logps/chosen": -173.62368774414062, + "logps/rejected": -180.52847290039062, + "loss": 0.8252, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6487332582473755, + "rewards/margins": -0.1877424120903015, + "rewards/rejected": -0.4609908163547516, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 5.9541984732824424e-08, + "logits/chosen": -3.1790597438812256, + "logits/rejected": -3.1685009002685547, + "logps/chosen": -451.2270812988281, + "logps/rejected": -257.05303955078125, + "loss": 0.9584, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4195936322212219, + "rewards/margins": -0.36516913771629333, + "rewards/rejected": -0.054424479603767395, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 6.183206106870229e-08, + "logits/chosen": -3.7453653812408447, + "logits/rejected": -3.687110662460327, + "logps/chosen": -293.18316650390625, + "logps/rejected": -310.10235595703125, + "loss": 0.7963, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6200013756752014, + "rewards/margins": -0.025363504886627197, + "rewards/rejected": -0.5946378707885742, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 6.412213740458014e-08, + "logits/chosen": -3.2591514587402344, + "logits/rejected": -3.551267147064209, + "logps/chosen": -105.18576049804688, + "logps/rejected": -138.39620971679688, + "loss": 0.6627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13792753219604492, + "rewards/margins": 0.1355496644973755, + "rewards/rejected": -0.2734771966934204, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 6.641221374045801e-08, + "logits/chosen": -2.7191622257232666, + "logits/rejected": -3.007643222808838, + "logps/chosen": -506.18853759765625, + "logps/rejected": -316.33709716796875, + "loss": 0.9049, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6219179630279541, + "rewards/margins": -0.28037935495376587, + "rewards/rejected": -0.34153860807418823, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 6.870229007633587e-08, + "logits/chosen": -3.357670545578003, + "logits/rejected": -3.2435083389282227, + "logps/chosen": -334.81182861328125, + "logps/rejected": -338.9449157714844, + "loss": 0.9027, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5266609191894531, + "rewards/margins": -0.25207364559173584, + "rewards/rejected": -0.2745872735977173, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 7.099236641221374e-08, + "logits/chosen": -2.9724888801574707, + "logits/rejected": -3.1793935298919678, + "logps/chosen": -194.36685180664062, + "logps/rejected": -197.75302124023438, + "loss": 0.6695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14510780572891235, + "rewards/margins": 0.10041507333517075, + "rewards/rejected": -0.2455228865146637, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 7.32824427480916e-08, + "logits/chosen": -3.429110527038574, + "logits/rejected": -3.6405317783355713, + "logps/chosen": -302.10711669921875, + "logps/rejected": -260.0133361816406, + "loss": 0.6717, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23328939080238342, + "rewards/margins": 0.17789646983146667, + "rewards/rejected": -0.4111858606338501, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 7.557251908396946e-08, + "logits/chosen": -2.4055440425872803, + "logits/rejected": -2.447580337524414, + "logps/chosen": -274.1697998046875, + "logps/rejected": -203.53143310546875, + "loss": 0.6863, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032405950129032135, + "rewards/margins": 0.028429530560970306, + "rewards/rejected": -0.06083548069000244, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 7.786259541984733e-08, + "logits/chosen": -2.4708945751190186, + "logits/rejected": -2.684777021408081, + "logps/chosen": -301.1230773925781, + "logps/rejected": -209.44749450683594, + "loss": 0.9305, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5941293239593506, + "rewards/margins": -0.2963210642337799, + "rewards/rejected": -0.2978082299232483, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 8.015267175572519e-08, + "logits/chosen": -3.1385161876678467, + "logits/rejected": -3.1987080574035645, + "logps/chosen": -319.5728759765625, + "logps/rejected": -289.68719482421875, + "loss": 0.9848, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.709468424320221, + "rewards/margins": -0.4550679922103882, + "rewards/rejected": -0.25440046191215515, + "step": 35 + }, + { + "epoch": 0.0, + "learning_rate": 8.244274809160306e-08, + "logits/chosen": -3.3285205364227295, + "logits/rejected": -3.2466812133789062, + "logps/chosen": -153.22515869140625, + "logps/rejected": -148.2000274658203, + "loss": 0.8992, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5575298070907593, + "rewards/margins": -0.28044992685317993, + "rewards/rejected": -0.27707988023757935, + "step": 36 + }, + { + "epoch": 0.0, + "learning_rate": 8.473282442748092e-08, + "logits/chosen": -2.8870813846588135, + "logits/rejected": -3.0496702194213867, + "logps/chosen": -301.7391052246094, + "logps/rejected": -248.8736572265625, + "loss": 0.6407, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1857985556125641, + "rewards/margins": 0.19316615164279938, + "rewards/rejected": -0.37896469235420227, + "step": 37 + }, + { + "epoch": 0.0, + "learning_rate": 8.702290076335876e-08, + "logits/chosen": -2.786118507385254, + "logits/rejected": -2.9129960536956787, + "logps/chosen": -311.86468505859375, + "logps/rejected": -313.30462646484375, + "loss": 0.6125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07735371589660645, + "rewards/margins": 0.21646477282047272, + "rewards/rejected": -0.29381847381591797, + "step": 38 + }, + { + "epoch": 0.0, + "learning_rate": 8.931297709923663e-08, + "logits/chosen": -2.8042986392974854, + "logits/rejected": -2.8723156452178955, + "logps/chosen": -321.5715637207031, + "logps/rejected": -320.4477233886719, + "loss": 0.6451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01878397725522518, + "rewards/margins": 0.12471916526556015, + "rewards/rejected": -0.14350314438343048, + "step": 39 + }, + { + "epoch": 0.0, + "learning_rate": 9.160305343511451e-08, + "logits/chosen": -3.1336617469787598, + "logits/rejected": -2.9629650115966797, + "logps/chosen": -503.5635070800781, + "logps/rejected": -265.04559326171875, + "loss": 0.8828, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.55012047290802, + "rewards/margins": -0.273624449968338, + "rewards/rejected": -0.2764959931373596, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 9.389312977099237e-08, + "logits/chosen": -2.956836700439453, + "logits/rejected": -2.9640579223632812, + "logps/chosen": -179.7181396484375, + "logps/rejected": -188.00875854492188, + "loss": 0.5287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.000301264226436615, + "rewards/margins": 0.44891369342803955, + "rewards/rejected": -0.44861239194869995, + "step": 41 + }, + { + "epoch": 0.0, + "learning_rate": 9.618320610687021e-08, + "logits/chosen": -3.740957736968994, + "logits/rejected": -3.6143572330474854, + "logps/chosen": -294.339599609375, + "logps/rejected": -267.4578857421875, + "loss": 0.7226, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12384767830371857, + "rewards/margins": 0.0343015231192112, + "rewards/rejected": -0.15814919769763947, + "step": 42 + }, + { + "epoch": 0.0, + "learning_rate": 9.847328244274808e-08, + "logits/chosen": -3.323808431625366, + "logits/rejected": -3.402543067932129, + "logps/chosen": -290.0013732910156, + "logps/rejected": -249.97146606445312, + "loss": 0.5933, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0443418025970459, + "rewards/margins": 0.22824504971504211, + "rewards/rejected": -0.18390324711799622, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.0076335877862595e-07, + "logits/chosen": -3.105375289916992, + "logits/rejected": -3.2476706504821777, + "logps/chosen": -180.52053833007812, + "logps/rejected": -168.6074676513672, + "loss": 0.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30576640367507935, + "rewards/margins": 0.009885972365736961, + "rewards/rejected": -0.31565237045288086, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 1.0305343511450381e-07, + "logits/chosen": -3.436368465423584, + "logits/rejected": -3.6154918670654297, + "logps/chosen": -190.18783569335938, + "logps/rejected": -251.2783660888672, + "loss": 0.6095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2333325445652008, + "rewards/margins": 0.27039211988449097, + "rewards/rejected": -0.5037246346473694, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.0534351145038167e-07, + "logits/chosen": -3.218198776245117, + "logits/rejected": -3.23685884475708, + "logps/chosen": -222.12570190429688, + "logps/rejected": -240.62774658203125, + "loss": 0.7216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2075226604938507, + "rewards/margins": -0.04449383169412613, + "rewards/rejected": -0.1630288064479828, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 1.0763358778625953e-07, + "logits/chosen": -3.4242541790008545, + "logits/rejected": -3.4930057525634766, + "logps/chosen": -446.5312805175781, + "logps/rejected": -243.98568725585938, + "loss": 0.7223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1561458706855774, + "rewards/margins": -0.028374865651130676, + "rewards/rejected": -0.12777099013328552, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.099236641221374e-07, + "logits/chosen": -3.3152806758880615, + "logits/rejected": -3.234919786453247, + "logps/chosen": -198.54061889648438, + "logps/rejected": -158.94789123535156, + "loss": 0.8807, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4782709777355194, + "rewards/margins": -0.3029758036136627, + "rewards/rejected": -0.1752951741218567, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.1221374045801526e-07, + "logits/chosen": -3.510044574737549, + "logits/rejected": -3.3472442626953125, + "logps/chosen": -252.0076904296875, + "logps/rejected": -188.69393920898438, + "loss": 0.7912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2720061242580414, + "rewards/margins": -0.1420605331659317, + "rewards/rejected": -0.1299455612897873, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.1450381679389313e-07, + "logits/chosen": -2.311176061630249, + "logits/rejected": -2.34525990486145, + "logps/chosen": -348.6552734375, + "logps/rejected": -330.98394775390625, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20904523134231567, + "rewards/margins": 0.12434081733226776, + "rewards/rejected": -0.33338606357574463, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.1679389312977098e-07, + "logits/chosen": -2.6295225620269775, + "logits/rejected": -2.6080965995788574, + "logps/chosen": -166.45623779296875, + "logps/rejected": -237.57711791992188, + "loss": 0.4807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19380536675453186, + "rewards/margins": 0.9976029396057129, + "rewards/rejected": -1.1914082765579224, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.1908396946564885e-07, + "logits/chosen": -3.254615068435669, + "logits/rejected": -3.429771900177002, + "logps/chosen": -275.1093444824219, + "logps/rejected": -261.70440673828125, + "loss": 0.6478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32171791791915894, + "rewards/margins": 0.18373233079910278, + "rewards/rejected": -0.5054502487182617, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 1.2137404580152673e-07, + "logits/chosen": -2.9028823375701904, + "logits/rejected": -2.97601318359375, + "logps/chosen": -273.58074951171875, + "logps/rejected": -264.04998779296875, + "loss": 0.6445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10398735851049423, + "rewards/margins": 0.13454608619213104, + "rewards/rejected": -0.23853343725204468, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 1.2366412213740458e-07, + "logits/chosen": -3.6550729274749756, + "logits/rejected": -3.7607603073120117, + "logps/chosen": -283.7383117675781, + "logps/rejected": -102.44178771972656, + "loss": 0.847, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22564618289470673, + "rewards/margins": -0.20799270272254944, + "rewards/rejected": -0.017653487622737885, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 1.2595419847328243e-07, + "logits/chosen": -3.3850491046905518, + "logits/rejected": -3.008723258972168, + "logps/chosen": -225.89337158203125, + "logps/rejected": -225.0657196044922, + "loss": 0.9198, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3809661865234375, + "rewards/margins": -0.34977200627326965, + "rewards/rejected": -0.031194206327199936, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 1.2824427480916029e-07, + "logits/chosen": -3.1954517364501953, + "logits/rejected": -3.2309398651123047, + "logps/chosen": -250.4422149658203, + "logps/rejected": -288.80108642578125, + "loss": 0.6738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0924837589263916, + "rewards/margins": 0.08327709138393402, + "rewards/rejected": -0.17576085031032562, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 1.3053435114503817e-07, + "logits/chosen": -3.1056759357452393, + "logits/rejected": -3.3055460453033447, + "logps/chosen": -302.9123840332031, + "logps/rejected": -377.08990478515625, + "loss": 0.6432, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1020759865641594, + "rewards/margins": 0.17593111097812653, + "rewards/rejected": -0.2780070900917053, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 1.3282442748091602e-07, + "logits/chosen": -3.499239921569824, + "logits/rejected": -3.706509590148926, + "logps/chosen": -210.54481506347656, + "logps/rejected": -177.09408569335938, + "loss": 0.7282, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13845963776111603, + "rewards/margins": -0.05474615469574928, + "rewards/rejected": -0.08371348679065704, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 1.3511450381679387e-07, + "logits/chosen": -3.2795305252075195, + "logits/rejected": -3.40950345993042, + "logps/chosen": -227.6250457763672, + "logps/rejected": -253.87281799316406, + "loss": 0.6866, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01637895405292511, + "rewards/margins": 0.05221076309680939, + "rewards/rejected": -0.0685897171497345, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 1.3740458015267175e-07, + "logits/chosen": -2.739415168762207, + "logits/rejected": -2.7073135375976562, + "logps/chosen": -263.2015075683594, + "logps/rejected": -264.9258728027344, + "loss": 0.612, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0466853603720665, + "rewards/margins": 0.2469082474708557, + "rewards/rejected": -0.2935935854911804, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.396946564885496e-07, + "logits/chosen": -3.107130765914917, + "logits/rejected": -3.1482627391815186, + "logps/chosen": -232.05810546875, + "logps/rejected": -217.86846923828125, + "loss": 0.64, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35781604051589966, + "rewards/margins": 0.1471811830997467, + "rewards/rejected": -0.504997193813324, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 1.4198473282442748e-07, + "logits/chosen": -3.232652187347412, + "logits/rejected": -3.2418293952941895, + "logps/chosen": -293.41375732421875, + "logps/rejected": -176.07763671875, + "loss": 0.8971, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5810433626174927, + "rewards/margins": -0.1955834925174713, + "rewards/rejected": -0.3854598104953766, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 1.4427480916030533e-07, + "logits/chosen": -3.355353355407715, + "logits/rejected": -2.914226531982422, + "logps/chosen": -312.8883361816406, + "logps/rejected": -245.61172485351562, + "loss": 0.7297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30815935134887695, + "rewards/margins": 0.03318149596452713, + "rewards/rejected": -0.3413408398628235, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 1.465648854961832e-07, + "logits/chosen": -3.356411933898926, + "logits/rejected": -3.5593082904815674, + "logps/chosen": -190.41754150390625, + "logps/rejected": -216.968017578125, + "loss": 0.5685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08559704571962357, + "rewards/margins": 0.34894007444381714, + "rewards/rejected": -0.4345371127128601, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 1.4885496183206107e-07, + "logits/chosen": -3.1831443309783936, + "logits/rejected": -3.4159247875213623, + "logps/chosen": -246.69711303710938, + "logps/rejected": -378.2087097167969, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21070018410682678, + "rewards/margins": 0.09936267137527466, + "rewards/rejected": -0.31006285548210144, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 1.5114503816793892e-07, + "logits/chosen": -3.176586389541626, + "logits/rejected": -3.2331390380859375, + "logps/chosen": -145.62741088867188, + "logps/rejected": -201.29031372070312, + "loss": 0.6863, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00719447061419487, + "rewards/margins": 0.04938926920294762, + "rewards/rejected": -0.05658373981714249, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 1.5343511450381677e-07, + "logits/chosen": -3.2608749866485596, + "logits/rejected": -3.1006011962890625, + "logps/chosen": -295.36663818359375, + "logps/rejected": -251.03538513183594, + "loss": 0.7539, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24826902151107788, + "rewards/margins": -0.10409437119960785, + "rewards/rejected": -0.14417466521263123, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 1.5572519083969465e-07, + "logits/chosen": -2.817625045776367, + "logits/rejected": -2.641526460647583, + "logps/chosen": -312.7576599121094, + "logps/rejected": -238.22512817382812, + "loss": 0.6838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16015835106372833, + "rewards/margins": 0.04708592966198921, + "rewards/rejected": -0.20724429190158844, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 1.580152671755725e-07, + "logits/chosen": -2.7484593391418457, + "logits/rejected": -2.7156190872192383, + "logps/chosen": -355.67657470703125, + "logps/rejected": -314.1708679199219, + "loss": 0.6291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.046055540442466736, + "rewards/margins": 0.16284514963626862, + "rewards/rejected": -0.20890067517757416, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 1.6030534351145038e-07, + "logits/chosen": -3.5092968940734863, + "logits/rejected": -3.4158108234405518, + "logps/chosen": -410.8740234375, + "logps/rejected": -239.82080078125, + "loss": 0.8419, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.26288700103759766, + "rewards/margins": -0.22411823272705078, + "rewards/rejected": -0.038768768310546875, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.6259541984732824e-07, + "logits/chosen": -3.689443588256836, + "logits/rejected": -3.8457741737365723, + "logps/chosen": -201.53549194335938, + "logps/rejected": -196.2528839111328, + "loss": 0.7281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5380469560623169, + "rewards/margins": 0.03280482441186905, + "rewards/rejected": -0.5708518028259277, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 1.6488549618320612e-07, + "logits/chosen": -3.6252548694610596, + "logits/rejected": -3.703477621078491, + "logps/chosen": -186.8706817626953, + "logps/rejected": -206.57540893554688, + "loss": 0.7224, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2830352783203125, + "rewards/margins": 0.10716373473405838, + "rewards/rejected": -0.39019903540611267, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 1.6717557251908397e-07, + "logits/chosen": -3.7435011863708496, + "logits/rejected": -3.344494104385376, + "logps/chosen": -408.44549560546875, + "logps/rejected": -195.21826171875, + "loss": 0.6231, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12293081730604172, + "rewards/margins": 0.21756964921951294, + "rewards/rejected": -0.34050044417381287, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 1.6946564885496185e-07, + "logits/chosen": -3.2026569843292236, + "logits/rejected": -3.049459934234619, + "logps/chosen": -168.80950927734375, + "logps/rejected": -109.94509887695312, + "loss": 0.8584, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.355167031288147, + "rewards/margins": -0.14064092934131622, + "rewards/rejected": -0.21452608704566956, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 1.7175572519083967e-07, + "logits/chosen": -3.384188652038574, + "logits/rejected": -3.38889479637146, + "logps/chosen": -136.00726318359375, + "logps/rejected": -167.09767150878906, + "loss": 0.585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009287472814321518, + "rewards/margins": 0.2863180935382843, + "rewards/rejected": -0.2956055700778961, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 1.7404580152671753e-07, + "logits/chosen": -3.215665340423584, + "logits/rejected": -3.5976943969726562, + "logps/chosen": -208.8179473876953, + "logps/rejected": -237.8899688720703, + "loss": 0.5989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22458000481128693, + "rewards/margins": 0.2855520248413086, + "rewards/rejected": -0.5101320743560791, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 1.763358778625954e-07, + "logits/chosen": -3.6791999340057373, + "logits/rejected": -3.5767288208007812, + "logps/chosen": -192.86703491210938, + "logps/rejected": -144.51394653320312, + "loss": 0.6796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1616715043783188, + "rewards/margins": 0.09672967344522476, + "rewards/rejected": -0.25840115547180176, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 1.7862595419847326e-07, + "logits/chosen": -3.0993099212646484, + "logits/rejected": -3.1729578971862793, + "logps/chosen": -133.24313354492188, + "logps/rejected": -268.46734619140625, + "loss": 0.7985, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3121618330478668, + "rewards/margins": -0.14207322895526886, + "rewards/rejected": -0.17008860409259796, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 1.8091603053435114e-07, + "logits/chosen": -2.547306537628174, + "logits/rejected": -2.672288179397583, + "logps/chosen": -510.82281494140625, + "logps/rejected": -329.476806640625, + "loss": 0.8966, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.27125856280326843, + "rewards/margins": -0.32742053270339966, + "rewards/rejected": 0.05616197735071182, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 1.8320610687022902e-07, + "logits/chosen": -3.897907257080078, + "logits/rejected": -3.639875888824463, + "logps/chosen": -283.5416259765625, + "logps/rejected": -196.81964111328125, + "loss": 0.6515, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23461467027664185, + "rewards/margins": 0.17349721491336823, + "rewards/rejected": -0.4081118404865265, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 1.8549618320610687e-07, + "logits/chosen": -2.970639228820801, + "logits/rejected": -2.8881149291992188, + "logps/chosen": -190.8037109375, + "logps/rejected": -248.2021026611328, + "loss": 1.0786, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.45137572288513184, + "rewards/margins": -0.5815427303314209, + "rewards/rejected": 0.13016700744628906, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 1.8778625954198475e-07, + "logits/chosen": -2.622469425201416, + "logits/rejected": -2.6842331886291504, + "logps/chosen": -524.6024169921875, + "logps/rejected": -383.9125671386719, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3260946273803711, + "rewards/margins": 0.0016674511134624481, + "rewards/rejected": -0.32776206731796265, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 1.9007633587786258e-07, + "logits/chosen": -2.657881259918213, + "logits/rejected": -2.80505108833313, + "logps/chosen": -194.5184326171875, + "logps/rejected": -249.5345458984375, + "loss": 0.5936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01869310811161995, + "rewards/margins": 0.2896742522716522, + "rewards/rejected": -0.3083673417568207, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 1.9236641221374043e-07, + "logits/chosen": -3.2683839797973633, + "logits/rejected": -3.5612690448760986, + "logps/chosen": -90.61796569824219, + "logps/rejected": -216.69204711914062, + "loss": 0.513, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2816338539123535, + "rewards/margins": 0.6866769194602966, + "rewards/rejected": -0.4050430357456207, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 1.946564885496183e-07, + "logits/chosen": -3.343080520629883, + "logits/rejected": -3.335479497909546, + "logps/chosen": -188.54107666015625, + "logps/rejected": -179.77496337890625, + "loss": 0.669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28755441308021545, + "rewards/margins": 0.09143888205289841, + "rewards/rejected": -0.37899330258369446, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 1.9694656488549616e-07, + "logits/chosen": -3.7428441047668457, + "logits/rejected": -3.585681438446045, + "logps/chosen": -316.42376708984375, + "logps/rejected": -277.25347900390625, + "loss": 0.8771, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40692636370658875, + "rewards/margins": 0.04885232448577881, + "rewards/rejected": -0.45577865839004517, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 1.9923664122137404e-07, + "logits/chosen": -3.7199385166168213, + "logits/rejected": -3.6908345222473145, + "logps/chosen": -286.1842346191406, + "logps/rejected": -298.1600646972656, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3951358497142792, + "rewards/margins": 0.15575389564037323, + "rewards/rejected": -0.5508897304534912, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 2.015267175572519e-07, + "logits/chosen": -3.4288408756256104, + "logits/rejected": -3.425314426422119, + "logps/chosen": -249.10635375976562, + "logps/rejected": -215.15084838867188, + "loss": 0.7498, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3369584381580353, + "rewards/margins": -0.057854074984788895, + "rewards/rejected": -0.2791043817996979, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 2.0381679389312977e-07, + "logits/chosen": -3.7518229484558105, + "logits/rejected": -3.4919657707214355, + "logps/chosen": -363.00244140625, + "logps/rejected": -267.1479187011719, + "loss": 0.9747, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6541010141372681, + "rewards/margins": -0.43367499113082886, + "rewards/rejected": -0.2204260379076004, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 2.0610687022900762e-07, + "logits/chosen": -2.840808629989624, + "logits/rejected": -2.660957098007202, + "logps/chosen": -231.73004150390625, + "logps/rejected": -251.68307495117188, + "loss": 1.2391, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.837734043598175, + "rewards/margins": -0.6147798299789429, + "rewards/rejected": -0.22295422852039337, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 2.083969465648855e-07, + "logits/chosen": -2.8722660541534424, + "logits/rejected": -3.3833670616149902, + "logps/chosen": -108.17529296875, + "logps/rejected": -186.573486328125, + "loss": 0.532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03559593856334686, + "rewards/margins": 0.44083544611930847, + "rewards/rejected": -0.47643136978149414, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 2.1068702290076333e-07, + "logits/chosen": -2.808424711227417, + "logits/rejected": -2.9115793704986572, + "logps/chosen": -279.8440856933594, + "logps/rejected": -182.3898468017578, + "loss": 0.7081, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20777331292629242, + "rewards/margins": 0.06929311901330948, + "rewards/rejected": -0.2770664393901825, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 2.129770992366412e-07, + "logits/chosen": -3.868560314178467, + "logits/rejected": -3.631479263305664, + "logps/chosen": -395.3164367675781, + "logps/rejected": -268.59808349609375, + "loss": 0.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10652284324169159, + "rewards/margins": 0.22148486971855164, + "rewards/rejected": -0.32800769805908203, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 2.1526717557251906e-07, + "logits/chosen": -2.4958600997924805, + "logits/rejected": -2.588447332382202, + "logps/chosen": -165.06683349609375, + "logps/rejected": -252.86849975585938, + "loss": 0.8546, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3297153413295746, + "rewards/margins": -0.17414125800132751, + "rewards/rejected": -0.15557409822940826, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 2.1755725190839694e-07, + "logits/chosen": -4.081074237823486, + "logits/rejected": -3.84214448928833, + "logps/chosen": -268.2448425292969, + "logps/rejected": -181.27870178222656, + "loss": 0.6577, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19969063997268677, + "rewards/margins": 0.13628439605236053, + "rewards/rejected": -0.3359750211238861, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 2.198473282442748e-07, + "logits/chosen": -2.5362308025360107, + "logits/rejected": -2.412184715270996, + "logps/chosen": -210.59732055664062, + "logps/rejected": -203.92547607421875, + "loss": 0.5965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.029003728181123734, + "rewards/margins": 0.2204502522945404, + "rewards/rejected": -0.24945397675037384, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 2.2213740458015267e-07, + "logits/chosen": -2.621870994567871, + "logits/rejected": -2.678809642791748, + "logps/chosen": -270.0916442871094, + "logps/rejected": -223.74954223632812, + "loss": 0.8207, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21331731975078583, + "rewards/margins": -0.17596152424812317, + "rewards/rejected": -0.037355780601501465, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 2.2442748091603053e-07, + "logits/chosen": -3.2101831436157227, + "logits/rejected": -3.0311570167541504, + "logps/chosen": -204.61627197265625, + "logps/rejected": -240.28762817382812, + "loss": 0.6831, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21028414368629456, + "rewards/margins": 0.12435102462768555, + "rewards/rejected": -0.3346351683139801, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 2.267175572519084e-07, + "logits/chosen": -2.986084461212158, + "logits/rejected": -3.260254383087158, + "logps/chosen": -232.95578002929688, + "logps/rejected": -339.60125732421875, + "loss": 0.6849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29341188073158264, + "rewards/margins": 0.08152984082698822, + "rewards/rejected": -0.37494170665740967, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 2.2900763358778626e-07, + "logits/chosen": -3.2261757850646973, + "logits/rejected": -3.233630418777466, + "logps/chosen": -133.8319091796875, + "logps/rejected": -266.5990295410156, + "loss": 0.6538, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2386825531721115, + "rewards/margins": 0.14204590022563934, + "rewards/rejected": -0.38072848320007324, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 2.3129770992366408e-07, + "logits/chosen": -2.8640241622924805, + "logits/rejected": -3.1998090744018555, + "logps/chosen": -154.29061889648438, + "logps/rejected": -190.56939697265625, + "loss": 0.664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45760613679885864, + "rewards/margins": 0.39487671852111816, + "rewards/rejected": -0.852482795715332, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 2.3358778625954196e-07, + "logits/chosen": -3.082688808441162, + "logits/rejected": -3.0252885818481445, + "logps/chosen": -237.9062042236328, + "logps/rejected": -252.95895385742188, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11365075409412384, + "rewards/margins": 0.0526578426361084, + "rewards/rejected": -0.16630861163139343, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 2.3587786259541982e-07, + "logits/chosen": -2.893190860748291, + "logits/rejected": -2.84110951423645, + "logps/chosen": -242.72274780273438, + "logps/rejected": -234.75003051757812, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07537122070789337, + "rewards/margins": 0.6416997313499451, + "rewards/rejected": -0.7170709371566772, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 2.381679389312977e-07, + "logits/chosen": -2.6073660850524902, + "logits/rejected": -2.7198150157928467, + "logps/chosen": -468.8730773925781, + "logps/rejected": -289.12579345703125, + "loss": 0.7131, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37968218326568604, + "rewards/margins": 0.021421197801828384, + "rewards/rejected": -0.4011034369468689, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 2.4045801526717555e-07, + "logits/chosen": -2.5145530700683594, + "logits/rejected": -2.4582180976867676, + "logps/chosen": -374.68780517578125, + "logps/rejected": -169.569091796875, + "loss": 0.9596, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.36867406964302063, + "rewards/margins": -0.38278090953826904, + "rewards/rejected": 0.01410684734582901, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 2.4274809160305345e-07, + "logits/chosen": -2.3901045322418213, + "logits/rejected": -2.6872684955596924, + "logps/chosen": -507.0616455078125, + "logps/rejected": -330.80035400390625, + "loss": 0.7278, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35140591859817505, + "rewards/margins": 0.08202869445085526, + "rewards/rejected": -0.4334346055984497, + "step": 106 + }, + { + "epoch": 0.01, + "learning_rate": 2.450381679389313e-07, + "logits/chosen": -3.057403326034546, + "logits/rejected": -3.0108768939971924, + "logps/chosen": -148.05653381347656, + "logps/rejected": -149.05780029296875, + "loss": 0.6064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01555030420422554, + "rewards/margins": 0.20563119649887085, + "rewards/rejected": -0.19008088111877441, + "step": 107 + }, + { + "epoch": 0.01, + "learning_rate": 2.4732824427480916e-07, + "logits/chosen": -2.9712204933166504, + "logits/rejected": -3.286478042602539, + "logps/chosen": -191.14633178710938, + "logps/rejected": -256.52520751953125, + "loss": 0.7175, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06530895829200745, + "rewards/margins": -0.015452280640602112, + "rewards/rejected": -0.04985666275024414, + "step": 108 + }, + { + "epoch": 0.01, + "learning_rate": 2.49618320610687e-07, + "logits/chosen": -3.298278331756592, + "logits/rejected": -3.1989502906799316, + "logps/chosen": -280.6126708984375, + "logps/rejected": -299.59027099609375, + "loss": 0.7241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13395948708057404, + "rewards/margins": 0.010253429412841797, + "rewards/rejected": -0.14421290159225464, + "step": 109 + }, + { + "epoch": 0.01, + "learning_rate": 2.5190839694656487e-07, + "logits/chosen": -3.0720670223236084, + "logits/rejected": -3.1476120948791504, + "logps/chosen": -265.6000061035156, + "logps/rejected": -212.43777465820312, + "loss": 0.6345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23672738671302795, + "rewards/margins": 0.293150931596756, + "rewards/rejected": -0.5298783183097839, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 2.541984732824427e-07, + "logits/chosen": -3.446840524673462, + "logits/rejected": -3.202186346054077, + "logps/chosen": -222.1020965576172, + "logps/rejected": -199.55259704589844, + "loss": 0.6789, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1506931185722351, + "rewards/margins": 0.06943853199481964, + "rewards/rejected": -0.22013165056705475, + "step": 111 + }, + { + "epoch": 0.01, + "learning_rate": 2.5648854961832057e-07, + "logits/chosen": -3.349372625350952, + "logits/rejected": -3.331946849822998, + "logps/chosen": -184.6533660888672, + "logps/rejected": -186.05755615234375, + "loss": 0.5072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0090029276907444, + "rewards/margins": 0.45447224378585815, + "rewards/rejected": -0.46347516775131226, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 2.587786259541985e-07, + "logits/chosen": -3.6324164867401123, + "logits/rejected": -3.5397114753723145, + "logps/chosen": -242.62844848632812, + "logps/rejected": -219.70465087890625, + "loss": 0.676, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2702448070049286, + "rewards/margins": 0.21820223331451416, + "rewards/rejected": -0.48844704031944275, + "step": 113 + }, + { + "epoch": 0.01, + "learning_rate": 2.6106870229007633e-07, + "logits/chosen": -2.633464813232422, + "logits/rejected": -2.981417655944824, + "logps/chosen": -439.77032470703125, + "logps/rejected": -337.91033935546875, + "loss": 0.7016, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32548338174819946, + "rewards/margins": 0.012251023203134537, + "rewards/rejected": -0.3377344012260437, + "step": 114 + }, + { + "epoch": 0.01, + "learning_rate": 2.633587786259542e-07, + "logits/chosen": -3.2973833084106445, + "logits/rejected": -3.400735378265381, + "logps/chosen": -138.05523681640625, + "logps/rejected": -180.0774383544922, + "loss": 0.6534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04633617028594017, + "rewards/margins": 0.21817225217819214, + "rewards/rejected": -0.26450836658477783, + "step": 115 + }, + { + "epoch": 0.01, + "learning_rate": 2.6564885496183204e-07, + "logits/chosen": -3.3439953327178955, + "logits/rejected": -3.0374372005462646, + "logps/chosen": -260.5849609375, + "logps/rejected": -185.37100219726562, + "loss": 0.8746, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5902585387229919, + "rewards/margins": -0.2743756175041199, + "rewards/rejected": -0.31588292121887207, + "step": 116 + }, + { + "epoch": 0.01, + "learning_rate": 2.6793893129770994e-07, + "logits/chosen": -3.410749912261963, + "logits/rejected": -3.595094680786133, + "logps/chosen": -108.80453491210938, + "logps/rejected": -123.71438598632812, + "loss": 0.6974, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03910769149661064, + "rewards/margins": 0.10077085345983505, + "rewards/rejected": -0.1398785263299942, + "step": 117 + }, + { + "epoch": 0.01, + "learning_rate": 2.7022900763358774e-07, + "logits/chosen": -3.1846394538879395, + "logits/rejected": -3.3248443603515625, + "logps/chosen": -129.10328674316406, + "logps/rejected": -176.6233367919922, + "loss": 0.6864, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2899637222290039, + "rewards/margins": 0.16040734946727753, + "rewards/rejected": -0.45037105679512024, + "step": 118 + }, + { + "epoch": 0.01, + "learning_rate": 2.7251908396946565e-07, + "logits/chosen": -3.4267988204956055, + "logits/rejected": -3.3199591636657715, + "logps/chosen": -207.3040771484375, + "logps/rejected": -240.88040161132812, + "loss": 0.6593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06342926621437073, + "rewards/margins": 0.19744788110256195, + "rewards/rejected": -0.2608771324157715, + "step": 119 + }, + { + "epoch": 0.01, + "learning_rate": 2.748091603053435e-07, + "logits/chosen": -2.868412494659424, + "logits/rejected": -2.5516269207000732, + "logps/chosen": -210.47901916503906, + "logps/rejected": -158.34957885742188, + "loss": 0.7778, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.354963481426239, + "rewards/margins": -0.040900036692619324, + "rewards/rejected": -0.31406348943710327, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 2.7709923664122135e-07, + "logits/chosen": -3.559480905532837, + "logits/rejected": -3.4399425983428955, + "logps/chosen": -209.5469970703125, + "logps/rejected": -328.025146484375, + "loss": 0.7457, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.024084679782390594, + "rewards/margins": 0.13470521569252014, + "rewards/rejected": -0.11062048375606537, + "step": 121 + }, + { + "epoch": 0.01, + "learning_rate": 2.793893129770992e-07, + "logits/chosen": -3.115969657897949, + "logits/rejected": -3.3250746726989746, + "logps/chosen": -136.64122009277344, + "logps/rejected": -129.7142333984375, + "loss": 0.7404, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10612975060939789, + "rewards/margins": -0.06728742271661758, + "rewards/rejected": -0.03884231299161911, + "step": 122 + }, + { + "epoch": 0.01, + "learning_rate": 2.816793893129771e-07, + "logits/chosen": -2.689939260482788, + "logits/rejected": -2.5061988830566406, + "logps/chosen": -154.2698211669922, + "logps/rejected": -311.65301513671875, + "loss": 0.7122, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03599357604980469, + "rewards/margins": -0.006336741149425507, + "rewards/rejected": -0.029656827449798584, + "step": 123 + }, + { + "epoch": 0.01, + "learning_rate": 2.8396946564885496e-07, + "logits/chosen": -2.684889793395996, + "logits/rejected": -2.8223021030426025, + "logps/chosen": -223.34930419921875, + "logps/rejected": -322.0392150878906, + "loss": 0.7694, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25060153007507324, + "rewards/margins": -0.10317506641149521, + "rewards/rejected": -0.14742647111415863, + "step": 124 + }, + { + "epoch": 0.01, + "learning_rate": 2.862595419847328e-07, + "logits/chosen": -2.6810455322265625, + "logits/rejected": -2.955416202545166, + "logps/chosen": -180.22251892089844, + "logps/rejected": -207.12493896484375, + "loss": 0.8187, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3565467596054077, + "rewards/margins": -0.14117440581321716, + "rewards/rejected": -0.21537233889102936, + "step": 125 + }, + { + "epoch": 0.01, + "learning_rate": 2.8854961832061067e-07, + "logits/chosen": -2.4012675285339355, + "logits/rejected": -2.5587844848632812, + "logps/chosen": -276.8446960449219, + "logps/rejected": -217.76889038085938, + "loss": 0.7352, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.19575592875480652, + "rewards/margins": -0.04734290391206741, + "rewards/rejected": -0.1484130471944809, + "step": 126 + }, + { + "epoch": 0.01, + "learning_rate": 2.908396946564885e-07, + "logits/chosen": -3.5057852268218994, + "logits/rejected": -3.3522205352783203, + "logps/chosen": -171.24147033691406, + "logps/rejected": -155.68870544433594, + "loss": 0.746, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1005580872297287, + "rewards/margins": 0.0946294516324997, + "rewards/rejected": -0.1951875239610672, + "step": 127 + }, + { + "epoch": 0.01, + "learning_rate": 2.931297709923664e-07, + "logits/chosen": -3.698072671890259, + "logits/rejected": -3.657116651535034, + "logps/chosen": -301.0848388671875, + "logps/rejected": -367.31622314453125, + "loss": 0.4862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06704981625080109, + "rewards/margins": 0.8873828649520874, + "rewards/rejected": -0.8203331232070923, + "step": 128 + }, + { + "epoch": 0.01, + "learning_rate": 2.9541984732824423e-07, + "logits/chosen": -2.964540481567383, + "logits/rejected": -2.9919073581695557, + "logps/chosen": -231.81051635742188, + "logps/rejected": -252.814453125, + "loss": 0.7622, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2602808475494385, + "rewards/margins": 0.28699302673339844, + "rewards/rejected": -0.5472738146781921, + "step": 129 + }, + { + "epoch": 0.01, + "learning_rate": 2.9770992366412213e-07, + "logits/chosen": -3.481682538986206, + "logits/rejected": -3.518353223800659, + "logps/chosen": -190.08633422851562, + "logps/rejected": -166.47813415527344, + "loss": 0.5496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13021288812160492, + "rewards/margins": 0.5465818047523499, + "rewards/rejected": -0.6767946481704712, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 3e-07, + "logits/chosen": -2.5496842861175537, + "logits/rejected": -2.475698709487915, + "logps/chosen": -268.1327209472656, + "logps/rejected": -337.38543701171875, + "loss": 0.6545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.058471158146858215, + "rewards/margins": 0.12006109207868576, + "rewards/rejected": -0.17853224277496338, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 2.9996488353037574e-07, + "logits/chosen": -3.3209218978881836, + "logits/rejected": -3.0210037231445312, + "logps/chosen": -268.4331359863281, + "logps/rejected": -175.0969696044922, + "loss": 0.822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38932472467422485, + "rewards/margins": -0.17867454886436462, + "rewards/rejected": -0.21065017580986023, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 2.999297670607515e-07, + "logits/chosen": -2.925034999847412, + "logits/rejected": -3.2774596214294434, + "logps/chosen": -110.07563781738281, + "logps/rejected": -284.17333984375, + "loss": 0.4082, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1488136649131775, + "rewards/margins": 0.8118383884429932, + "rewards/rejected": -0.6630247235298157, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 2.9989465059112725e-07, + "logits/chosen": -3.530514717102051, + "logits/rejected": -3.268393039703369, + "logps/chosen": -346.12030029296875, + "logps/rejected": -265.051025390625, + "loss": 0.5096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.016817137598991394, + "rewards/margins": 0.44441425800323486, + "rewards/rejected": -0.46123138070106506, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 2.9985953412150295e-07, + "logits/chosen": -3.0413150787353516, + "logits/rejected": -3.0458922386169434, + "logps/chosen": -438.7294921875, + "logps/rejected": -348.6278076171875, + "loss": 0.7099, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14035530388355255, + "rewards/margins": 0.023443520069122314, + "rewards/rejected": -0.16379882395267487, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 2.998244176518787e-07, + "logits/chosen": -2.143547534942627, + "logits/rejected": -2.2817349433898926, + "logps/chosen": -479.7779541015625, + "logps/rejected": -485.11834716796875, + "loss": 0.7278, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17917653918266296, + "rewards/margins": 0.040828123688697815, + "rewards/rejected": -0.22000466287136078, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 2.9978930118225446e-07, + "logits/chosen": -3.4842469692230225, + "logits/rejected": -3.4347009658813477, + "logps/chosen": -408.99639892578125, + "logps/rejected": -387.94085693359375, + "loss": 0.7987, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37270495295524597, + "rewards/margins": 0.004420414566993713, + "rewards/rejected": -0.3771253526210785, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 2.997541847126302e-07, + "logits/chosen": -2.9967212677001953, + "logits/rejected": -2.8189125061035156, + "logps/chosen": -138.25588989257812, + "logps/rejected": -242.4408721923828, + "loss": 0.6451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2035536766052246, + "rewards/margins": 0.24642862379550934, + "rewards/rejected": -0.44998228549957275, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 2.9971906824300596e-07, + "logits/chosen": -3.15767765045166, + "logits/rejected": -2.7366042137145996, + "logps/chosen": -343.4460754394531, + "logps/rejected": -171.1099853515625, + "loss": 0.6915, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23290085792541504, + "rewards/margins": 0.053667664527893066, + "rewards/rejected": -0.2865685224533081, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 2.996839517733817e-07, + "logits/chosen": -2.623671054840088, + "logits/rejected": -2.845733165740967, + "logps/chosen": -302.492919921875, + "logps/rejected": -320.3753662109375, + "loss": 0.5069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12305383384227753, + "rewards/margins": 0.5965790152549744, + "rewards/rejected": -0.7196328043937683, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 2.996488353037574e-07, + "logits/chosen": -3.285778284072876, + "logits/rejected": -2.98270583152771, + "logps/chosen": -287.9363708496094, + "logps/rejected": -228.86444091796875, + "loss": 0.7223, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22906330227851868, + "rewards/margins": -0.010491617023944855, + "rewards/rejected": -0.2185717076063156, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 2.996137188341332e-07, + "logits/chosen": -3.3022141456604004, + "logits/rejected": -3.2671098709106445, + "logps/chosen": -481.6185302734375, + "logps/rejected": -277.2169189453125, + "loss": 0.6677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21497273445129395, + "rewards/margins": 0.12409339845180511, + "rewards/rejected": -0.33906614780426025, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 2.9957860236450893e-07, + "logits/chosen": -3.365952491760254, + "logits/rejected": -3.150855779647827, + "logps/chosen": -415.9041748046875, + "logps/rejected": -344.7142333984375, + "loss": 0.5989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24185144901275635, + "rewards/margins": 0.3635188639163971, + "rewards/rejected": -0.6053703427314758, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 2.995434858948847e-07, + "logits/chosen": -3.2255189418792725, + "logits/rejected": -3.396691083908081, + "logps/chosen": -189.93124389648438, + "logps/rejected": -199.94618225097656, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045084331184625626, + "rewards/margins": 0.6578375101089478, + "rewards/rejected": -0.61275315284729, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 2.9950836942526043e-07, + "logits/chosen": -2.579957962036133, + "logits/rejected": -2.634092330932617, + "logps/chosen": -385.6375427246094, + "logps/rejected": -341.6743469238281, + "loss": 0.5896, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04191902279853821, + "rewards/margins": 0.2284519076347351, + "rewards/rejected": -0.1865328848361969, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 2.994732529556362e-07, + "logits/chosen": -3.058870553970337, + "logits/rejected": -3.120302438735962, + "logps/chosen": -279.54766845703125, + "logps/rejected": -271.9564514160156, + "loss": 0.6933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28896427154541016, + "rewards/margins": 0.07057391107082367, + "rewards/rejected": -0.359538197517395, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 2.9943813648601194e-07, + "logits/chosen": -2.701864719390869, + "logits/rejected": -2.6773757934570312, + "logps/chosen": -287.5202941894531, + "logps/rejected": -180.57945251464844, + "loss": 0.756, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4529024064540863, + "rewards/margins": 0.08259911835193634, + "rewards/rejected": -0.5355014801025391, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 2.994030200163877e-07, + "logits/chosen": -3.3984477519989014, + "logits/rejected": -3.2735342979431152, + "logps/chosen": -209.30294799804688, + "logps/rejected": -250.28787231445312, + "loss": 0.7034, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2591944932937622, + "rewards/margins": 0.18426847457885742, + "rewards/rejected": -0.44346296787261963, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 2.993679035467634e-07, + "logits/chosen": -3.5456769466400146, + "logits/rejected": -3.5502374172210693, + "logps/chosen": -228.46807861328125, + "logps/rejected": -195.2511749267578, + "loss": 0.6286, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33787602186203003, + "rewards/margins": 0.42107442021369934, + "rewards/rejected": -0.7589504718780518, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 2.9933278707713915e-07, + "logits/chosen": -4.046043872833252, + "logits/rejected": -3.7114052772521973, + "logps/chosen": -249.51197814941406, + "logps/rejected": -138.1561279296875, + "loss": 0.5391, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08359494060277939, + "rewards/margins": 0.51688551902771, + "rewards/rejected": -0.6004804372787476, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 2.992976706075149e-07, + "logits/chosen": -3.3404526710510254, + "logits/rejected": -3.208498001098633, + "logps/chosen": -243.32574462890625, + "logps/rejected": -231.23641967773438, + "loss": 0.4954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10667084157466888, + "rewards/margins": 0.6393780708312988, + "rewards/rejected": -0.7460489273071289, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 2.9926255413789066e-07, + "logits/chosen": -2.9796719551086426, + "logits/rejected": -2.851384401321411, + "logps/chosen": -198.4134521484375, + "logps/rejected": -189.6981201171875, + "loss": 0.802, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24949032068252563, + "rewards/margins": -0.08734472095966339, + "rewards/rejected": -0.16214559972286224, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 2.992274376682664e-07, + "logits/chosen": -3.3185057640075684, + "logits/rejected": -3.5422091484069824, + "logps/chosen": -340.01898193359375, + "logps/rejected": -222.56600952148438, + "loss": 0.6495, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2793940603733063, + "rewards/margins": 0.3967933654785156, + "rewards/rejected": -0.6761875152587891, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 2.991923211986421e-07, + "logits/chosen": -2.9152514934539795, + "logits/rejected": -2.8761844635009766, + "logps/chosen": -149.1962432861328, + "logps/rejected": -339.2282409667969, + "loss": 0.7615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28893014788627625, + "rewards/margins": 0.13765370845794678, + "rewards/rejected": -0.426583856344223, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 2.991572047290179e-07, + "logits/chosen": -2.5073742866516113, + "logits/rejected": -2.8334412574768066, + "logps/chosen": -396.4125061035156, + "logps/rejected": -323.62701416015625, + "loss": 0.4225, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34479600191116333, + "rewards/margins": 0.8884700536727905, + "rewards/rejected": -0.543674111366272, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 2.991220882593937e-07, + "logits/chosen": -3.331512451171875, + "logits/rejected": -3.384878635406494, + "logps/chosen": -124.75603485107422, + "logps/rejected": -154.62310791015625, + "loss": 0.594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10098839551210403, + "rewards/margins": 0.2894188463687897, + "rewards/rejected": -0.3904072642326355, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 2.990869717897694e-07, + "logits/chosen": -3.347975730895996, + "logits/rejected": -3.1921215057373047, + "logps/chosen": -257.6455993652344, + "logps/rejected": -203.8451690673828, + "loss": 0.5977, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07259725034236908, + "rewards/margins": 0.32492148876190186, + "rewards/rejected": -0.39751872420310974, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 2.9905185532014513e-07, + "logits/chosen": -3.3827998638153076, + "logits/rejected": -3.5492677688598633, + "logps/chosen": -328.07012939453125, + "logps/rejected": -292.88995361328125, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28669944405555725, + "rewards/margins": 0.6278131008148193, + "rewards/rejected": -0.9145126342773438, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 2.990167388505209e-07, + "logits/chosen": -3.4158194065093994, + "logits/rejected": -3.3485889434814453, + "logps/chosen": -183.23655700683594, + "logps/rejected": -205.1849822998047, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4549804925918579, + "rewards/margins": 0.5344358682632446, + "rewards/rejected": -0.9894163608551025, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 2.9898162238089664e-07, + "logits/chosen": -2.8854222297668457, + "logits/rejected": -2.7342538833618164, + "logps/chosen": -434.2591552734375, + "logps/rejected": -436.9725341796875, + "loss": 0.5369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12497800588607788, + "rewards/margins": 0.5749577283859253, + "rewards/rejected": -0.6999356746673584, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 2.989465059112724e-07, + "logits/chosen": -3.999845504760742, + "logits/rejected": -3.616804838180542, + "logps/chosen": -329.3415222167969, + "logps/rejected": -229.0399932861328, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10923422873020172, + "rewards/margins": 0.5819433927536011, + "rewards/rejected": -0.6911776661872864, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 2.989113894416481e-07, + "logits/chosen": -3.4872474670410156, + "logits/rejected": -3.2849693298339844, + "logps/chosen": -377.61309814453125, + "logps/rejected": -399.02423095703125, + "loss": 0.7418, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.36903852224349976, + "rewards/margins": 0.02642093598842621, + "rewards/rejected": -0.3954594135284424, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 2.9887627297202385e-07, + "logits/chosen": -3.478287696838379, + "logits/rejected": -3.205300807952881, + "logps/chosen": -224.24118041992188, + "logps/rejected": -94.39404296875, + "loss": 0.6099, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0231582410633564, + "rewards/margins": 0.26386886835098267, + "rewards/rejected": -0.24071064591407776, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 2.9884115650239965e-07, + "logits/chosen": -2.5553925037384033, + "logits/rejected": -2.506513833999634, + "logps/chosen": -499.73187255859375, + "logps/rejected": -322.5975036621094, + "loss": 0.954, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7257604598999023, + "rewards/margins": -0.0986221432685852, + "rewards/rejected": -0.6271383166313171, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 2.9880604003277535e-07, + "logits/chosen": -2.6206936836242676, + "logits/rejected": -2.769092559814453, + "logps/chosen": -230.21005249023438, + "logps/rejected": -201.02163696289062, + "loss": 0.7004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2516683340072632, + "rewards/margins": 0.08148515224456787, + "rewards/rejected": -0.33315351605415344, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 2.987709235631511e-07, + "logits/chosen": -3.632220506668091, + "logits/rejected": -3.4407827854156494, + "logps/chosen": -433.216064453125, + "logps/rejected": -258.58099365234375, + "loss": 0.5777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026065528392791748, + "rewards/margins": 0.3912695646286011, + "rewards/rejected": -0.4173351526260376, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 2.9873580709352686e-07, + "logits/chosen": -2.7458789348602295, + "logits/rejected": -2.957353115081787, + "logps/chosen": -162.34446716308594, + "logps/rejected": -347.2967224121094, + "loss": 0.4975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05099650099873543, + "rewards/margins": 0.6179355382919312, + "rewards/rejected": -0.6689320206642151, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 2.987006906239026e-07, + "logits/chosen": -2.4787118434906006, + "logits/rejected": -2.534574508666992, + "logps/chosen": -278.05120849609375, + "logps/rejected": -202.2365264892578, + "loss": 0.7327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29210782051086426, + "rewards/margins": -0.017836904153227806, + "rewards/rejected": -0.2742709219455719, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 2.9866557415427837e-07, + "logits/chosen": -2.821833610534668, + "logits/rejected": -3.0649807453155518, + "logps/chosen": -202.0626220703125, + "logps/rejected": -204.60374450683594, + "loss": 0.7851, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2640414834022522, + "rewards/margins": -0.12227870523929596, + "rewards/rejected": -0.14176276326179504, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 2.9863045768465407e-07, + "logits/chosen": -3.2463502883911133, + "logits/rejected": -3.2087459564208984, + "logps/chosen": -230.4561767578125, + "logps/rejected": -272.46490478515625, + "loss": 0.7687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20005351305007935, + "rewards/margins": 0.015266001224517822, + "rewards/rejected": -0.21531949937343597, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 2.985953412150298e-07, + "logits/chosen": -3.313397169113159, + "logits/rejected": -3.26961088180542, + "logps/chosen": -233.5687713623047, + "logps/rejected": -195.5608673095703, + "loss": 0.742, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1908952295780182, + "rewards/margins": -0.012923330068588257, + "rewards/rejected": -0.17797189950942993, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 2.985602247454056e-07, + "logits/chosen": -3.9305286407470703, + "logits/rejected": -3.6726322174072266, + "logps/chosen": -334.8080749511719, + "logps/rejected": -171.98944091796875, + "loss": 0.6186, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0617518424987793, + "rewards/margins": 0.2917241156101227, + "rewards/rejected": -0.22997227311134338, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 2.9852510827578133e-07, + "logits/chosen": -2.7501490116119385, + "logits/rejected": -3.198286771774292, + "logps/chosen": -223.817138671875, + "logps/rejected": -332.61468505859375, + "loss": 0.5727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42387324571609497, + "rewards/margins": 0.6796536445617676, + "rewards/rejected": -1.1035268306732178, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 2.984899918061571e-07, + "logits/chosen": -3.8350906372070312, + "logits/rejected": -3.5324692726135254, + "logps/chosen": -193.56277465820312, + "logps/rejected": -148.30596923828125, + "loss": 0.4862, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2093673050403595, + "rewards/margins": 0.7078065872192383, + "rewards/rejected": -0.498439222574234, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 2.984548753365328e-07, + "logits/chosen": -2.9773130416870117, + "logits/rejected": -3.1218841075897217, + "logps/chosen": -195.6688690185547, + "logps/rejected": -195.92657470703125, + "loss": 0.7823, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5283998847007751, + "rewards/margins": -0.10887815058231354, + "rewards/rejected": -0.4195217490196228, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 2.984197588669086e-07, + "logits/chosen": -2.9050984382629395, + "logits/rejected": -3.1372880935668945, + "logps/chosen": -170.55767822265625, + "logps/rejected": -185.3782958984375, + "loss": 0.5555, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06616852432489395, + "rewards/margins": 0.45835667848587036, + "rewards/rejected": -0.3921881914138794, + "step": 176 + }, + { + "epoch": 0.02, + "learning_rate": 2.9838464239728435e-07, + "logits/chosen": -3.0023117065429688, + "logits/rejected": -3.1521849632263184, + "logps/chosen": -385.71356201171875, + "logps/rejected": -226.0330352783203, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3155870735645294, + "rewards/margins": 0.32975566387176514, + "rewards/rejected": -0.014168601483106613, + "step": 177 + }, + { + "epoch": 0.02, + "learning_rate": 2.9834952592766005e-07, + "logits/chosen": -2.600144863128662, + "logits/rejected": -2.405885934829712, + "logps/chosen": -137.55184936523438, + "logps/rejected": -156.10321044921875, + "loss": 0.7071, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19570621848106384, + "rewards/margins": 0.07889263331890106, + "rewards/rejected": -0.2745988667011261, + "step": 178 + }, + { + "epoch": 0.02, + "learning_rate": 2.983144094580358e-07, + "logits/chosen": -3.4724783897399902, + "logits/rejected": -3.27103853225708, + "logps/chosen": -340.7546081542969, + "logps/rejected": -267.78375244140625, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.017360538244247437, + "rewards/margins": 0.9971700310707092, + "rewards/rejected": -1.0145305395126343, + "step": 179 + }, + { + "epoch": 0.02, + "learning_rate": 2.9827929298841155e-07, + "logits/chosen": -2.8327319622039795, + "logits/rejected": -2.883878707885742, + "logps/chosen": -225.36587524414062, + "logps/rejected": -267.1601257324219, + "loss": 0.5921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1742541491985321, + "rewards/margins": 0.49065008759498596, + "rewards/rejected": -0.6649041771888733, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 2.982441765187873e-07, + "logits/chosen": -2.311443567276001, + "logits/rejected": -2.3403432369232178, + "logps/chosen": -291.8216247558594, + "logps/rejected": -233.33709716796875, + "loss": 0.6999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2790372371673584, + "rewards/margins": 0.11873116344213486, + "rewards/rejected": -0.39776840806007385, + "step": 181 + }, + { + "epoch": 0.02, + "learning_rate": 2.9820906004916306e-07, + "logits/chosen": -2.5439298152923584, + "logits/rejected": -2.7971789836883545, + "logps/chosen": -379.8512878417969, + "logps/rejected": -301.1816711425781, + "loss": 0.4132, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0018820781260728836, + "rewards/margins": 0.8097530603408813, + "rewards/rejected": -0.8078709840774536, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 2.9817394357953876e-07, + "logits/chosen": -2.828860282897949, + "logits/rejected": -2.9380154609680176, + "logps/chosen": -253.701416015625, + "logps/rejected": -313.2247314453125, + "loss": 0.6251, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17963212728500366, + "rewards/margins": 0.28871774673461914, + "rewards/rejected": -0.4683498740196228, + "step": 183 + }, + { + "epoch": 0.02, + "learning_rate": 2.981388271099145e-07, + "logits/chosen": -2.8582992553710938, + "logits/rejected": -3.032472610473633, + "logps/chosen": -471.1768493652344, + "logps/rejected": -300.2481994628906, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03532904386520386, + "rewards/margins": 0.9726289510726929, + "rewards/rejected": -0.937299907207489, + "step": 184 + }, + { + "epoch": 0.02, + "learning_rate": 2.9810371064029027e-07, + "logits/chosen": -3.062253952026367, + "logits/rejected": -2.9814083576202393, + "logps/chosen": -287.1173095703125, + "logps/rejected": -239.99176025390625, + "loss": 0.5691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12003950774669647, + "rewards/margins": 0.42535489797592163, + "rewards/rejected": -0.5453944206237793, + "step": 185 + }, + { + "epoch": 0.02, + "learning_rate": 2.98068594170666e-07, + "logits/chosen": -3.4551150798797607, + "logits/rejected": -3.4497079849243164, + "logps/chosen": -192.80047607421875, + "logps/rejected": -197.1517333984375, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10274100303649902, + "rewards/margins": 0.589533805847168, + "rewards/rejected": -0.692274808883667, + "step": 186 + }, + { + "epoch": 0.02, + "learning_rate": 2.980334777010418e-07, + "logits/chosen": -2.9222331047058105, + "logits/rejected": -3.0306241512298584, + "logps/chosen": -264.43621826171875, + "logps/rejected": -289.7286071777344, + "loss": 0.5074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022851087152957916, + "rewards/margins": 0.7150350213050842, + "rewards/rejected": -0.7378861308097839, + "step": 187 + }, + { + "epoch": 0.02, + "learning_rate": 2.979983612314175e-07, + "logits/chosen": -3.7061309814453125, + "logits/rejected": -3.685464382171631, + "logps/chosen": -194.2506866455078, + "logps/rejected": -299.09600830078125, + "loss": 0.5612, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08315838873386383, + "rewards/margins": 0.5760486125946045, + "rewards/rejected": -0.6592070460319519, + "step": 188 + }, + { + "epoch": 0.02, + "learning_rate": 2.979632447617933e-07, + "logits/chosen": -3.250340700149536, + "logits/rejected": -3.4634101390838623, + "logps/chosen": -240.18511962890625, + "logps/rejected": -204.5301971435547, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2084040641784668, + "rewards/margins": 0.5127589106559753, + "rewards/rejected": -0.30435487627983093, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 2.9792812829216904e-07, + "logits/chosen": -2.8980226516723633, + "logits/rejected": -2.909475803375244, + "logps/chosen": -265.8323669433594, + "logps/rejected": -297.021728515625, + "loss": 0.5685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22862929105758667, + "rewards/margins": 0.45408862829208374, + "rewards/rejected": -0.6827179193496704, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 2.9789301182254474e-07, + "logits/chosen": -3.1988024711608887, + "logits/rejected": -3.325883626937866, + "logps/chosen": -218.533203125, + "logps/rejected": -245.3336944580078, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5096397995948792, + "rewards/margins": 0.07023922353982925, + "rewards/rejected": -0.5798790454864502, + "step": 191 + }, + { + "epoch": 0.02, + "learning_rate": 2.978578953529205e-07, + "logits/chosen": -3.0551133155822754, + "logits/rejected": -3.1477432250976562, + "logps/chosen": -233.41189575195312, + "logps/rejected": -214.26553344726562, + "loss": 0.5762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10441388189792633, + "rewards/margins": 0.514127790927887, + "rewards/rejected": -0.6185417175292969, + "step": 192 + }, + { + "epoch": 0.02, + "learning_rate": 2.9782277888329625e-07, + "logits/chosen": -3.3801629543304443, + "logits/rejected": -3.3786168098449707, + "logps/chosen": -371.3843688964844, + "logps/rejected": -311.36669921875, + "loss": 0.5325, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10817471146583557, + "rewards/margins": 0.5224473476409912, + "rewards/rejected": -0.414272665977478, + "step": 193 + }, + { + "epoch": 0.02, + "learning_rate": 2.97787662413672e-07, + "logits/chosen": -2.8085408210754395, + "logits/rejected": -2.8878421783447266, + "logps/chosen": -248.19271850585938, + "logps/rejected": -379.5545959472656, + "loss": 0.4467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06882905215024948, + "rewards/margins": 0.9077163934707642, + "rewards/rejected": -0.8388873338699341, + "step": 194 + }, + { + "epoch": 0.02, + "learning_rate": 2.9775254594404776e-07, + "logits/chosen": -3.327256679534912, + "logits/rejected": -3.314408779144287, + "logps/chosen": -310.48468017578125, + "logps/rejected": -299.07159423828125, + "loss": 0.4931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35162973403930664, + "rewards/margins": 0.9348543882369995, + "rewards/rejected": -1.2864842414855957, + "step": 195 + }, + { + "epoch": 0.02, + "learning_rate": 2.9771742947442346e-07, + "logits/chosen": -3.902977466583252, + "logits/rejected": -3.9194223880767822, + "logps/chosen": -159.94650268554688, + "logps/rejected": -148.90464782714844, + "loss": 0.5699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20235267281532288, + "rewards/margins": 0.5211620330810547, + "rewards/rejected": -0.7235147356987, + "step": 196 + }, + { + "epoch": 0.02, + "learning_rate": 2.976823130047992e-07, + "logits/chosen": -2.1873645782470703, + "logits/rejected": -2.54728102684021, + "logps/chosen": -439.40771484375, + "logps/rejected": -402.89215087890625, + "loss": 0.8621, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06044436991214752, + "rewards/margins": -0.16255342960357666, + "rewards/rejected": 0.10210905969142914, + "step": 197 + }, + { + "epoch": 0.02, + "learning_rate": 2.97647196535175e-07, + "logits/chosen": -2.756392478942871, + "logits/rejected": -2.571483612060547, + "logps/chosen": -199.18685913085938, + "logps/rejected": -262.017578125, + "loss": 0.8194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36614125967025757, + "rewards/margins": -0.0005308240652084351, + "rewards/rejected": -0.36561042070388794, + "step": 198 + }, + { + "epoch": 0.02, + "learning_rate": 2.976120800655507e-07, + "logits/chosen": -2.8051319122314453, + "logits/rejected": -3.2325615882873535, + "logps/chosen": -197.07574462890625, + "logps/rejected": -246.64891052246094, + "loss": 0.3894, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3187968134880066, + "rewards/margins": 1.0560178756713867, + "rewards/rejected": -0.7372210025787354, + "step": 199 + }, + { + "epoch": 0.02, + "learning_rate": 2.9757696359592647e-07, + "logits/chosen": -3.391042470932007, + "logits/rejected": -3.23837947845459, + "logps/chosen": -282.110595703125, + "logps/rejected": -233.5883331298828, + "loss": 0.6438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47287696599960327, + "rewards/margins": 0.2671605050563812, + "rewards/rejected": -0.7400375008583069, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 2.9754184712630223e-07, + "logits/chosen": -3.0555269718170166, + "logits/rejected": -2.8955140113830566, + "logps/chosen": -215.63636779785156, + "logps/rejected": -219.0470428466797, + "loss": 0.3911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08939912170171738, + "rewards/margins": 0.9251919984817505, + "rewards/rejected": -1.014591097831726, + "step": 201 + }, + { + "epoch": 0.02, + "learning_rate": 2.97506730656678e-07, + "logits/chosen": -3.0892515182495117, + "logits/rejected": -3.2398135662078857, + "logps/chosen": -260.987060546875, + "logps/rejected": -316.5274353027344, + "loss": 0.5312, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023985620588064194, + "rewards/margins": 0.5565420985221863, + "rewards/rejected": -0.580527663230896, + "step": 202 + }, + { + "epoch": 0.02, + "learning_rate": 2.9747161418705373e-07, + "logits/chosen": -2.6565470695495605, + "logits/rejected": -2.48661208152771, + "logps/chosen": -158.07730102539062, + "logps/rejected": -314.88397216796875, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015331745147705078, + "rewards/margins": 0.9793165922164917, + "rewards/rejected": -0.994648277759552, + "step": 203 + }, + { + "epoch": 0.02, + "learning_rate": 2.9743649771742944e-07, + "logits/chosen": -3.140512466430664, + "logits/rejected": -3.50887393951416, + "logps/chosen": -116.6439208984375, + "logps/rejected": -185.6629638671875, + "loss": 0.4969, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09205470234155655, + "rewards/margins": 0.7980520129203796, + "rewards/rejected": -0.7059973478317261, + "step": 204 + }, + { + "epoch": 0.02, + "learning_rate": 2.974013812478052e-07, + "logits/chosen": -2.906808853149414, + "logits/rejected": -3.275259256362915, + "logps/chosen": -138.61817932128906, + "logps/rejected": -273.44415283203125, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023972608149051666, + "rewards/margins": 1.4868533611297607, + "rewards/rejected": -1.5108258724212646, + "step": 205 + }, + { + "epoch": 0.02, + "learning_rate": 2.9736626477818094e-07, + "logits/chosen": -2.807100296020508, + "logits/rejected": -2.839050769805908, + "logps/chosen": -307.84112548828125, + "logps/rejected": -267.107666015625, + "loss": 0.6733, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42347633838653564, + "rewards/margins": 0.1767052412033081, + "rewards/rejected": -0.6001815795898438, + "step": 206 + }, + { + "epoch": 0.02, + "learning_rate": 2.973311483085567e-07, + "logits/chosen": -2.9292707443237305, + "logits/rejected": -2.70388126373291, + "logps/chosen": -258.6240234375, + "logps/rejected": -243.3085479736328, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16219741106033325, + "rewards/margins": 0.488385945558548, + "rewards/rejected": -0.3261885344982147, + "step": 207 + }, + { + "epoch": 0.02, + "learning_rate": 2.9729603183893245e-07, + "logits/chosen": -3.6700987815856934, + "logits/rejected": -3.612215042114258, + "logps/chosen": -294.9308776855469, + "logps/rejected": -310.5212707519531, + "loss": 0.6661, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5405645966529846, + "rewards/margins": 0.21268552541732788, + "rewards/rejected": -0.7532501220703125, + "step": 208 + }, + { + "epoch": 0.02, + "learning_rate": 2.972609153693082e-07, + "logits/chosen": -3.4860682487487793, + "logits/rejected": -3.761573553085327, + "logps/chosen": -219.30694580078125, + "logps/rejected": -250.24530029296875, + "loss": 0.4722, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.038902655243873596, + "rewards/margins": 0.6632366180419922, + "rewards/rejected": -0.7021392583847046, + "step": 209 + }, + { + "epoch": 0.02, + "learning_rate": 2.9722579889968396e-07, + "logits/chosen": -2.762735605239868, + "logits/rejected": -2.594841480255127, + "logps/chosen": -255.82757568359375, + "logps/rejected": -148.57679748535156, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11014002561569214, + "rewards/margins": 0.6784398555755615, + "rewards/rejected": -0.5682997703552246, + "step": 210 + }, + { + "epoch": 0.02, + "learning_rate": 2.971906824300597e-07, + "logits/chosen": -3.3610377311706543, + "logits/rejected": -3.216486692428589, + "logps/chosen": -199.43492126464844, + "logps/rejected": -169.95294189453125, + "loss": 0.4486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14022159576416016, + "rewards/margins": 0.6015288233757019, + "rewards/rejected": -0.46130722761154175, + "step": 211 + }, + { + "epoch": 0.02, + "learning_rate": 2.971555659604354e-07, + "logits/chosen": -3.5222673416137695, + "logits/rejected": -3.2268295288085938, + "logps/chosen": -161.68572998046875, + "logps/rejected": -110.05447387695312, + "loss": 0.8698, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5529178380966187, + "rewards/margins": -0.2131844162940979, + "rewards/rejected": -0.33973339200019836, + "step": 212 + }, + { + "epoch": 0.02, + "learning_rate": 2.9712044949081117e-07, + "logits/chosen": -3.728032350540161, + "logits/rejected": -3.4043331146240234, + "logps/chosen": -208.7862091064453, + "logps/rejected": -179.4081268310547, + "loss": 0.6808, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28625184297561646, + "rewards/margins": 0.17475757002830505, + "rewards/rejected": -0.4610094428062439, + "step": 213 + }, + { + "epoch": 0.02, + "learning_rate": 2.970853330211869e-07, + "logits/chosen": -2.6386070251464844, + "logits/rejected": -2.7980525493621826, + "logps/chosen": -516.9013671875, + "logps/rejected": -268.7263488769531, + "loss": 0.7721, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15232467651367188, + "rewards/margins": -0.07989473640918732, + "rewards/rejected": -0.07242995500564575, + "step": 214 + }, + { + "epoch": 0.02, + "learning_rate": 2.970502165515627e-07, + "logits/chosen": -3.597278118133545, + "logits/rejected": -3.415316343307495, + "logps/chosen": -290.1488037109375, + "logps/rejected": -283.62860107421875, + "loss": 0.5235, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08039014041423798, + "rewards/margins": 0.5221853256225586, + "rewards/rejected": -0.602575421333313, + "step": 215 + }, + { + "epoch": 0.02, + "learning_rate": 2.9701510008193843e-07, + "logits/chosen": -3.453481912612915, + "logits/rejected": -3.2548611164093018, + "logps/chosen": -236.84219360351562, + "logps/rejected": -185.232421875, + "loss": 0.6122, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0571928545832634, + "rewards/margins": 0.2992919683456421, + "rewards/rejected": -0.3564848303794861, + "step": 216 + }, + { + "epoch": 0.03, + "learning_rate": 2.969799836123142e-07, + "logits/chosen": -3.678865432739258, + "logits/rejected": -3.0691254138946533, + "logps/chosen": -281.11480712890625, + "logps/rejected": -298.4317932128906, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013596925884485245, + "rewards/margins": 0.5251922607421875, + "rewards/rejected": -0.5387891530990601, + "step": 217 + }, + { + "epoch": 0.03, + "learning_rate": 2.969448671426899e-07, + "logits/chosen": -2.94370698928833, + "logits/rejected": -3.2599921226501465, + "logps/chosen": -271.5096740722656, + "logps/rejected": -298.1865539550781, + "loss": 0.9749, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.46265023946762085, + "rewards/margins": 0.10639116168022156, + "rewards/rejected": -0.5690414905548096, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 2.9690975067306564e-07, + "logits/chosen": -2.6451072692871094, + "logits/rejected": -2.4324116706848145, + "logps/chosen": -210.8350067138672, + "logps/rejected": -322.6939697265625, + "loss": 0.6167, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2769347131252289, + "rewards/margins": 0.44237735867500305, + "rewards/rejected": -0.7193120718002319, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 2.968746342034414e-07, + "logits/chosen": -3.206939458847046, + "logits/rejected": -3.2500791549682617, + "logps/chosen": -226.1312255859375, + "logps/rejected": -276.7279357910156, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2716180384159088, + "rewards/margins": 0.07935592532157898, + "rewards/rejected": -0.3509739637374878, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 2.9683951773381715e-07, + "logits/chosen": -3.516022205352783, + "logits/rejected": -3.3892829418182373, + "logps/chosen": -245.95211791992188, + "logps/rejected": -190.95635986328125, + "loss": 0.5184, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09167724847793579, + "rewards/margins": 0.4101697504520416, + "rewards/rejected": -0.31849250197410583, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 2.968044012641929e-07, + "logits/chosen": -2.7190985679626465, + "logits/rejected": -2.912688732147217, + "logps/chosen": -76.05853271484375, + "logps/rejected": -322.3135070800781, + "loss": 0.6346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.153318852186203, + "rewards/margins": 0.22015169262886047, + "rewards/rejected": -0.3734705448150635, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 2.9676928479456865e-07, + "logits/chosen": -2.8298537731170654, + "logits/rejected": -2.6934783458709717, + "logps/chosen": -300.29364013671875, + "logps/rejected": -308.415771484375, + "loss": 0.5984, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28014540672302246, + "rewards/margins": 0.6264234781265259, + "rewards/rejected": -0.9065688848495483, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 2.967341683249444e-07, + "logits/chosen": -3.0922107696533203, + "logits/rejected": -2.8710861206054688, + "logps/chosen": -326.006103515625, + "logps/rejected": -281.7949523925781, + "loss": 0.6966, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4827280044555664, + "rewards/margins": 0.3587973713874817, + "rewards/rejected": -0.8415253162384033, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 2.9669905185532016e-07, + "logits/chosen": -2.3920328617095947, + "logits/rejected": -2.381572723388672, + "logps/chosen": -320.2467346191406, + "logps/rejected": -273.5317077636719, + "loss": 0.8224, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3464587330818176, + "rewards/margins": -0.1277146190404892, + "rewards/rejected": -0.21874408423900604, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 2.9666393538569586e-07, + "logits/chosen": -3.5929512977600098, + "logits/rejected": -3.31587815284729, + "logps/chosen": -214.7141876220703, + "logps/rejected": -184.9407196044922, + "loss": 0.6134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2481451779603958, + "rewards/margins": 0.21901190280914307, + "rewards/rejected": -0.4671570658683777, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 2.966288189160716e-07, + "logits/chosen": -2.9478559494018555, + "logits/rejected": -2.8069493770599365, + "logps/chosen": -203.62388610839844, + "logps/rejected": -189.1769256591797, + "loss": 0.5301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07000663876533508, + "rewards/margins": 0.5580227971076965, + "rewards/rejected": -0.6280293464660645, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 2.9659370244644737e-07, + "logits/chosen": -2.190747022628784, + "logits/rejected": -2.487834930419922, + "logps/chosen": -381.3501281738281, + "logps/rejected": -243.27737426757812, + "loss": 0.4856, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004550546407699585, + "rewards/margins": 0.6609116792678833, + "rewards/rejected": -0.6563611030578613, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 2.965585859768231e-07, + "logits/chosen": -3.680727481842041, + "logits/rejected": -3.666208267211914, + "logps/chosen": -222.2845916748047, + "logps/rejected": -314.9975891113281, + "loss": 0.4674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030049512162804604, + "rewards/margins": 0.9947059154510498, + "rewards/rejected": -1.0247553586959839, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 2.965234695071989e-07, + "logits/chosen": -3.4629735946655273, + "logits/rejected": -3.31046724319458, + "logps/chosen": -221.97340393066406, + "logps/rejected": -126.62712097167969, + "loss": 0.8937, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2501615285873413, + "rewards/margins": 0.16573986411094666, + "rewards/rejected": -0.4159013032913208, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 2.964883530375746e-07, + "logits/chosen": -3.419600009918213, + "logits/rejected": -3.4507951736450195, + "logps/chosen": -229.01889038085938, + "logps/rejected": -358.0796203613281, + "loss": 0.4654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3408559262752533, + "rewards/margins": 0.7648007273674011, + "rewards/rejected": -1.105656623840332, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 2.964532365679504e-07, + "logits/chosen": -3.765226364135742, + "logits/rejected": -3.8680765628814697, + "logps/chosen": -188.7469482421875, + "logps/rejected": -184.66305541992188, + "loss": 0.5588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1464422643184662, + "rewards/margins": 0.5799602270126343, + "rewards/rejected": -0.7264024615287781, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 2.964181200983261e-07, + "logits/chosen": -2.745063304901123, + "logits/rejected": -2.8669588565826416, + "logps/chosen": -286.19873046875, + "logps/rejected": -265.67486572265625, + "loss": 0.5542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05369701236486435, + "rewards/margins": 0.5675129890441895, + "rewards/rejected": -0.6212100386619568, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 2.9638300362870184e-07, + "logits/chosen": -2.836422920227051, + "logits/rejected": -2.7854535579681396, + "logps/chosen": -287.45306396484375, + "logps/rejected": -154.11834716796875, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1475120484828949, + "rewards/margins": 0.548670768737793, + "rewards/rejected": -0.6961827874183655, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 2.963478871590776e-07, + "logits/chosen": -3.0314126014709473, + "logits/rejected": -3.105581283569336, + "logps/chosen": -291.4547424316406, + "logps/rejected": -380.5617370605469, + "loss": 0.2724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11533575505018234, + "rewards/margins": 1.307447075843811, + "rewards/rejected": -1.1921114921569824, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 2.9631277068945335e-07, + "logits/chosen": -3.2220299243927, + "logits/rejected": -3.0955076217651367, + "logps/chosen": -371.5953063964844, + "logps/rejected": -283.3895568847656, + "loss": 0.5466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1008606031537056, + "rewards/margins": 0.5603925585746765, + "rewards/rejected": -0.6612531542778015, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 2.962776542198291e-07, + "logits/chosen": -3.0689451694488525, + "logits/rejected": -3.151905059814453, + "logps/chosen": -162.16529846191406, + "logps/rejected": -269.0153503417969, + "loss": 0.5308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14437344670295715, + "rewards/margins": 0.4573960304260254, + "rewards/rejected": -0.6017694473266602, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 2.9624253775020485e-07, + "logits/chosen": -2.5645570755004883, + "logits/rejected": -2.456944465637207, + "logps/chosen": -430.4282531738281, + "logps/rejected": -340.0616760253906, + "loss": 0.5597, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2560270428657532, + "rewards/margins": 0.7348439693450928, + "rewards/rejected": -0.9908709526062012, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 2.9620742128058056e-07, + "logits/chosen": -2.7330174446105957, + "logits/rejected": -2.7327356338500977, + "logps/chosen": -277.7727355957031, + "logps/rejected": -228.38668823242188, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19206827878952026, + "rewards/margins": 0.5544301271438599, + "rewards/rejected": -0.7464984655380249, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 2.961723048109563e-07, + "logits/chosen": -3.7633213996887207, + "logits/rejected": -3.566074848175049, + "logps/chosen": -141.76890563964844, + "logps/rejected": -142.35098266601562, + "loss": 0.4026, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10821930319070816, + "rewards/margins": 1.1136753559112549, + "rewards/rejected": -1.0054560899734497, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 2.9613718834133206e-07, + "logits/chosen": -2.6251418590545654, + "logits/rejected": -2.9427804946899414, + "logps/chosen": -120.33463287353516, + "logps/rejected": -202.0577392578125, + "loss": 0.3456, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10063973069190979, + "rewards/margins": 1.3071494102478027, + "rewards/rejected": -1.2065095901489258, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 2.961020718717078e-07, + "logits/chosen": -3.6507034301757812, + "logits/rejected": -3.8930370807647705, + "logps/chosen": -165.0177001953125, + "logps/rejected": -206.48675537109375, + "loss": 0.7507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04302286356687546, + "rewards/margins": 0.16279834508895874, + "rewards/rejected": -0.2058212161064148, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 2.9606695540208357e-07, + "logits/chosen": -2.7475037574768066, + "logits/rejected": -3.154788017272949, + "logps/chosen": -240.9046630859375, + "logps/rejected": -202.08717346191406, + "loss": 0.5142, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00800538994371891, + "rewards/margins": 0.7327769994735718, + "rewards/rejected": -0.7247716188430786, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 2.960318389324593e-07, + "logits/chosen": -3.4889817237854004, + "logits/rejected": -3.0416648387908936, + "logps/chosen": -241.79881286621094, + "logps/rejected": -251.40850830078125, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06766597926616669, + "rewards/margins": 0.6569793224334717, + "rewards/rejected": -0.5893133282661438, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 2.959967224628351e-07, + "logits/chosen": -3.3415560722351074, + "logits/rejected": -3.5458202362060547, + "logps/chosen": -147.89974975585938, + "logps/rejected": -322.64581298828125, + "loss": 0.4864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1522575318813324, + "rewards/margins": 1.0821325778961182, + "rewards/rejected": -1.234390139579773, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 2.9596160599321083e-07, + "logits/chosen": -3.1987807750701904, + "logits/rejected": -3.247431993484497, + "logps/chosen": -250.9003448486328, + "logps/rejected": -149.986328125, + "loss": 0.5994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14529022574424744, + "rewards/margins": 0.3438277840614319, + "rewards/rejected": -0.48911798000335693, + "step": 246 + }, + { + "epoch": 0.03, + "learning_rate": 2.9592648952358653e-07, + "logits/chosen": -3.712801933288574, + "logits/rejected": -3.555534839630127, + "logps/chosen": -321.03778076171875, + "logps/rejected": -269.14007568359375, + "loss": 0.4639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.012854758650064468, + "rewards/margins": 0.7215206027030945, + "rewards/rejected": -0.7343753576278687, + "step": 247 + }, + { + "epoch": 0.03, + "learning_rate": 2.958913730539623e-07, + "logits/chosen": -3.397170066833496, + "logits/rejected": -3.4199612140655518, + "logps/chosen": -244.46722412109375, + "logps/rejected": -202.02520751953125, + "loss": 0.4384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12344011664390564, + "rewards/margins": 0.9416974186897278, + "rewards/rejected": -0.8182573318481445, + "step": 248 + }, + { + "epoch": 0.03, + "learning_rate": 2.9585625658433804e-07, + "logits/chosen": -2.8129043579101562, + "logits/rejected": -2.795145034790039, + "logps/chosen": -252.59750366210938, + "logps/rejected": -142.37274169921875, + "loss": 0.6231, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17783230543136597, + "rewards/margins": 0.5210087299346924, + "rewards/rejected": -0.6988410949707031, + "step": 249 + }, + { + "epoch": 0.03, + "learning_rate": 2.958211401147138e-07, + "logits/chosen": -3.1448240280151367, + "logits/rejected": -2.9438672065734863, + "logps/chosen": -246.2015380859375, + "logps/rejected": -216.67286682128906, + "loss": 0.5658, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19408829510211945, + "rewards/margins": 0.41872453689575195, + "rewards/rejected": -0.6128128170967102, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 2.9578602364508955e-07, + "logits/chosen": -3.0558786392211914, + "logits/rejected": -3.134983539581299, + "logps/chosen": -293.5311584472656, + "logps/rejected": -334.5091552734375, + "loss": 0.4784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09639740735292435, + "rewards/margins": 0.9320396780967712, + "rewards/rejected": -1.0284371376037598, + "step": 251 + }, + { + "epoch": 0.03, + "learning_rate": 2.9575090717546525e-07, + "logits/chosen": -3.5703725814819336, + "logits/rejected": -3.4591760635375977, + "logps/chosen": -122.27105712890625, + "logps/rejected": -138.0686798095703, + "loss": 0.6985, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2735036313533783, + "rewards/margins": 0.22693270444869995, + "rewards/rejected": -0.5004363656044006, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 2.95715790705841e-07, + "logits/chosen": -2.582799196243286, + "logits/rejected": -2.5798473358154297, + "logps/chosen": -367.116943359375, + "logps/rejected": -163.09207153320312, + "loss": 0.5897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.043112900108098984, + "rewards/margins": 0.2766725420951843, + "rewards/rejected": -0.23355963826179504, + "step": 253 + }, + { + "epoch": 0.03, + "learning_rate": 2.956806742362168e-07, + "logits/chosen": -2.7647547721862793, + "logits/rejected": -2.7235262393951416, + "logps/chosen": -255.77725219726562, + "logps/rejected": -248.46710205078125, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09696606546640396, + "rewards/margins": 0.21835319697856903, + "rewards/rejected": -0.3153192698955536, + "step": 254 + }, + { + "epoch": 0.03, + "learning_rate": 2.956455577665925e-07, + "logits/chosen": -2.8253111839294434, + "logits/rejected": -2.7937302589416504, + "logps/chosen": -269.11822509765625, + "logps/rejected": -147.9305419921875, + "loss": 0.735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3757493793964386, + "rewards/margins": 0.13268440961837769, + "rewards/rejected": -0.5084337592124939, + "step": 255 + }, + { + "epoch": 0.03, + "learning_rate": 2.9561044129696827e-07, + "logits/chosen": -3.5680689811706543, + "logits/rejected": -3.8017170429229736, + "logps/chosen": -184.30157470703125, + "logps/rejected": -181.49844360351562, + "loss": 0.5662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34342139959335327, + "rewards/margins": 0.41154569387435913, + "rewards/rejected": -0.7549671530723572, + "step": 256 + }, + { + "epoch": 0.03, + "learning_rate": 2.95575324827344e-07, + "logits/chosen": -3.413520336151123, + "logits/rejected": -3.2705631256103516, + "logps/chosen": -207.03585815429688, + "logps/rejected": -187.05044555664062, + "loss": 0.4434, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044650763273239136, + "rewards/margins": 0.9544289112091064, + "rewards/rejected": -0.999079704284668, + "step": 257 + }, + { + "epoch": 0.03, + "learning_rate": 2.9554020835771977e-07, + "logits/chosen": -3.250988721847534, + "logits/rejected": -3.1080923080444336, + "logps/chosen": -311.73333740234375, + "logps/rejected": -207.7476806640625, + "loss": 1.0542, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6775745153427124, + "rewards/margins": -0.48238229751586914, + "rewards/rejected": -0.19519217312335968, + "step": 258 + }, + { + "epoch": 0.03, + "learning_rate": 2.9550509188809553e-07, + "logits/chosen": -3.295384407043457, + "logits/rejected": -3.2088217735290527, + "logps/chosen": -212.302490234375, + "logps/rejected": -306.0133056640625, + "loss": 0.4493, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10247643291950226, + "rewards/margins": 0.7426782250404358, + "rewards/rejected": -0.8451547622680664, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 2.9546997541847123e-07, + "logits/chosen": -2.9110910892486572, + "logits/rejected": -2.7901899814605713, + "logps/chosen": -161.24423217773438, + "logps/rejected": -165.93568420410156, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27035072445869446, + "rewards/margins": 0.29355430603027344, + "rewards/rejected": -0.5639050602912903, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 2.95434858948847e-07, + "logits/chosen": -3.393010377883911, + "logits/rejected": -3.228618621826172, + "logps/chosen": -278.3468322753906, + "logps/rejected": -240.8148651123047, + "loss": 0.7107, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024721860885620117, + "rewards/margins": 0.11118073761463165, + "rewards/rejected": -0.13590261340141296, + "step": 261 + }, + { + "epoch": 0.03, + "learning_rate": 2.9539974247922274e-07, + "logits/chosen": -2.8101284503936768, + "logits/rejected": -2.7222795486450195, + "logps/chosen": -408.3419189453125, + "logps/rejected": -332.5548400878906, + "loss": 0.5974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15421709418296814, + "rewards/margins": 0.5535743236541748, + "rewards/rejected": -0.7077913880348206, + "step": 262 + }, + { + "epoch": 0.03, + "learning_rate": 2.953646260095985e-07, + "logits/chosen": -2.37119197845459, + "logits/rejected": -2.436436176300049, + "logps/chosen": -217.45526123046875, + "logps/rejected": -155.53355407714844, + "loss": 0.6469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2927638292312622, + "rewards/margins": 0.38705724477767944, + "rewards/rejected": -0.6798210740089417, + "step": 263 + }, + { + "epoch": 0.03, + "learning_rate": 2.9532950953997424e-07, + "logits/chosen": -3.383761405944824, + "logits/rejected": -3.105156183242798, + "logps/chosen": -288.82489013671875, + "logps/rejected": -234.01803588867188, + "loss": 0.5456, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03502259403467178, + "rewards/margins": 0.36700737476348877, + "rewards/rejected": -0.3319847881793976, + "step": 264 + }, + { + "epoch": 0.03, + "learning_rate": 2.9529439307034994e-07, + "logits/chosen": -3.2609729766845703, + "logits/rejected": -3.3870296478271484, + "logps/chosen": -366.5753479003906, + "logps/rejected": -280.2196044921875, + "loss": 0.3967, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04168878495693207, + "rewards/margins": 1.0339088439941406, + "rewards/rejected": -0.9922200441360474, + "step": 265 + }, + { + "epoch": 0.03, + "learning_rate": 2.9525927660072575e-07, + "logits/chosen": -3.5743567943573, + "logits/rejected": -3.2901411056518555, + "logps/chosen": -223.4453125, + "logps/rejected": -137.41665649414062, + "loss": 0.7586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08028890192508698, + "rewards/margins": -0.00903475284576416, + "rewards/rejected": -0.07125413417816162, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 2.952241601311015e-07, + "logits/chosen": -2.895266056060791, + "logits/rejected": -2.9548327922821045, + "logps/chosen": -129.94833374023438, + "logps/rejected": -185.19534301757812, + "loss": 0.6044, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03316054493188858, + "rewards/margins": 0.39170870184898376, + "rewards/rejected": -0.3585481643676758, + "step": 267 + }, + { + "epoch": 0.03, + "learning_rate": 2.951890436614772e-07, + "logits/chosen": -3.622300148010254, + "logits/rejected": -3.056347370147705, + "logps/chosen": -250.09535217285156, + "logps/rejected": -190.8685760498047, + "loss": 0.6703, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08594925701618195, + "rewards/margins": 0.30523163080215454, + "rewards/rejected": -0.3911808431148529, + "step": 268 + }, + { + "epoch": 0.03, + "learning_rate": 2.9515392719185296e-07, + "logits/chosen": -3.10042667388916, + "logits/rejected": -3.348539113998413, + "logps/chosen": -220.8554229736328, + "logps/rejected": -203.28074645996094, + "loss": 0.5053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19195130467414856, + "rewards/margins": 0.6746183633804321, + "rewards/rejected": -0.8665696382522583, + "step": 269 + }, + { + "epoch": 0.03, + "learning_rate": 2.951188107222287e-07, + "logits/chosen": -3.3677988052368164, + "logits/rejected": -3.060849189758301, + "logps/chosen": -468.64080810546875, + "logps/rejected": -307.90435791015625, + "loss": 0.6562, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.163176029920578, + "rewards/margins": 0.42041224241256714, + "rewards/rejected": -0.5835882425308228, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 2.9508369425260447e-07, + "logits/chosen": -2.292534351348877, + "logits/rejected": -2.2191662788391113, + "logps/chosen": -198.69781494140625, + "logps/rejected": -153.9535369873047, + "loss": 0.5588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09385165572166443, + "rewards/margins": 0.4356576204299927, + "rewards/rejected": -0.5295092463493347, + "step": 271 + }, + { + "epoch": 0.03, + "learning_rate": 2.950485777829802e-07, + "logits/chosen": -3.2396583557128906, + "logits/rejected": -2.7925634384155273, + "logps/chosen": -393.10662841796875, + "logps/rejected": -274.460205078125, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06138180196285248, + "rewards/margins": 0.39145928621292114, + "rewards/rejected": -0.4528411030769348, + "step": 272 + }, + { + "epoch": 0.03, + "learning_rate": 2.950134613133559e-07, + "logits/chosen": -2.5421390533447266, + "logits/rejected": -2.56845760345459, + "logps/chosen": -414.1586608886719, + "logps/rejected": -324.009765625, + "loss": 0.6283, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2678390443325043, + "rewards/margins": 0.5445848703384399, + "rewards/rejected": -0.2767457664012909, + "step": 273 + }, + { + "epoch": 0.03, + "learning_rate": 2.949783448437317e-07, + "logits/chosen": -3.293621301651001, + "logits/rejected": -3.4014596939086914, + "logps/chosen": -178.57855224609375, + "logps/rejected": -322.4431457519531, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27491435408592224, + "rewards/margins": 0.49620288610458374, + "rewards/rejected": -0.7711172103881836, + "step": 274 + }, + { + "epoch": 0.03, + "learning_rate": 2.9494322837410743e-07, + "logits/chosen": -3.5934128761291504, + "logits/rejected": -3.473421096801758, + "logps/chosen": -316.4269104003906, + "logps/rejected": -280.4856872558594, + "loss": 0.3596, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33110618591308594, + "rewards/margins": 1.189288854598999, + "rewards/rejected": -0.8581825494766235, + "step": 275 + }, + { + "epoch": 0.03, + "learning_rate": 2.949081119044832e-07, + "logits/chosen": -2.942293643951416, + "logits/rejected": -3.1980695724487305, + "logps/chosen": -222.44879150390625, + "logps/rejected": -168.8373565673828, + "loss": 0.6689, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11472505331039429, + "rewards/margins": 0.28952834010124207, + "rewards/rejected": -0.40425336360931396, + "step": 276 + }, + { + "epoch": 0.03, + "learning_rate": 2.9487299543485894e-07, + "logits/chosen": -2.7216310501098633, + "logits/rejected": -2.735112428665161, + "logps/chosen": -142.8868865966797, + "logps/rejected": -118.58480834960938, + "loss": 0.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04224151372909546, + "rewards/margins": 0.29635509848594666, + "rewards/rejected": -0.3385966122150421, + "step": 277 + }, + { + "epoch": 0.03, + "learning_rate": 2.948378789652347e-07, + "logits/chosen": -3.320821762084961, + "logits/rejected": -3.3334197998046875, + "logps/chosen": -331.442138671875, + "logps/rejected": -326.5345153808594, + "loss": 0.474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3174463212490082, + "rewards/margins": 0.6842853426933289, + "rewards/rejected": -1.0017316341400146, + "step": 278 + }, + { + "epoch": 0.03, + "learning_rate": 2.9480276249561045e-07, + "logits/chosen": -3.4243664741516113, + "logits/rejected": -3.8387043476104736, + "logps/chosen": -209.17385864257812, + "logps/rejected": -207.16128540039062, + "loss": 0.3675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11109502613544464, + "rewards/margins": 1.3444573879241943, + "rewards/rejected": -1.2333624362945557, + "step": 279 + }, + { + "epoch": 0.03, + "learning_rate": 2.947676460259862e-07, + "logits/chosen": -2.8244261741638184, + "logits/rejected": -2.9401025772094727, + "logps/chosen": -322.2203674316406, + "logps/rejected": -391.6444091796875, + "loss": 0.8416, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0325528085231781, + "rewards/margins": -0.13652373850345612, + "rewards/rejected": 0.16907654702663422, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 2.947325295563619e-07, + "logits/chosen": -3.6474616527557373, + "logits/rejected": -3.429649591445923, + "logps/chosen": -279.5596008300781, + "logps/rejected": -244.8258514404297, + "loss": 0.4327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45788782835006714, + "rewards/margins": 0.9634177684783936, + "rewards/rejected": -1.4213056564331055, + "step": 281 + }, + { + "epoch": 0.03, + "learning_rate": 2.9469741308673765e-07, + "logits/chosen": -3.269453763961792, + "logits/rejected": -3.0245208740234375, + "logps/chosen": -336.4051513671875, + "logps/rejected": -291.3902282714844, + "loss": 0.5383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1263527274131775, + "rewards/margins": 0.4369955062866211, + "rewards/rejected": -0.5633482933044434, + "step": 282 + }, + { + "epoch": 0.03, + "learning_rate": 2.946622966171134e-07, + "logits/chosen": -3.0807693004608154, + "logits/rejected": -2.7006919384002686, + "logps/chosen": -320.9117431640625, + "logps/rejected": -237.8446044921875, + "loss": 0.7154, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49351128935813904, + "rewards/margins": 0.21444934606552124, + "rewards/rejected": -0.7079607248306274, + "step": 283 + }, + { + "epoch": 0.03, + "learning_rate": 2.9462718014748916e-07, + "logits/chosen": -2.948262929916382, + "logits/rejected": -2.9350359439849854, + "logps/chosen": -274.36187744140625, + "logps/rejected": -205.1109619140625, + "loss": 0.6456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08139395713806152, + "rewards/margins": 0.1307264119386673, + "rewards/rejected": -0.21212036907672882, + "step": 284 + }, + { + "epoch": 0.03, + "learning_rate": 2.945920636778649e-07, + "logits/chosen": -2.964962959289551, + "logits/rejected": -3.2351560592651367, + "logps/chosen": -338.36151123046875, + "logps/rejected": -358.1169738769531, + "loss": 0.2838, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11050411313772202, + "rewards/margins": 2.0304903984069824, + "rewards/rejected": -1.9199864864349365, + "step": 285 + }, + { + "epoch": 0.03, + "learning_rate": 2.945569472082406e-07, + "logits/chosen": -2.6364738941192627, + "logits/rejected": -2.820420265197754, + "logps/chosen": -173.23806762695312, + "logps/rejected": -286.0772399902344, + "loss": 0.4242, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009502798318862915, + "rewards/margins": 1.4162652492523193, + "rewards/rejected": -1.4067624807357788, + "step": 286 + }, + { + "epoch": 0.03, + "learning_rate": 2.9452183073861637e-07, + "logits/chosen": -3.712928533554077, + "logits/rejected": -4.027872085571289, + "logps/chosen": -141.1028594970703, + "logps/rejected": -178.35043334960938, + "loss": 0.386, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03404662758111954, + "rewards/margins": 1.340185284614563, + "rewards/rejected": -1.3061386346817017, + "step": 287 + }, + { + "epoch": 0.03, + "learning_rate": 2.944867142689922e-07, + "logits/chosen": -3.055030107498169, + "logits/rejected": -3.342327356338501, + "logps/chosen": -349.2940979003906, + "logps/rejected": -236.1305389404297, + "loss": 0.8511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5700069665908813, + "rewards/margins": -0.04341600835323334, + "rewards/rejected": -0.5265909433364868, + "step": 288 + }, + { + "epoch": 0.03, + "learning_rate": 2.944515977993679e-07, + "logits/chosen": -2.088204860687256, + "logits/rejected": -2.1083574295043945, + "logps/chosen": -406.4343566894531, + "logps/rejected": -307.288818359375, + "loss": 0.7923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28288477659225464, + "rewards/margins": 0.10361167788505554, + "rewards/rejected": -0.3864964544773102, + "step": 289 + }, + { + "epoch": 0.03, + "learning_rate": 2.9441648132974363e-07, + "logits/chosen": -2.992600917816162, + "logits/rejected": -2.660426139831543, + "logps/chosen": -137.6371307373047, + "logps/rejected": -177.1961669921875, + "loss": 0.553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11843032389879227, + "rewards/margins": 0.3880849778652191, + "rewards/rejected": -0.5065153241157532, + "step": 290 + }, + { + "epoch": 0.03, + "learning_rate": 2.943813648601194e-07, + "logits/chosen": -3.703796863555908, + "logits/rejected": -3.789332389831543, + "logps/chosen": -237.91632080078125, + "logps/rejected": -285.07269287109375, + "loss": 0.4759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44285842776298523, + "rewards/margins": 1.0705028772354126, + "rewards/rejected": -1.5133612155914307, + "step": 291 + }, + { + "epoch": 0.03, + "learning_rate": 2.9434624839049514e-07, + "logits/chosen": -2.7813186645507812, + "logits/rejected": -2.623178720474243, + "logps/chosen": -104.82647705078125, + "logps/rejected": -214.0926971435547, + "loss": 0.6244, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11224200576543808, + "rewards/margins": 0.2823832631111145, + "rewards/rejected": -0.17014124989509583, + "step": 292 + }, + { + "epoch": 0.03, + "learning_rate": 2.943111319208709e-07, + "logits/chosen": -3.278107166290283, + "logits/rejected": -3.2751479148864746, + "logps/chosen": -279.2469177246094, + "logps/rejected": -304.2403564453125, + "loss": 0.8096, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2041158527135849, + "rewards/margins": 0.1692911684513092, + "rewards/rejected": -0.3734070062637329, + "step": 293 + }, + { + "epoch": 0.03, + "learning_rate": 2.942760154512466e-07, + "logits/chosen": -2.3685712814331055, + "logits/rejected": -2.8315694332122803, + "logps/chosen": -353.8135986328125, + "logps/rejected": -248.48870849609375, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.025336403399705887, + "rewards/margins": 0.2184801697731018, + "rewards/rejected": -0.2438165694475174, + "step": 294 + }, + { + "epoch": 0.03, + "learning_rate": 2.9424089898162235e-07, + "logits/chosen": -3.0615806579589844, + "logits/rejected": -2.8228201866149902, + "logps/chosen": -172.1629638671875, + "logps/rejected": -169.157958984375, + "loss": 0.5479, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15541484951972961, + "rewards/margins": 0.8552181124687195, + "rewards/rejected": -0.6998032331466675, + "step": 295 + }, + { + "epoch": 0.03, + "learning_rate": 2.942057825119981e-07, + "logits/chosen": -2.9466476440429688, + "logits/rejected": -3.334681510925293, + "logps/chosen": -284.5035095214844, + "logps/rejected": -378.8334045410156, + "loss": 0.4928, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09600576758384705, + "rewards/margins": 0.6931609511375427, + "rewards/rejected": -0.7891668081283569, + "step": 296 + }, + { + "epoch": 0.03, + "learning_rate": 2.9417066604237386e-07, + "logits/chosen": -3.3863070011138916, + "logits/rejected": -3.148373603820801, + "logps/chosen": -362.2552795410156, + "logps/rejected": -257.8871154785156, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4009194076061249, + "rewards/margins": 0.7932018041610718, + "rewards/rejected": -1.1941211223602295, + "step": 297 + }, + { + "epoch": 0.03, + "learning_rate": 2.941355495727496e-07, + "logits/chosen": -2.9037632942199707, + "logits/rejected": -2.832989454269409, + "logps/chosen": -132.77593994140625, + "logps/rejected": -139.24984741210938, + "loss": 0.5631, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.016427859663963318, + "rewards/margins": 0.5653521418571472, + "rewards/rejected": -0.5489243268966675, + "step": 298 + }, + { + "epoch": 0.03, + "learning_rate": 2.9410043310312536e-07, + "logits/chosen": -3.2077412605285645, + "logits/rejected": -3.107977867126465, + "logps/chosen": -345.524658203125, + "logps/rejected": -291.3341979980469, + "loss": 0.3868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014534756541252136, + "rewards/margins": 1.1889867782592773, + "rewards/rejected": -1.203521490097046, + "step": 299 + }, + { + "epoch": 0.03, + "learning_rate": 2.940653166335011e-07, + "logits/chosen": -3.352492094039917, + "logits/rejected": -3.109666347503662, + "logps/chosen": -362.8140563964844, + "logps/rejected": -252.06201171875, + "loss": 0.5785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18650507926940918, + "rewards/margins": 0.3064579665660858, + "rewards/rejected": -0.4929630160331726, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 2.9403020016387687e-07, + "logits/chosen": -2.6634249687194824, + "logits/rejected": -2.7344226837158203, + "logps/chosen": -356.1751403808594, + "logps/rejected": -317.7835388183594, + "loss": 0.4404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09449952095746994, + "rewards/margins": 0.855010986328125, + "rewards/rejected": -0.9495104551315308, + "step": 301 + }, + { + "epoch": 0.03, + "learning_rate": 2.9399508369425257e-07, + "logits/chosen": -2.7083141803741455, + "logits/rejected": -3.0024375915527344, + "logps/chosen": -274.20294189453125, + "logps/rejected": -202.3379364013672, + "loss": 0.6559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01950082927942276, + "rewards/margins": 0.9511275887489319, + "rewards/rejected": -0.9316267371177673, + "step": 302 + }, + { + "epoch": 0.03, + "learning_rate": 2.939599672246283e-07, + "logits/chosen": -2.9776711463928223, + "logits/rejected": -2.837388277053833, + "logps/chosen": -258.4771423339844, + "logps/rejected": -188.95675659179688, + "loss": 0.5883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0939856544137001, + "rewards/margins": 0.25986289978027344, + "rewards/rejected": -0.16587725281715393, + "step": 303 + }, + { + "epoch": 0.04, + "learning_rate": 2.939248507550041e-07, + "logits/chosen": -2.441681385040283, + "logits/rejected": -2.4628002643585205, + "logps/chosen": -259.7264404296875, + "logps/rejected": -250.77215576171875, + "loss": 0.6129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22008131444454193, + "rewards/margins": 0.3451223075389862, + "rewards/rejected": -0.5652036070823669, + "step": 304 + }, + { + "epoch": 0.04, + "learning_rate": 2.9388973428537983e-07, + "logits/chosen": -3.7031519412994385, + "logits/rejected": -3.6959173679351807, + "logps/chosen": -177.1024169921875, + "logps/rejected": -128.32034301757812, + "loss": 1.1963, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0959892272949219, + "rewards/margins": -0.4990222156047821, + "rewards/rejected": -0.5969669818878174, + "step": 305 + }, + { + "epoch": 0.04, + "learning_rate": 2.938546178157556e-07, + "logits/chosen": -3.327065944671631, + "logits/rejected": -3.6325559616088867, + "logps/chosen": -123.60502624511719, + "logps/rejected": -213.4351806640625, + "loss": 0.2347, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05670592561364174, + "rewards/margins": 1.4688411951065063, + "rewards/rejected": -1.4121352434158325, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 2.9381950134613134e-07, + "logits/chosen": -2.4592673778533936, + "logits/rejected": -2.374279499053955, + "logps/chosen": -344.15069580078125, + "logps/rejected": -383.67718505859375, + "loss": 0.4861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1637369990348816, + "rewards/margins": 0.6602519154548645, + "rewards/rejected": -0.8239889144897461, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 2.9378438487650704e-07, + "logits/chosen": -2.758707046508789, + "logits/rejected": -2.822021245956421, + "logps/chosen": -153.11996459960938, + "logps/rejected": -148.84185791015625, + "loss": 0.6439, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009745601564645767, + "rewards/margins": 0.306857168674469, + "rewards/rejected": -0.31660276651382446, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 2.937492684068828e-07, + "logits/chosen": -2.9479808807373047, + "logits/rejected": -2.8663699626922607, + "logps/chosen": -219.19906616210938, + "logps/rejected": -187.76983642578125, + "loss": 0.4953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1511968970298767, + "rewards/margins": 0.6809184551239014, + "rewards/rejected": -0.8321153521537781, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 2.9371415193725855e-07, + "logits/chosen": -3.3738842010498047, + "logits/rejected": -3.1222164630889893, + "logps/chosen": -193.66009521484375, + "logps/rejected": -175.98611450195312, + "loss": 0.6653, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3349013924598694, + "rewards/margins": 0.16812512278556824, + "rewards/rejected": -0.5030264854431152, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.936790354676343e-07, + "logits/chosen": -3.5526282787323, + "logits/rejected": -3.445509195327759, + "logps/chosen": -123.62091064453125, + "logps/rejected": -180.59898376464844, + "loss": 0.8308, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4425463080406189, + "rewards/margins": 0.6086779236793518, + "rewards/rejected": -1.0512242317199707, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 2.9364391899801006e-07, + "logits/chosen": -3.2245140075683594, + "logits/rejected": -3.179724931716919, + "logps/chosen": -460.0415344238281, + "logps/rejected": -438.45245361328125, + "loss": 0.8796, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3963492810726166, + "rewards/margins": 0.3340432047843933, + "rewards/rejected": -0.7303924560546875, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 2.936088025283858e-07, + "logits/chosen": -3.1400339603424072, + "logits/rejected": -3.1425466537475586, + "logps/chosen": -239.12672424316406, + "logps/rejected": -219.86468505859375, + "loss": 0.4588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13392484188079834, + "rewards/margins": 0.8361534476280212, + "rewards/rejected": -0.9700783491134644, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 2.9357368605876157e-07, + "logits/chosen": -2.6414456367492676, + "logits/rejected": -2.6109519004821777, + "logps/chosen": -253.28993225097656, + "logps/rejected": -312.7982482910156, + "loss": 0.4628, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2580501437187195, + "rewards/margins": 0.7300293445587158, + "rewards/rejected": -0.4719792306423187, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 2.935385695891373e-07, + "logits/chosen": -3.2817306518554688, + "logits/rejected": -3.5374622344970703, + "logps/chosen": -157.21253967285156, + "logps/rejected": -307.54412841796875, + "loss": 0.3245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08705174922943115, + "rewards/margins": 1.1963064670562744, + "rewards/rejected": -1.1092548370361328, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 2.93503453119513e-07, + "logits/chosen": -3.5193703174591064, + "logits/rejected": -3.158482551574707, + "logps/chosen": -292.23297119140625, + "logps/rejected": -132.8345184326172, + "loss": 0.6632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08253279328346252, + "rewards/margins": 0.4941173791885376, + "rewards/rejected": -0.5766501426696777, + "step": 316 + }, + { + "epoch": 0.04, + "learning_rate": 2.934683366498888e-07, + "logits/chosen": -2.743800640106201, + "logits/rejected": -3.1043996810913086, + "logps/chosen": -217.6492919921875, + "logps/rejected": -310.0658874511719, + "loss": 0.4752, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18808509409427643, + "rewards/margins": 0.8832048773765564, + "rewards/rejected": -1.0712898969650269, + "step": 317 + }, + { + "epoch": 0.04, + "learning_rate": 2.9343322018026453e-07, + "logits/chosen": -2.9949402809143066, + "logits/rejected": -3.3514065742492676, + "logps/chosen": -113.5266342163086, + "logps/rejected": -246.15701293945312, + "loss": 0.3837, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2799297869205475, + "rewards/margins": 1.240370512008667, + "rewards/rejected": -0.9604406952857971, + "step": 318 + }, + { + "epoch": 0.04, + "learning_rate": 2.933981037106403e-07, + "logits/chosen": -2.973745346069336, + "logits/rejected": -3.2470593452453613, + "logps/chosen": -384.97125244140625, + "logps/rejected": -464.354248046875, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.058614302426576614, + "rewards/margins": 1.1149569749832153, + "rewards/rejected": -1.0563428401947021, + "step": 319 + }, + { + "epoch": 0.04, + "learning_rate": 2.9336298724101604e-07, + "logits/chosen": -3.272603988647461, + "logits/rejected": -3.3869433403015137, + "logps/chosen": -194.43154907226562, + "logps/rejected": -215.8647003173828, + "loss": 0.888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3865167498588562, + "rewards/margins": -0.10005658864974976, + "rewards/rejected": -0.2864602208137512, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.9332787077139174e-07, + "logits/chosen": -3.1690454483032227, + "logits/rejected": -3.3065826892852783, + "logps/chosen": -237.34722900390625, + "logps/rejected": -217.1185302734375, + "loss": 0.5791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2768930494785309, + "rewards/margins": 0.6962430477142334, + "rewards/rejected": -0.9731361269950867, + "step": 321 + }, + { + "epoch": 0.04, + "learning_rate": 2.9329275430176754e-07, + "logits/chosen": -2.8331990242004395, + "logits/rejected": -3.181321144104004, + "logps/chosen": -310.63226318359375, + "logps/rejected": -353.94482421875, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16542020440101624, + "rewards/margins": 1.615039348602295, + "rewards/rejected": -1.449619174003601, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 2.9325763783214324e-07, + "logits/chosen": -3.0022006034851074, + "logits/rejected": -3.0283021926879883, + "logps/chosen": -318.5064697265625, + "logps/rejected": -284.5988464355469, + "loss": 0.3804, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1324634701013565, + "rewards/margins": 1.1459918022155762, + "rewards/rejected": -1.0135283470153809, + "step": 323 + }, + { + "epoch": 0.04, + "learning_rate": 2.93222521362519e-07, + "logits/chosen": -2.5371994972229004, + "logits/rejected": -2.9142589569091797, + "logps/chosen": -197.29006958007812, + "logps/rejected": -157.74923706054688, + "loss": 0.7642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08228358626365662, + "rewards/margins": 0.06503608822822571, + "rewards/rejected": 0.017247512936592102, + "step": 324 + }, + { + "epoch": 0.04, + "learning_rate": 2.9318740489289475e-07, + "logits/chosen": -3.585228443145752, + "logits/rejected": -3.0260109901428223, + "logps/chosen": -302.4771728515625, + "logps/rejected": -254.6261749267578, + "loss": 0.7624, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40179863572120667, + "rewards/margins": 0.38462257385253906, + "rewards/rejected": -0.7864211797714233, + "step": 325 + }, + { + "epoch": 0.04, + "learning_rate": 2.931522884232705e-07, + "logits/chosen": -3.8819375038146973, + "logits/rejected": -3.7814290523529053, + "logps/chosen": -371.606689453125, + "logps/rejected": -302.6039733886719, + "loss": 0.311, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41315630078315735, + "rewards/margins": 2.109727382659912, + "rewards/rejected": -1.6965712308883667, + "step": 326 + }, + { + "epoch": 0.04, + "learning_rate": 2.9311717195364626e-07, + "logits/chosen": -2.563169002532959, + "logits/rejected": -2.597715377807617, + "logps/chosen": -199.35897827148438, + "logps/rejected": -175.29971313476562, + "loss": 0.6096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01657724380493164, + "rewards/margins": 0.35831472277641296, + "rewards/rejected": -0.374891996383667, + "step": 327 + }, + { + "epoch": 0.04, + "learning_rate": 2.93082055484022e-07, + "logits/chosen": -2.8802742958068848, + "logits/rejected": -3.2123565673828125, + "logps/chosen": -223.31463623046875, + "logps/rejected": -155.9239501953125, + "loss": 0.5391, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2656763195991516, + "rewards/margins": 0.6253472566604614, + "rewards/rejected": -0.3596709370613098, + "step": 328 + }, + { + "epoch": 0.04, + "learning_rate": 2.930469390143977e-07, + "logits/chosen": -3.3330438137054443, + "logits/rejected": -3.0752158164978027, + "logps/chosen": -232.72360229492188, + "logps/rejected": -206.1331024169922, + "loss": 1.0731, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4577289819717407, + "rewards/margins": -0.13812890648841858, + "rewards/rejected": -0.31960004568099976, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 2.9301182254477347e-07, + "logits/chosen": -3.23341703414917, + "logits/rejected": -3.2092065811157227, + "logps/chosen": -385.8639831542969, + "logps/rejected": -205.04293823242188, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08751187473535538, + "rewards/margins": 0.5633958578109741, + "rewards/rejected": -0.6509077548980713, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.929767060751492e-07, + "logits/chosen": -3.0862948894500732, + "logits/rejected": -3.3389623165130615, + "logps/chosen": -256.924560546875, + "logps/rejected": -216.75375366210938, + "loss": 0.5746, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1574631929397583, + "rewards/margins": 0.3690336346626282, + "rewards/rejected": -0.5264968872070312, + "step": 331 + }, + { + "epoch": 0.04, + "learning_rate": 2.92941589605525e-07, + "logits/chosen": -3.5357768535614014, + "logits/rejected": -3.564411163330078, + "logps/chosen": -222.9327392578125, + "logps/rejected": -239.39614868164062, + "loss": 0.5261, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.033870432525873184, + "rewards/margins": 0.4337180554866791, + "rewards/rejected": -0.3998476266860962, + "step": 332 + }, + { + "epoch": 0.04, + "learning_rate": 2.9290647313590073e-07, + "logits/chosen": -2.92166805267334, + "logits/rejected": -2.8673064708709717, + "logps/chosen": -154.33761596679688, + "logps/rejected": -151.41603088378906, + "loss": 0.6343, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17431049048900604, + "rewards/margins": 0.2438768744468689, + "rewards/rejected": -0.41818737983703613, + "step": 333 + }, + { + "epoch": 0.04, + "learning_rate": 2.928713566662765e-07, + "logits/chosen": -2.942204236984253, + "logits/rejected": -2.9921658039093018, + "logps/chosen": -134.6187286376953, + "logps/rejected": -168.8784942626953, + "loss": 0.539, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11122285574674606, + "rewards/margins": 0.698149561882019, + "rewards/rejected": -0.5869267582893372, + "step": 334 + }, + { + "epoch": 0.04, + "learning_rate": 2.9283624019665224e-07, + "logits/chosen": -3.1781651973724365, + "logits/rejected": -3.4687681198120117, + "logps/chosen": -269.4075012207031, + "logps/rejected": -220.25637817382812, + "loss": 0.5999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27838900685310364, + "rewards/margins": 0.41326144337654114, + "rewards/rejected": -0.6916504502296448, + "step": 335 + }, + { + "epoch": 0.04, + "learning_rate": 2.92801123727028e-07, + "logits/chosen": -3.096864700317383, + "logits/rejected": -3.032203197479248, + "logps/chosen": -232.36148071289062, + "logps/rejected": -293.11767578125, + "loss": 0.3994, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10393879562616348, + "rewards/margins": 1.0696824789047241, + "rewards/rejected": -0.9657436609268188, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 2.927660072574037e-07, + "logits/chosen": -2.461975574493408, + "logits/rejected": -2.61027193069458, + "logps/chosen": -292.6427307128906, + "logps/rejected": -256.63531494140625, + "loss": 0.4423, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2760024070739746, + "rewards/margins": 0.7060391902923584, + "rewards/rejected": -0.4300367832183838, + "step": 337 + }, + { + "epoch": 0.04, + "learning_rate": 2.9273089078777945e-07, + "logits/chosen": -2.866392135620117, + "logits/rejected": -2.8920254707336426, + "logps/chosen": -339.0001220703125, + "logps/rejected": -342.10589599609375, + "loss": 0.5154, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14709237217903137, + "rewards/margins": 0.6183605194091797, + "rewards/rejected": -0.4712681174278259, + "step": 338 + }, + { + "epoch": 0.04, + "learning_rate": 2.926957743181552e-07, + "logits/chosen": -3.3417139053344727, + "logits/rejected": -3.105760097503662, + "logps/chosen": -403.8670654296875, + "logps/rejected": -291.6964111328125, + "loss": 0.3191, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17001502215862274, + "rewards/margins": 1.4750672578811646, + "rewards/rejected": -1.3050522804260254, + "step": 339 + }, + { + "epoch": 0.04, + "learning_rate": 2.9266065784853095e-07, + "logits/chosen": -3.247652769088745, + "logits/rejected": -3.1947481632232666, + "logps/chosen": -252.56202697753906, + "logps/rejected": -177.67776489257812, + "loss": 0.7384, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3070858418941498, + "rewards/margins": -0.05510035157203674, + "rewards/rejected": -0.25198546051979065, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 2.926255413789067e-07, + "logits/chosen": -3.2172679901123047, + "logits/rejected": -3.0673675537109375, + "logps/chosen": -333.4479675292969, + "logps/rejected": -195.57716369628906, + "loss": 0.7629, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3991571366786957, + "rewards/margins": 0.2618064284324646, + "rewards/rejected": -0.6609635353088379, + "step": 341 + }, + { + "epoch": 0.04, + "learning_rate": 2.925904249092824e-07, + "logits/chosen": -2.3871593475341797, + "logits/rejected": -2.2753219604492188, + "logps/chosen": -235.65164184570312, + "logps/rejected": -251.76060485839844, + "loss": 0.5954, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07159613817930222, + "rewards/margins": 0.6409983038902283, + "rewards/rejected": -0.5694020986557007, + "step": 342 + }, + { + "epoch": 0.04, + "learning_rate": 2.9255530843965816e-07, + "logits/chosen": -2.857883930206299, + "logits/rejected": -2.715831756591797, + "logps/chosen": -177.47262573242188, + "logps/rejected": -256.6138916015625, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012427326291799545, + "rewards/margins": 0.903405487537384, + "rewards/rejected": -0.9158328175544739, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 2.9252019197003397e-07, + "logits/chosen": -3.578307628631592, + "logits/rejected": -3.40179443359375, + "logps/chosen": -352.6904296875, + "logps/rejected": -267.75433349609375, + "loss": 0.3813, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032896608114242554, + "rewards/margins": 1.2026045322418213, + "rewards/rejected": -1.1697078943252563, + "step": 344 + }, + { + "epoch": 0.04, + "learning_rate": 2.9248507550040967e-07, + "logits/chosen": -3.159625291824341, + "logits/rejected": -2.9712705612182617, + "logps/chosen": -348.6385803222656, + "logps/rejected": -334.8789367675781, + "loss": 0.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41008687019348145, + "rewards/margins": 0.26367512345314026, + "rewards/rejected": -0.6737619638442993, + "step": 345 + }, + { + "epoch": 0.04, + "learning_rate": 2.924499590307854e-07, + "logits/chosen": -3.3857009410858154, + "logits/rejected": -3.3708343505859375, + "logps/chosen": -460.67333984375, + "logps/rejected": -311.1851806640625, + "loss": 0.5569, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07098651677370071, + "rewards/margins": 0.44480839371681213, + "rewards/rejected": -0.3738219141960144, + "step": 346 + }, + { + "epoch": 0.04, + "learning_rate": 2.924148425611612e-07, + "logits/chosen": -3.080620527267456, + "logits/rejected": -2.8345484733581543, + "logps/chosen": -203.33914184570312, + "logps/rejected": -228.475830078125, + "loss": 0.6748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16161608695983887, + "rewards/margins": 0.23351091146469116, + "rewards/rejected": -0.39512699842453003, + "step": 347 + }, + { + "epoch": 0.04, + "learning_rate": 2.9237972609153693e-07, + "logits/chosen": -2.760406017303467, + "logits/rejected": -2.6120376586914062, + "logps/chosen": -194.50914001464844, + "logps/rejected": -211.8148956298828, + "loss": 0.5653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15548710525035858, + "rewards/margins": 0.4855707287788391, + "rewards/rejected": -0.6410577893257141, + "step": 348 + }, + { + "epoch": 0.04, + "learning_rate": 2.923446096219127e-07, + "logits/chosen": -3.844106912612915, + "logits/rejected": -3.606504440307617, + "logps/chosen": -288.6073913574219, + "logps/rejected": -164.28311157226562, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09621953964233398, + "rewards/margins": 0.638676106929779, + "rewards/rejected": -0.5424565672874451, + "step": 349 + }, + { + "epoch": 0.04, + "learning_rate": 2.923094931522884e-07, + "logits/chosen": -3.577012062072754, + "logits/rejected": -3.207430601119995, + "logps/chosen": -405.03460693359375, + "logps/rejected": -258.4103698730469, + "loss": 0.7757, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5443931221961975, + "rewards/margins": 0.13326036930084229, + "rewards/rejected": -0.677653431892395, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 2.9227437668266414e-07, + "logits/chosen": -3.124886989593506, + "logits/rejected": -3.310025215148926, + "logps/chosen": -461.0048828125, + "logps/rejected": -202.38067626953125, + "loss": 0.5395, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.037668608129024506, + "rewards/margins": 0.6605635285377502, + "rewards/rejected": -0.6228950023651123, + "step": 351 + }, + { + "epoch": 0.04, + "learning_rate": 2.922392602130399e-07, + "logits/chosen": -3.377054452896118, + "logits/rejected": -3.477799892425537, + "logps/chosen": -201.81796264648438, + "logps/rejected": -220.58404541015625, + "loss": 0.4115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025179080665111542, + "rewards/margins": 1.3641031980514526, + "rewards/rejected": -1.3389240503311157, + "step": 352 + }, + { + "epoch": 0.04, + "learning_rate": 2.9220414374341565e-07, + "logits/chosen": -3.078449010848999, + "logits/rejected": -2.9485878944396973, + "logps/chosen": -204.41680908203125, + "logps/rejected": -214.7860107421875, + "loss": 0.5579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08494075387716293, + "rewards/margins": 0.5809195041656494, + "rewards/rejected": -0.6658602952957153, + "step": 353 + }, + { + "epoch": 0.04, + "learning_rate": 2.921690272737914e-07, + "logits/chosen": -3.243419647216797, + "logits/rejected": -3.274054527282715, + "logps/chosen": -169.3918914794922, + "logps/rejected": -187.13702392578125, + "loss": 0.4455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22083498537540436, + "rewards/margins": 0.7169981598854065, + "rewards/rejected": -0.9378331899642944, + "step": 354 + }, + { + "epoch": 0.04, + "learning_rate": 2.921339108041671e-07, + "logits/chosen": -3.3823444843292236, + "logits/rejected": -3.5137081146240234, + "logps/chosen": -221.2010040283203, + "logps/rejected": -229.50128173828125, + "loss": 0.7663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24071770906448364, + "rewards/margins": 0.40651610493659973, + "rewards/rejected": -0.647233784198761, + "step": 355 + }, + { + "epoch": 0.04, + "learning_rate": 2.920987943345429e-07, + "logits/chosen": -3.289555311203003, + "logits/rejected": -3.1732425689697266, + "logps/chosen": -132.65480041503906, + "logps/rejected": -264.6292724609375, + "loss": 0.3568, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07612219452857971, + "rewards/margins": 1.1061369180679321, + "rewards/rejected": -1.0300146341323853, + "step": 356 + }, + { + "epoch": 0.04, + "learning_rate": 2.9206367786491866e-07, + "logits/chosen": -3.3427963256835938, + "logits/rejected": -3.0530571937561035, + "logps/chosen": -316.4560852050781, + "logps/rejected": -277.9018249511719, + "loss": 0.6342, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07268981635570526, + "rewards/margins": 0.29800957441329956, + "rewards/rejected": -0.37069937586784363, + "step": 357 + }, + { + "epoch": 0.04, + "learning_rate": 2.9202856139529436e-07, + "logits/chosen": -3.4742090702056885, + "logits/rejected": -3.632537364959717, + "logps/chosen": -139.5057373046875, + "logps/rejected": -161.68316650390625, + "loss": 0.5034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22021833062171936, + "rewards/margins": 0.7397069931030273, + "rewards/rejected": -0.9599252939224243, + "step": 358 + }, + { + "epoch": 0.04, + "learning_rate": 2.919934449256701e-07, + "logits/chosen": -2.5446934700012207, + "logits/rejected": -2.332984685897827, + "logps/chosen": -389.1212463378906, + "logps/rejected": -364.28302001953125, + "loss": 0.302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3573048710823059, + "rewards/margins": 1.635549783706665, + "rewards/rejected": -1.278244972229004, + "step": 359 + }, + { + "epoch": 0.04, + "learning_rate": 2.9195832845604587e-07, + "logits/chosen": -3.7903034687042236, + "logits/rejected": -3.8577733039855957, + "logps/chosen": -373.8998718261719, + "logps/rejected": -301.3363037109375, + "loss": 0.5067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1959686279296875, + "rewards/margins": 0.7763450741767883, + "rewards/rejected": -0.9723137617111206, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 2.919232119864216e-07, + "logits/chosen": -2.3711633682250977, + "logits/rejected": -2.368687868118286, + "logps/chosen": -531.5543212890625, + "logps/rejected": -358.6279602050781, + "loss": 0.5601, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21405896544456482, + "rewards/margins": 0.38602346181869507, + "rewards/rejected": -0.6000823974609375, + "step": 361 + }, + { + "epoch": 0.04, + "learning_rate": 2.918880955167974e-07, + "logits/chosen": -3.5251173973083496, + "logits/rejected": -3.3067636489868164, + "logps/chosen": -172.3519287109375, + "logps/rejected": -227.97776794433594, + "loss": 0.7153, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42045795917510986, + "rewards/margins": 0.45866432785987854, + "rewards/rejected": -0.879122257232666, + "step": 362 + }, + { + "epoch": 0.04, + "learning_rate": 2.918529790471731e-07, + "logits/chosen": -2.5726752281188965, + "logits/rejected": -2.690509557723999, + "logps/chosen": -284.4325866699219, + "logps/rejected": -220.40345764160156, + "loss": 0.5036, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11083866655826569, + "rewards/margins": 0.5766516923904419, + "rewards/rejected": -0.4658130705356598, + "step": 363 + }, + { + "epoch": 0.04, + "learning_rate": 2.9181786257754883e-07, + "logits/chosen": -2.8000705242156982, + "logits/rejected": -3.038846254348755, + "logps/chosen": -389.9476623535156, + "logps/rejected": -282.806640625, + "loss": 0.412, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004527613520622253, + "rewards/margins": 0.9298328161239624, + "rewards/rejected": -0.925305187702179, + "step": 364 + }, + { + "epoch": 0.04, + "learning_rate": 2.9178274610792464e-07, + "logits/chosen": -2.6487743854522705, + "logits/rejected": -2.543379306793213, + "logps/chosen": -179.88192749023438, + "logps/rejected": -157.98568725585938, + "loss": 0.7515, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06328187882900238, + "rewards/margins": -0.03620663285255432, + "rewards/rejected": -0.027075227349996567, + "step": 365 + }, + { + "epoch": 0.04, + "learning_rate": 2.9174762963830034e-07, + "logits/chosen": -2.91072416305542, + "logits/rejected": -2.879549980163574, + "logps/chosen": -298.82421875, + "logps/rejected": -288.4239501953125, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29703912138938904, + "rewards/margins": 0.5171013474464417, + "rewards/rejected": -0.8141404390335083, + "step": 366 + }, + { + "epoch": 0.04, + "learning_rate": 2.917125131686761e-07, + "logits/chosen": -2.7178382873535156, + "logits/rejected": -2.916861057281494, + "logps/chosen": -306.9468688964844, + "logps/rejected": -274.46038818359375, + "loss": 0.5149, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25449931621551514, + "rewards/margins": 0.41928577423095703, + "rewards/rejected": -0.1647864282131195, + "step": 367 + }, + { + "epoch": 0.04, + "learning_rate": 2.9167739669905185e-07, + "logits/chosen": -2.8936867713928223, + "logits/rejected": -2.91463565826416, + "logps/chosen": -209.67076110839844, + "logps/rejected": -203.77996826171875, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.320660263299942, + "rewards/margins": 0.5525835156440735, + "rewards/rejected": -0.8732438087463379, + "step": 368 + }, + { + "epoch": 0.04, + "learning_rate": 2.916422802294276e-07, + "logits/chosen": -3.2346396446228027, + "logits/rejected": -3.1034178733825684, + "logps/chosen": -275.64398193359375, + "logps/rejected": -348.3328552246094, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2597840428352356, + "rewards/margins": 0.8506712913513184, + "rewards/rejected": -0.590887188911438, + "step": 369 + }, + { + "epoch": 0.04, + "learning_rate": 2.9160716375980336e-07, + "logits/chosen": -3.795283317565918, + "logits/rejected": -3.494300365447998, + "logps/chosen": -302.0039978027344, + "logps/rejected": -218.86351013183594, + "loss": 0.7233, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3841859698295593, + "rewards/margins": 0.42547863721847534, + "rewards/rejected": -0.8096646070480347, + "step": 370 + }, + { + "epoch": 0.04, + "learning_rate": 2.9157204729017906e-07, + "logits/chosen": -2.6999711990356445, + "logits/rejected": -3.0116143226623535, + "logps/chosen": -181.87298583984375, + "logps/rejected": -226.5128631591797, + "loss": 0.3867, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10978382080793381, + "rewards/margins": 0.9570813179016113, + "rewards/rejected": -1.066865086555481, + "step": 371 + }, + { + "epoch": 0.04, + "learning_rate": 2.915369308205548e-07, + "logits/chosen": -2.8200936317443848, + "logits/rejected": -2.8534929752349854, + "logps/chosen": -332.4095458984375, + "logps/rejected": -368.05645751953125, + "loss": 0.5555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07119980454444885, + "rewards/margins": 0.6547142267227173, + "rewards/rejected": -0.7259140014648438, + "step": 372 + }, + { + "epoch": 0.04, + "learning_rate": 2.9150181435093057e-07, + "logits/chosen": -2.7920491695404053, + "logits/rejected": -2.6613986492156982, + "logps/chosen": -183.70977783203125, + "logps/rejected": -357.7785949707031, + "loss": 0.4618, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13575685024261475, + "rewards/margins": 0.8295263051986694, + "rewards/rejected": -0.6937694549560547, + "step": 373 + }, + { + "epoch": 0.04, + "learning_rate": 2.914666978813063e-07, + "logits/chosen": -2.8138973712921143, + "logits/rejected": -2.895423412322998, + "logps/chosen": -274.2352294921875, + "logps/rejected": -232.97042846679688, + "loss": 0.5258, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15725061297416687, + "rewards/margins": 0.6544475555419922, + "rewards/rejected": -0.4971969723701477, + "step": 374 + }, + { + "epoch": 0.04, + "learning_rate": 2.914315814116821e-07, + "logits/chosen": -3.503725290298462, + "logits/rejected": -3.6358609199523926, + "logps/chosen": -236.18182373046875, + "logps/rejected": -246.78253173828125, + "loss": 0.4831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05912458151578903, + "rewards/margins": 0.8933229446411133, + "rewards/rejected": -0.9524475336074829, + "step": 375 + }, + { + "epoch": 0.04, + "learning_rate": 2.913964649420578e-07, + "logits/chosen": -3.4336366653442383, + "logits/rejected": -3.3956403732299805, + "logps/chosen": -417.1386413574219, + "logps/rejected": -328.87054443359375, + "loss": 0.385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12791182100772858, + "rewards/margins": 0.9718395471572876, + "rewards/rejected": -0.8439276814460754, + "step": 376 + }, + { + "epoch": 0.04, + "learning_rate": 2.9136134847243353e-07, + "logits/chosen": -3.1968047618865967, + "logits/rejected": -3.259679079055786, + "logps/chosen": -182.16749572753906, + "logps/rejected": -164.5848388671875, + "loss": 0.7117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05847674608230591, + "rewards/margins": 0.059625424444675446, + "rewards/rejected": -0.11810217797756195, + "step": 377 + }, + { + "epoch": 0.04, + "learning_rate": 2.9132623200280934e-07, + "logits/chosen": -2.565866231918335, + "logits/rejected": -2.755876302719116, + "logps/chosen": -429.2481994628906, + "logps/rejected": -308.80364990234375, + "loss": 0.3832, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3412594795227051, + "rewards/margins": 1.153958797454834, + "rewards/rejected": -0.8126993179321289, + "step": 378 + }, + { + "epoch": 0.04, + "learning_rate": 2.9129111553318504e-07, + "logits/chosen": -2.2944064140319824, + "logits/rejected": -2.381772756576538, + "logps/chosen": -295.29412841796875, + "logps/rejected": -339.4980163574219, + "loss": 0.8135, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3978363573551178, + "rewards/margins": -0.06217159330844879, + "rewards/rejected": -0.3356647491455078, + "step": 379 + }, + { + "epoch": 0.04, + "learning_rate": 2.912559990635608e-07, + "logits/chosen": -2.888493537902832, + "logits/rejected": -2.8989343643188477, + "logps/chosen": -201.9176788330078, + "logps/rejected": -308.35211181640625, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16608989238739014, + "rewards/margins": 0.47484633326530457, + "rewards/rejected": -0.30875641107559204, + "step": 380 + }, + { + "epoch": 0.04, + "learning_rate": 2.9122088259393654e-07, + "logits/chosen": -3.1463208198547363, + "logits/rejected": -3.0799243450164795, + "logps/chosen": -157.54348754882812, + "logps/rejected": -202.6800537109375, + "loss": 0.6652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4167552590370178, + "rewards/margins": 0.3437081575393677, + "rewards/rejected": -0.7604634761810303, + "step": 381 + }, + { + "epoch": 0.04, + "learning_rate": 2.911857661243123e-07, + "logits/chosen": -3.436253547668457, + "logits/rejected": -3.355146884918213, + "logps/chosen": -207.0049285888672, + "logps/rejected": -244.63433837890625, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18851184844970703, + "rewards/margins": 1.5959644317626953, + "rewards/rejected": -1.7844762802124023, + "step": 382 + }, + { + "epoch": 0.04, + "learning_rate": 2.9115064965468805e-07, + "logits/chosen": -3.5673012733459473, + "logits/rejected": -3.224808692932129, + "logps/chosen": -281.155517578125, + "logps/rejected": -214.9539794921875, + "loss": 0.581, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10351411998271942, + "rewards/margins": 0.37552356719970703, + "rewards/rejected": -0.47903767228126526, + "step": 383 + }, + { + "epoch": 0.04, + "learning_rate": 2.9111553318506375e-07, + "logits/chosen": -2.507352352142334, + "logits/rejected": -2.582430839538574, + "logps/chosen": -398.6578674316406, + "logps/rejected": -409.44476318359375, + "loss": 0.4568, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1252782791852951, + "rewards/margins": 1.063739538192749, + "rewards/rejected": -0.9384613037109375, + "step": 384 + }, + { + "epoch": 0.04, + "learning_rate": 2.910804167154395e-07, + "logits/chosen": -2.8345272541046143, + "logits/rejected": -3.0449700355529785, + "logps/chosen": -283.5218505859375, + "logps/rejected": -263.2257995605469, + "loss": 0.6351, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06995394825935364, + "rewards/margins": 0.15204322338104248, + "rewards/rejected": -0.08208927512168884, + "step": 385 + }, + { + "epoch": 0.04, + "learning_rate": 2.9104530024581526e-07, + "logits/chosen": -2.6289825439453125, + "logits/rejected": -2.3539299964904785, + "logps/chosen": -276.59149169921875, + "logps/rejected": -134.65951538085938, + "loss": 0.6826, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18538042902946472, + "rewards/margins": 0.29335954785346985, + "rewards/rejected": -0.10797910392284393, + "step": 386 + }, + { + "epoch": 0.04, + "learning_rate": 2.91010183776191e-07, + "logits/chosen": -2.7838029861450195, + "logits/rejected": -2.8201916217803955, + "logps/chosen": -176.77011108398438, + "logps/rejected": -294.69232177734375, + "loss": 0.3345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13179786503314972, + "rewards/margins": 1.55593740940094, + "rewards/rejected": -1.4241396188735962, + "step": 387 + }, + { + "epoch": 0.04, + "learning_rate": 2.9097506730656677e-07, + "logits/chosen": -3.1612627506256104, + "logits/rejected": -3.084688663482666, + "logps/chosen": -357.0299377441406, + "logps/rejected": -191.01991271972656, + "loss": 0.5834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10176057368516922, + "rewards/margins": 0.2801131308078766, + "rewards/rejected": -0.3818736970424652, + "step": 388 + }, + { + "epoch": 0.04, + "learning_rate": 2.909399508369425e-07, + "logits/chosen": -3.6793336868286133, + "logits/rejected": -3.6861114501953125, + "logps/chosen": -211.15023803710938, + "logps/rejected": -233.69847106933594, + "loss": 0.6845, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.148991197347641, + "rewards/margins": 0.3549796938896179, + "rewards/rejected": -0.5039708614349365, + "step": 389 + }, + { + "epoch": 0.04, + "learning_rate": 2.909048343673183e-07, + "logits/chosen": -3.301929235458374, + "logits/rejected": -3.20796275138855, + "logps/chosen": -192.09274291992188, + "logps/rejected": -276.6363830566406, + "loss": 0.6737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17153462767601013, + "rewards/margins": 0.20755133032798767, + "rewards/rejected": -0.3790859878063202, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.9086971789769403e-07, + "logits/chosen": -2.750441551208496, + "logits/rejected": -2.878647804260254, + "logps/chosen": -289.1087646484375, + "logps/rejected": -254.92759704589844, + "loss": 0.4784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03340554237365723, + "rewards/margins": 1.09812331199646, + "rewards/rejected": -1.1315288543701172, + "step": 391 + }, + { + "epoch": 0.05, + "learning_rate": 2.9083460142806973e-07, + "logits/chosen": -3.4067602157592773, + "logits/rejected": -3.2986998558044434, + "logps/chosen": -318.7747802734375, + "logps/rejected": -295.73565673828125, + "loss": 0.5936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47447454929351807, + "rewards/margins": 0.43653565645217896, + "rewards/rejected": -0.911010205745697, + "step": 392 + }, + { + "epoch": 0.05, + "learning_rate": 2.907994849584455e-07, + "logits/chosen": -3.5612130165100098, + "logits/rejected": -3.571927070617676, + "logps/chosen": -220.2811279296875, + "logps/rejected": -253.1396484375, + "loss": 0.4306, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21127064526081085, + "rewards/margins": 1.0022602081298828, + "rewards/rejected": -0.7909895181655884, + "step": 393 + }, + { + "epoch": 0.05, + "learning_rate": 2.9076436848882124e-07, + "logits/chosen": -2.8417909145355225, + "logits/rejected": -2.7929465770721436, + "logps/chosen": -73.14635467529297, + "logps/rejected": -93.77976989746094, + "loss": 0.5909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2049848735332489, + "rewards/margins": 0.4261936545372009, + "rewards/rejected": -0.22120878100395203, + "step": 394 + }, + { + "epoch": 0.05, + "learning_rate": 2.90729252019197e-07, + "logits/chosen": -3.475584030151367, + "logits/rejected": -3.029663562774658, + "logps/chosen": -192.2339324951172, + "logps/rejected": -219.62197875976562, + "loss": 0.6004, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26847776770591736, + "rewards/margins": 0.8090030550956726, + "rewards/rejected": -1.0774807929992676, + "step": 395 + }, + { + "epoch": 0.05, + "learning_rate": 2.9069413554957275e-07, + "logits/chosen": -3.025455951690674, + "logits/rejected": -2.979712963104248, + "logps/chosen": -227.0043487548828, + "logps/rejected": -263.66632080078125, + "loss": 0.5774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20933493971824646, + "rewards/margins": 0.36125776171684265, + "rewards/rejected": -0.5705927014350891, + "step": 396 + }, + { + "epoch": 0.05, + "learning_rate": 2.906590190799485e-07, + "logits/chosen": -3.744821071624756, + "logits/rejected": -4.103786468505859, + "logps/chosen": -135.1412811279297, + "logps/rejected": -191.207763671875, + "loss": 0.462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15522538125514984, + "rewards/margins": 0.7042402029037476, + "rewards/rejected": -0.8594655990600586, + "step": 397 + }, + { + "epoch": 0.05, + "learning_rate": 2.906239026103242e-07, + "logits/chosen": -3.234471082687378, + "logits/rejected": -3.2841789722442627, + "logps/chosen": -265.807861328125, + "logps/rejected": -220.61085510253906, + "loss": 0.6239, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11368418484926224, + "rewards/margins": 0.6250712871551514, + "rewards/rejected": -0.5113871097564697, + "step": 398 + }, + { + "epoch": 0.05, + "learning_rate": 2.905887861407e-07, + "logits/chosen": -2.9094643592834473, + "logits/rejected": -3.1217610836029053, + "logps/chosen": -253.38064575195312, + "logps/rejected": -321.3524169921875, + "loss": 0.6663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1420055478811264, + "rewards/margins": 0.3870762884616852, + "rewards/rejected": -0.5290818214416504, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 2.905536696710757e-07, + "logits/chosen": -2.2691457271575928, + "logits/rejected": -2.476783037185669, + "logps/chosen": -314.0921325683594, + "logps/rejected": -231.2151336669922, + "loss": 0.728, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17998142540454865, + "rewards/margins": 0.1350838989019394, + "rewards/rejected": -0.31506532430648804, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.9051855320145146e-07, + "logits/chosen": -3.514326572418213, + "logits/rejected": -3.358855724334717, + "logps/chosen": -347.7041015625, + "logps/rejected": -287.153076171875, + "loss": 0.3041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10987290740013123, + "rewards/margins": 1.2267987728118896, + "rewards/rejected": -1.116925835609436, + "step": 401 + }, + { + "epoch": 0.05, + "learning_rate": 2.904834367318272e-07, + "logits/chosen": -3.029038190841675, + "logits/rejected": -3.052837371826172, + "logps/chosen": -172.21746826171875, + "logps/rejected": -153.2972869873047, + "loss": 0.764, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2717152535915375, + "rewards/margins": -0.08527862280607224, + "rewards/rejected": -0.18643663823604584, + "step": 402 + }, + { + "epoch": 0.05, + "learning_rate": 2.9044832026220297e-07, + "logits/chosen": -3.0615785121917725, + "logits/rejected": -3.109618663787842, + "logps/chosen": -175.44223022460938, + "logps/rejected": -245.84722900390625, + "loss": 0.5672, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17322120070457458, + "rewards/margins": 0.8645422458648682, + "rewards/rejected": -0.6913211345672607, + "step": 403 + }, + { + "epoch": 0.05, + "learning_rate": 2.904132037925787e-07, + "logits/chosen": -3.5077638626098633, + "logits/rejected": -3.146940231323242, + "logps/chosen": -317.1956787109375, + "logps/rejected": -176.7849578857422, + "loss": 0.3104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2571291923522949, + "rewards/margins": 1.1085937023162842, + "rewards/rejected": -0.8514645099639893, + "step": 404 + }, + { + "epoch": 0.05, + "learning_rate": 2.903780873229545e-07, + "logits/chosen": -3.766819953918457, + "logits/rejected": -3.6343953609466553, + "logps/chosen": -375.5865783691406, + "logps/rejected": -191.43911743164062, + "loss": 0.3751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11596641689538956, + "rewards/margins": 1.363093614578247, + "rewards/rejected": -1.4790599346160889, + "step": 405 + }, + { + "epoch": 0.05, + "learning_rate": 2.903429708533302e-07, + "logits/chosen": -2.3069400787353516, + "logits/rejected": -2.37493634223938, + "logps/chosen": -233.0413818359375, + "logps/rejected": -187.205322265625, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021038254722952843, + "rewards/margins": 0.49389857053756714, + "rewards/rejected": -0.5149368047714233, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 2.9030785438370593e-07, + "logits/chosen": -3.210749626159668, + "logits/rejected": -3.606515884399414, + "logps/chosen": -127.40187072753906, + "logps/rejected": -176.50196838378906, + "loss": 0.546, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005973998457193375, + "rewards/margins": 0.5208548903465271, + "rewards/rejected": -0.514880895614624, + "step": 407 + }, + { + "epoch": 0.05, + "learning_rate": 2.902727379140817e-07, + "logits/chosen": -3.272876501083374, + "logits/rejected": -3.469956874847412, + "logps/chosen": -198.27767944335938, + "logps/rejected": -197.46701049804688, + "loss": 0.4784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10195407271385193, + "rewards/margins": 0.6237679719924927, + "rewards/rejected": -0.7257220149040222, + "step": 408 + }, + { + "epoch": 0.05, + "learning_rate": 2.9023762144445744e-07, + "logits/chosen": -3.4800057411193848, + "logits/rejected": -3.683103561401367, + "logps/chosen": -237.06924438476562, + "logps/rejected": -269.08050537109375, + "loss": 0.4867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04493027180433273, + "rewards/margins": 0.7818341255187988, + "rewards/rejected": -0.8267643451690674, + "step": 409 + }, + { + "epoch": 0.05, + "learning_rate": 2.902025049748332e-07, + "logits/chosen": -3.3969452381134033, + "logits/rejected": -3.2532665729522705, + "logps/chosen": -349.688232421875, + "logps/rejected": -176.83021545410156, + "loss": 0.6842, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3673829436302185, + "rewards/margins": 0.11574295163154602, + "rewards/rejected": -0.48312586545944214, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.901673885052089e-07, + "logits/chosen": -2.668060302734375, + "logits/rejected": -2.755876064300537, + "logps/chosen": -275.920654296875, + "logps/rejected": -223.3485870361328, + "loss": 0.4811, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018153982236981392, + "rewards/margins": 0.6309922337532043, + "rewards/rejected": -0.6128382682800293, + "step": 411 + }, + { + "epoch": 0.05, + "learning_rate": 2.901322720355847e-07, + "logits/chosen": -2.6855573654174805, + "logits/rejected": -2.578275203704834, + "logps/chosen": -350.96002197265625, + "logps/rejected": -258.6368103027344, + "loss": 0.4263, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19422507286071777, + "rewards/margins": 0.7949941158294678, + "rewards/rejected": -0.60076904296875, + "step": 412 + }, + { + "epoch": 0.05, + "learning_rate": 2.9009715556596046e-07, + "logits/chosen": -2.865835189819336, + "logits/rejected": -2.7778525352478027, + "logps/chosen": -240.86866760253906, + "logps/rejected": -297.23870849609375, + "loss": 0.4525, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22952251136302948, + "rewards/margins": 0.7440728545188904, + "rewards/rejected": -0.5145503878593445, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 2.9006203909633616e-07, + "logits/chosen": -2.6799004077911377, + "logits/rejected": -2.5991106033325195, + "logps/chosen": -380.75042724609375, + "logps/rejected": -232.90187072753906, + "loss": 0.4246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2754175066947937, + "rewards/margins": 0.7328726053237915, + "rewards/rejected": -0.4574550986289978, + "step": 414 + }, + { + "epoch": 0.05, + "learning_rate": 2.900269226267119e-07, + "logits/chosen": -3.2311782836914062, + "logits/rejected": -3.590186834335327, + "logps/chosen": -219.71585083007812, + "logps/rejected": -264.0310363769531, + "loss": 0.2875, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27128270268440247, + "rewards/margins": 1.4048019647598267, + "rewards/rejected": -1.1335194110870361, + "step": 415 + }, + { + "epoch": 0.05, + "learning_rate": 2.8999180615708766e-07, + "logits/chosen": -3.3896942138671875, + "logits/rejected": -3.7524070739746094, + "logps/chosen": -74.85578918457031, + "logps/rejected": -152.80722045898438, + "loss": 0.5737, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04244138300418854, + "rewards/margins": 0.539861798286438, + "rewards/rejected": -0.49742045998573303, + "step": 416 + }, + { + "epoch": 0.05, + "learning_rate": 2.899566896874634e-07, + "logits/chosen": -3.9499166011810303, + "logits/rejected": -3.4246063232421875, + "logps/chosen": -419.5428466796875, + "logps/rejected": -234.18382263183594, + "loss": 0.465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12455032020807266, + "rewards/margins": 0.9598172307014465, + "rewards/rejected": -1.0843676328659058, + "step": 417 + }, + { + "epoch": 0.05, + "learning_rate": 2.8992157321783917e-07, + "logits/chosen": -2.850705623626709, + "logits/rejected": -2.933830738067627, + "logps/chosen": -317.0789794921875, + "logps/rejected": -246.9700927734375, + "loss": 0.4698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1677778661251068, + "rewards/margins": 0.6241986751556396, + "rewards/rejected": -0.45642077922821045, + "step": 418 + }, + { + "epoch": 0.05, + "learning_rate": 2.8988645674821487e-07, + "logits/chosen": -2.9594523906707764, + "logits/rejected": -2.848459005355835, + "logps/chosen": -356.214599609375, + "logps/rejected": -246.29466247558594, + "loss": 0.4705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0382872074842453, + "rewards/margins": 0.6664814949035645, + "rewards/rejected": -0.628194272518158, + "step": 419 + }, + { + "epoch": 0.05, + "learning_rate": 2.8985134027859063e-07, + "logits/chosen": -2.657510280609131, + "logits/rejected": -2.9935638904571533, + "logps/chosen": -147.80377197265625, + "logps/rejected": -262.690185546875, + "loss": 0.6125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4110000431537628, + "rewards/margins": 0.2875770032405853, + "rewards/rejected": -0.6985770463943481, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 2.898162238089664e-07, + "logits/chosen": -3.2642645835876465, + "logits/rejected": -3.039949893951416, + "logps/chosen": -450.27838134765625, + "logps/rejected": -256.2910461425781, + "loss": 0.6544, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032554350793361664, + "rewards/margins": 0.5248735547065735, + "rewards/rejected": -0.5574278831481934, + "step": 421 + }, + { + "epoch": 0.05, + "learning_rate": 2.8978110733934213e-07, + "logits/chosen": -2.406627893447876, + "logits/rejected": -2.6548261642456055, + "logps/chosen": -133.6346435546875, + "logps/rejected": -142.57772827148438, + "loss": 0.695, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24219754338264465, + "rewards/margins": 0.5127408504486084, + "rewards/rejected": -0.7549383640289307, + "step": 422 + }, + { + "epoch": 0.05, + "learning_rate": 2.897459908697179e-07, + "logits/chosen": -2.6494946479797363, + "logits/rejected": -2.6429615020751953, + "logps/chosen": -501.8548889160156, + "logps/rejected": -324.11907958984375, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027265071868896484, + "rewards/margins": 0.7837241291999817, + "rewards/rejected": -0.7564589977264404, + "step": 423 + }, + { + "epoch": 0.05, + "learning_rate": 2.8971087440009364e-07, + "logits/chosen": -2.8341546058654785, + "logits/rejected": -3.268275022506714, + "logps/chosen": -296.7918701171875, + "logps/rejected": -225.91140747070312, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18758806586265564, + "rewards/margins": 1.2077127695083618, + "rewards/rejected": -1.0201246738433838, + "step": 424 + }, + { + "epoch": 0.05, + "learning_rate": 2.896757579304694e-07, + "logits/chosen": -2.8102188110351562, + "logits/rejected": -3.1215076446533203, + "logps/chosen": -279.5031433105469, + "logps/rejected": -247.31483459472656, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1715620905160904, + "rewards/margins": 1.052005410194397, + "rewards/rejected": -0.8804432153701782, + "step": 425 + }, + { + "epoch": 0.05, + "learning_rate": 2.8964064146084515e-07, + "logits/chosen": -2.8398847579956055, + "logits/rejected": -2.6468255519866943, + "logps/chosen": -257.25830078125, + "logps/rejected": -268.04266357421875, + "loss": 0.5065, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03891228884458542, + "rewards/margins": 0.7750474214553833, + "rewards/rejected": -0.7361351251602173, + "step": 426 + }, + { + "epoch": 0.05, + "learning_rate": 2.8960552499122085e-07, + "logits/chosen": -2.5891106128692627, + "logits/rejected": -2.562657117843628, + "logps/chosen": -351.991943359375, + "logps/rejected": -154.7635040283203, + "loss": 0.9136, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37533143162727356, + "rewards/margins": -0.1489536166191101, + "rewards/rejected": -0.22637782990932465, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 2.895704085215966e-07, + "logits/chosen": -3.0758864879608154, + "logits/rejected": -3.2650647163391113, + "logps/chosen": -310.06512451171875, + "logps/rejected": -270.84613037109375, + "loss": 0.622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24239929020404816, + "rewards/margins": 0.4100295901298523, + "rewards/rejected": -0.6524288654327393, + "step": 428 + }, + { + "epoch": 0.05, + "learning_rate": 2.8953529205197236e-07, + "logits/chosen": -3.0198135375976562, + "logits/rejected": -3.286731719970703, + "logps/chosen": -395.9908447265625, + "logps/rejected": -352.3685302734375, + "loss": 0.4475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08217925578355789, + "rewards/margins": 1.4642651081085205, + "rewards/rejected": -1.546444296836853, + "step": 429 + }, + { + "epoch": 0.05, + "learning_rate": 2.895001755823481e-07, + "logits/chosen": -3.727076530456543, + "logits/rejected": -3.518991470336914, + "logps/chosen": -258.24822998046875, + "logps/rejected": -165.10745239257812, + "loss": 0.5922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03867786377668381, + "rewards/margins": 0.500251829624176, + "rewards/rejected": -0.5389297008514404, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 2.8946505911272387e-07, + "logits/chosen": -2.4576897621154785, + "logits/rejected": -2.303232192993164, + "logps/chosen": -346.661376953125, + "logps/rejected": -254.64068603515625, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09284898638725281, + "rewards/margins": 0.8508399128913879, + "rewards/rejected": -0.7579909563064575, + "step": 431 + }, + { + "epoch": 0.05, + "learning_rate": 2.8942994264309957e-07, + "logits/chosen": -3.332794666290283, + "logits/rejected": -3.2342536449432373, + "logps/chosen": -187.4974365234375, + "logps/rejected": -211.42694091796875, + "loss": 0.3537, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22367991507053375, + "rewards/margins": 1.3280267715454102, + "rewards/rejected": -1.1043466329574585, + "step": 432 + }, + { + "epoch": 0.05, + "learning_rate": 2.893948261734754e-07, + "logits/chosen": -3.6305017471313477, + "logits/rejected": -3.377901077270508, + "logps/chosen": -239.5826416015625, + "logps/rejected": -153.404541015625, + "loss": 0.5964, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4212096333503723, + "rewards/margins": 0.5863312482833862, + "rewards/rejected": -1.0075409412384033, + "step": 433 + }, + { + "epoch": 0.05, + "learning_rate": 2.8935970970385113e-07, + "logits/chosen": -2.8274264335632324, + "logits/rejected": -3.0231239795684814, + "logps/chosen": -121.36795043945312, + "logps/rejected": -150.42874145507812, + "loss": 0.4203, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22381076216697693, + "rewards/margins": 0.8497442007064819, + "rewards/rejected": -0.6259334087371826, + "step": 434 + }, + { + "epoch": 0.05, + "learning_rate": 2.8932459323422683e-07, + "logits/chosen": -3.2575225830078125, + "logits/rejected": -3.0279829502105713, + "logps/chosen": -239.0912628173828, + "logps/rejected": -156.88623046875, + "loss": 0.7652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3596770167350769, + "rewards/margins": 0.052212025970220566, + "rewards/rejected": -0.4118890166282654, + "step": 435 + }, + { + "epoch": 0.05, + "learning_rate": 2.892894767646026e-07, + "logits/chosen": -3.0100252628326416, + "logits/rejected": -3.157221555709839, + "logps/chosen": -329.5337829589844, + "logps/rejected": -218.9036865234375, + "loss": 0.5606, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0907401591539383, + "rewards/margins": 0.6632818579673767, + "rewards/rejected": -0.7540220022201538, + "step": 436 + }, + { + "epoch": 0.05, + "learning_rate": 2.8925436029497834e-07, + "logits/chosen": -2.998896360397339, + "logits/rejected": -2.773792266845703, + "logps/chosen": -307.9561767578125, + "logps/rejected": -194.65496826171875, + "loss": 0.8778, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4579678475856781, + "rewards/margins": -0.12219326198101044, + "rewards/rejected": -0.33577460050582886, + "step": 437 + }, + { + "epoch": 0.05, + "learning_rate": 2.892192438253541e-07, + "logits/chosen": -2.887543201446533, + "logits/rejected": -2.7657833099365234, + "logps/chosen": -234.41851806640625, + "logps/rejected": -256.7139892578125, + "loss": 0.8364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2570115923881531, + "rewards/margins": 0.5620640516281128, + "rewards/rejected": -0.8190757036209106, + "step": 438 + }, + { + "epoch": 0.05, + "learning_rate": 2.8918412735572984e-07, + "logits/chosen": -2.729084014892578, + "logits/rejected": -2.7433485984802246, + "logps/chosen": -117.47038269042969, + "logps/rejected": -230.05661010742188, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.194748193025589, + "rewards/margins": 0.6182531118392944, + "rewards/rejected": -0.42350485920906067, + "step": 439 + }, + { + "epoch": 0.05, + "learning_rate": 2.8914901088610555e-07, + "logits/chosen": -4.01908540725708, + "logits/rejected": -4.057231903076172, + "logps/chosen": -319.78985595703125, + "logps/rejected": -238.83909606933594, + "loss": 0.5, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.362660676240921, + "rewards/margins": 1.0223525762557983, + "rewards/rejected": -1.3850133419036865, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 2.891138944164813e-07, + "logits/chosen": -3.1168556213378906, + "logits/rejected": -3.4155185222625732, + "logps/chosen": -65.08560180664062, + "logps/rejected": -252.98086547851562, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0746397003531456, + "rewards/margins": 1.4310288429260254, + "rewards/rejected": -1.356389045715332, + "step": 441 + }, + { + "epoch": 0.05, + "learning_rate": 2.8907877794685705e-07, + "logits/chosen": -2.983868360519409, + "logits/rejected": -3.1380386352539062, + "logps/chosen": -178.57437133789062, + "logps/rejected": -239.548095703125, + "loss": 0.4207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1182938814163208, + "rewards/margins": 1.0859735012054443, + "rewards/rejected": -0.9676795601844788, + "step": 442 + }, + { + "epoch": 0.05, + "learning_rate": 2.890436614772328e-07, + "logits/chosen": -3.280564069747925, + "logits/rejected": -2.9529080390930176, + "logps/chosen": -280.44000244140625, + "logps/rejected": -160.4457550048828, + "loss": 0.6225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1865798830986023, + "rewards/margins": 0.62520432472229, + "rewards/rejected": -0.8117842078208923, + "step": 443 + }, + { + "epoch": 0.05, + "learning_rate": 2.8900854500760856e-07, + "logits/chosen": -3.452803611755371, + "logits/rejected": -3.4232821464538574, + "logps/chosen": -445.9339599609375, + "logps/rejected": -183.57379150390625, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06135844439268112, + "rewards/margins": 0.416906476020813, + "rewards/rejected": -0.4782648980617523, + "step": 444 + }, + { + "epoch": 0.05, + "learning_rate": 2.8897342853798426e-07, + "logits/chosen": -2.914935350418091, + "logits/rejected": -2.862522840499878, + "logps/chosen": -239.44798278808594, + "logps/rejected": -186.4224395751953, + "loss": 0.4462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2786865234375, + "rewards/margins": 0.7166086435317993, + "rewards/rejected": -0.9952951669692993, + "step": 445 + }, + { + "epoch": 0.05, + "learning_rate": 2.8893831206836007e-07, + "logits/chosen": -3.292107582092285, + "logits/rejected": -3.110042095184326, + "logps/chosen": -227.95521545410156, + "logps/rejected": -254.63314819335938, + "loss": 0.5106, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44962915778160095, + "rewards/margins": 0.9801754355430603, + "rewards/rejected": -1.4298045635223389, + "step": 446 + }, + { + "epoch": 0.05, + "learning_rate": 2.889031955987358e-07, + "logits/chosen": -2.576687812805176, + "logits/rejected": -2.4736924171447754, + "logps/chosen": -399.8274230957031, + "logps/rejected": -401.70947265625, + "loss": 0.5143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04490023851394653, + "rewards/margins": 0.5428970456123352, + "rewards/rejected": -0.5877972841262817, + "step": 447 + }, + { + "epoch": 0.05, + "learning_rate": 2.888680791291115e-07, + "logits/chosen": -3.1054911613464355, + "logits/rejected": -2.8871450424194336, + "logps/chosen": -402.8165283203125, + "logps/rejected": -269.0899963378906, + "loss": 0.5287, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02926502376794815, + "rewards/margins": 0.8205345273017883, + "rewards/rejected": -0.8497995734214783, + "step": 448 + }, + { + "epoch": 0.05, + "learning_rate": 2.888329626594873e-07, + "logits/chosen": -3.2220678329467773, + "logits/rejected": -2.916114091873169, + "logps/chosen": -328.8043518066406, + "logps/rejected": -163.7899932861328, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25431814789772034, + "rewards/margins": 0.7400802373886108, + "rewards/rejected": -0.4857621192932129, + "step": 449 + }, + { + "epoch": 0.05, + "learning_rate": 2.8879784618986303e-07, + "logits/chosen": -3.117187976837158, + "logits/rejected": -3.060683012008667, + "logps/chosen": -420.4107666015625, + "logps/rejected": -281.23809814453125, + "loss": 0.4774, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1414695680141449, + "rewards/margins": 1.1993730068206787, + "rewards/rejected": -1.0579032897949219, + "step": 450 + }, + { + "epoch": 0.05, + "learning_rate": 2.887627297202388e-07, + "logits/chosen": -3.2127084732055664, + "logits/rejected": -3.2453420162200928, + "logps/chosen": -305.41766357421875, + "logps/rejected": -272.09527587890625, + "loss": 0.5381, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04543359577655792, + "rewards/margins": 0.8024728298187256, + "rewards/rejected": -0.7570393085479736, + "step": 451 + }, + { + "epoch": 0.05, + "learning_rate": 2.8872761325061454e-07, + "logits/chosen": -3.012664556503296, + "logits/rejected": -3.139662742614746, + "logps/chosen": -331.30810546875, + "logps/rejected": -239.49822998046875, + "loss": 0.4507, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01095910556614399, + "rewards/margins": 1.2323007583618164, + "rewards/rejected": -1.221341609954834, + "step": 452 + }, + { + "epoch": 0.05, + "learning_rate": 2.8869249678099024e-07, + "logits/chosen": -2.527912139892578, + "logits/rejected": -2.838423252105713, + "logps/chosen": -86.77651977539062, + "logps/rejected": -169.56405639648438, + "loss": 0.747, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1546945571899414, + "rewards/margins": 0.3063422441482544, + "rewards/rejected": -0.4610367715358734, + "step": 453 + }, + { + "epoch": 0.05, + "learning_rate": 2.88657380311366e-07, + "logits/chosen": -2.7566442489624023, + "logits/rejected": -2.975444793701172, + "logps/chosen": -237.46656799316406, + "logps/rejected": -174.21327209472656, + "loss": 0.6195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16275857388973236, + "rewards/margins": 0.29780369997024536, + "rewards/rejected": -0.4605622887611389, + "step": 454 + }, + { + "epoch": 0.05, + "learning_rate": 2.886222638417418e-07, + "logits/chosen": -2.4827194213867188, + "logits/rejected": -2.5032083988189697, + "logps/chosen": -139.6429901123047, + "logps/rejected": -260.80902099609375, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16889016330242157, + "rewards/margins": 0.7242395281791687, + "rewards/rejected": -0.8931295871734619, + "step": 455 + }, + { + "epoch": 0.05, + "learning_rate": 2.885871473721175e-07, + "logits/chosen": -3.0726680755615234, + "logits/rejected": -2.7118992805480957, + "logps/chosen": -210.5627899169922, + "logps/rejected": -260.4034423828125, + "loss": 0.4671, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3428042531013489, + "rewards/margins": 1.0605111122131348, + "rewards/rejected": -0.7177067995071411, + "step": 456 + }, + { + "epoch": 0.05, + "learning_rate": 2.8855203090249325e-07, + "logits/chosen": -2.5457630157470703, + "logits/rejected": -2.798626661300659, + "logps/chosen": -589.6298828125, + "logps/rejected": -407.348388671875, + "loss": 0.3411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1833634376525879, + "rewards/margins": 0.9715942144393921, + "rewards/rejected": -0.7882307171821594, + "step": 457 + }, + { + "epoch": 0.05, + "learning_rate": 2.88516914432869e-07, + "logits/chosen": -2.9093854427337646, + "logits/rejected": -2.866203546524048, + "logps/chosen": -151.8306884765625, + "logps/rejected": -220.07723999023438, + "loss": 0.44, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02912278287112713, + "rewards/margins": 1.054234504699707, + "rewards/rejected": -1.0251119136810303, + "step": 458 + }, + { + "epoch": 0.05, + "learning_rate": 2.8848179796324476e-07, + "logits/chosen": -3.245063543319702, + "logits/rejected": -3.2865824699401855, + "logps/chosen": -169.97947692871094, + "logps/rejected": -205.06236267089844, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20643648505210876, + "rewards/margins": 0.8170823454856873, + "rewards/rejected": -1.0235188007354736, + "step": 459 + }, + { + "epoch": 0.05, + "learning_rate": 2.884466814936205e-07, + "logits/chosen": -3.3763699531555176, + "logits/rejected": -3.4078330993652344, + "logps/chosen": -224.6304473876953, + "logps/rejected": -199.60533142089844, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20913316309452057, + "rewards/margins": 0.22670747339725494, + "rewards/rejected": -0.4358406662940979, + "step": 460 + }, + { + "epoch": 0.05, + "learning_rate": 2.884115650239962e-07, + "logits/chosen": -3.5697882175445557, + "logits/rejected": -3.449800968170166, + "logps/chosen": -329.3570556640625, + "logps/rejected": -279.0393371582031, + "loss": 0.4055, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007454387843608856, + "rewards/margins": 1.5074009895324707, + "rewards/rejected": -1.4999465942382812, + "step": 461 + }, + { + "epoch": 0.05, + "learning_rate": 2.8837644855437197e-07, + "logits/chosen": -3.077784538269043, + "logits/rejected": -3.2358601093292236, + "logps/chosen": -402.5267639160156, + "logps/rejected": -298.3506774902344, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17414353787899017, + "rewards/margins": 1.239652395248413, + "rewards/rejected": -1.0655088424682617, + "step": 462 + }, + { + "epoch": 0.05, + "learning_rate": 2.883413320847477e-07, + "logits/chosen": -3.308037042617798, + "logits/rejected": -3.1235296726226807, + "logps/chosen": -244.96963500976562, + "logps/rejected": -194.09091186523438, + "loss": 0.5236, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013576053082942963, + "rewards/margins": 0.47806257009506226, + "rewards/rejected": -0.4916386306285858, + "step": 463 + }, + { + "epoch": 0.05, + "learning_rate": 2.883062156151235e-07, + "logits/chosen": -3.2055907249450684, + "logits/rejected": -3.123772144317627, + "logps/chosen": -234.43942260742188, + "logps/rejected": -292.5718688964844, + "loss": 0.3388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14657142758369446, + "rewards/margins": 1.463843822479248, + "rewards/rejected": -1.6104153394699097, + "step": 464 + }, + { + "epoch": 0.05, + "learning_rate": 2.8827109914549923e-07, + "logits/chosen": -2.8594017028808594, + "logits/rejected": -2.804487705230713, + "logps/chosen": -263.6805725097656, + "logps/rejected": -368.3585510253906, + "loss": 0.2849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27003681659698486, + "rewards/margins": 1.2551006078720093, + "rewards/rejected": -0.9850637912750244, + "step": 465 + }, + { + "epoch": 0.05, + "learning_rate": 2.8823598267587493e-07, + "logits/chosen": -2.5701279640197754, + "logits/rejected": -2.8582043647766113, + "logps/chosen": -138.20327758789062, + "logps/rejected": -173.23362731933594, + "loss": 0.6408, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4296302795410156, + "rewards/margins": 0.6491732001304626, + "rewards/rejected": -1.0788034200668335, + "step": 466 + }, + { + "epoch": 0.05, + "learning_rate": 2.8820086620625074e-07, + "logits/chosen": -3.6446046829223633, + "logits/rejected": -3.315652847290039, + "logps/chosen": -245.41732788085938, + "logps/rejected": -236.41683959960938, + "loss": 0.7151, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44958096742630005, + "rewards/margins": 0.35197514295578003, + "rewards/rejected": -0.8015561103820801, + "step": 467 + }, + { + "epoch": 0.05, + "learning_rate": 2.881657497366265e-07, + "logits/chosen": -3.2038822174072266, + "logits/rejected": -2.8926000595092773, + "logps/chosen": -456.45208740234375, + "logps/rejected": -239.56100463867188, + "loss": 0.7909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19957531988620758, + "rewards/margins": 0.5295236706733704, + "rewards/rejected": -0.7290990352630615, + "step": 468 + }, + { + "epoch": 0.05, + "learning_rate": 2.881306332670022e-07, + "logits/chosen": -3.096151351928711, + "logits/rejected": -2.846104621887207, + "logps/chosen": -128.83131408691406, + "logps/rejected": -205.9220428466797, + "loss": 0.4925, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26024311780929565, + "rewards/margins": 0.8299098610877991, + "rewards/rejected": -1.0901530981063843, + "step": 469 + }, + { + "epoch": 0.05, + "learning_rate": 2.8809551679737795e-07, + "logits/chosen": -3.233553886413574, + "logits/rejected": -3.096565008163452, + "logps/chosen": -69.12287902832031, + "logps/rejected": -119.07392883300781, + "loss": 0.5257, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18879368901252747, + "rewards/margins": 0.5164449214935303, + "rewards/rejected": -0.3276512324810028, + "step": 470 + }, + { + "epoch": 0.05, + "learning_rate": 2.880604003277537e-07, + "logits/chosen": -3.042024612426758, + "logits/rejected": -3.27488374710083, + "logps/chosen": -328.2177734375, + "logps/rejected": -225.53134155273438, + "loss": 0.3151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4705287218093872, + "rewards/margins": 2.0412919521331787, + "rewards/rejected": -1.570763111114502, + "step": 471 + }, + { + "epoch": 0.05, + "learning_rate": 2.8802528385812946e-07, + "logits/chosen": -3.3470211029052734, + "logits/rejected": -3.0811100006103516, + "logps/chosen": -418.20745849609375, + "logps/rejected": -254.3585205078125, + "loss": 0.5836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1571069061756134, + "rewards/margins": 0.4423336386680603, + "rewards/rejected": -0.5994405746459961, + "step": 472 + }, + { + "epoch": 0.05, + "learning_rate": 2.879901673885052e-07, + "logits/chosen": -3.5778961181640625, + "logits/rejected": -3.027993679046631, + "logps/chosen": -388.94073486328125, + "logps/rejected": -134.12550354003906, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0047159940004348755, + "rewards/margins": 0.7017173767089844, + "rewards/rejected": -0.6970013380050659, + "step": 473 + }, + { + "epoch": 0.05, + "learning_rate": 2.879550509188809e-07, + "logits/chosen": -2.821014881134033, + "logits/rejected": -2.579470634460449, + "logps/chosen": -397.657470703125, + "logps/rejected": -238.9468536376953, + "loss": 0.6333, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.039274368435144424, + "rewards/margins": 0.24432459473609924, + "rewards/rejected": -0.2050502449274063, + "step": 474 + }, + { + "epoch": 0.05, + "learning_rate": 2.8791993444925667e-07, + "logits/chosen": -2.7070438861846924, + "logits/rejected": -2.735952854156494, + "logps/chosen": -226.82904052734375, + "logps/rejected": -345.65618896484375, + "loss": 0.4714, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021621137857437134, + "rewards/margins": 1.3002173900604248, + "rewards/rejected": -1.27859628200531, + "step": 475 + }, + { + "epoch": 0.05, + "learning_rate": 2.878848179796324e-07, + "logits/chosen": -3.484994649887085, + "logits/rejected": -3.8121988773345947, + "logps/chosen": -136.73118591308594, + "logps/rejected": -263.3206481933594, + "loss": 0.4021, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025934603065252304, + "rewards/margins": 1.8335849046707153, + "rewards/rejected": -1.8076503276824951, + "step": 476 + }, + { + "epoch": 0.05, + "learning_rate": 2.8784970151000817e-07, + "logits/chosen": -3.5027527809143066, + "logits/rejected": -3.657472848892212, + "logps/chosen": -206.80128479003906, + "logps/rejected": -266.017822265625, + "loss": 0.4624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11314955353736877, + "rewards/margins": 1.2086315155029297, + "rewards/rejected": -1.321781039237976, + "step": 477 + }, + { + "epoch": 0.06, + "learning_rate": 2.8781458504038393e-07, + "logits/chosen": -3.0798563957214355, + "logits/rejected": -2.9016366004943848, + "logps/chosen": -360.1195373535156, + "logps/rejected": -251.15463256835938, + "loss": 0.7105, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11658275872468948, + "rewards/margins": 0.058493778109550476, + "rewards/rejected": -0.17507654428482056, + "step": 478 + }, + { + "epoch": 0.06, + "learning_rate": 2.877794685707597e-07, + "logits/chosen": -3.3471498489379883, + "logits/rejected": -3.452831268310547, + "logps/chosen": -219.1038818359375, + "logps/rejected": -281.0732116699219, + "loss": 0.3734, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.014966841787099838, + "rewards/margins": 1.28567636013031, + "rewards/rejected": -1.2707096338272095, + "step": 479 + }, + { + "epoch": 0.06, + "learning_rate": 2.8774435210113543e-07, + "logits/chosen": -3.4063801765441895, + "logits/rejected": -3.2551231384277344, + "logps/chosen": -273.7037353515625, + "logps/rejected": -306.4983825683594, + "loss": 0.4655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13756990432739258, + "rewards/margins": 1.3235504627227783, + "rewards/rejected": -1.461120367050171, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 2.877092356315112e-07, + "logits/chosen": -2.888484239578247, + "logits/rejected": -2.6733169555664062, + "logps/chosen": -214.738525390625, + "logps/rejected": -278.33245849609375, + "loss": 0.4701, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1592889428138733, + "rewards/margins": 0.799102246761322, + "rewards/rejected": -0.9583912491798401, + "step": 481 + }, + { + "epoch": 0.06, + "learning_rate": 2.876741191618869e-07, + "logits/chosen": -3.345571756362915, + "logits/rejected": -3.554896593093872, + "logps/chosen": -97.7536392211914, + "logps/rejected": -173.86065673828125, + "loss": 0.4443, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.037883177399635315, + "rewards/margins": 1.0296493768692017, + "rewards/rejected": -0.9917662143707275, + "step": 482 + }, + { + "epoch": 0.06, + "learning_rate": 2.8763900269226264e-07, + "logits/chosen": -3.767484664916992, + "logits/rejected": -3.597215175628662, + "logps/chosen": -231.55938720703125, + "logps/rejected": -170.04779052734375, + "loss": 0.671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39047712087631226, + "rewards/margins": 0.10898162424564362, + "rewards/rejected": -0.4994587302207947, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 2.876038862226384e-07, + "logits/chosen": -3.1447672843933105, + "logits/rejected": -3.406238317489624, + "logps/chosen": -217.88648986816406, + "logps/rejected": -283.2392272949219, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10717424750328064, + "rewards/margins": 1.4090369939804077, + "rewards/rejected": -1.5162112712860107, + "step": 484 + }, + { + "epoch": 0.06, + "learning_rate": 2.8756876975301415e-07, + "logits/chosen": -3.735593795776367, + "logits/rejected": -3.6610612869262695, + "logps/chosen": -208.30088806152344, + "logps/rejected": -181.7802734375, + "loss": 0.5141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10227297246456146, + "rewards/margins": 0.6313957571983337, + "rewards/rejected": -0.7336687445640564, + "step": 485 + }, + { + "epoch": 0.06, + "learning_rate": 2.875336532833899e-07, + "logits/chosen": -3.4746227264404297, + "logits/rejected": -3.8396947383880615, + "logps/chosen": -166.04409790039062, + "logps/rejected": -246.57850646972656, + "loss": 0.4345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3677878975868225, + "rewards/margins": 1.6241183280944824, + "rewards/rejected": -1.9919061660766602, + "step": 486 + }, + { + "epoch": 0.06, + "learning_rate": 2.8749853681376566e-07, + "logits/chosen": -2.9436943531036377, + "logits/rejected": -3.0604281425476074, + "logps/chosen": -207.33624267578125, + "logps/rejected": -228.96389770507812, + "loss": 0.3463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004196275025606155, + "rewards/margins": 1.4612349271774292, + "rewards/rejected": -1.4654312133789062, + "step": 487 + }, + { + "epoch": 0.06, + "learning_rate": 2.8746342034414136e-07, + "logits/chosen": -3.0453977584838867, + "logits/rejected": -3.0727570056915283, + "logps/chosen": -314.0736389160156, + "logps/rejected": -272.1146240234375, + "loss": 0.4917, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19971731305122375, + "rewards/margins": 1.133684515953064, + "rewards/rejected": -1.3334019184112549, + "step": 488 + }, + { + "epoch": 0.06, + "learning_rate": 2.8742830387451717e-07, + "logits/chosen": -2.878746509552002, + "logits/rejected": -2.851273536682129, + "logps/chosen": -367.4239501953125, + "logps/rejected": -384.2108459472656, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1495692878961563, + "rewards/margins": 1.4559946060180664, + "rewards/rejected": -1.3064253330230713, + "step": 489 + }, + { + "epoch": 0.06, + "learning_rate": 2.8739318740489287e-07, + "logits/chosen": -1.9737764596939087, + "logits/rejected": -2.142622470855713, + "logps/chosen": -312.28680419921875, + "logps/rejected": -245.7939453125, + "loss": 0.4212, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08468474447727203, + "rewards/margins": 0.9715059995651245, + "rewards/rejected": -0.8868212699890137, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 2.873580709352686e-07, + "logits/chosen": -3.0486574172973633, + "logits/rejected": -2.7361245155334473, + "logps/chosen": -192.8045654296875, + "logps/rejected": -210.4586181640625, + "loss": 0.9373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7772664427757263, + "rewards/margins": -0.05737540125846863, + "rewards/rejected": -0.7198910117149353, + "step": 491 + }, + { + "epoch": 0.06, + "learning_rate": 2.873229544656444e-07, + "logits/chosen": -2.6409361362457275, + "logits/rejected": -2.6498703956604004, + "logps/chosen": -144.90940856933594, + "logps/rejected": -239.55465698242188, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.034305423498153687, + "rewards/margins": 0.7900944352149963, + "rewards/rejected": -0.755789041519165, + "step": 492 + }, + { + "epoch": 0.06, + "learning_rate": 2.8728783799602013e-07, + "logits/chosen": -3.17602801322937, + "logits/rejected": -3.413388252258301, + "logps/chosen": -202.27537536621094, + "logps/rejected": -281.008544921875, + "loss": 0.3236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2670978307723999, + "rewards/margins": 1.3742003440856934, + "rewards/rejected": -1.1071025133132935, + "step": 493 + }, + { + "epoch": 0.06, + "learning_rate": 2.872527215263959e-07, + "logits/chosen": -3.530827522277832, + "logits/rejected": -3.375345230102539, + "logps/chosen": -304.483642578125, + "logps/rejected": -319.90087890625, + "loss": 0.5451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22065424919128418, + "rewards/margins": 0.6409857273101807, + "rewards/rejected": -0.8616399765014648, + "step": 494 + }, + { + "epoch": 0.06, + "learning_rate": 2.8721760505677164e-07, + "logits/chosen": -3.3144350051879883, + "logits/rejected": -3.124290704727173, + "logps/chosen": -162.87513732910156, + "logps/rejected": -182.17413330078125, + "loss": 0.5875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21477529406547546, + "rewards/margins": 0.6383110880851746, + "rewards/rejected": -0.4235357642173767, + "step": 495 + }, + { + "epoch": 0.06, + "learning_rate": 2.8718248858714734e-07, + "logits/chosen": -3.4169209003448486, + "logits/rejected": -3.2275640964508057, + "logps/chosen": -135.8197784423828, + "logps/rejected": -142.04396057128906, + "loss": 0.5799, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016947001218795776, + "rewards/margins": 0.509728729724884, + "rewards/rejected": -0.5266757011413574, + "step": 496 + }, + { + "epoch": 0.06, + "learning_rate": 2.871473721175231e-07, + "logits/chosen": -3.227363109588623, + "logits/rejected": -3.611828088760376, + "logps/chosen": -159.49276733398438, + "logps/rejected": -256.2635803222656, + "loss": 0.4354, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09774680435657501, + "rewards/margins": 1.3314770460128784, + "rewards/rejected": -1.2337303161621094, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 2.8711225564789885e-07, + "logits/chosen": -2.992539644241333, + "logits/rejected": -3.15537691116333, + "logps/chosen": -284.51776123046875, + "logps/rejected": -227.77810668945312, + "loss": 0.4464, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1649443805217743, + "rewards/margins": 1.209691047668457, + "rewards/rejected": -1.04474675655365, + "step": 498 + }, + { + "epoch": 0.06, + "learning_rate": 2.870771391782746e-07, + "logits/chosen": -3.039200782775879, + "logits/rejected": -2.9331090450286865, + "logps/chosen": -405.0827941894531, + "logps/rejected": -371.2900390625, + "loss": 0.6963, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39746686816215515, + "rewards/margins": 0.8559498190879822, + "rewards/rejected": -1.253416657447815, + "step": 499 + }, + { + "epoch": 0.06, + "learning_rate": 2.8704202270865035e-07, + "logits/chosen": -2.8560547828674316, + "logits/rejected": -3.090045690536499, + "logps/chosen": -227.1024169921875, + "logps/rejected": -187.82977294921875, + "loss": 0.766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19129395484924316, + "rewards/margins": 0.482562780380249, + "rewards/rejected": -0.673856794834137, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 2.870069062390261e-07, + "logits/chosen": -2.5864219665527344, + "logits/rejected": -2.5727272033691406, + "logps/chosen": -302.9307861328125, + "logps/rejected": -211.59548950195312, + "loss": 0.3917, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19512784481048584, + "rewards/margins": 0.9747135043144226, + "rewards/rejected": -0.7795856595039368, + "step": 501 + }, + { + "epoch": 0.06, + "learning_rate": 2.8697178976940186e-07, + "logits/chosen": -2.7894229888916016, + "logits/rejected": -2.7230677604675293, + "logps/chosen": -189.9849853515625, + "logps/rejected": -156.8744659423828, + "loss": 0.6785, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12064714729785919, + "rewards/margins": 0.13896769285202026, + "rewards/rejected": -0.018320566043257713, + "step": 502 + }, + { + "epoch": 0.06, + "learning_rate": 2.869366732997776e-07, + "logits/chosen": -2.541473627090454, + "logits/rejected": -2.7297348976135254, + "logps/chosen": -264.6041259765625, + "logps/rejected": -324.25714111328125, + "loss": 0.41, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16816501319408417, + "rewards/margins": 1.1273730993270874, + "rewards/rejected": -0.959208071231842, + "step": 503 + }, + { + "epoch": 0.06, + "learning_rate": 2.869015568301533e-07, + "logits/chosen": -2.9365227222442627, + "logits/rejected": -3.1242902278900146, + "logps/chosen": -125.56317901611328, + "logps/rejected": -234.14224243164062, + "loss": 0.3605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06341756880283356, + "rewards/margins": 1.1993844509124756, + "rewards/rejected": -1.262802004814148, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 2.8686644036052907e-07, + "logits/chosen": -2.867974281311035, + "logits/rejected": -2.9811301231384277, + "logps/chosen": -198.185791015625, + "logps/rejected": -329.2797546386719, + "loss": 0.6382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25616970658302307, + "rewards/margins": 0.24100124835968018, + "rewards/rejected": -0.49717098474502563, + "step": 505 + }, + { + "epoch": 0.06, + "learning_rate": 2.868313238909048e-07, + "logits/chosen": -3.55409574508667, + "logits/rejected": -3.3509416580200195, + "logps/chosen": -486.7705078125, + "logps/rejected": -302.6485595703125, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12468338757753372, + "rewards/margins": 1.0725276470184326, + "rewards/rejected": -1.1972111463546753, + "step": 506 + }, + { + "epoch": 0.06, + "learning_rate": 2.867962074212806e-07, + "logits/chosen": -2.9114534854888916, + "logits/rejected": -2.97478985786438, + "logps/chosen": -277.025390625, + "logps/rejected": -264.81427001953125, + "loss": 0.4664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11956827342510223, + "rewards/margins": 1.0786495208740234, + "rewards/rejected": -1.1982176303863525, + "step": 507 + }, + { + "epoch": 0.06, + "learning_rate": 2.8676109095165633e-07, + "logits/chosen": -2.5504937171936035, + "logits/rejected": -2.4858078956604004, + "logps/chosen": -231.9882354736328, + "logps/rejected": -273.6965637207031, + "loss": 0.402, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19226795434951782, + "rewards/margins": 0.802055835723877, + "rewards/rejected": -0.6097878813743591, + "step": 508 + }, + { + "epoch": 0.06, + "learning_rate": 2.8672597448203203e-07, + "logits/chosen": -3.050672769546509, + "logits/rejected": -2.942763328552246, + "logps/chosen": -361.748779296875, + "logps/rejected": -303.2430725097656, + "loss": 0.5529, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1266341209411621, + "rewards/margins": 0.8433701992034912, + "rewards/rejected": -0.7167360782623291, + "step": 509 + }, + { + "epoch": 0.06, + "learning_rate": 2.866908580124078e-07, + "logits/chosen": -3.014805555343628, + "logits/rejected": -3.192039966583252, + "logps/chosen": -167.71812438964844, + "logps/rejected": -253.79908752441406, + "loss": 0.3678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11787936091423035, + "rewards/margins": 1.4172860383987427, + "rewards/rejected": -1.535165548324585, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 2.866557415427836e-07, + "logits/chosen": -3.1891798973083496, + "logits/rejected": -3.3985683917999268, + "logps/chosen": -395.2935791015625, + "logps/rejected": -233.57791137695312, + "loss": 0.4813, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03097468614578247, + "rewards/margins": 0.8894357681274414, + "rewards/rejected": -0.8584611415863037, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 2.866206250731593e-07, + "logits/chosen": -2.7636375427246094, + "logits/rejected": -2.7309622764587402, + "logps/chosen": -188.4740447998047, + "logps/rejected": -235.02081298828125, + "loss": 0.5118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12039600312709808, + "rewards/margins": 0.714372456073761, + "rewards/rejected": -0.8347684741020203, + "step": 512 + }, + { + "epoch": 0.06, + "learning_rate": 2.8658550860353505e-07, + "logits/chosen": -3.4364349842071533, + "logits/rejected": -3.791431188583374, + "logps/chosen": -148.3749237060547, + "logps/rejected": -172.4364013671875, + "loss": 0.6647, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47247448563575745, + "rewards/margins": 0.7560961246490479, + "rewards/rejected": -1.228570580482483, + "step": 513 + }, + { + "epoch": 0.06, + "learning_rate": 2.865503921339108e-07, + "logits/chosen": -3.0675241947174072, + "logits/rejected": -3.2901792526245117, + "logps/chosen": -213.7919464111328, + "logps/rejected": -295.42535400390625, + "loss": 0.6086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11407937109470367, + "rewards/margins": 0.9246984720230103, + "rewards/rejected": -1.0387778282165527, + "step": 514 + }, + { + "epoch": 0.06, + "learning_rate": 2.8651527566428656e-07, + "logits/chosen": -3.1039838790893555, + "logits/rejected": -3.328001022338867, + "logps/chosen": -262.2862243652344, + "logps/rejected": -229.57106018066406, + "loss": 0.4118, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44690725207328796, + "rewards/margins": 1.019927740097046, + "rewards/rejected": -0.5730204582214355, + "step": 515 + }, + { + "epoch": 0.06, + "learning_rate": 2.864801591946623e-07, + "logits/chosen": -2.241164207458496, + "logits/rejected": -2.1829233169555664, + "logps/chosen": -220.9943389892578, + "logps/rejected": -276.4043273925781, + "loss": 0.7219, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12687012553215027, + "rewards/margins": 0.48822373151779175, + "rewards/rejected": -0.6150938868522644, + "step": 516 + }, + { + "epoch": 0.06, + "learning_rate": 2.86445042725038e-07, + "logits/chosen": -3.601105213165283, + "logits/rejected": -3.429993152618408, + "logps/chosen": -231.45730590820312, + "logps/rejected": -247.46238708496094, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12420734018087387, + "rewards/margins": 0.8599831461906433, + "rewards/rejected": -0.9841905236244202, + "step": 517 + }, + { + "epoch": 0.06, + "learning_rate": 2.8640992625541376e-07, + "logits/chosen": -2.833491802215576, + "logits/rejected": -2.5983874797821045, + "logps/chosen": -291.8511657714844, + "logps/rejected": -302.2479248046875, + "loss": 0.2148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2806636691093445, + "rewards/margins": 1.7790451049804688, + "rewards/rejected": -1.4983816146850586, + "step": 518 + }, + { + "epoch": 0.06, + "learning_rate": 2.863748097857895e-07, + "logits/chosen": -2.8953664302825928, + "logits/rejected": -3.1274290084838867, + "logps/chosen": -540.9337768554688, + "logps/rejected": -346.3360290527344, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1636904776096344, + "rewards/margins": 0.6888893842697144, + "rewards/rejected": -0.5251989364624023, + "step": 519 + }, + { + "epoch": 0.06, + "learning_rate": 2.8633969331616527e-07, + "logits/chosen": -3.1299099922180176, + "logits/rejected": -3.3590762615203857, + "logps/chosen": -162.1237335205078, + "logps/rejected": -263.94158935546875, + "loss": 0.3476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23379351198673248, + "rewards/margins": 1.5890848636627197, + "rewards/rejected": -1.3552911281585693, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 2.86304576846541e-07, + "logits/chosen": -3.154480457305908, + "logits/rejected": -2.8855338096618652, + "logps/chosen": -227.01904296875, + "logps/rejected": -137.81228637695312, + "loss": 0.7626, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43410974740982056, + "rewards/margins": 0.21767541766166687, + "rewards/rejected": -0.651785135269165, + "step": 521 + }, + { + "epoch": 0.06, + "learning_rate": 2.862694603769167e-07, + "logits/chosen": -3.1889216899871826, + "logits/rejected": -3.2298495769500732, + "logps/chosen": -328.3163757324219, + "logps/rejected": -260.7103576660156, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02139928936958313, + "rewards/margins": 0.8407351970672607, + "rewards/rejected": -0.8621345162391663, + "step": 522 + }, + { + "epoch": 0.06, + "learning_rate": 2.8623434390729253e-07, + "logits/chosen": -2.878713607788086, + "logits/rejected": -2.793660879135132, + "logps/chosen": -276.17645263671875, + "logps/rejected": -229.63475036621094, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.035965923219919205, + "rewards/margins": 0.6465068459510803, + "rewards/rejected": -0.6824727058410645, + "step": 523 + }, + { + "epoch": 0.06, + "learning_rate": 2.861992274376683e-07, + "logits/chosen": -3.0597634315490723, + "logits/rejected": -2.809943914413452, + "logps/chosen": -224.89942932128906, + "logps/rejected": -279.1510314941406, + "loss": 0.5128, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17489421367645264, + "rewards/margins": 0.5333244204521179, + "rewards/rejected": -0.3584301769733429, + "step": 524 + }, + { + "epoch": 0.06, + "learning_rate": 2.86164110968044e-07, + "logits/chosen": -2.7586112022399902, + "logits/rejected": -2.7360591888427734, + "logps/chosen": -281.439697265625, + "logps/rejected": -299.2528076171875, + "loss": 0.5781, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05590415000915527, + "rewards/margins": 0.2985103130340576, + "rewards/rejected": -0.24260616302490234, + "step": 525 + }, + { + "epoch": 0.06, + "learning_rate": 2.8612899449841974e-07, + "logits/chosen": -3.474358558654785, + "logits/rejected": -3.768622875213623, + "logps/chosen": -78.41242218017578, + "logps/rejected": -221.89361572265625, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07666325569152832, + "rewards/margins": 2.6205077171325684, + "rewards/rejected": -2.54384446144104, + "step": 526 + }, + { + "epoch": 0.06, + "learning_rate": 2.860938780287955e-07, + "logits/chosen": -3.536370277404785, + "logits/rejected": -3.3360824584960938, + "logps/chosen": -185.61795043945312, + "logps/rejected": -164.88418579101562, + "loss": 0.704, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2838400602340698, + "rewards/margins": 0.46942323446273804, + "rewards/rejected": -0.7532632350921631, + "step": 527 + }, + { + "epoch": 0.06, + "learning_rate": 2.8605876155917125e-07, + "logits/chosen": -2.358625888824463, + "logits/rejected": -2.5217840671539307, + "logps/chosen": -320.2193298339844, + "logps/rejected": -237.72866821289062, + "loss": 0.5965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2599813640117645, + "rewards/margins": 0.35364556312561035, + "rewards/rejected": -0.6136269569396973, + "step": 528 + }, + { + "epoch": 0.06, + "learning_rate": 2.86023645089547e-07, + "logits/chosen": -3.1081275939941406, + "logits/rejected": -3.059229612350464, + "logps/chosen": -369.98455810546875, + "logps/rejected": -276.3141174316406, + "loss": 0.544, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12163963913917542, + "rewards/margins": 0.72295743227005, + "rewards/rejected": -0.8445970416069031, + "step": 529 + }, + { + "epoch": 0.06, + "learning_rate": 2.859885286199227e-07, + "logits/chosen": -3.38150691986084, + "logits/rejected": -3.1757264137268066, + "logps/chosen": -177.9127655029297, + "logps/rejected": -224.57460021972656, + "loss": 0.7702, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49800533056259155, + "rewards/margins": 0.42585068941116333, + "rewards/rejected": -0.9238559603691101, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 2.8595341215029846e-07, + "logits/chosen": -3.8444156646728516, + "logits/rejected": -3.6734280586242676, + "logps/chosen": -104.33226776123047, + "logps/rejected": -82.72016143798828, + "loss": 0.6482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14879372715950012, + "rewards/margins": 0.3363971710205078, + "rewards/rejected": -0.48519089818000793, + "step": 531 + }, + { + "epoch": 0.06, + "learning_rate": 2.859182956806742e-07, + "logits/chosen": -2.9359076023101807, + "logits/rejected": -3.0449743270874023, + "logps/chosen": -228.01751708984375, + "logps/rejected": -180.79644775390625, + "loss": 0.6173, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2374809980392456, + "rewards/margins": 0.3511824905872345, + "rewards/rejected": -0.5886634588241577, + "step": 532 + }, + { + "epoch": 0.06, + "learning_rate": 2.8588317921104997e-07, + "logits/chosen": -2.995626449584961, + "logits/rejected": -2.8867599964141846, + "logps/chosen": -323.6828918457031, + "logps/rejected": -179.494873046875, + "loss": 0.352, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5366920232772827, + "rewards/margins": 1.1864476203918457, + "rewards/rejected": -0.6497554779052734, + "step": 533 + }, + { + "epoch": 0.06, + "learning_rate": 2.858480627414257e-07, + "logits/chosen": -3.669623613357544, + "logits/rejected": -3.7601470947265625, + "logps/chosen": -500.26806640625, + "logps/rejected": -318.5075988769531, + "loss": 0.4036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4221084713935852, + "rewards/margins": 1.10366690158844, + "rewards/rejected": -1.52577543258667, + "step": 534 + }, + { + "epoch": 0.06, + "learning_rate": 2.8581294627180147e-07, + "logits/chosen": -3.1554794311523438, + "logits/rejected": -3.3461742401123047, + "logps/chosen": -462.300537109375, + "logps/rejected": -228.64552307128906, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20671509206295013, + "rewards/margins": 0.4693266749382019, + "rewards/rejected": -0.6760417819023132, + "step": 535 + }, + { + "epoch": 0.06, + "learning_rate": 2.8577782980217723e-07, + "logits/chosen": -2.9409327507019043, + "logits/rejected": -2.8004651069641113, + "logps/chosen": -169.707275390625, + "logps/rejected": -261.3868713378906, + "loss": 0.3896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016493968665599823, + "rewards/margins": 1.1234095096588135, + "rewards/rejected": -1.1069154739379883, + "step": 536 + }, + { + "epoch": 0.06, + "learning_rate": 2.85742713332553e-07, + "logits/chosen": -3.4502832889556885, + "logits/rejected": -3.249462604522705, + "logps/chosen": -448.35784912109375, + "logps/rejected": -448.76495361328125, + "loss": 0.4964, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04303455352783203, + "rewards/margins": 0.6656639575958252, + "rewards/rejected": -0.6226294040679932, + "step": 537 + }, + { + "epoch": 0.06, + "learning_rate": 2.857075968629287e-07, + "logits/chosen": -3.094134569168091, + "logits/rejected": -3.204838991165161, + "logps/chosen": -92.15776062011719, + "logps/rejected": -279.9194030761719, + "loss": 0.6079, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08525685966014862, + "rewards/margins": 0.5060462951660156, + "rewards/rejected": -0.4207894802093506, + "step": 538 + }, + { + "epoch": 0.06, + "learning_rate": 2.8567248039330444e-07, + "logits/chosen": -3.061634063720703, + "logits/rejected": -3.2320733070373535, + "logps/chosen": -343.7528991699219, + "logps/rejected": -294.998046875, + "loss": 0.851, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.45737898349761963, + "rewards/margins": 0.07155458629131317, + "rewards/rejected": -0.5289335250854492, + "step": 539 + }, + { + "epoch": 0.06, + "learning_rate": 2.856373639236802e-07, + "logits/chosen": -2.809377670288086, + "logits/rejected": -3.2030439376831055, + "logps/chosen": -330.2304992675781, + "logps/rejected": -213.47561645507812, + "loss": 0.5892, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0012353844940662384, + "rewards/margins": 0.47216248512268066, + "rewards/rejected": -0.4709271192550659, + "step": 540 + }, + { + "epoch": 0.06, + "learning_rate": 2.8560224745405594e-07, + "logits/chosen": -2.3284294605255127, + "logits/rejected": -2.863255023956299, + "logps/chosen": -535.4571533203125, + "logps/rejected": -307.80303955078125, + "loss": 0.5115, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3536966145038605, + "rewards/margins": 0.9567894339561462, + "rewards/rejected": -1.310486078262329, + "step": 541 + }, + { + "epoch": 0.06, + "learning_rate": 2.855671309844317e-07, + "logits/chosen": -2.8721399307250977, + "logits/rejected": -2.9318065643310547, + "logps/chosen": -305.16949462890625, + "logps/rejected": -284.3133239746094, + "loss": 0.4169, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06329341232776642, + "rewards/margins": 1.091465711593628, + "rewards/rejected": -1.154759168624878, + "step": 542 + }, + { + "epoch": 0.06, + "learning_rate": 2.855320145148074e-07, + "logits/chosen": -3.7321062088012695, + "logits/rejected": -2.9591822624206543, + "logps/chosen": -475.44415283203125, + "logps/rejected": -153.30538940429688, + "loss": 0.382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15792134404182434, + "rewards/margins": 0.8902776837348938, + "rewards/rejected": -1.048198938369751, + "step": 543 + }, + { + "epoch": 0.06, + "learning_rate": 2.8549689804518315e-07, + "logits/chosen": -2.7988762855529785, + "logits/rejected": -2.767940044403076, + "logps/chosen": -243.82025146484375, + "logps/rejected": -266.5516357421875, + "loss": 0.3299, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21011295914649963, + "rewards/margins": 1.3443522453308105, + "rewards/rejected": -1.1342393159866333, + "step": 544 + }, + { + "epoch": 0.06, + "learning_rate": 2.8546178157555896e-07, + "logits/chosen": -3.634690999984741, + "logits/rejected": -3.778970241546631, + "logps/chosen": -319.986083984375, + "logps/rejected": -306.7301025390625, + "loss": 0.337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0906183123588562, + "rewards/margins": 1.3068578243255615, + "rewards/rejected": -1.3974761962890625, + "step": 545 + }, + { + "epoch": 0.06, + "learning_rate": 2.8542666510593466e-07, + "logits/chosen": -2.377859354019165, + "logits/rejected": -2.68601655960083, + "logps/chosen": -151.09364318847656, + "logps/rejected": -230.4820556640625, + "loss": 0.4721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.012478187680244446, + "rewards/margins": 0.777976393699646, + "rewards/rejected": -0.790454626083374, + "step": 546 + }, + { + "epoch": 0.06, + "learning_rate": 2.853915486363104e-07, + "logits/chosen": -3.0171828269958496, + "logits/rejected": -2.935272216796875, + "logps/chosen": -210.77883911132812, + "logps/rejected": -266.1453552246094, + "loss": 0.4129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11610323935747147, + "rewards/margins": 1.0423280000686646, + "rewards/rejected": -0.9262247085571289, + "step": 547 + }, + { + "epoch": 0.06, + "learning_rate": 2.8535643216668617e-07, + "logits/chosen": -2.657559871673584, + "logits/rejected": -2.763437032699585, + "logps/chosen": -147.03570556640625, + "logps/rejected": -237.77920532226562, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39586618542671204, + "rewards/margins": 0.6470467448234558, + "rewards/rejected": -0.2511805593967438, + "step": 548 + }, + { + "epoch": 0.06, + "learning_rate": 2.853213156970619e-07, + "logits/chosen": -3.5417895317077637, + "logits/rejected": -3.018122911453247, + "logps/chosen": -303.20257568359375, + "logps/rejected": -224.96017456054688, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.184012770652771, + "rewards/margins": 1.318422555923462, + "rewards/rejected": -1.134409785270691, + "step": 549 + }, + { + "epoch": 0.06, + "learning_rate": 2.852861992274377e-07, + "logits/chosen": -2.5790486335754395, + "logits/rejected": -2.585319995880127, + "logps/chosen": -255.1060791015625, + "logps/rejected": -236.447265625, + "loss": 0.7813, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5112833380699158, + "rewards/margins": -0.0824710801243782, + "rewards/rejected": -0.4288122057914734, + "step": 550 + }, + { + "epoch": 0.06, + "learning_rate": 2.852510827578134e-07, + "logits/chosen": -2.172697067260742, + "logits/rejected": -2.159735918045044, + "logps/chosen": -415.64697265625, + "logps/rejected": -255.6728973388672, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10474257916212082, + "rewards/margins": 0.5209324359893799, + "rewards/rejected": -0.41618984937667847, + "step": 551 + }, + { + "epoch": 0.06, + "learning_rate": 2.8521596628818913e-07, + "logits/chosen": -3.2614693641662598, + "logits/rejected": -3.245962619781494, + "logps/chosen": -244.94973754882812, + "logps/rejected": -194.36509704589844, + "loss": 0.4547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1512511819601059, + "rewards/margins": 1.0718021392822266, + "rewards/rejected": -1.223053216934204, + "step": 552 + }, + { + "epoch": 0.06, + "learning_rate": 2.851808498185649e-07, + "logits/chosen": -3.158853530883789, + "logits/rejected": -3.2057247161865234, + "logps/chosen": -360.01898193359375, + "logps/rejected": -309.0932922363281, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16631008684635162, + "rewards/margins": 1.0384345054626465, + "rewards/rejected": -1.204744577407837, + "step": 553 + }, + { + "epoch": 0.06, + "learning_rate": 2.8514573334894064e-07, + "logits/chosen": -3.1241519451141357, + "logits/rejected": -3.2429823875427246, + "logps/chosen": -310.84832763671875, + "logps/rejected": -270.1993713378906, + "loss": 0.5723, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03085634857416153, + "rewards/margins": 0.45363301038742065, + "rewards/rejected": -0.4844893515110016, + "step": 554 + }, + { + "epoch": 0.06, + "learning_rate": 2.851106168793164e-07, + "logits/chosen": -2.5227901935577393, + "logits/rejected": -2.5692367553710938, + "logps/chosen": -154.5811004638672, + "logps/rejected": -177.42520141601562, + "loss": 0.5283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22822032868862152, + "rewards/margins": 0.5128406286239624, + "rewards/rejected": -0.7410610318183899, + "step": 555 + }, + { + "epoch": 0.06, + "learning_rate": 2.8507550040969215e-07, + "logits/chosen": -2.639172315597534, + "logits/rejected": -2.875760078430176, + "logps/chosen": -213.1607208251953, + "logps/rejected": -234.0604248046875, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15493053197860718, + "rewards/margins": 0.7433271408081055, + "rewards/rejected": -0.5883966684341431, + "step": 556 + }, + { + "epoch": 0.06, + "learning_rate": 2.850403839400679e-07, + "logits/chosen": -2.6481471061706543, + "logits/rejected": -2.764463424682617, + "logps/chosen": -134.46688842773438, + "logps/rejected": -163.86424255371094, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42262765765190125, + "rewards/margins": 0.8071228861808777, + "rewards/rejected": -1.2297505140304565, + "step": 557 + }, + { + "epoch": 0.06, + "learning_rate": 2.8500526747044365e-07, + "logits/chosen": -3.5028622150421143, + "logits/rejected": -3.6843693256378174, + "logps/chosen": -335.29046630859375, + "logps/rejected": -299.91973876953125, + "loss": 0.6575, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43093204498291016, + "rewards/margins": 0.5989680290222168, + "rewards/rejected": -1.029900074005127, + "step": 558 + }, + { + "epoch": 0.06, + "learning_rate": 2.8497015100081935e-07, + "logits/chosen": -2.535102128982544, + "logits/rejected": -2.6182827949523926, + "logps/chosen": -164.6511688232422, + "logps/rejected": -272.6824035644531, + "loss": 0.7012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08056869357824326, + "rewards/margins": 0.9240962266921997, + "rewards/rejected": -0.8435275554656982, + "step": 559 + }, + { + "epoch": 0.06, + "learning_rate": 2.849350345311951e-07, + "logits/chosen": -2.699617624282837, + "logits/rejected": -2.6587982177734375, + "logps/chosen": -246.6691131591797, + "logps/rejected": -273.34625244140625, + "loss": 0.5611, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10043494403362274, + "rewards/margins": 0.642559289932251, + "rewards/rejected": -0.5421243906021118, + "step": 560 + }, + { + "epoch": 0.06, + "learning_rate": 2.8489991806157086e-07, + "logits/chosen": -3.2646660804748535, + "logits/rejected": -3.3632395267486572, + "logps/chosen": -347.6244201660156, + "logps/rejected": -384.0805358886719, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3999229371547699, + "rewards/margins": 0.5680125951766968, + "rewards/rejected": -0.9679355621337891, + "step": 561 + }, + { + "epoch": 0.06, + "learning_rate": 2.848648015919466e-07, + "logits/chosen": -2.572514533996582, + "logits/rejected": -2.8451180458068848, + "logps/chosen": -217.26177978515625, + "logps/rejected": -221.21568298339844, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08519965410232544, + "rewards/margins": 1.5005558729171753, + "rewards/rejected": -1.5857555866241455, + "step": 562 + }, + { + "epoch": 0.06, + "learning_rate": 2.8482968512232237e-07, + "logits/chosen": -2.9453463554382324, + "logits/rejected": -3.049625873565674, + "logps/chosen": -404.310546875, + "logps/rejected": -277.104248046875, + "loss": 0.453, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24798595905303955, + "rewards/margins": 0.9206985235214233, + "rewards/rejected": -0.672712504863739, + "step": 563 + }, + { + "epoch": 0.07, + "learning_rate": 2.8479456865269807e-07, + "logits/chosen": -3.2859647274017334, + "logits/rejected": -3.3170647621154785, + "logps/chosen": -228.65225219726562, + "logps/rejected": -209.265869140625, + "loss": 0.717, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22715182602405548, + "rewards/margins": 0.8490610718727112, + "rewards/rejected": -1.0762128829956055, + "step": 564 + }, + { + "epoch": 0.07, + "learning_rate": 2.847594521830738e-07, + "logits/chosen": -3.0620079040527344, + "logits/rejected": -3.1941654682159424, + "logps/chosen": -300.74688720703125, + "logps/rejected": -348.821044921875, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31530195474624634, + "rewards/margins": 0.5786973834037781, + "rewards/rejected": -0.8939993381500244, + "step": 565 + }, + { + "epoch": 0.07, + "learning_rate": 2.847243357134496e-07, + "logits/chosen": -3.3365564346313477, + "logits/rejected": -3.2249889373779297, + "logps/chosen": -274.286865234375, + "logps/rejected": -180.2883758544922, + "loss": 0.7249, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3781934380531311, + "rewards/margins": 0.1618926078081131, + "rewards/rejected": -0.540086030960083, + "step": 566 + }, + { + "epoch": 0.07, + "learning_rate": 2.8468921924382533e-07, + "logits/chosen": -2.9601683616638184, + "logits/rejected": -2.6350760459899902, + "logps/chosen": -315.9713439941406, + "logps/rejected": -250.956787109375, + "loss": 0.5427, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17157602310180664, + "rewards/margins": 0.7589461803436279, + "rewards/rejected": -0.5873702168464661, + "step": 567 + }, + { + "epoch": 0.07, + "learning_rate": 2.846541027742011e-07, + "logits/chosen": -2.83595609664917, + "logits/rejected": -2.7099967002868652, + "logps/chosen": -259.77587890625, + "logps/rejected": -176.6959228515625, + "loss": 0.5304, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2183617204427719, + "rewards/margins": 0.5292070508003235, + "rewards/rejected": -0.3108453154563904, + "step": 568 + }, + { + "epoch": 0.07, + "learning_rate": 2.8461898630457684e-07, + "logits/chosen": -2.9993205070495605, + "logits/rejected": -2.9990499019622803, + "logps/chosen": -211.07571411132812, + "logps/rejected": -230.1943359375, + "loss": 0.4211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26790815591812134, + "rewards/margins": 0.7439912557601929, + "rewards/rejected": -0.47608309984207153, + "step": 569 + }, + { + "epoch": 0.07, + "learning_rate": 2.845838698349526e-07, + "logits/chosen": -2.6326541900634766, + "logits/rejected": -2.629570960998535, + "logps/chosen": -215.5721435546875, + "logps/rejected": -264.63726806640625, + "loss": 0.4937, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17606694996356964, + "rewards/margins": 0.7536322474479675, + "rewards/rejected": -0.5775653123855591, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 2.8454875336532835e-07, + "logits/chosen": -2.565608501434326, + "logits/rejected": -2.7435672283172607, + "logps/chosen": -419.363037109375, + "logps/rejected": -107.31320190429688, + "loss": 1.0601, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7615485191345215, + "rewards/margins": -0.4368448257446289, + "rewards/rejected": -0.3247036933898926, + "step": 571 + }, + { + "epoch": 0.07, + "learning_rate": 2.8451363689570405e-07, + "logits/chosen": -3.5370476245880127, + "logits/rejected": -3.39475679397583, + "logps/chosen": -233.05548095703125, + "logps/rejected": -258.10009765625, + "loss": 0.7058, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44455212354660034, + "rewards/margins": 0.426350474357605, + "rewards/rejected": -0.8709025382995605, + "step": 572 + }, + { + "epoch": 0.07, + "learning_rate": 2.844785204260798e-07, + "logits/chosen": -3.0539088249206543, + "logits/rejected": -3.003988742828369, + "logps/chosen": -200.1059112548828, + "logps/rejected": -204.16909790039062, + "loss": 0.4893, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01456526666879654, + "rewards/margins": 0.5757176280021667, + "rewards/rejected": -0.5611523389816284, + "step": 573 + }, + { + "epoch": 0.07, + "learning_rate": 2.8444340395645556e-07, + "logits/chosen": -3.422623872756958, + "logits/rejected": -3.2860288619995117, + "logps/chosen": -540.5427856445312, + "logps/rejected": -292.38372802734375, + "loss": 0.2989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024834223091602325, + "rewards/margins": 1.3464725017547607, + "rewards/rejected": -1.3713066577911377, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 2.844082874868313e-07, + "logits/chosen": -2.7679200172424316, + "logits/rejected": -3.0395469665527344, + "logps/chosen": -379.524169921875, + "logps/rejected": -180.01467895507812, + "loss": 0.6238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31150147318840027, + "rewards/margins": 0.53651362657547, + "rewards/rejected": -0.8480151295661926, + "step": 575 + }, + { + "epoch": 0.07, + "learning_rate": 2.8437317101720706e-07, + "logits/chosen": -2.912106513977051, + "logits/rejected": -3.076009511947632, + "logps/chosen": -186.04254150390625, + "logps/rejected": -201.83538818359375, + "loss": 0.4337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3043007254600525, + "rewards/margins": 0.9681100845336914, + "rewards/rejected": -1.2724108695983887, + "step": 576 + }, + { + "epoch": 0.07, + "learning_rate": 2.843380545475828e-07, + "logits/chosen": -2.464355230331421, + "logits/rejected": -2.833991050720215, + "logps/chosen": -392.5476379394531, + "logps/rejected": -278.966552734375, + "loss": 0.7362, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3340369462966919, + "rewards/margins": 0.13450157642364502, + "rewards/rejected": -0.4685384929180145, + "step": 577 + }, + { + "epoch": 0.07, + "learning_rate": 2.843029380779585e-07, + "logits/chosen": -3.2506563663482666, + "logits/rejected": -3.0696945190429688, + "logps/chosen": -190.8265380859375, + "logps/rejected": -128.91940307617188, + "loss": 0.6402, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.027650974690914154, + "rewards/margins": 0.15454694628715515, + "rewards/rejected": -0.1268959790468216, + "step": 578 + }, + { + "epoch": 0.07, + "learning_rate": 2.842678216083343e-07, + "logits/chosen": -3.0331614017486572, + "logits/rejected": -3.1520509719848633, + "logps/chosen": -386.54913330078125, + "logps/rejected": -261.4997863769531, + "loss": 0.4125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22909235954284668, + "rewards/margins": 1.250894546508789, + "rewards/rejected": -1.4799869060516357, + "step": 579 + }, + { + "epoch": 0.07, + "learning_rate": 2.8423270513871e-07, + "logits/chosen": -3.1338346004486084, + "logits/rejected": -3.2878732681274414, + "logps/chosen": -343.92596435546875, + "logps/rejected": -157.74884033203125, + "loss": 0.4756, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16051925718784332, + "rewards/margins": 0.911997377872467, + "rewards/rejected": -0.7514780759811401, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 2.841975886690858e-07, + "logits/chosen": -3.133943796157837, + "logits/rejected": -3.0379106998443604, + "logps/chosen": -131.99093627929688, + "logps/rejected": -193.8929443359375, + "loss": 0.6252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1466299593448639, + "rewards/margins": 0.45224130153656006, + "rewards/rejected": -0.5988712906837463, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 2.8416247219946153e-07, + "logits/chosen": -3.051846742630005, + "logits/rejected": -2.7714009284973145, + "logps/chosen": -259.3820495605469, + "logps/rejected": -202.49575805664062, + "loss": 0.3842, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1096249595284462, + "rewards/margins": 0.9627222418785095, + "rewards/rejected": -0.8530972599983215, + "step": 582 + }, + { + "epoch": 0.07, + "learning_rate": 2.841273557298373e-07, + "logits/chosen": -2.690918445587158, + "logits/rejected": -2.485649585723877, + "logps/chosen": -263.5823974609375, + "logps/rejected": -240.57923889160156, + "loss": 0.6488, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1939014494419098, + "rewards/margins": 0.19574642181396484, + "rewards/rejected": -0.0018449574708938599, + "step": 583 + }, + { + "epoch": 0.07, + "learning_rate": 2.8409223926021304e-07, + "logits/chosen": -3.312138080596924, + "logits/rejected": -2.97157621383667, + "logps/chosen": -284.61541748046875, + "logps/rejected": -185.38809204101562, + "loss": 0.3772, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2834477722644806, + "rewards/margins": 1.2873646020889282, + "rewards/rejected": -1.0039167404174805, + "step": 584 + }, + { + "epoch": 0.07, + "learning_rate": 2.840571227905888e-07, + "logits/chosen": -2.740142345428467, + "logits/rejected": -2.71858286857605, + "logps/chosen": -219.63058471679688, + "logps/rejected": -170.3091583251953, + "loss": 0.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05971723422408104, + "rewards/margins": 0.7796342968940735, + "rewards/rejected": -0.8393515348434448, + "step": 585 + }, + { + "epoch": 0.07, + "learning_rate": 2.840220063209645e-07, + "logits/chosen": -2.3721702098846436, + "logits/rejected": -2.319884777069092, + "logps/chosen": -239.2967529296875, + "logps/rejected": -312.5836181640625, + "loss": 0.5475, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05204177647829056, + "rewards/margins": 0.545594334602356, + "rewards/rejected": -0.5976361036300659, + "step": 586 + }, + { + "epoch": 0.07, + "learning_rate": 2.8398688985134025e-07, + "logits/chosen": -3.438483476638794, + "logits/rejected": -3.3418476581573486, + "logps/chosen": -288.30047607421875, + "logps/rejected": -322.61114501953125, + "loss": 0.3558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05761921405792236, + "rewards/margins": 1.2904915809631348, + "rewards/rejected": -1.2328723669052124, + "step": 587 + }, + { + "epoch": 0.07, + "learning_rate": 2.83951773381716e-07, + "logits/chosen": -3.420330047607422, + "logits/rejected": -3.147935390472412, + "logps/chosen": -244.6639404296875, + "logps/rejected": -192.90037536621094, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43344008922576904, + "rewards/margins": 0.7481533288955688, + "rewards/rejected": -1.181593418121338, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 2.8391665691209176e-07, + "logits/chosen": -3.1110801696777344, + "logits/rejected": -2.820272922515869, + "logps/chosen": -364.3965759277344, + "logps/rejected": -333.96142578125, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2278439998626709, + "rewards/margins": 1.2228749990463257, + "rewards/rejected": -1.4507191181182861, + "step": 589 + }, + { + "epoch": 0.07, + "learning_rate": 2.838815404424675e-07, + "logits/chosen": -2.5807666778564453, + "logits/rejected": -2.401181697845459, + "logps/chosen": -222.55657958984375, + "logps/rejected": -183.37710571289062, + "loss": 0.5474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07202743738889694, + "rewards/margins": 0.4343627691268921, + "rewards/rejected": -0.5063902139663696, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 2.8384642397284327e-07, + "logits/chosen": -3.7373287677764893, + "logits/rejected": -3.380686044692993, + "logps/chosen": -368.79302978515625, + "logps/rejected": -349.9984130859375, + "loss": 0.5687, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07392235100269318, + "rewards/margins": 0.5646569728851318, + "rewards/rejected": -0.49073460698127747, + "step": 591 + }, + { + "epoch": 0.07, + "learning_rate": 2.83811307503219e-07, + "logits/chosen": -3.299287796020508, + "logits/rejected": -2.932203531265259, + "logps/chosen": -288.1181945800781, + "logps/rejected": -293.4860534667969, + "loss": 0.3414, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1486733853816986, + "rewards/margins": 1.4948911666870117, + "rewards/rejected": -1.3462179899215698, + "step": 592 + }, + { + "epoch": 0.07, + "learning_rate": 2.8377619103359477e-07, + "logits/chosen": -3.1409854888916016, + "logits/rejected": -3.1719956398010254, + "logps/chosen": -367.2176208496094, + "logps/rejected": -224.2695770263672, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2023565024137497, + "rewards/margins": 0.7969527840614319, + "rewards/rejected": -0.5945962071418762, + "step": 593 + }, + { + "epoch": 0.07, + "learning_rate": 2.837410745639705e-07, + "logits/chosen": -3.666069746017456, + "logits/rejected": -3.3207573890686035, + "logps/chosen": -376.62158203125, + "logps/rejected": -162.41262817382812, + "loss": 0.7542, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4511701166629791, + "rewards/margins": 0.00042928755283355713, + "rewards/rejected": -0.4515994191169739, + "step": 594 + }, + { + "epoch": 0.07, + "learning_rate": 2.8370595809434623e-07, + "logits/chosen": -3.1518170833587646, + "logits/rejected": -3.2193751335144043, + "logps/chosen": -229.4091339111328, + "logps/rejected": -188.7713623046875, + "loss": 0.4416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16101792454719543, + "rewards/margins": 1.0197718143463135, + "rewards/rejected": -1.1807897090911865, + "step": 595 + }, + { + "epoch": 0.07, + "learning_rate": 2.83670841624722e-07, + "logits/chosen": -3.2547640800476074, + "logits/rejected": -3.5978660583496094, + "logps/chosen": -145.9723663330078, + "logps/rejected": -346.95587158203125, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11018386483192444, + "rewards/margins": 1.9690461158752441, + "rewards/rejected": -1.8588621616363525, + "step": 596 + }, + { + "epoch": 0.07, + "learning_rate": 2.8363572515509774e-07, + "logits/chosen": -3.633589506149292, + "logits/rejected": -3.2893996238708496, + "logps/chosen": -191.10833740234375, + "logps/rejected": -169.20172119140625, + "loss": 0.6649, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.039100948721170425, + "rewards/margins": 0.3054118752479553, + "rewards/rejected": -0.34451279044151306, + "step": 597 + }, + { + "epoch": 0.07, + "learning_rate": 2.836006086854735e-07, + "logits/chosen": -3.276345729827881, + "logits/rejected": -3.1193535327911377, + "logps/chosen": -238.8255157470703, + "logps/rejected": -265.792724609375, + "loss": 0.2981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.039712145924568176, + "rewards/margins": 1.7417290210723877, + "rewards/rejected": -1.702016830444336, + "step": 598 + }, + { + "epoch": 0.07, + "learning_rate": 2.835654922158492e-07, + "logits/chosen": -3.36613130569458, + "logits/rejected": -3.3966870307922363, + "logps/chosen": -248.4036865234375, + "logps/rejected": -258.8201904296875, + "loss": 0.3471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05074310302734375, + "rewards/margins": 1.2298450469970703, + "rewards/rejected": -1.1791019439697266, + "step": 599 + }, + { + "epoch": 0.07, + "learning_rate": 2.8353037574622494e-07, + "logits/chosen": -2.8329219818115234, + "logits/rejected": -2.881606101989746, + "logps/chosen": -315.7949523925781, + "logps/rejected": -189.22109985351562, + "loss": 0.5293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09626045823097229, + "rewards/margins": 0.9568472504615784, + "rewards/rejected": -1.053107738494873, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 2.8349525927660075e-07, + "logits/chosen": -3.479264259338379, + "logits/rejected": -3.2550079822540283, + "logps/chosen": -371.71820068359375, + "logps/rejected": -378.53009033203125, + "loss": 0.571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048086121678352356, + "rewards/margins": 0.7984611392021179, + "rewards/rejected": -0.8465472459793091, + "step": 601 + }, + { + "epoch": 0.07, + "learning_rate": 2.8346014280697645e-07, + "logits/chosen": -2.9838826656341553, + "logits/rejected": -2.9615654945373535, + "logps/chosen": -151.2657012939453, + "logps/rejected": -101.64781188964844, + "loss": 0.6159, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1190408393740654, + "rewards/margins": 0.45836377143859863, + "rewards/rejected": -0.33932292461395264, + "step": 602 + }, + { + "epoch": 0.07, + "learning_rate": 2.834250263373522e-07, + "logits/chosen": -3.8918468952178955, + "logits/rejected": -3.9071686267852783, + "logps/chosen": -215.66847229003906, + "logps/rejected": -262.21661376953125, + "loss": 0.4703, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12405318021774292, + "rewards/margins": 1.2712405920028687, + "rewards/rejected": -1.3952938318252563, + "step": 603 + }, + { + "epoch": 0.07, + "learning_rate": 2.8338990986772796e-07, + "logits/chosen": -3.230825185775757, + "logits/rejected": -3.391624689102173, + "logps/chosen": -367.1190185546875, + "logps/rejected": -344.9413757324219, + "loss": 0.9151, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3344690501689911, + "rewards/margins": 0.07447004318237305, + "rewards/rejected": -0.40893906354904175, + "step": 604 + }, + { + "epoch": 0.07, + "learning_rate": 2.833547933981037e-07, + "logits/chosen": -3.498638153076172, + "logits/rejected": -3.279733180999756, + "logps/chosen": -309.2095031738281, + "logps/rejected": -318.4519958496094, + "loss": 0.3328, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18286341428756714, + "rewards/margins": 1.473036766052246, + "rewards/rejected": -1.2901732921600342, + "step": 605 + }, + { + "epoch": 0.07, + "learning_rate": 2.8331967692847947e-07, + "logits/chosen": -3.579416275024414, + "logits/rejected": -3.324866533279419, + "logps/chosen": -226.84814453125, + "logps/rejected": -169.96875, + "loss": 0.5737, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5237652063369751, + "rewards/margins": 0.8228992223739624, + "rewards/rejected": -1.3466644287109375, + "step": 606 + }, + { + "epoch": 0.07, + "learning_rate": 2.8328456045885517e-07, + "logits/chosen": -3.7990610599517822, + "logits/rejected": -3.493673324584961, + "logps/chosen": -190.21429443359375, + "logps/rejected": -192.6749725341797, + "loss": 0.3771, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11960812658071518, + "rewards/margins": 1.3039953708648682, + "rewards/rejected": -1.1843873262405396, + "step": 607 + }, + { + "epoch": 0.07, + "learning_rate": 2.832494439892309e-07, + "logits/chosen": -2.839855670928955, + "logits/rejected": -2.872342109680176, + "logps/chosen": -369.30352783203125, + "logps/rejected": -309.098876953125, + "loss": 0.4578, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18495683372020721, + "rewards/margins": 1.3200418949127197, + "rewards/rejected": -1.135084867477417, + "step": 608 + }, + { + "epoch": 0.07, + "learning_rate": 2.832143275196067e-07, + "logits/chosen": -3.3907508850097656, + "logits/rejected": -3.0417046546936035, + "logps/chosen": -287.4632568359375, + "logps/rejected": -185.15408325195312, + "loss": 1.2906, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.335735559463501, + "rewards/margins": -0.5624992847442627, + "rewards/rejected": -0.7732362747192383, + "step": 609 + }, + { + "epoch": 0.07, + "learning_rate": 2.8317921104998243e-07, + "logits/chosen": -3.7138254642486572, + "logits/rejected": -3.705875873565674, + "logps/chosen": -302.68310546875, + "logps/rejected": -229.69387817382812, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14651048183441162, + "rewards/margins": 0.7846730351448059, + "rewards/rejected": -0.9311835169792175, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 2.831440945803582e-07, + "logits/chosen": -3.3624393939971924, + "logits/rejected": -3.0908660888671875, + "logps/chosen": -124.87248992919922, + "logps/rejected": -100.67927551269531, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02834758162498474, + "rewards/margins": 0.6390489935874939, + "rewards/rejected": -0.667396605014801, + "step": 611 + }, + { + "epoch": 0.07, + "learning_rate": 2.831089781107339e-07, + "logits/chosen": -3.372060775756836, + "logits/rejected": -3.1947011947631836, + "logps/chosen": -157.3802490234375, + "logps/rejected": -199.58578491210938, + "loss": 0.4947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03317014127969742, + "rewards/margins": 0.7201344966888428, + "rewards/rejected": -0.7533047199249268, + "step": 612 + }, + { + "epoch": 0.07, + "learning_rate": 2.830738616411097e-07, + "logits/chosen": -3.2238903045654297, + "logits/rejected": -3.1187775135040283, + "logps/chosen": -125.33981323242188, + "logps/rejected": -163.28749084472656, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3590514659881592, + "rewards/margins": 1.5268975496292114, + "rewards/rejected": -1.1678460836410522, + "step": 613 + }, + { + "epoch": 0.07, + "learning_rate": 2.8303874517148545e-07, + "logits/chosen": -2.984804391860962, + "logits/rejected": -3.002309799194336, + "logps/chosen": -315.9145812988281, + "logps/rejected": -281.891357421875, + "loss": 0.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1040472462773323, + "rewards/margins": 0.6271089911460876, + "rewards/rejected": -0.5230617523193359, + "step": 614 + }, + { + "epoch": 0.07, + "learning_rate": 2.8300362870186115e-07, + "logits/chosen": -2.814328193664551, + "logits/rejected": -2.8751745223999023, + "logps/chosen": -219.7996826171875, + "logps/rejected": -293.1571044921875, + "loss": 0.4039, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32106101512908936, + "rewards/margins": 1.3266940116882324, + "rewards/rejected": -1.0056328773498535, + "step": 615 + }, + { + "epoch": 0.07, + "learning_rate": 2.829685122322369e-07, + "logits/chosen": -2.8527355194091797, + "logits/rejected": -2.989307403564453, + "logps/chosen": -219.4447784423828, + "logps/rejected": -288.9472961425781, + "loss": 0.6222, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05713772773742676, + "rewards/margins": 0.668530285358429, + "rewards/rejected": -0.6113924980163574, + "step": 616 + }, + { + "epoch": 0.07, + "learning_rate": 2.8293339576261265e-07, + "logits/chosen": -3.837667226791382, + "logits/rejected": -3.9089627265930176, + "logps/chosen": -198.53402709960938, + "logps/rejected": -203.71905517578125, + "loss": 0.5995, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029162194579839706, + "rewards/margins": 0.354979544878006, + "rewards/rejected": -0.3841417729854584, + "step": 617 + }, + { + "epoch": 0.07, + "learning_rate": 2.828982792929884e-07, + "logits/chosen": -2.9510388374328613, + "logits/rejected": -2.6519482135772705, + "logps/chosen": -152.76417541503906, + "logps/rejected": -176.57406616210938, + "loss": 0.4328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011390842497348785, + "rewards/margins": 0.7977659702301025, + "rewards/rejected": -0.8091567754745483, + "step": 618 + }, + { + "epoch": 0.07, + "learning_rate": 2.8286316282336416e-07, + "logits/chosen": -3.963745594024658, + "logits/rejected": -3.7298078536987305, + "logps/chosen": -257.343994140625, + "logps/rejected": -202.93719482421875, + "loss": 0.4738, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.036448799073696136, + "rewards/margins": 0.8925577402114868, + "rewards/rejected": -0.8561089634895325, + "step": 619 + }, + { + "epoch": 0.07, + "learning_rate": 2.8282804635373986e-07, + "logits/chosen": -3.560281753540039, + "logits/rejected": -3.3643643856048584, + "logps/chosen": -381.47979736328125, + "logps/rejected": -372.06658935546875, + "loss": 0.4669, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27751314640045166, + "rewards/margins": 0.9599399566650391, + "rewards/rejected": -1.2374531030654907, + "step": 620 + }, + { + "epoch": 0.07, + "learning_rate": 2.827929298841156e-07, + "logits/chosen": -3.530916213989258, + "logits/rejected": -3.4918291568756104, + "logps/chosen": -273.8067932128906, + "logps/rejected": -223.84823608398438, + "loss": 0.6889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06040230020880699, + "rewards/margins": 0.6505823731422424, + "rewards/rejected": -0.7109846472740173, + "step": 621 + }, + { + "epoch": 0.07, + "learning_rate": 2.827578134144914e-07, + "logits/chosen": -2.9849705696105957, + "logits/rejected": -3.0985097885131836, + "logps/chosen": -335.08917236328125, + "logps/rejected": -304.03369140625, + "loss": 0.448, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11139983683824539, + "rewards/margins": 1.1816134452819824, + "rewards/rejected": -1.0702136754989624, + "step": 622 + }, + { + "epoch": 0.07, + "learning_rate": 2.827226969448671e-07, + "logits/chosen": -2.9607491493225098, + "logits/rejected": -3.1391518115997314, + "logps/chosen": -202.5513458251953, + "logps/rejected": -165.22268676757812, + "loss": 0.519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.077151358127594, + "rewards/margins": 0.5873945951461792, + "rewards/rejected": -0.664546012878418, + "step": 623 + }, + { + "epoch": 0.07, + "learning_rate": 2.826875804752429e-07, + "logits/chosen": -2.4530487060546875, + "logits/rejected": -2.3164474964141846, + "logps/chosen": -210.24658203125, + "logps/rejected": -255.90866088867188, + "loss": 0.4222, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.013784952461719513, + "rewards/margins": 0.8635143637657166, + "rewards/rejected": -0.8497294187545776, + "step": 624 + }, + { + "epoch": 0.07, + "learning_rate": 2.8265246400561863e-07, + "logits/chosen": -3.2347888946533203, + "logits/rejected": -3.2915313243865967, + "logps/chosen": -130.29518127441406, + "logps/rejected": -199.05657958984375, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37697330117225647, + "rewards/margins": 0.9773353934288025, + "rewards/rejected": -1.3543087244033813, + "step": 625 + }, + { + "epoch": 0.07, + "learning_rate": 2.826173475359944e-07, + "logits/chosen": -3.669151782989502, + "logits/rejected": -3.390648365020752, + "logps/chosen": -206.706787109375, + "logps/rejected": -159.48300170898438, + "loss": 0.764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4353793263435364, + "rewards/margins": 0.03055882453918457, + "rewards/rejected": -0.46593815088272095, + "step": 626 + }, + { + "epoch": 0.07, + "learning_rate": 2.8258223106637014e-07, + "logits/chosen": -2.950985908508301, + "logits/rejected": -3.2271039485931396, + "logps/chosen": -421.0364990234375, + "logps/rejected": -423.6495361328125, + "loss": 0.5084, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11888724565505981, + "rewards/margins": 1.116855263710022, + "rewards/rejected": -0.9979678988456726, + "step": 627 + }, + { + "epoch": 0.07, + "learning_rate": 2.8254711459674584e-07, + "logits/chosen": -2.693455219268799, + "logits/rejected": -2.5099165439605713, + "logps/chosen": -296.0506286621094, + "logps/rejected": -217.6190185546875, + "loss": 0.6226, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07870311290025711, + "rewards/margins": 0.5490449666976929, + "rewards/rejected": -0.6277481913566589, + "step": 628 + }, + { + "epoch": 0.07, + "learning_rate": 2.825119981271216e-07, + "logits/chosen": -2.988558292388916, + "logits/rejected": -2.8650808334350586, + "logps/chosen": -402.98614501953125, + "logps/rejected": -315.8740539550781, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0207187682390213, + "rewards/margins": 1.0009024143218994, + "rewards/rejected": -0.9801836609840393, + "step": 629 + }, + { + "epoch": 0.07, + "learning_rate": 2.8247688165749735e-07, + "logits/chosen": -2.465223789215088, + "logits/rejected": -2.769787073135376, + "logps/chosen": -220.0747833251953, + "logps/rejected": -228.54983520507812, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0826532319188118, + "rewards/margins": 0.7393407225608826, + "rewards/rejected": -0.6566874980926514, + "step": 630 + }, + { + "epoch": 0.07, + "learning_rate": 2.824417651878731e-07, + "logits/chosen": -2.6062750816345215, + "logits/rejected": -2.872267723083496, + "logps/chosen": -197.28997802734375, + "logps/rejected": -284.3924560546875, + "loss": 0.629, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33210378885269165, + "rewards/margins": 1.0762338638305664, + "rewards/rejected": -1.4083375930786133, + "step": 631 + }, + { + "epoch": 0.07, + "learning_rate": 2.8240664871824886e-07, + "logits/chosen": -2.7323367595672607, + "logits/rejected": -2.745156764984131, + "logps/chosen": -308.6730041503906, + "logps/rejected": -396.23077392578125, + "loss": 0.3815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26536500453948975, + "rewards/margins": 0.9743742346763611, + "rewards/rejected": -1.239739179611206, + "step": 632 + }, + { + "epoch": 0.07, + "learning_rate": 2.8237153224862456e-07, + "logits/chosen": -3.504061698913574, + "logits/rejected": -3.7011914253234863, + "logps/chosen": -232.31907653808594, + "logps/rejected": -202.03196716308594, + "loss": 0.3384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3494293987751007, + "rewards/margins": 1.3804587125778198, + "rewards/rejected": -1.031029224395752, + "step": 633 + }, + { + "epoch": 0.07, + "learning_rate": 2.823364157790003e-07, + "logits/chosen": -2.966700315475464, + "logits/rejected": -2.904381513595581, + "logps/chosen": -491.19134521484375, + "logps/rejected": -278.7889709472656, + "loss": 0.3915, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1501804143190384, + "rewards/margins": 0.8382871150970459, + "rewards/rejected": -0.6881066560745239, + "step": 634 + }, + { + "epoch": 0.07, + "learning_rate": 2.823012993093761e-07, + "logits/chosen": -2.5274901390075684, + "logits/rejected": -3.0725626945495605, + "logps/chosen": -445.3673095703125, + "logps/rejected": -254.85923767089844, + "loss": 0.188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6510478258132935, + "rewards/margins": 2.1387946605682373, + "rewards/rejected": -1.4877469539642334, + "step": 635 + }, + { + "epoch": 0.07, + "learning_rate": 2.822661828397518e-07, + "logits/chosen": -3.3121562004089355, + "logits/rejected": -3.4127559661865234, + "logps/chosen": -389.0754699707031, + "logps/rejected": -291.8110046386719, + "loss": 0.4447, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12660431861877441, + "rewards/margins": 0.7087717652320862, + "rewards/rejected": -0.5821675658226013, + "step": 636 + }, + { + "epoch": 0.07, + "learning_rate": 2.8223106637012757e-07, + "logits/chosen": -2.9307665824890137, + "logits/rejected": -2.8229918479919434, + "logps/chosen": -303.55169677734375, + "logps/rejected": -310.1448059082031, + "loss": 0.5319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.061731815338134766, + "rewards/margins": 0.9247167110443115, + "rewards/rejected": -0.9864485263824463, + "step": 637 + }, + { + "epoch": 0.07, + "learning_rate": 2.821959499005033e-07, + "logits/chosen": -3.9062395095825195, + "logits/rejected": -3.432831048965454, + "logps/chosen": -339.75433349609375, + "logps/rejected": -193.55880737304688, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23130403459072113, + "rewards/margins": 0.37777069211006165, + "rewards/rejected": -0.6090747714042664, + "step": 638 + }, + { + "epoch": 0.07, + "learning_rate": 2.821608334308791e-07, + "logits/chosen": -2.6190154552459717, + "logits/rejected": -2.702080249786377, + "logps/chosen": -516.25244140625, + "logps/rejected": -289.9027099609375, + "loss": 0.5306, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21195468306541443, + "rewards/margins": 1.0171723365783691, + "rewards/rejected": -0.8052175045013428, + "step": 639 + }, + { + "epoch": 0.07, + "learning_rate": 2.8212571696125483e-07, + "logits/chosen": -2.679837465286255, + "logits/rejected": -2.923995018005371, + "logps/chosen": -197.61209106445312, + "logps/rejected": -200.4513702392578, + "loss": 0.6621, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3475823402404785, + "rewards/margins": 0.41853657364845276, + "rewards/rejected": -0.7661188840866089, + "step": 640 + }, + { + "epoch": 0.07, + "learning_rate": 2.8209060049163053e-07, + "logits/chosen": -3.0901055335998535, + "logits/rejected": -3.15689754486084, + "logps/chosen": -314.61004638671875, + "logps/rejected": -203.03675842285156, + "loss": 0.4572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25867682695388794, + "rewards/margins": 0.7284399271011353, + "rewards/rejected": -0.4697631001472473, + "step": 641 + }, + { + "epoch": 0.07, + "learning_rate": 2.820554840220063e-07, + "logits/chosen": -3.70808744430542, + "logits/rejected": -3.533933162689209, + "logps/chosen": -228.56832885742188, + "logps/rejected": -157.4409637451172, + "loss": 0.9322, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.810427188873291, + "rewards/margins": -0.3191819190979004, + "rewards/rejected": -0.49124518036842346, + "step": 642 + }, + { + "epoch": 0.07, + "learning_rate": 2.8202036755238204e-07, + "logits/chosen": -3.4402666091918945, + "logits/rejected": -3.608651876449585, + "logps/chosen": -167.5726776123047, + "logps/rejected": -189.20660400390625, + "loss": 0.4443, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0929604098200798, + "rewards/margins": 1.208081841468811, + "rewards/rejected": -1.3010423183441162, + "step": 643 + }, + { + "epoch": 0.07, + "learning_rate": 2.819852510827578e-07, + "logits/chosen": -2.3759982585906982, + "logits/rejected": -2.7116000652313232, + "logps/chosen": -219.03952026367188, + "logps/rejected": -194.26783752441406, + "loss": 0.5063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02527442015707493, + "rewards/margins": 0.9281631112098694, + "rewards/rejected": -0.9534375667572021, + "step": 644 + }, + { + "epoch": 0.07, + "learning_rate": 2.8195013461313355e-07, + "logits/chosen": -2.429760456085205, + "logits/rejected": -2.3235838413238525, + "logps/chosen": -186.65560913085938, + "logps/rejected": -248.0267333984375, + "loss": 0.7367, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5467196702957153, + "rewards/margins": 0.002445850521326065, + "rewards/rejected": -0.5491654872894287, + "step": 645 + }, + { + "epoch": 0.07, + "learning_rate": 2.819150181435093e-07, + "logits/chosen": -3.0598084926605225, + "logits/rejected": -3.109884262084961, + "logps/chosen": -284.8348083496094, + "logps/rejected": -388.57171630859375, + "loss": 0.2034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6189019083976746, + "rewards/margins": 1.8634790182113647, + "rewards/rejected": -1.244577169418335, + "step": 646 + }, + { + "epoch": 0.07, + "learning_rate": 2.8187990167388506e-07, + "logits/chosen": -3.4173641204833984, + "logits/rejected": -3.3686740398406982, + "logps/chosen": -203.47494506835938, + "logps/rejected": -172.414794921875, + "loss": 0.4649, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2973012328147888, + "rewards/margins": 0.8682623505592346, + "rewards/rejected": -0.5709611177444458, + "step": 647 + }, + { + "epoch": 0.07, + "learning_rate": 2.818447852042608e-07, + "logits/chosen": -2.737396717071533, + "logits/rejected": -2.8556737899780273, + "logps/chosen": -368.24993896484375, + "logps/rejected": -150.498046875, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15030935406684875, + "rewards/margins": 0.7441118955612183, + "rewards/rejected": -0.5938024520874023, + "step": 648 + }, + { + "epoch": 0.07, + "learning_rate": 2.818096687346365e-07, + "logits/chosen": -3.612534761428833, + "logits/rejected": -3.7730445861816406, + "logps/chosen": -260.3518371582031, + "logps/rejected": -237.2313690185547, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10116110742092133, + "rewards/margins": 1.478062391281128, + "rewards/rejected": -1.5792236328125, + "step": 649 + }, + { + "epoch": 0.07, + "learning_rate": 2.8177455226501227e-07, + "logits/chosen": -3.8730850219726562, + "logits/rejected": -3.579479932785034, + "logps/chosen": -329.912841796875, + "logps/rejected": -218.162353515625, + "loss": 0.456, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09039077162742615, + "rewards/margins": 0.7901965379714966, + "rewards/rejected": -0.699805736541748, + "step": 650 + }, + { + "epoch": 0.08, + "learning_rate": 2.81739435795388e-07, + "logits/chosen": -2.870739459991455, + "logits/rejected": -2.986448287963867, + "logps/chosen": -253.240966796875, + "logps/rejected": -235.52874755859375, + "loss": 0.3736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3080122470855713, + "rewards/margins": 1.2318308353424072, + "rewards/rejected": -0.9238184690475464, + "step": 651 + }, + { + "epoch": 0.08, + "learning_rate": 2.817043193257638e-07, + "logits/chosen": -3.2262566089630127, + "logits/rejected": -3.2233049869537354, + "logps/chosen": -226.5541229248047, + "logps/rejected": -254.38339233398438, + "loss": 0.499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06557979434728622, + "rewards/margins": 0.7249903082847595, + "rewards/rejected": -0.790570080280304, + "step": 652 + }, + { + "epoch": 0.08, + "learning_rate": 2.8166920285613953e-07, + "logits/chosen": -3.3718514442443848, + "logits/rejected": -3.6217567920684814, + "logps/chosen": -219.1671142578125, + "logps/rejected": -269.7743835449219, + "loss": 0.5121, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07980260252952576, + "rewards/margins": 1.1106517314910889, + "rewards/rejected": -1.0308490991592407, + "step": 653 + }, + { + "epoch": 0.08, + "learning_rate": 2.816340863865153e-07, + "logits/chosen": -3.5437674522399902, + "logits/rejected": -3.562863826751709, + "logps/chosen": -383.1075134277344, + "logps/rejected": -340.1318359375, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23954106867313385, + "rewards/margins": 1.8402096033096313, + "rewards/rejected": -1.6006686687469482, + "step": 654 + }, + { + "epoch": 0.08, + "learning_rate": 2.81598969916891e-07, + "logits/chosen": -2.182158946990967, + "logits/rejected": -2.341937303543091, + "logps/chosen": -448.85687255859375, + "logps/rejected": -303.45928955078125, + "loss": 0.3674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2605876624584198, + "rewards/margins": 0.9077305197715759, + "rewards/rejected": -0.6471428871154785, + "step": 655 + }, + { + "epoch": 0.08, + "learning_rate": 2.815638534472668e-07, + "logits/chosen": -2.663252115249634, + "logits/rejected": -2.679100275039673, + "logps/chosen": -250.4080810546875, + "logps/rejected": -328.595458984375, + "loss": 0.513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06937167793512344, + "rewards/margins": 0.8674710988998413, + "rewards/rejected": -0.9368427395820618, + "step": 656 + }, + { + "epoch": 0.08, + "learning_rate": 2.815287369776425e-07, + "logits/chosen": -2.8067548274993896, + "logits/rejected": -2.9203338623046875, + "logps/chosen": -241.26138305664062, + "logps/rejected": -186.9059295654297, + "loss": 0.4242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1027831956744194, + "rewards/margins": 0.7267370223999023, + "rewards/rejected": -0.6239538192749023, + "step": 657 + }, + { + "epoch": 0.08, + "learning_rate": 2.8149362050801824e-07, + "logits/chosen": -3.4372267723083496, + "logits/rejected": -3.178781270980835, + "logps/chosen": -376.176513671875, + "logps/rejected": -221.98355102539062, + "loss": 0.4268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33262619376182556, + "rewards/margins": 0.9065229892730713, + "rewards/rejected": -1.2391493320465088, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 2.81458504038394e-07, + "logits/chosen": -3.217949390411377, + "logits/rejected": -3.089524269104004, + "logps/chosen": -228.12503051757812, + "logps/rejected": -243.04562377929688, + "loss": 0.5367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37924641370773315, + "rewards/margins": 0.4592605233192444, + "rewards/rejected": -0.8385068774223328, + "step": 659 + }, + { + "epoch": 0.08, + "learning_rate": 2.8142338756876975e-07, + "logits/chosen": -2.900501251220703, + "logits/rejected": -2.7165184020996094, + "logps/chosen": -248.76388549804688, + "logps/rejected": -207.01409912109375, + "loss": 0.3649, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15769578516483307, + "rewards/margins": 0.8768856525421143, + "rewards/rejected": -0.7191898822784424, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 2.813882710991455e-07, + "logits/chosen": -3.957078456878662, + "logits/rejected": -3.8362674713134766, + "logps/chosen": -145.32540893554688, + "logps/rejected": -136.26345825195312, + "loss": 0.3963, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011360090225934982, + "rewards/margins": 1.0556401014328003, + "rewards/rejected": -1.044279932975769, + "step": 661 + }, + { + "epoch": 0.08, + "learning_rate": 2.813531546295212e-07, + "logits/chosen": -2.8501622676849365, + "logits/rejected": -3.0820937156677246, + "logps/chosen": -456.33319091796875, + "logps/rejected": -192.61392211914062, + "loss": 0.3305, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24530985951423645, + "rewards/margins": 1.1563799381256104, + "rewards/rejected": -0.9110701084136963, + "step": 662 + }, + { + "epoch": 0.08, + "learning_rate": 2.8131803815989696e-07, + "logits/chosen": -3.8404223918914795, + "logits/rejected": -3.631300449371338, + "logps/chosen": -197.45916748046875, + "logps/rejected": -113.88496398925781, + "loss": 0.6455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3517687916755676, + "rewards/margins": 0.25402748584747314, + "rewards/rejected": -0.6057963371276855, + "step": 663 + }, + { + "epoch": 0.08, + "learning_rate": 2.812829216902727e-07, + "logits/chosen": -3.119865655899048, + "logits/rejected": -3.1801528930664062, + "logps/chosen": -287.74066162109375, + "logps/rejected": -112.96851348876953, + "loss": 0.5897, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12497826665639877, + "rewards/margins": 0.33184346556663513, + "rewards/rejected": -0.4568217396736145, + "step": 664 + }, + { + "epoch": 0.08, + "learning_rate": 2.8124780522064847e-07, + "logits/chosen": -3.4885144233703613, + "logits/rejected": -3.3222920894622803, + "logps/chosen": -86.46119689941406, + "logps/rejected": -141.62918090820312, + "loss": 0.4183, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24856294691562653, + "rewards/margins": 0.7436630129814148, + "rewards/rejected": -0.49510008096694946, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 2.812126887510242e-07, + "logits/chosen": -3.740874767303467, + "logits/rejected": -3.7094991207122803, + "logps/chosen": -178.31536865234375, + "logps/rejected": -139.36520385742188, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16647811233997345, + "rewards/margins": 0.2598204016685486, + "rewards/rejected": -0.4262985289096832, + "step": 666 + }, + { + "epoch": 0.08, + "learning_rate": 2.811775722814e-07, + "logits/chosen": -3.1754376888275146, + "logits/rejected": -2.932903289794922, + "logps/chosen": -263.0095520019531, + "logps/rejected": -227.65838623046875, + "loss": 0.6239, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47631216049194336, + "rewards/margins": 0.9844347238540649, + "rewards/rejected": -1.4607468843460083, + "step": 667 + }, + { + "epoch": 0.08, + "learning_rate": 2.811424558117757e-07, + "logits/chosen": -3.1097710132598877, + "logits/rejected": -2.9225053787231445, + "logps/chosen": -532.6370239257812, + "logps/rejected": -284.1167907714844, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1953025609254837, + "rewards/margins": 0.24375471472740173, + "rewards/rejected": -0.04845218360424042, + "step": 668 + }, + { + "epoch": 0.08, + "learning_rate": 2.811073393421515e-07, + "logits/chosen": -3.4223151206970215, + "logits/rejected": -3.2450737953186035, + "logps/chosen": -362.22723388671875, + "logps/rejected": -198.56187438964844, + "loss": 0.4319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03571777045726776, + "rewards/margins": 1.2749719619750977, + "rewards/rejected": -1.3106898069381714, + "step": 669 + }, + { + "epoch": 0.08, + "learning_rate": 2.810722228725272e-07, + "logits/chosen": -2.9319663047790527, + "logits/rejected": -3.094832181930542, + "logps/chosen": -232.2923583984375, + "logps/rejected": -280.48388671875, + "loss": 0.2578, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24395324289798737, + "rewards/margins": 1.5181598663330078, + "rewards/rejected": -1.2742066383361816, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 2.8103710640290294e-07, + "logits/chosen": -3.8010122776031494, + "logits/rejected": -3.588047742843628, + "logps/chosen": -234.400146484375, + "logps/rejected": -213.72781372070312, + "loss": 0.4428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17712001502513885, + "rewards/margins": 0.8781943321228027, + "rewards/rejected": -1.055314302444458, + "step": 671 + }, + { + "epoch": 0.08, + "learning_rate": 2.810019899332787e-07, + "logits/chosen": -2.562882900238037, + "logits/rejected": -2.899467945098877, + "logps/chosen": -320.32635498046875, + "logps/rejected": -225.9552001953125, + "loss": 0.5401, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07994194328784943, + "rewards/margins": 0.6259147524833679, + "rewards/rejected": -0.5459728240966797, + "step": 672 + }, + { + "epoch": 0.08, + "learning_rate": 2.8096687346365445e-07, + "logits/chosen": -2.525270462036133, + "logits/rejected": -2.663062810897827, + "logps/chosen": -230.9478759765625, + "logps/rejected": -269.2705078125, + "loss": 0.4953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06332116574048996, + "rewards/margins": 0.8073590993881226, + "rewards/rejected": -0.8706803321838379, + "step": 673 + }, + { + "epoch": 0.08, + "learning_rate": 2.809317569940302e-07, + "logits/chosen": -3.27494740486145, + "logits/rejected": -3.249255657196045, + "logps/chosen": -129.63888549804688, + "logps/rejected": -192.5345916748047, + "loss": 0.3029, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.314860075712204, + "rewards/margins": 1.5779260396957397, + "rewards/rejected": -1.2630659341812134, + "step": 674 + }, + { + "epoch": 0.08, + "learning_rate": 2.8089664052440595e-07, + "logits/chosen": -2.46126389503479, + "logits/rejected": -2.5478923320770264, + "logps/chosen": -270.1539611816406, + "logps/rejected": -316.5727844238281, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22709369659423828, + "rewards/margins": 0.7511131167411804, + "rewards/rejected": -0.5240194797515869, + "step": 675 + }, + { + "epoch": 0.08, + "learning_rate": 2.8086152405478166e-07, + "logits/chosen": -3.0591249465942383, + "logits/rejected": -3.4650230407714844, + "logps/chosen": -63.17518997192383, + "logps/rejected": -184.40037536621094, + "loss": 0.6821, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03870847821235657, + "rewards/margins": 0.5994771718978882, + "rewards/rejected": -0.6381855607032776, + "step": 676 + }, + { + "epoch": 0.08, + "learning_rate": 2.808264075851574e-07, + "logits/chosen": -3.387763261795044, + "logits/rejected": -3.4069201946258545, + "logps/chosen": -452.8389892578125, + "logps/rejected": -309.1654968261719, + "loss": 0.4548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3142624795436859, + "rewards/margins": 1.189105749130249, + "rewards/rejected": -1.5033681392669678, + "step": 677 + }, + { + "epoch": 0.08, + "learning_rate": 2.8079129111553316e-07, + "logits/chosen": -3.295924425125122, + "logits/rejected": -3.3116328716278076, + "logps/chosen": -263.3892822265625, + "logps/rejected": -186.02883911132812, + "loss": 0.4234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0004598647356033325, + "rewards/margins": 0.947930097579956, + "rewards/rejected": -0.9483898878097534, + "step": 678 + }, + { + "epoch": 0.08, + "learning_rate": 2.807561746459089e-07, + "logits/chosen": -3.211676597595215, + "logits/rejected": -2.964092969894409, + "logps/chosen": -282.63330078125, + "logps/rejected": -376.3515319824219, + "loss": 0.3642, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1897697150707245, + "rewards/margins": 1.3087869882583618, + "rewards/rejected": -1.1190173625946045, + "step": 679 + }, + { + "epoch": 0.08, + "learning_rate": 2.8072105817628467e-07, + "logits/chosen": -2.9880011081695557, + "logits/rejected": -3.0205297470092773, + "logps/chosen": -150.84628295898438, + "logps/rejected": -176.31793212890625, + "loss": 0.482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04968436062335968, + "rewards/margins": 1.008692741394043, + "rewards/rejected": -1.0583771467208862, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 2.806859417066604e-07, + "logits/chosen": -3.131657600402832, + "logits/rejected": -2.874061346054077, + "logps/chosen": -100.8520736694336, + "logps/rejected": -181.38697814941406, + "loss": 0.3973, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08181305229663849, + "rewards/margins": 1.0862394571304321, + "rewards/rejected": -1.0044264793395996, + "step": 681 + }, + { + "epoch": 0.08, + "learning_rate": 2.806508252370362e-07, + "logits/chosen": -2.808809757232666, + "logits/rejected": -2.70902943611145, + "logps/chosen": -277.4212646484375, + "logps/rejected": -341.1982421875, + "loss": 0.373, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30684730410575867, + "rewards/margins": 1.2745519876480103, + "rewards/rejected": -0.9677046537399292, + "step": 682 + }, + { + "epoch": 0.08, + "learning_rate": 2.8061570876741193e-07, + "logits/chosen": -2.6040287017822266, + "logits/rejected": -2.595909595489502, + "logps/chosen": -240.48321533203125, + "logps/rejected": -198.50997924804688, + "loss": 0.3561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2184903919696808, + "rewards/margins": 1.0276191234588623, + "rewards/rejected": -0.8091287612915039, + "step": 683 + }, + { + "epoch": 0.08, + "learning_rate": 2.8058059229778763e-07, + "logits/chosen": -2.8333404064178467, + "logits/rejected": -2.4502015113830566, + "logps/chosen": -219.761474609375, + "logps/rejected": -142.65008544921875, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2704215943813324, + "rewards/margins": 0.5716500878334045, + "rewards/rejected": -0.30122846364974976, + "step": 684 + }, + { + "epoch": 0.08, + "learning_rate": 2.805454758281634e-07, + "logits/chosen": -2.5912113189697266, + "logits/rejected": -2.7085680961608887, + "logps/chosen": -411.77685546875, + "logps/rejected": -289.8896484375, + "loss": 0.4253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2640058696269989, + "rewards/margins": 1.0749235153198242, + "rewards/rejected": -0.8109176158905029, + "step": 685 + }, + { + "epoch": 0.08, + "learning_rate": 2.8051035935853914e-07, + "logits/chosen": -2.515194892883301, + "logits/rejected": -2.6483588218688965, + "logps/chosen": -426.415283203125, + "logps/rejected": -347.5091247558594, + "loss": 0.3039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005179956555366516, + "rewards/margins": 1.3198150396347046, + "rewards/rejected": -1.3146350383758545, + "step": 686 + }, + { + "epoch": 0.08, + "learning_rate": 2.804752428889149e-07, + "logits/chosen": -3.6294760704040527, + "logits/rejected": -3.5103511810302734, + "logps/chosen": -240.3109130859375, + "logps/rejected": -221.80860900878906, + "loss": 0.2843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3888118863105774, + "rewards/margins": 1.8055329322814941, + "rewards/rejected": -1.4167211055755615, + "step": 687 + }, + { + "epoch": 0.08, + "learning_rate": 2.8044012641929065e-07, + "logits/chosen": -3.248447895050049, + "logits/rejected": -3.404047727584839, + "logps/chosen": -291.5246276855469, + "logps/rejected": -197.08004760742188, + "loss": 0.5457, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.538979709148407, + "rewards/margins": 1.1935218572616577, + "rewards/rejected": -0.654542088508606, + "step": 688 + }, + { + "epoch": 0.08, + "learning_rate": 2.8040500994966635e-07, + "logits/chosen": -2.7433083057403564, + "logits/rejected": -2.693399429321289, + "logps/chosen": -216.10342407226562, + "logps/rejected": -193.66622924804688, + "loss": 0.535, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09604065120220184, + "rewards/margins": 1.2432631254196167, + "rewards/rejected": -1.1472225189208984, + "step": 689 + }, + { + "epoch": 0.08, + "learning_rate": 2.8036989348004216e-07, + "logits/chosen": -3.1248443126678467, + "logits/rejected": -3.3194429874420166, + "logps/chosen": -271.44677734375, + "logps/rejected": -354.9962158203125, + "loss": 0.3927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19200202822685242, + "rewards/margins": 1.6102817058563232, + "rewards/rejected": -1.418279767036438, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 2.803347770104179e-07, + "logits/chosen": -3.6547465324401855, + "logits/rejected": -3.507004737854004, + "logps/chosen": -186.81564331054688, + "logps/rejected": -220.38259887695312, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1647680252790451, + "rewards/margins": 1.9419746398925781, + "rewards/rejected": -2.1067428588867188, + "step": 691 + }, + { + "epoch": 0.08, + "learning_rate": 2.802996605407936e-07, + "logits/chosen": -3.6030077934265137, + "logits/rejected": -3.2871081829071045, + "logps/chosen": -316.1094970703125, + "logps/rejected": -225.51097106933594, + "loss": 0.5874, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08980468660593033, + "rewards/margins": 0.6749645471572876, + "rewards/rejected": -0.7647692561149597, + "step": 692 + }, + { + "epoch": 0.08, + "learning_rate": 2.8026454407116936e-07, + "logits/chosen": -3.0034432411193848, + "logits/rejected": -3.0887794494628906, + "logps/chosen": -272.9200134277344, + "logps/rejected": -254.30731201171875, + "loss": 0.6342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28065866231918335, + "rewards/margins": 0.8286210894584656, + "rewards/rejected": -1.109279751777649, + "step": 693 + }, + { + "epoch": 0.08, + "learning_rate": 2.802294276015451e-07, + "logits/chosen": -3.42630672454834, + "logits/rejected": -3.390458345413208, + "logps/chosen": -166.72396850585938, + "logps/rejected": -181.4721221923828, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010705262422561646, + "rewards/margins": 1.5873644351959229, + "rewards/rejected": -1.5766589641571045, + "step": 694 + }, + { + "epoch": 0.08, + "learning_rate": 2.8019431113192087e-07, + "logits/chosen": -3.5085535049438477, + "logits/rejected": -3.1806631088256836, + "logps/chosen": -335.67327880859375, + "logps/rejected": -222.08627319335938, + "loss": 0.7465, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11532177031040192, + "rewards/margins": 0.6543084979057312, + "rewards/rejected": -0.5389867424964905, + "step": 695 + }, + { + "epoch": 0.08, + "learning_rate": 2.801591946622966e-07, + "logits/chosen": -3.2428689002990723, + "logits/rejected": -3.399885654449463, + "logps/chosen": -263.11761474609375, + "logps/rejected": -248.46810913085938, + "loss": 0.4727, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12523208558559418, + "rewards/margins": 0.6779493689537048, + "rewards/rejected": -0.5527173280715942, + "step": 696 + }, + { + "epoch": 0.08, + "learning_rate": 2.8012407819267233e-07, + "logits/chosen": -2.223513603210449, + "logits/rejected": -2.4811928272247314, + "logps/chosen": -328.03662109375, + "logps/rejected": -389.5274963378906, + "loss": 0.3393, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1013520210981369, + "rewards/margins": 1.3311753273010254, + "rewards/rejected": -1.229823350906372, + "step": 697 + }, + { + "epoch": 0.08, + "learning_rate": 2.800889617230481e-07, + "logits/chosen": -3.5635762214660645, + "logits/rejected": -3.4025607109069824, + "logps/chosen": -113.25352478027344, + "logps/rejected": -212.2552947998047, + "loss": 0.3372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061311714351177216, + "rewards/margins": 1.4127999544143677, + "rewards/rejected": -1.4741116762161255, + "step": 698 + }, + { + "epoch": 0.08, + "learning_rate": 2.8005384525342383e-07, + "logits/chosen": -2.7048871517181396, + "logits/rejected": -2.554380416870117, + "logps/chosen": -352.5461730957031, + "logps/rejected": -265.97833251953125, + "loss": 0.542, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3755878210067749, + "rewards/margins": 0.6865990161895752, + "rewards/rejected": -0.3110112249851227, + "step": 699 + }, + { + "epoch": 0.08, + "learning_rate": 2.800187287837996e-07, + "logits/chosen": -2.2965357303619385, + "logits/rejected": -1.9074287414550781, + "logps/chosen": -199.3422088623047, + "logps/rejected": -313.6127624511719, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23939643800258636, + "rewards/margins": 0.7571967840194702, + "rewards/rejected": -0.996593177318573, + "step": 700 + }, + { + "epoch": 0.08, + "learning_rate": 2.7998361231417534e-07, + "logits/chosen": -3.5383293628692627, + "logits/rejected": -3.5414085388183594, + "logps/chosen": -308.8039855957031, + "logps/rejected": -323.1033935546875, + "loss": 0.4573, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06360204517841339, + "rewards/margins": 1.2177666425704956, + "rewards/rejected": -1.1541645526885986, + "step": 701 + }, + { + "epoch": 0.08, + "learning_rate": 2.7994849584455104e-07, + "logits/chosen": -3.158003807067871, + "logits/rejected": -3.127227783203125, + "logps/chosen": -330.5771484375, + "logps/rejected": -277.0057067871094, + "loss": 0.6162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.039095744490623474, + "rewards/margins": 0.6411279439926147, + "rewards/rejected": -0.6802237033843994, + "step": 702 + }, + { + "epoch": 0.08, + "learning_rate": 2.7991337937492685e-07, + "logits/chosen": -3.391720771789551, + "logits/rejected": -3.5536458492279053, + "logps/chosen": -149.1319122314453, + "logps/rejected": -252.89488220214844, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03919847309589386, + "rewards/margins": 1.124940276145935, + "rewards/rejected": -1.0857417583465576, + "step": 703 + }, + { + "epoch": 0.08, + "learning_rate": 2.798782629053026e-07, + "logits/chosen": -3.42157244682312, + "logits/rejected": -3.7671823501586914, + "logps/chosen": -261.8943176269531, + "logps/rejected": -336.5327453613281, + "loss": 0.3204, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0694407969713211, + "rewards/margins": 1.4619674682617188, + "rewards/rejected": -1.392526626586914, + "step": 704 + }, + { + "epoch": 0.08, + "learning_rate": 2.798431464356783e-07, + "logits/chosen": -3.438798427581787, + "logits/rejected": -3.211738109588623, + "logps/chosen": -222.43499755859375, + "logps/rejected": -184.21075439453125, + "loss": 0.672, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.011085599660873413, + "rewards/margins": 0.3730858564376831, + "rewards/rejected": -0.38417142629623413, + "step": 705 + }, + { + "epoch": 0.08, + "learning_rate": 2.7980802996605406e-07, + "logits/chosen": -3.0592851638793945, + "logits/rejected": -2.938581705093384, + "logps/chosen": -169.57278442382812, + "logps/rejected": -296.74969482421875, + "loss": 0.2689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3602779507637024, + "rewards/margins": 1.771864414215088, + "rewards/rejected": -1.4115862846374512, + "step": 706 + }, + { + "epoch": 0.08, + "learning_rate": 2.797729134964298e-07, + "logits/chosen": -2.977429151535034, + "logits/rejected": -3.1957926750183105, + "logps/chosen": -262.33929443359375, + "logps/rejected": -203.01358032226562, + "loss": 0.6024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3755205273628235, + "rewards/margins": 0.6346362233161926, + "rewards/rejected": -1.0101567506790161, + "step": 707 + }, + { + "epoch": 0.08, + "learning_rate": 2.7973779702680557e-07, + "logits/chosen": -3.205326557159424, + "logits/rejected": -3.0857486724853516, + "logps/chosen": -398.4686279296875, + "logps/rejected": -302.9078369140625, + "loss": 0.4013, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14540919661521912, + "rewards/margins": 1.6690673828125, + "rewards/rejected": -1.5236581563949585, + "step": 708 + }, + { + "epoch": 0.08, + "learning_rate": 2.797026805571813e-07, + "logits/chosen": -2.898405075073242, + "logits/rejected": -2.9792308807373047, + "logps/chosen": -131.74102783203125, + "logps/rejected": -299.36651611328125, + "loss": 0.3555, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35862088203430176, + "rewards/margins": 1.3310527801513672, + "rewards/rejected": -1.6896735429763794, + "step": 709 + }, + { + "epoch": 0.08, + "learning_rate": 2.79667564087557e-07, + "logits/chosen": -2.7953314781188965, + "logits/rejected": -2.966688394546509, + "logps/chosen": -142.406494140625, + "logps/rejected": -241.51358032226562, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19121915102005005, + "rewards/margins": 0.42608654499053955, + "rewards/rejected": -0.6173057556152344, + "step": 710 + }, + { + "epoch": 0.08, + "learning_rate": 2.796324476179328e-07, + "logits/chosen": -3.5614981651306152, + "logits/rejected": -3.113609790802002, + "logps/chosen": -528.8561401367188, + "logps/rejected": -230.63150024414062, + "loss": 0.6038, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24319688975811005, + "rewards/margins": 0.9700608253479004, + "rewards/rejected": -1.2132576704025269, + "step": 711 + }, + { + "epoch": 0.08, + "learning_rate": 2.795973311483086e-07, + "logits/chosen": -3.450883388519287, + "logits/rejected": -3.1953043937683105, + "logps/chosen": -177.04537963867188, + "logps/rejected": -173.73739624023438, + "loss": 0.4407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025586768984794617, + "rewards/margins": 1.3895102739334106, + "rewards/rejected": -1.4150969982147217, + "step": 712 + }, + { + "epoch": 0.08, + "learning_rate": 2.795622146786843e-07, + "logits/chosen": -3.022975444793701, + "logits/rejected": -3.166229724884033, + "logps/chosen": -265.0945739746094, + "logps/rejected": -336.5422668457031, + "loss": 0.3947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2782943844795227, + "rewards/margins": 1.6208069324493408, + "rewards/rejected": -1.8991012573242188, + "step": 713 + }, + { + "epoch": 0.08, + "learning_rate": 2.7952709820906004e-07, + "logits/chosen": -2.9518818855285645, + "logits/rejected": -2.85339093208313, + "logps/chosen": -225.41339111328125, + "logps/rejected": -264.4231262207031, + "loss": 0.3502, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2141304910182953, + "rewards/margins": 1.193411946296692, + "rewards/rejected": -0.979281485080719, + "step": 714 + }, + { + "epoch": 0.08, + "learning_rate": 2.794919817394358e-07, + "logits/chosen": -3.4671685695648193, + "logits/rejected": -3.010868549346924, + "logps/chosen": -266.54180908203125, + "logps/rejected": -168.47796630859375, + "loss": 0.4375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10000243037939072, + "rewards/margins": 0.7507315874099731, + "rewards/rejected": -0.8507339954376221, + "step": 715 + }, + { + "epoch": 0.08, + "learning_rate": 2.7945686526981154e-07, + "logits/chosen": -3.561511278152466, + "logits/rejected": -3.179701566696167, + "logps/chosen": -228.364990234375, + "logps/rejected": -167.16336059570312, + "loss": 0.553, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023554518818855286, + "rewards/margins": 0.8202447891235352, + "rewards/rejected": -0.843799352645874, + "step": 716 + }, + { + "epoch": 0.08, + "learning_rate": 2.794217488001873e-07, + "logits/chosen": -3.2105660438537598, + "logits/rejected": -3.1001129150390625, + "logps/chosen": -235.6729736328125, + "logps/rejected": -245.31349182128906, + "loss": 0.3792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21252159774303436, + "rewards/margins": 1.2790400981903076, + "rewards/rejected": -1.491561770439148, + "step": 717 + }, + { + "epoch": 0.08, + "learning_rate": 2.79386632330563e-07, + "logits/chosen": -2.600304126739502, + "logits/rejected": -2.3475260734558105, + "logps/chosen": -458.005126953125, + "logps/rejected": -353.92022705078125, + "loss": 0.6151, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21235902607440948, + "rewards/margins": 0.329931378364563, + "rewards/rejected": -0.5422903895378113, + "step": 718 + }, + { + "epoch": 0.08, + "learning_rate": 2.7935151586093875e-07, + "logits/chosen": -2.723416328430176, + "logits/rejected": -2.1372413635253906, + "logps/chosen": -479.31402587890625, + "logps/rejected": -422.8626403808594, + "loss": 0.5397, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.060451313853263855, + "rewards/margins": 0.7133290767669678, + "rewards/rejected": -0.6528778672218323, + "step": 719 + }, + { + "epoch": 0.08, + "learning_rate": 2.793163993913145e-07, + "logits/chosen": -3.8497650623321533, + "logits/rejected": -4.18699836730957, + "logps/chosen": -354.9637451171875, + "logps/rejected": -426.37371826171875, + "loss": 0.4274, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13541057705879211, + "rewards/margins": 1.0846930742263794, + "rewards/rejected": -0.9492824673652649, + "step": 720 + }, + { + "epoch": 0.08, + "learning_rate": 2.7928128292169026e-07, + "logits/chosen": -3.0595836639404297, + "logits/rejected": -2.864090919494629, + "logps/chosen": -116.8880615234375, + "logps/rejected": -276.19677734375, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0646720826625824, + "rewards/margins": 1.173906922340393, + "rewards/rejected": -1.1092348098754883, + "step": 721 + }, + { + "epoch": 0.08, + "learning_rate": 2.79246166452066e-07, + "logits/chosen": -3.027130365371704, + "logits/rejected": -2.8423495292663574, + "logps/chosen": -326.50274658203125, + "logps/rejected": -260.4288330078125, + "loss": 1.2141, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9012296795845032, + "rewards/margins": -0.12403631210327148, + "rewards/rejected": -0.7771934866905212, + "step": 722 + }, + { + "epoch": 0.08, + "learning_rate": 2.792110499824417e-07, + "logits/chosen": -2.9410150051116943, + "logits/rejected": -2.9808695316314697, + "logps/chosen": -338.3072204589844, + "logps/rejected": -221.28509521484375, + "loss": 0.5662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06646937876939774, + "rewards/margins": 0.5223274827003479, + "rewards/rejected": -0.588796854019165, + "step": 723 + }, + { + "epoch": 0.08, + "learning_rate": 2.791759335128175e-07, + "logits/chosen": -3.389832019805908, + "logits/rejected": -3.607801914215088, + "logps/chosen": -184.72640991210938, + "logps/rejected": -191.76133728027344, + "loss": 0.4964, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13443441689014435, + "rewards/margins": 0.8097478151321411, + "rewards/rejected": -0.944182276725769, + "step": 724 + }, + { + "epoch": 0.08, + "learning_rate": 2.791408170431933e-07, + "logits/chosen": -3.7416253089904785, + "logits/rejected": -3.840014696121216, + "logps/chosen": -194.82476806640625, + "logps/rejected": -185.4194793701172, + "loss": 0.5016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45439350605010986, + "rewards/margins": 0.9588402509689331, + "rewards/rejected": -1.413233757019043, + "step": 725 + }, + { + "epoch": 0.08, + "learning_rate": 2.79105700573569e-07, + "logits/chosen": -3.260671854019165, + "logits/rejected": -3.1522653102874756, + "logps/chosen": -320.48919677734375, + "logps/rejected": -305.313720703125, + "loss": 0.4002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06286229938268661, + "rewards/margins": 0.8737356066703796, + "rewards/rejected": -0.9365978837013245, + "step": 726 + }, + { + "epoch": 0.08, + "learning_rate": 2.7907058410394473e-07, + "logits/chosen": -3.0532050132751465, + "logits/rejected": -3.4129347801208496, + "logps/chosen": -195.68765258789062, + "logps/rejected": -307.7918701171875, + "loss": 0.3589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39316853880882263, + "rewards/margins": 1.3101814985275269, + "rewards/rejected": -0.9170130491256714, + "step": 727 + }, + { + "epoch": 0.08, + "learning_rate": 2.790354676343205e-07, + "logits/chosen": -2.940598487854004, + "logits/rejected": -3.1966962814331055, + "logps/chosen": -177.54449462890625, + "logps/rejected": -275.5166015625, + "loss": 0.4473, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1405424028635025, + "rewards/margins": 1.4285423755645752, + "rewards/rejected": -1.2879998683929443, + "step": 728 + }, + { + "epoch": 0.08, + "learning_rate": 2.7900035116469624e-07, + "logits/chosen": -3.1877589225769043, + "logits/rejected": -2.766432285308838, + "logps/chosen": -304.5781555175781, + "logps/rejected": -265.7191467285156, + "loss": 0.6039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2719941735267639, + "rewards/margins": 0.4328400194644928, + "rewards/rejected": -0.7048341631889343, + "step": 729 + }, + { + "epoch": 0.08, + "learning_rate": 2.78965234695072e-07, + "logits/chosen": -3.326904773712158, + "logits/rejected": -3.3223938941955566, + "logps/chosen": -368.2452697753906, + "logps/rejected": -283.09893798828125, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4378724694252014, + "rewards/margins": 1.6837018728256226, + "rewards/rejected": -1.2458293437957764, + "step": 730 + }, + { + "epoch": 0.08, + "learning_rate": 2.789301182254477e-07, + "logits/chosen": -3.4446797370910645, + "logits/rejected": -3.1093382835388184, + "logps/chosen": -360.2107849121094, + "logps/rejected": -256.012451171875, + "loss": 0.4546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02461787313222885, + "rewards/margins": 0.7881884574890137, + "rewards/rejected": -0.8128063678741455, + "step": 731 + }, + { + "epoch": 0.08, + "learning_rate": 2.7889500175582345e-07, + "logits/chosen": -2.9737820625305176, + "logits/rejected": -2.883054256439209, + "logps/chosen": -416.96136474609375, + "logps/rejected": -240.3757781982422, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05331142991781235, + "rewards/margins": 0.8406397104263306, + "rewards/rejected": -0.8939511775970459, + "step": 732 + }, + { + "epoch": 0.08, + "learning_rate": 2.788598852861992e-07, + "logits/chosen": -3.5305378437042236, + "logits/rejected": -3.084609031677246, + "logps/chosen": -303.6122741699219, + "logps/rejected": -183.35679626464844, + "loss": 0.5426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3534061312675476, + "rewards/margins": 0.3605599105358124, + "rewards/rejected": -0.7139660120010376, + "step": 733 + }, + { + "epoch": 0.08, + "learning_rate": 2.7882476881657496e-07, + "logits/chosen": -3.27215313911438, + "logits/rejected": -3.041658878326416, + "logps/chosen": -175.80950927734375, + "logps/rejected": -137.91656494140625, + "loss": 0.8296, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22282934188842773, + "rewards/margins": -0.014165259897708893, + "rewards/rejected": -0.20866408944129944, + "step": 734 + }, + { + "epoch": 0.08, + "learning_rate": 2.787896523469507e-07, + "logits/chosen": -2.7333507537841797, + "logits/rejected": -2.6591265201568604, + "logps/chosen": -206.57362365722656, + "logps/rejected": -254.91778564453125, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24605610966682434, + "rewards/margins": 0.69692462682724, + "rewards/rejected": -0.45086848735809326, + "step": 735 + }, + { + "epoch": 0.08, + "learning_rate": 2.7875453587732646e-07, + "logits/chosen": -3.360931873321533, + "logits/rejected": -3.6438777446746826, + "logps/chosen": -224.9855499267578, + "logps/rejected": -209.10198974609375, + "loss": 0.284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023202970623970032, + "rewards/margins": 1.805598258972168, + "rewards/rejected": -1.8288013935089111, + "step": 736 + }, + { + "epoch": 0.08, + "learning_rate": 2.787194194077022e-07, + "logits/chosen": -3.1314797401428223, + "logits/rejected": -3.0882620811462402, + "logps/chosen": -220.35833740234375, + "logps/rejected": -139.05125427246094, + "loss": 0.6041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17225147783756256, + "rewards/margins": 0.3240305483341217, + "rewards/rejected": -0.49628201127052307, + "step": 737 + }, + { + "epoch": 0.09, + "learning_rate": 2.7868430293807797e-07, + "logits/chosen": -3.308011054992676, + "logits/rejected": -3.3963537216186523, + "logps/chosen": -191.07635498046875, + "logps/rejected": -203.13839721679688, + "loss": 0.6503, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41069403290748596, + "rewards/margins": 0.56345534324646, + "rewards/rejected": -0.9741493463516235, + "step": 738 + }, + { + "epoch": 0.09, + "learning_rate": 2.7864918646845367e-07, + "logits/chosen": -3.430941104888916, + "logits/rejected": -3.525658369064331, + "logps/chosen": -92.60293579101562, + "logps/rejected": -176.57913208007812, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29544633626937866, + "rewards/margins": 1.10063898563385, + "rewards/rejected": -1.396085262298584, + "step": 739 + }, + { + "epoch": 0.09, + "learning_rate": 2.786140699988294e-07, + "logits/chosen": -3.457493305206299, + "logits/rejected": -3.5594327449798584, + "logps/chosen": -176.48135375976562, + "logps/rejected": -204.83287048339844, + "loss": 0.514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2704823613166809, + "rewards/margins": 0.49225959181785583, + "rewards/rejected": -0.7627419233322144, + "step": 740 + }, + { + "epoch": 0.09, + "learning_rate": 2.785789535292052e-07, + "logits/chosen": -3.106994152069092, + "logits/rejected": -3.050036907196045, + "logps/chosen": -262.15106201171875, + "logps/rejected": -221.78695678710938, + "loss": 0.3798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23475639522075653, + "rewards/margins": 1.4000234603881836, + "rewards/rejected": -1.1652668714523315, + "step": 741 + }, + { + "epoch": 0.09, + "learning_rate": 2.7854383705958093e-07, + "logits/chosen": -2.7205018997192383, + "logits/rejected": -2.717792272567749, + "logps/chosen": -309.83148193359375, + "logps/rejected": -224.51214599609375, + "loss": 0.564, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23166640102863312, + "rewards/margins": 0.39010512828826904, + "rewards/rejected": -0.15843872725963593, + "step": 742 + }, + { + "epoch": 0.09, + "learning_rate": 2.785087205899567e-07, + "logits/chosen": -3.3515236377716064, + "logits/rejected": -3.4007985591888428, + "logps/chosen": -157.39659118652344, + "logps/rejected": -142.01329040527344, + "loss": 0.4733, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08005556464195251, + "rewards/margins": 0.6852942109107971, + "rewards/rejected": -0.605238676071167, + "step": 743 + }, + { + "epoch": 0.09, + "learning_rate": 2.7847360412033244e-07, + "logits/chosen": -3.5198566913604736, + "logits/rejected": -3.41941499710083, + "logps/chosen": -394.3213806152344, + "logps/rejected": -339.6938781738281, + "loss": 0.2811, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6189172267913818, + "rewards/margins": 1.63225519657135, + "rewards/rejected": -1.0133379697799683, + "step": 744 + }, + { + "epoch": 0.09, + "learning_rate": 2.7843848765070814e-07, + "logits/chosen": -2.3034324645996094, + "logits/rejected": -2.556891679763794, + "logps/chosen": -265.40338134765625, + "logps/rejected": -321.1072082519531, + "loss": 0.3482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35625535249710083, + "rewards/margins": 1.2761567831039429, + "rewards/rejected": -0.9199013710021973, + "step": 745 + }, + { + "epoch": 0.09, + "learning_rate": 2.7840337118108395e-07, + "logits/chosen": -3.111315965652466, + "logits/rejected": -3.149352550506592, + "logps/chosen": -209.0286102294922, + "logps/rejected": -259.8973693847656, + "loss": 0.4053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27739983797073364, + "rewards/margins": 1.4222211837768555, + "rewards/rejected": -1.6996210813522339, + "step": 746 + }, + { + "epoch": 0.09, + "learning_rate": 2.7836825471145965e-07, + "logits/chosen": -3.2193336486816406, + "logits/rejected": -3.359908103942871, + "logps/chosen": -318.5334167480469, + "logps/rejected": -292.0082702636719, + "loss": 0.2583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08901204913854599, + "rewards/margins": 1.8340709209442139, + "rewards/rejected": -1.923082947731018, + "step": 747 + }, + { + "epoch": 0.09, + "learning_rate": 2.783331382418354e-07, + "logits/chosen": -3.4616434574127197, + "logits/rejected": -3.588042736053467, + "logps/chosen": -270.0181884765625, + "logps/rejected": -267.8734130859375, + "loss": 0.5003, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2925337851047516, + "rewards/margins": 1.0193285942077637, + "rewards/rejected": -0.7267947793006897, + "step": 748 + }, + { + "epoch": 0.09, + "learning_rate": 2.7829802177221116e-07, + "logits/chosen": -2.8892769813537598, + "logits/rejected": -2.9554972648620605, + "logps/chosen": -434.30645751953125, + "logps/rejected": -344.25494384765625, + "loss": 0.5749, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3680606782436371, + "rewards/margins": 0.4565792381763458, + "rewards/rejected": -0.8246399164199829, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 2.782629053025869e-07, + "logits/chosen": -3.30849027633667, + "logits/rejected": -3.4568932056427, + "logps/chosen": -221.63856506347656, + "logps/rejected": -243.6094970703125, + "loss": 0.3395, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4739278256893158, + "rewards/margins": 1.5499975681304932, + "rewards/rejected": -1.076069712638855, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 2.7822778883296266e-07, + "logits/chosen": -3.2489686012268066, + "logits/rejected": -3.207798957824707, + "logps/chosen": -320.4010314941406, + "logps/rejected": -239.68475341796875, + "loss": 0.3593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23952628672122955, + "rewards/margins": 1.6683579683303833, + "rewards/rejected": -1.428831696510315, + "step": 751 + }, + { + "epoch": 0.09, + "learning_rate": 2.7819267236333837e-07, + "logits/chosen": -2.9491050243377686, + "logits/rejected": -2.9428305625915527, + "logps/chosen": -303.6954345703125, + "logps/rejected": -190.91000366210938, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04244603216648102, + "rewards/margins": 0.6577991843223572, + "rewards/rejected": -0.700245201587677, + "step": 752 + }, + { + "epoch": 0.09, + "learning_rate": 2.781575558937141e-07, + "logits/chosen": -3.8878326416015625, + "logits/rejected": -3.841360092163086, + "logps/chosen": -234.3267059326172, + "logps/rejected": -201.7291259765625, + "loss": 0.6764, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09288483113050461, + "rewards/margins": 0.6000270247459412, + "rewards/rejected": -0.6929118037223816, + "step": 753 + }, + { + "epoch": 0.09, + "learning_rate": 2.7812243942408987e-07, + "logits/chosen": -2.64681077003479, + "logits/rejected": -2.630861520767212, + "logps/chosen": -214.25994873046875, + "logps/rejected": -235.4342498779297, + "loss": 0.4287, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15708193182945251, + "rewards/margins": 0.9682724475860596, + "rewards/rejected": -1.1253544092178345, + "step": 754 + }, + { + "epoch": 0.09, + "learning_rate": 2.7808732295446563e-07, + "logits/chosen": -2.664579391479492, + "logits/rejected": -2.812045097351074, + "logps/chosen": -235.59559631347656, + "logps/rejected": -349.33685302734375, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20749445259571075, + "rewards/margins": 1.0535074472427368, + "rewards/rejected": -0.8460130095481873, + "step": 755 + }, + { + "epoch": 0.09, + "learning_rate": 2.780522064848414e-07, + "logits/chosen": -2.9259274005889893, + "logits/rejected": -3.1783413887023926, + "logps/chosen": -299.782470703125, + "logps/rejected": -205.81707763671875, + "loss": 0.2766, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4122794270515442, + "rewards/margins": 1.5904133319854736, + "rewards/rejected": -1.1781339645385742, + "step": 756 + }, + { + "epoch": 0.09, + "learning_rate": 2.7801709001521714e-07, + "logits/chosen": -3.840954303741455, + "logits/rejected": -3.6058359146118164, + "logps/chosen": -572.851318359375, + "logps/rejected": -389.96197509765625, + "loss": 0.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19878225028514862, + "rewards/margins": 1.4587445259094238, + "rewards/rejected": -1.2599623203277588, + "step": 757 + }, + { + "epoch": 0.09, + "learning_rate": 2.779819735455929e-07, + "logits/chosen": -2.9931206703186035, + "logits/rejected": -3.1358203887939453, + "logps/chosen": -198.49517822265625, + "logps/rejected": -294.4404602050781, + "loss": 0.4641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4661659598350525, + "rewards/margins": 0.8074597716331482, + "rewards/rejected": -1.2736257314682007, + "step": 758 + }, + { + "epoch": 0.09, + "learning_rate": 2.7794685707596864e-07, + "logits/chosen": -2.944542407989502, + "logits/rejected": -2.916090250015259, + "logps/chosen": -337.2884216308594, + "logps/rejected": -289.6893310546875, + "loss": 0.5873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41707825660705566, + "rewards/margins": 0.585429847240448, + "rewards/rejected": -1.0025081634521484, + "step": 759 + }, + { + "epoch": 0.09, + "learning_rate": 2.7791174060634434e-07, + "logits/chosen": -2.286489725112915, + "logits/rejected": -2.2807350158691406, + "logps/chosen": -305.55633544921875, + "logps/rejected": -229.83505249023438, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4459977149963379, + "rewards/margins": 0.5031130909919739, + "rewards/rejected": -0.05711541324853897, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 2.778766241367201e-07, + "logits/chosen": -3.200603485107422, + "logits/rejected": -3.091647148132324, + "logps/chosen": -225.74549865722656, + "logps/rejected": -238.61561584472656, + "loss": 0.4729, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16022247076034546, + "rewards/margins": 0.7869634032249451, + "rewards/rejected": -0.6267409920692444, + "step": 761 + }, + { + "epoch": 0.09, + "learning_rate": 2.7784150766709585e-07, + "logits/chosen": -2.5710701942443848, + "logits/rejected": -2.792001724243164, + "logps/chosen": -530.7432250976562, + "logps/rejected": -633.0314331054688, + "loss": 0.4267, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0067253075540065765, + "rewards/margins": 1.1828887462615967, + "rewards/rejected": -1.1761634349822998, + "step": 762 + }, + { + "epoch": 0.09, + "learning_rate": 2.778063911974716e-07, + "logits/chosen": -2.9660983085632324, + "logits/rejected": -2.9491803646087646, + "logps/chosen": -362.95819091796875, + "logps/rejected": -312.9381103515625, + "loss": 0.5217, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3714291453361511, + "rewards/margins": 0.9204393625259399, + "rewards/rejected": -1.2918685674667358, + "step": 763 + }, + { + "epoch": 0.09, + "learning_rate": 2.7777127472784736e-07, + "logits/chosen": -2.3638367652893066, + "logits/rejected": -2.79617977142334, + "logps/chosen": -272.8638000488281, + "logps/rejected": -167.12689208984375, + "loss": 0.7992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24792566895484924, + "rewards/margins": -0.1430899202823639, + "rewards/rejected": -0.10483574867248535, + "step": 764 + }, + { + "epoch": 0.09, + "learning_rate": 2.777361582582231e-07, + "logits/chosen": -2.7295498847961426, + "logits/rejected": -2.5084145069122314, + "logps/chosen": -169.173095703125, + "logps/rejected": -257.80841064453125, + "loss": 0.4239, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20656678080558777, + "rewards/margins": 1.0060412883758545, + "rewards/rejected": -0.7994745373725891, + "step": 765 + }, + { + "epoch": 0.09, + "learning_rate": 2.777010417885988e-07, + "logits/chosen": -2.94695782661438, + "logits/rejected": -3.0802979469299316, + "logps/chosen": -378.9036560058594, + "logps/rejected": -298.5536193847656, + "loss": 0.5195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37140989303588867, + "rewards/margins": 0.9030638933181763, + "rewards/rejected": -1.274473786354065, + "step": 766 + }, + { + "epoch": 0.09, + "learning_rate": 2.7766592531897457e-07, + "logits/chosen": -2.8654913902282715, + "logits/rejected": -2.941662311553955, + "logps/chosen": -303.6979675292969, + "logps/rejected": -182.4124755859375, + "loss": 0.3754, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09726090729236603, + "rewards/margins": 1.0909184217453003, + "rewards/rejected": -0.9936575293540955, + "step": 767 + }, + { + "epoch": 0.09, + "learning_rate": 2.776308088493503e-07, + "logits/chosen": -2.7466251850128174, + "logits/rejected": -2.6715352535247803, + "logps/chosen": -521.7222290039062, + "logps/rejected": -291.4895324707031, + "loss": 0.4713, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0589589886367321, + "rewards/margins": 1.0211752653121948, + "rewards/rejected": -0.9622161984443665, + "step": 768 + }, + { + "epoch": 0.09, + "learning_rate": 2.775956923797261e-07, + "logits/chosen": -3.3884201049804688, + "logits/rejected": -3.282735586166382, + "logps/chosen": -343.5610656738281, + "logps/rejected": -191.45126342773438, + "loss": 0.649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16660653054714203, + "rewards/margins": 0.46771326661109924, + "rewards/rejected": -0.6343197822570801, + "step": 769 + }, + { + "epoch": 0.09, + "learning_rate": 2.7756057591010183e-07, + "logits/chosen": -2.86013126373291, + "logits/rejected": -2.6618406772613525, + "logps/chosen": -291.9141845703125, + "logps/rejected": -374.5934753417969, + "loss": 0.4966, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05318525433540344, + "rewards/margins": 0.9221298694610596, + "rewards/rejected": -0.8689446449279785, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 2.775254594404776e-07, + "logits/chosen": -2.764878273010254, + "logits/rejected": -2.7415854930877686, + "logps/chosen": -265.850341796875, + "logps/rejected": -228.15701293945312, + "loss": 0.7045, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4472038447856903, + "rewards/margins": 0.4980015158653259, + "rewards/rejected": -0.9452053308486938, + "step": 771 + }, + { + "epoch": 0.09, + "learning_rate": 2.7749034297085334e-07, + "logits/chosen": -3.147306442260742, + "logits/rejected": -3.313295841217041, + "logps/chosen": -233.81837463378906, + "logps/rejected": -319.2016906738281, + "loss": 0.3005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.408542275428772, + "rewards/margins": 1.486729621887207, + "rewards/rejected": -1.0781872272491455, + "step": 772 + }, + { + "epoch": 0.09, + "learning_rate": 2.774552265012291e-07, + "logits/chosen": -2.7149100303649902, + "logits/rejected": -3.296889066696167, + "logps/chosen": -266.28759765625, + "logps/rejected": -227.50338745117188, + "loss": 0.4875, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14019201695919037, + "rewards/margins": 1.007714033126831, + "rewards/rejected": -0.8675219416618347, + "step": 773 + }, + { + "epoch": 0.09, + "learning_rate": 2.774201100316048e-07, + "logits/chosen": -3.036712646484375, + "logits/rejected": -2.9754140377044678, + "logps/chosen": -404.80889892578125, + "logps/rejected": -301.0320739746094, + "loss": 0.4199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09466972202062607, + "rewards/margins": 0.7925654649734497, + "rewards/rejected": -0.8872352242469788, + "step": 774 + }, + { + "epoch": 0.09, + "learning_rate": 2.7738499356198055e-07, + "logits/chosen": -2.9769539833068848, + "logits/rejected": -3.0576791763305664, + "logps/chosen": -347.3688659667969, + "logps/rejected": -181.9683837890625, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24589025974273682, + "rewards/margins": 0.09954608976840973, + "rewards/rejected": -0.34543633460998535, + "step": 775 + }, + { + "epoch": 0.09, + "learning_rate": 2.773498770923563e-07, + "logits/chosen": -2.6938247680664062, + "logits/rejected": -2.8581957817077637, + "logps/chosen": -228.93560791015625, + "logps/rejected": -282.1071472167969, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45766735076904297, + "rewards/margins": 2.0347132682800293, + "rewards/rejected": -1.5770459175109863, + "step": 776 + }, + { + "epoch": 0.09, + "learning_rate": 2.7731476062273205e-07, + "logits/chosen": -2.81034517288208, + "logits/rejected": -2.6404664516448975, + "logps/chosen": -343.42724609375, + "logps/rejected": -247.49749755859375, + "loss": 0.7535, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1953914314508438, + "rewards/margins": 0.09828576445579529, + "rewards/rejected": -0.2936772108078003, + "step": 777 + }, + { + "epoch": 0.09, + "learning_rate": 2.772796441531078e-07, + "logits/chosen": -3.4541571140289307, + "logits/rejected": -3.171201705932617, + "logps/chosen": -265.7227478027344, + "logps/rejected": -284.701171875, + "loss": 0.4372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10160556435585022, + "rewards/margins": 1.1893612146377563, + "rewards/rejected": -1.2909667491912842, + "step": 778 + }, + { + "epoch": 0.09, + "learning_rate": 2.772445276834835e-07, + "logits/chosen": -3.4178433418273926, + "logits/rejected": -2.9332752227783203, + "logps/chosen": -246.45094299316406, + "logps/rejected": -228.9134521484375, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0016984790563583374, + "rewards/margins": 0.6399407982826233, + "rewards/rejected": -0.6416392922401428, + "step": 779 + }, + { + "epoch": 0.09, + "learning_rate": 2.772094112138593e-07, + "logits/chosen": -3.0161616802215576, + "logits/rejected": -3.011470317840576, + "logps/chosen": -86.36643981933594, + "logps/rejected": -119.4278793334961, + "loss": 0.6144, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04824692755937576, + "rewards/margins": 0.3826948404312134, + "rewards/rejected": -0.43094176054000854, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 2.7717429474423507e-07, + "logits/chosen": -3.042189598083496, + "logits/rejected": -3.003848075866699, + "logps/chosen": -192.06727600097656, + "logps/rejected": -180.68637084960938, + "loss": 0.4963, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08379801362752914, + "rewards/margins": 0.6655247807502747, + "rewards/rejected": -0.5817267894744873, + "step": 781 + }, + { + "epoch": 0.09, + "learning_rate": 2.7713917827461077e-07, + "logits/chosen": -2.302798271179199, + "logits/rejected": -2.4597132205963135, + "logps/chosen": -172.348388671875, + "logps/rejected": -189.34625244140625, + "loss": 0.6826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2909846007823944, + "rewards/margins": 0.4485825300216675, + "rewards/rejected": -0.7395671606063843, + "step": 782 + }, + { + "epoch": 0.09, + "learning_rate": 2.771040618049865e-07, + "logits/chosen": -2.9049015045166016, + "logits/rejected": -2.8664212226867676, + "logps/chosen": -287.1170349121094, + "logps/rejected": -319.918701171875, + "loss": 0.357, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17138461768627167, + "rewards/margins": 1.514103651046753, + "rewards/rejected": -1.3427191972732544, + "step": 783 + }, + { + "epoch": 0.09, + "learning_rate": 2.770689453353623e-07, + "logits/chosen": -2.58640456199646, + "logits/rejected": -2.814419746398926, + "logps/chosen": -158.6343994140625, + "logps/rejected": -145.18853759765625, + "loss": 0.6517, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.047204241156578064, + "rewards/margins": 0.2903803288936615, + "rewards/rejected": -0.33758458495140076, + "step": 784 + }, + { + "epoch": 0.09, + "learning_rate": 2.7703382886573803e-07, + "logits/chosen": -3.515599250793457, + "logits/rejected": -3.826673984527588, + "logps/chosen": -179.5092315673828, + "logps/rejected": -316.6686706542969, + "loss": 0.2825, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13998627662658691, + "rewards/margins": 1.968587875366211, + "rewards/rejected": -1.8286018371582031, + "step": 785 + }, + { + "epoch": 0.09, + "learning_rate": 2.769987123961138e-07, + "logits/chosen": -2.4906749725341797, + "logits/rejected": -2.799945831298828, + "logps/chosen": -432.70135498046875, + "logps/rejected": -237.13162231445312, + "loss": 0.3155, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4119276702404022, + "rewards/margins": 1.6757333278656006, + "rewards/rejected": -1.2638057470321655, + "step": 786 + }, + { + "epoch": 0.09, + "learning_rate": 2.769635959264895e-07, + "logits/chosen": -3.3350143432617188, + "logits/rejected": -2.9567699432373047, + "logps/chosen": -167.20494079589844, + "logps/rejected": -120.77100372314453, + "loss": 0.8804, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12553802132606506, + "rewards/margins": 0.09677297621965408, + "rewards/rejected": 0.02876507118344307, + "step": 787 + }, + { + "epoch": 0.09, + "learning_rate": 2.7692847945686524e-07, + "logits/chosen": -2.9183406829833984, + "logits/rejected": -2.7444400787353516, + "logps/chosen": -353.8623046875, + "logps/rejected": -377.4620361328125, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07431145012378693, + "rewards/margins": 1.0330679416656494, + "rewards/rejected": -0.9587564468383789, + "step": 788 + }, + { + "epoch": 0.09, + "learning_rate": 2.76893362987241e-07, + "logits/chosen": -3.5005249977111816, + "logits/rejected": -3.4073243141174316, + "logps/chosen": -304.9727783203125, + "logps/rejected": -259.5607604980469, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35121098160743713, + "rewards/margins": 1.8979530334472656, + "rewards/rejected": -1.5467422008514404, + "step": 789 + }, + { + "epoch": 0.09, + "learning_rate": 2.7685824651761675e-07, + "logits/chosen": -2.9321601390838623, + "logits/rejected": -3.0256450176239014, + "logps/chosen": -180.75299072265625, + "logps/rejected": -232.77297973632812, + "loss": 0.4668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013980764895677567, + "rewards/margins": 1.9283115863800049, + "rewards/rejected": -1.9143307209014893, + "step": 790 + }, + { + "epoch": 0.09, + "learning_rate": 2.768231300479925e-07, + "logits/chosen": -2.7284204959869385, + "logits/rejected": -2.7354629039764404, + "logps/chosen": -211.25372314453125, + "logps/rejected": -270.4598388671875, + "loss": 0.4324, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17289772629737854, + "rewards/margins": 0.8530900478363037, + "rewards/rejected": -0.6801923513412476, + "step": 791 + }, + { + "epoch": 0.09, + "learning_rate": 2.7678801357836826e-07, + "logits/chosen": -2.7718188762664795, + "logits/rejected": -2.828396797180176, + "logps/chosen": -333.40509033203125, + "logps/rejected": -246.76522827148438, + "loss": 0.5144, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13157925009727478, + "rewards/margins": 0.6368228793144226, + "rewards/rejected": -0.5052436590194702, + "step": 792 + }, + { + "epoch": 0.09, + "learning_rate": 2.76752897108744e-07, + "logits/chosen": -3.1648764610290527, + "logits/rejected": -3.386141777038574, + "logps/chosen": -187.5034942626953, + "logps/rejected": -299.50665283203125, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35229629278182983, + "rewards/margins": 1.4573674201965332, + "rewards/rejected": -1.8096636533737183, + "step": 793 + }, + { + "epoch": 0.09, + "learning_rate": 2.7671778063911976e-07, + "logits/chosen": -2.5413899421691895, + "logits/rejected": -2.4490833282470703, + "logps/chosen": -193.41082763671875, + "logps/rejected": -264.7743225097656, + "loss": 0.4126, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2964949607849121, + "rewards/margins": 0.988365650177002, + "rewards/rejected": -0.6918706893920898, + "step": 794 + }, + { + "epoch": 0.09, + "learning_rate": 2.7668266416949546e-07, + "logits/chosen": -3.1381118297576904, + "logits/rejected": -3.390993595123291, + "logps/chosen": -166.88821411132812, + "logps/rejected": -233.92172241210938, + "loss": 0.3307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00010221544653177261, + "rewards/margins": 1.4514660835266113, + "rewards/rejected": -1.4513638019561768, + "step": 795 + }, + { + "epoch": 0.09, + "learning_rate": 2.766475476998712e-07, + "logits/chosen": -3.5331833362579346, + "logits/rejected": -3.3989603519439697, + "logps/chosen": -178.17892456054688, + "logps/rejected": -175.97079467773438, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16475343704223633, + "rewards/margins": 1.1457151174545288, + "rewards/rejected": -0.9809616804122925, + "step": 796 + }, + { + "epoch": 0.09, + "learning_rate": 2.7661243123024697e-07, + "logits/chosen": -3.75699520111084, + "logits/rejected": -3.80007266998291, + "logps/chosen": -246.5064697265625, + "logps/rejected": -291.75048828125, + "loss": 0.4595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10673819482326508, + "rewards/margins": 0.8938522934913635, + "rewards/rejected": -1.0005903244018555, + "step": 797 + }, + { + "epoch": 0.09, + "learning_rate": 2.765773147606227e-07, + "logits/chosen": -2.968410015106201, + "logits/rejected": -3.3296732902526855, + "logps/chosen": -205.11862182617188, + "logps/rejected": -210.55772399902344, + "loss": 0.3367, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09104548394680023, + "rewards/margins": 1.931513786315918, + "rewards/rejected": -1.840468406677246, + "step": 798 + }, + { + "epoch": 0.09, + "learning_rate": 2.765421982909985e-07, + "logits/chosen": -3.4944396018981934, + "logits/rejected": -3.2029104232788086, + "logps/chosen": -191.5533447265625, + "logps/rejected": -205.69796752929688, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1512124389410019, + "rewards/margins": 1.1898620128631592, + "rewards/rejected": -1.3410743474960327, + "step": 799 + }, + { + "epoch": 0.09, + "learning_rate": 2.765070818213742e-07, + "logits/chosen": -3.664780616760254, + "logits/rejected": -3.68318510055542, + "logps/chosen": -230.27467346191406, + "logps/rejected": -275.4501037597656, + "loss": 0.2577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15244176983833313, + "rewards/margins": 2.1232802867889404, + "rewards/rejected": -1.9708384275436401, + "step": 800 + }, + { + "epoch": 0.09, + "learning_rate": 2.7647196535174993e-07, + "logits/chosen": -3.46421480178833, + "logits/rejected": -3.676669120788574, + "logps/chosen": -209.22488403320312, + "logps/rejected": -247.3558349609375, + "loss": 0.4863, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1414693295955658, + "rewards/margins": 1.1148982048034668, + "rewards/rejected": -0.9734289050102234, + "step": 801 + }, + { + "epoch": 0.09, + "learning_rate": 2.7643684888212574e-07, + "logits/chosen": -3.051532745361328, + "logits/rejected": -2.6125741004943848, + "logps/chosen": -296.45037841796875, + "logps/rejected": -178.32009887695312, + "loss": 0.4827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17951726913452148, + "rewards/margins": 0.6749148368835449, + "rewards/rejected": -0.49539750814437866, + "step": 802 + }, + { + "epoch": 0.09, + "learning_rate": 2.7640173241250144e-07, + "logits/chosen": -3.62080717086792, + "logits/rejected": -3.388254404067993, + "logps/chosen": -241.11851501464844, + "logps/rejected": -130.45437622070312, + "loss": 0.6223, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14625240862369537, + "rewards/margins": 0.4451746940612793, + "rewards/rejected": -0.29892224073410034, + "step": 803 + }, + { + "epoch": 0.09, + "learning_rate": 2.763666159428772e-07, + "logits/chosen": -3.0720181465148926, + "logits/rejected": -3.234790802001953, + "logps/chosen": -272.6170349121094, + "logps/rejected": -239.00579833984375, + "loss": 0.6243, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4109915494918823, + "rewards/margins": 0.5064557790756226, + "rewards/rejected": -0.9174473881721497, + "step": 804 + }, + { + "epoch": 0.09, + "learning_rate": 2.7633149947325295e-07, + "logits/chosen": -2.8254270553588867, + "logits/rejected": -2.8319807052612305, + "logps/chosen": -528.080810546875, + "logps/rejected": -239.9102783203125, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0475018247961998, + "rewards/margins": 1.0276113748550415, + "rewards/rejected": -0.9801095724105835, + "step": 805 + }, + { + "epoch": 0.09, + "learning_rate": 2.762963830036287e-07, + "logits/chosen": -3.577845335006714, + "logits/rejected": -3.316768169403076, + "logps/chosen": -264.29925537109375, + "logps/rejected": -248.85610961914062, + "loss": 0.5777, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012349037453532219, + "rewards/margins": 0.6933183670043945, + "rewards/rejected": -0.6809692978858948, + "step": 806 + }, + { + "epoch": 0.09, + "learning_rate": 2.7626126653400446e-07, + "logits/chosen": -2.477569103240967, + "logits/rejected": -2.512051582336426, + "logps/chosen": -417.02008056640625, + "logps/rejected": -274.2406005859375, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21478520333766937, + "rewards/margins": 1.5594470500946045, + "rewards/rejected": -1.344661831855774, + "step": 807 + }, + { + "epoch": 0.09, + "learning_rate": 2.7622615006438016e-07, + "logits/chosen": -3.7279951572418213, + "logits/rejected": -3.8948373794555664, + "logps/chosen": -129.264892578125, + "logps/rejected": -267.4670104980469, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2151148021221161, + "rewards/margins": 0.9731276631355286, + "rewards/rejected": -1.1882424354553223, + "step": 808 + }, + { + "epoch": 0.09, + "learning_rate": 2.761910335947559e-07, + "logits/chosen": -2.6491005420684814, + "logits/rejected": -2.611477851867676, + "logps/chosen": -612.55419921875, + "logps/rejected": -386.42523193359375, + "loss": 0.2611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08794021606445312, + "rewards/margins": 1.4910738468170166, + "rewards/rejected": -1.4031336307525635, + "step": 809 + }, + { + "epoch": 0.09, + "learning_rate": 2.7615591712513167e-07, + "logits/chosen": -3.9967260360717773, + "logits/rejected": -3.5841431617736816, + "logps/chosen": -333.91021728515625, + "logps/rejected": -238.244873046875, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3397977352142334, + "rewards/margins": 1.2786295413970947, + "rewards/rejected": -1.6184272766113281, + "step": 810 + }, + { + "epoch": 0.09, + "learning_rate": 2.761208006555074e-07, + "logits/chosen": -2.7773990631103516, + "logits/rejected": -2.9145822525024414, + "logps/chosen": -298.7373046875, + "logps/rejected": -240.71343994140625, + "loss": 0.4287, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04489287734031677, + "rewards/margins": 0.9263416528701782, + "rewards/rejected": -0.8814487457275391, + "step": 811 + }, + { + "epoch": 0.09, + "learning_rate": 2.7608568418588317e-07, + "logits/chosen": -2.893343448638916, + "logits/rejected": -2.9592838287353516, + "logps/chosen": -276.739013671875, + "logps/rejected": -235.43222045898438, + "loss": 0.3445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05172871798276901, + "rewards/margins": 1.1782557964324951, + "rewards/rejected": -1.126527190208435, + "step": 812 + }, + { + "epoch": 0.09, + "learning_rate": 2.760505677162589e-07, + "logits/chosen": -3.150068759918213, + "logits/rejected": -3.132845401763916, + "logps/chosen": -199.677001953125, + "logps/rejected": -185.10208129882812, + "loss": 0.4059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07125253975391388, + "rewards/margins": 1.0133061408996582, + "rewards/rejected": -1.0845587253570557, + "step": 813 + }, + { + "epoch": 0.09, + "learning_rate": 2.760154512466347e-07, + "logits/chosen": -2.9751601219177246, + "logits/rejected": -3.251437187194824, + "logps/chosen": -136.62762451171875, + "logps/rejected": -210.20529174804688, + "loss": 0.4626, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09316956251859665, + "rewards/margins": 1.0490405559539795, + "rewards/rejected": -0.9558709859848022, + "step": 814 + }, + { + "epoch": 0.09, + "learning_rate": 2.7598033477701044e-07, + "logits/chosen": -2.8333122730255127, + "logits/rejected": -2.6702632904052734, + "logps/chosen": -253.98123168945312, + "logps/rejected": -329.1234130859375, + "loss": 0.6227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02935744822025299, + "rewards/margins": 0.8620047569274902, + "rewards/rejected": -0.8326473832130432, + "step": 815 + }, + { + "epoch": 0.09, + "learning_rate": 2.7594521830738614e-07, + "logits/chosen": -3.494313955307007, + "logits/rejected": -3.568282127380371, + "logps/chosen": -188.5488739013672, + "logps/rejected": -340.9733581542969, + "loss": 0.7583, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7036993503570557, + "rewards/margins": -0.027393028140068054, + "rewards/rejected": -0.6763062477111816, + "step": 816 + }, + { + "epoch": 0.09, + "learning_rate": 2.759101018377619e-07, + "logits/chosen": -3.356037139892578, + "logits/rejected": -3.520613431930542, + "logps/chosen": -255.93846130371094, + "logps/rejected": -216.89398193359375, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1564173698425293, + "rewards/margins": 1.223031759262085, + "rewards/rejected": -1.0666143894195557, + "step": 817 + }, + { + "epoch": 0.09, + "learning_rate": 2.7587498536813764e-07, + "logits/chosen": -2.651693105697632, + "logits/rejected": -2.681769847869873, + "logps/chosen": -238.81459045410156, + "logps/rejected": -188.87985229492188, + "loss": 0.7093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2916595935821533, + "rewards/margins": 0.1870458424091339, + "rewards/rejected": -0.478705495595932, + "step": 818 + }, + { + "epoch": 0.09, + "learning_rate": 2.758398688985134e-07, + "logits/chosen": -3.432954788208008, + "logits/rejected": -3.379977226257324, + "logps/chosen": -304.18353271484375, + "logps/rejected": -181.3824005126953, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45107027888298035, + "rewards/margins": 1.181410551071167, + "rewards/rejected": -0.730340301990509, + "step": 819 + }, + { + "epoch": 0.09, + "learning_rate": 2.7580475242888915e-07, + "logits/chosen": -3.4822726249694824, + "logits/rejected": -3.2820496559143066, + "logps/chosen": -168.00473022460938, + "logps/rejected": -212.29330444335938, + "loss": 0.3012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3096015155315399, + "rewards/margins": 1.4323790073394775, + "rewards/rejected": -1.1227775812149048, + "step": 820 + }, + { + "epoch": 0.09, + "learning_rate": 2.7576963595926485e-07, + "logits/chosen": -2.952803373336792, + "logits/rejected": -3.2074875831604004, + "logps/chosen": -200.18057250976562, + "logps/rejected": -173.583984375, + "loss": 0.5475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14085188508033752, + "rewards/margins": 0.47980016469955444, + "rewards/rejected": -0.6206520795822144, + "step": 821 + }, + { + "epoch": 0.09, + "learning_rate": 2.757345194896406e-07, + "logits/chosen": -4.037409782409668, + "logits/rejected": -3.581698417663574, + "logps/chosen": -215.97024536132812, + "logps/rejected": -261.63427734375, + "loss": 0.6184, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34472501277923584, + "rewards/margins": 0.3228113055229187, + "rewards/rejected": -0.6675362586975098, + "step": 822 + }, + { + "epoch": 0.09, + "learning_rate": 2.7569940302001636e-07, + "logits/chosen": -3.261975049972534, + "logits/rejected": -3.4368786811828613, + "logps/chosen": -180.62832641601562, + "logps/rejected": -250.05682373046875, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21257230639457703, + "rewards/margins": 1.1826269626617432, + "rewards/rejected": -0.970054566860199, + "step": 823 + }, + { + "epoch": 0.09, + "learning_rate": 2.756642865503921e-07, + "logits/chosen": -3.1896753311157227, + "logits/rejected": -2.911609172821045, + "logps/chosen": -180.71978759765625, + "logps/rejected": -230.10848999023438, + "loss": 0.3973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03788638114929199, + "rewards/margins": 1.1634998321533203, + "rewards/rejected": -1.2013862133026123, + "step": 824 + }, + { + "epoch": 0.1, + "learning_rate": 2.7562917008076787e-07, + "logits/chosen": -2.711021900177002, + "logits/rejected": -2.6782000064849854, + "logps/chosen": -178.26597595214844, + "logps/rejected": -254.74911499023438, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09842272102832794, + "rewards/margins": 1.270041584968567, + "rewards/rejected": -1.3684642314910889, + "step": 825 + }, + { + "epoch": 0.1, + "learning_rate": 2.755940536111436e-07, + "logits/chosen": -2.9815597534179688, + "logits/rejected": -3.1267921924591064, + "logps/chosen": -253.5839080810547, + "logps/rejected": -224.4720458984375, + "loss": 0.5343, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1619645357131958, + "rewards/margins": 0.6192668080329895, + "rewards/rejected": -0.7812313437461853, + "step": 826 + }, + { + "epoch": 0.1, + "learning_rate": 2.755589371415194e-07, + "logits/chosen": -3.3388805389404297, + "logits/rejected": -3.301466464996338, + "logps/chosen": -185.37667846679688, + "logps/rejected": -427.2044677734375, + "loss": 0.726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13455341756343842, + "rewards/margins": 0.20535269379615784, + "rewards/rejected": -0.33990606665611267, + "step": 827 + }, + { + "epoch": 0.1, + "learning_rate": 2.7552382067189513e-07, + "logits/chosen": -1.9774956703186035, + "logits/rejected": -1.982587456703186, + "logps/chosen": -225.5425567626953, + "logps/rejected": -163.74658203125, + "loss": 0.5317, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3739159405231476, + "rewards/margins": 0.5552279949188232, + "rewards/rejected": -0.18131211400032043, + "step": 828 + }, + { + "epoch": 0.1, + "learning_rate": 2.7548870420227083e-07, + "logits/chosen": -2.386810302734375, + "logits/rejected": -2.4659695625305176, + "logps/chosen": -403.2242431640625, + "logps/rejected": -334.8184814453125, + "loss": 0.6501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06281670928001404, + "rewards/margins": 0.6998395323753357, + "rewards/rejected": -0.637022852897644, + "step": 829 + }, + { + "epoch": 0.1, + "learning_rate": 2.754535877326466e-07, + "logits/chosen": -3.265871047973633, + "logits/rejected": -3.0587384700775146, + "logps/chosen": -373.1357727050781, + "logps/rejected": -258.71319580078125, + "loss": 0.7587, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2828088700771332, + "rewards/margins": 0.06885270774364471, + "rewards/rejected": -0.3516615927219391, + "step": 830 + }, + { + "epoch": 0.1, + "learning_rate": 2.7541847126302234e-07, + "logits/chosen": -3.139171838760376, + "logits/rejected": -2.7939915657043457, + "logps/chosen": -218.7184295654297, + "logps/rejected": -242.91561889648438, + "loss": 0.8487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45756590366363525, + "rewards/margins": 1.143376350402832, + "rewards/rejected": -1.6009422540664673, + "step": 831 + }, + { + "epoch": 0.1, + "learning_rate": 2.753833547933981e-07, + "logits/chosen": -3.090639591217041, + "logits/rejected": -3.123727321624756, + "logps/chosen": -301.5442199707031, + "logps/rejected": -285.9490661621094, + "loss": 0.578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04124303162097931, + "rewards/margins": 0.6752254366874695, + "rewards/rejected": -0.7164684534072876, + "step": 832 + }, + { + "epoch": 0.1, + "learning_rate": 2.7534823832377385e-07, + "logits/chosen": -3.0568671226501465, + "logits/rejected": -3.0110726356506348, + "logps/chosen": -183.74142456054688, + "logps/rejected": -130.54478454589844, + "loss": 0.5414, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14824596047401428, + "rewards/margins": 0.8876832723617554, + "rewards/rejected": -0.7394372820854187, + "step": 833 + }, + { + "epoch": 0.1, + "learning_rate": 2.753131218541496e-07, + "logits/chosen": -2.9822592735290527, + "logits/rejected": -2.7962350845336914, + "logps/chosen": -286.7266845703125, + "logps/rejected": -265.45428466796875, + "loss": 0.4306, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18657168745994568, + "rewards/margins": 1.0591834783554077, + "rewards/rejected": -0.8726117610931396, + "step": 834 + }, + { + "epoch": 0.1, + "learning_rate": 2.752780053845253e-07, + "logits/chosen": -3.128429651260376, + "logits/rejected": -2.7803685665130615, + "logps/chosen": -256.7389831542969, + "logps/rejected": -218.9199676513672, + "loss": 0.6355, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1512812227010727, + "rewards/margins": 0.4312949478626251, + "rewards/rejected": -0.5825761556625366, + "step": 835 + }, + { + "epoch": 0.1, + "learning_rate": 2.752428889149011e-07, + "logits/chosen": -2.8643195629119873, + "logits/rejected": -2.8368594646453857, + "logps/chosen": -246.2068634033203, + "logps/rejected": -355.43646240234375, + "loss": 0.4653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08422783017158508, + "rewards/margins": 1.0881870985031128, + "rewards/rejected": -1.1724148988723755, + "step": 836 + }, + { + "epoch": 0.1, + "learning_rate": 2.752077724452768e-07, + "logits/chosen": -3.3321077823638916, + "logits/rejected": -3.1884727478027344, + "logps/chosen": -169.97579956054688, + "logps/rejected": -481.00616455078125, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13322196900844574, + "rewards/margins": 2.267456531524658, + "rewards/rejected": -2.4006783962249756, + "step": 837 + }, + { + "epoch": 0.1, + "learning_rate": 2.7517265597565256e-07, + "logits/chosen": -3.2844486236572266, + "logits/rejected": -3.2791526317596436, + "logps/chosen": -167.28054809570312, + "logps/rejected": -237.37396240234375, + "loss": 0.5303, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38232889771461487, + "rewards/margins": 1.035386562347412, + "rewards/rejected": -0.6530575752258301, + "step": 838 + }, + { + "epoch": 0.1, + "learning_rate": 2.751375395060283e-07, + "logits/chosen": -2.8910608291625977, + "logits/rejected": -2.490525722503662, + "logps/chosen": -239.01239013671875, + "logps/rejected": -161.4678192138672, + "loss": 0.5792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18761992454528809, + "rewards/margins": 0.32470637559890747, + "rewards/rejected": -0.5123263001441956, + "step": 839 + }, + { + "epoch": 0.1, + "learning_rate": 2.7510242303640407e-07, + "logits/chosen": -3.2921488285064697, + "logits/rejected": -3.3097188472747803, + "logps/chosen": -295.316650390625, + "logps/rejected": -115.58242797851562, + "loss": 0.4937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04165259748697281, + "rewards/margins": 0.5823967456817627, + "rewards/rejected": -0.6240493655204773, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 2.750673065667798e-07, + "logits/chosen": -2.654162645339966, + "logits/rejected": -2.853571891784668, + "logps/chosen": -246.6444091796875, + "logps/rejected": -366.7325134277344, + "loss": 0.5326, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4231809377670288, + "rewards/margins": 0.7851053476333618, + "rewards/rejected": -1.2082862854003906, + "step": 841 + }, + { + "epoch": 0.1, + "learning_rate": 2.750321900971556e-07, + "logits/chosen": -3.2783055305480957, + "logits/rejected": -3.299618721008301, + "logps/chosen": -117.82460021972656, + "logps/rejected": -244.18411254882812, + "loss": 0.4121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24455642700195312, + "rewards/margins": 1.2014939785003662, + "rewards/rejected": -0.9569374918937683, + "step": 842 + }, + { + "epoch": 0.1, + "learning_rate": 2.749970736275313e-07, + "logits/chosen": -2.9774930477142334, + "logits/rejected": -2.6807808876037598, + "logps/chosen": -351.3978576660156, + "logps/rejected": -279.03704833984375, + "loss": 0.8492, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23986348509788513, + "rewards/margins": -0.16272865235805511, + "rewards/rejected": -0.07713484764099121, + "step": 843 + }, + { + "epoch": 0.1, + "learning_rate": 2.7496195715790703e-07, + "logits/chosen": -2.1806719303131104, + "logits/rejected": -2.1184065341949463, + "logps/chosen": -358.91851806640625, + "logps/rejected": -235.81985473632812, + "loss": 0.4973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027127843350172043, + "rewards/margins": 0.9274944067001343, + "rewards/rejected": -0.9003665447235107, + "step": 844 + }, + { + "epoch": 0.1, + "learning_rate": 2.749268406882828e-07, + "logits/chosen": -2.5226049423217773, + "logits/rejected": -2.5325746536254883, + "logps/chosen": -367.41192626953125, + "logps/rejected": -411.62506103515625, + "loss": 0.4258, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27732980251312256, + "rewards/margins": 1.2960268259048462, + "rewards/rejected": -1.018696904182434, + "step": 845 + }, + { + "epoch": 0.1, + "learning_rate": 2.7489172421865854e-07, + "logits/chosen": -4.083926200866699, + "logits/rejected": -3.8236048221588135, + "logps/chosen": -106.34597778320312, + "logps/rejected": -89.45291137695312, + "loss": 0.7906, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.29099392890930176, + "rewards/margins": -0.10117463767528534, + "rewards/rejected": -0.18981926143169403, + "step": 846 + }, + { + "epoch": 0.1, + "learning_rate": 2.748566077490343e-07, + "logits/chosen": -3.0409655570983887, + "logits/rejected": -3.036008358001709, + "logps/chosen": -146.0890350341797, + "logps/rejected": -230.4691925048828, + "loss": 0.4155, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3017933666706085, + "rewards/margins": 1.3907811641693115, + "rewards/rejected": -1.0889878273010254, + "step": 847 + }, + { + "epoch": 0.1, + "learning_rate": 2.7482149127941005e-07, + "logits/chosen": -3.3585784435272217, + "logits/rejected": -3.127389907836914, + "logps/chosen": -310.48931884765625, + "logps/rejected": -283.6368408203125, + "loss": 0.5293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39480113983154297, + "rewards/margins": 0.7650704383850098, + "rewards/rejected": -1.1598716974258423, + "step": 848 + }, + { + "epoch": 0.1, + "learning_rate": 2.747863748097858e-07, + "logits/chosen": -2.850364923477173, + "logits/rejected": -2.8961315155029297, + "logps/chosen": -255.3311309814453, + "logps/rejected": -333.34375, + "loss": 0.933, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19613291323184967, + "rewards/margins": 0.05653604865074158, + "rewards/rejected": -0.25266897678375244, + "step": 849 + }, + { + "epoch": 0.1, + "learning_rate": 2.747512583401615e-07, + "logits/chosen": -2.9712603092193604, + "logits/rejected": -3.238640546798706, + "logps/chosen": -202.01333618164062, + "logps/rejected": -219.466796875, + "loss": 0.7301, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2153870016336441, + "rewards/margins": 0.3994392156600952, + "rewards/rejected": -0.6148262619972229, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 2.7471614187053726e-07, + "logits/chosen": -3.9397530555725098, + "logits/rejected": -3.958695650100708, + "logps/chosen": -234.10787963867188, + "logps/rejected": -176.3972930908203, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007220447063446045, + "rewards/margins": 1.64689302444458, + "rewards/rejected": -1.6541134119033813, + "step": 851 + }, + { + "epoch": 0.1, + "learning_rate": 2.74681025400913e-07, + "logits/chosen": -3.089216709136963, + "logits/rejected": -3.2197213172912598, + "logps/chosen": -311.3764343261719, + "logps/rejected": -236.69052124023438, + "loss": 0.5464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2225361466407776, + "rewards/margins": 0.6139265894889832, + "rewards/rejected": -0.8364627361297607, + "step": 852 + }, + { + "epoch": 0.1, + "learning_rate": 2.7464590893128876e-07, + "logits/chosen": -3.5604469776153564, + "logits/rejected": -3.5403032302856445, + "logps/chosen": -192.53421020507812, + "logps/rejected": -211.67202758789062, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1529182344675064, + "rewards/margins": 1.1420307159423828, + "rewards/rejected": -0.9891124963760376, + "step": 853 + }, + { + "epoch": 0.1, + "learning_rate": 2.746107924616645e-07, + "logits/chosen": -2.6424708366394043, + "logits/rejected": -2.614380359649658, + "logps/chosen": -266.469482421875, + "logps/rejected": -240.80807495117188, + "loss": 0.4944, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21823687851428986, + "rewards/margins": 0.7156004905700684, + "rewards/rejected": -0.9338374137878418, + "step": 854 + }, + { + "epoch": 0.1, + "learning_rate": 2.7457567599204027e-07, + "logits/chosen": -3.251129150390625, + "logits/rejected": -3.387636423110962, + "logps/chosen": -175.33871459960938, + "logps/rejected": -150.60842895507812, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012763511389493942, + "rewards/margins": 1.0121045112609863, + "rewards/rejected": -1.0248678922653198, + "step": 855 + }, + { + "epoch": 0.1, + "learning_rate": 2.7454055952241597e-07, + "logits/chosen": -2.779937744140625, + "logits/rejected": -2.8106565475463867, + "logps/chosen": -113.65443420410156, + "logps/rejected": -220.91098022460938, + "loss": 0.464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08553160727024078, + "rewards/margins": 1.1640958786010742, + "rewards/rejected": -1.2496274709701538, + "step": 856 + }, + { + "epoch": 0.1, + "learning_rate": 2.745054430527917e-07, + "logits/chosen": -2.8897969722747803, + "logits/rejected": -2.5848641395568848, + "logps/chosen": -314.1836242675781, + "logps/rejected": -306.003173828125, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28883516788482666, + "rewards/margins": 1.6706345081329346, + "rewards/rejected": -1.3817994594573975, + "step": 857 + }, + { + "epoch": 0.1, + "learning_rate": 2.744703265831675e-07, + "logits/chosen": -3.357424736022949, + "logits/rejected": -3.068399667739868, + "logps/chosen": -221.11459350585938, + "logps/rejected": -202.184326171875, + "loss": 0.3669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06499813497066498, + "rewards/margins": 1.5424292087554932, + "rewards/rejected": -1.4774311780929565, + "step": 858 + }, + { + "epoch": 0.1, + "learning_rate": 2.7443521011354323e-07, + "logits/chosen": -2.8036627769470215, + "logits/rejected": -3.0134940147399902, + "logps/chosen": -296.6165466308594, + "logps/rejected": -218.00172424316406, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07972222566604614, + "rewards/margins": 0.7851074934005737, + "rewards/rejected": -0.7053853273391724, + "step": 859 + }, + { + "epoch": 0.1, + "learning_rate": 2.74400093643919e-07, + "logits/chosen": -3.318068265914917, + "logits/rejected": -3.22570538520813, + "logps/chosen": -304.4339599609375, + "logps/rejected": -217.77716064453125, + "loss": 0.4289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3415123224258423, + "rewards/margins": 0.7825182676315308, + "rewards/rejected": -1.124030590057373, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 2.7436497717429474e-07, + "logits/chosen": -2.879946708679199, + "logits/rejected": -2.668354034423828, + "logps/chosen": -247.2313232421875, + "logps/rejected": -378.1593017578125, + "loss": 0.4276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41900256276130676, + "rewards/margins": 0.9543185830116272, + "rewards/rejected": -1.3733210563659668, + "step": 861 + }, + { + "epoch": 0.1, + "learning_rate": 2.743298607046705e-07, + "logits/chosen": -3.0926265716552734, + "logits/rejected": -3.1871871948242188, + "logps/chosen": -268.2755126953125, + "logps/rejected": -205.275146484375, + "loss": 0.4498, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2651245594024658, + "rewards/margins": 0.7635625600814819, + "rewards/rejected": -0.49843794107437134, + "step": 862 + }, + { + "epoch": 0.1, + "learning_rate": 2.7429474423504625e-07, + "logits/chosen": -2.900203227996826, + "logits/rejected": -3.3673105239868164, + "logps/chosen": -234.1897430419922, + "logps/rejected": -260.4333190917969, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29174286127090454, + "rewards/margins": 2.3300509452819824, + "rewards/rejected": -2.0383083820343018, + "step": 863 + }, + { + "epoch": 0.1, + "learning_rate": 2.7425962776542195e-07, + "logits/chosen": -2.6148953437805176, + "logits/rejected": -2.6465742588043213, + "logps/chosen": -392.43109130859375, + "logps/rejected": -368.469970703125, + "loss": 0.4, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09190759807825089, + "rewards/margins": 1.0077214241027832, + "rewards/rejected": -0.9158138036727905, + "step": 864 + }, + { + "epoch": 0.1, + "learning_rate": 2.742245112957977e-07, + "logits/chosen": -3.818584680557251, + "logits/rejected": -3.352694511413574, + "logps/chosen": -294.89739990234375, + "logps/rejected": -255.57516479492188, + "loss": 0.3533, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1025727316737175, + "rewards/margins": 1.863452434539795, + "rewards/rejected": -1.9660251140594482, + "step": 865 + }, + { + "epoch": 0.1, + "learning_rate": 2.7418939482617346e-07, + "logits/chosen": -3.4445462226867676, + "logits/rejected": -2.934802532196045, + "logps/chosen": -610.8291015625, + "logps/rejected": -206.37393188476562, + "loss": 0.4809, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3013654351234436, + "rewards/margins": 0.7586274147033691, + "rewards/rejected": -1.059992790222168, + "step": 866 + }, + { + "epoch": 0.1, + "learning_rate": 2.741542783565492e-07, + "logits/chosen": -3.7697229385375977, + "logits/rejected": -3.583829402923584, + "logps/chosen": -315.19659423828125, + "logps/rejected": -248.25814819335938, + "loss": 0.8446, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4225854277610779, + "rewards/margins": 0.012507237493991852, + "rewards/rejected": -0.43509265780448914, + "step": 867 + }, + { + "epoch": 0.1, + "learning_rate": 2.7411916188692497e-07, + "logits/chosen": -3.731266975402832, + "logits/rejected": -3.948967456817627, + "logps/chosen": -89.87312316894531, + "logps/rejected": -165.8406524658203, + "loss": 0.5956, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06773865222930908, + "rewards/margins": 0.342643141746521, + "rewards/rejected": -0.41038182377815247, + "step": 868 + }, + { + "epoch": 0.1, + "learning_rate": 2.7408404541730067e-07, + "logits/chosen": -2.8064169883728027, + "logits/rejected": -2.845860242843628, + "logps/chosen": -443.707763671875, + "logps/rejected": -338.8929748535156, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14651471376419067, + "rewards/margins": 0.48660218715667725, + "rewards/rejected": -0.6331169009208679, + "step": 869 + }, + { + "epoch": 0.1, + "learning_rate": 2.740489289476765e-07, + "logits/chosen": -2.7667996883392334, + "logits/rejected": -3.015601873397827, + "logps/chosen": -425.88690185546875, + "logps/rejected": -338.3902893066406, + "loss": 0.5371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13016009330749512, + "rewards/margins": 0.8180822134017944, + "rewards/rejected": -0.9482423067092896, + "step": 870 + }, + { + "epoch": 0.1, + "learning_rate": 2.7401381247805223e-07, + "logits/chosen": -2.478245973587036, + "logits/rejected": -2.573784828186035, + "logps/chosen": -361.21142578125, + "logps/rejected": -276.9137878417969, + "loss": 0.4985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12616321444511414, + "rewards/margins": 0.7055253386497498, + "rewards/rejected": -0.579362154006958, + "step": 871 + }, + { + "epoch": 0.1, + "learning_rate": 2.7397869600842793e-07, + "logits/chosen": -2.9991977214813232, + "logits/rejected": -3.1077451705932617, + "logps/chosen": -291.0771484375, + "logps/rejected": -259.90863037109375, + "loss": 0.5261, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08637861907482147, + "rewards/margins": 0.5141648054122925, + "rewards/rejected": -0.4277861714363098, + "step": 872 + }, + { + "epoch": 0.1, + "learning_rate": 2.739435795388037e-07, + "logits/chosen": -3.3164827823638916, + "logits/rejected": -3.3662006855010986, + "logps/chosen": -166.0374298095703, + "logps/rejected": -248.99966430664062, + "loss": 0.7578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12946414947509766, + "rewards/margins": -0.05476903170347214, + "rewards/rejected": -0.07469511032104492, + "step": 873 + }, + { + "epoch": 0.1, + "learning_rate": 2.7390846306917944e-07, + "logits/chosen": -3.2020163536071777, + "logits/rejected": -3.1777071952819824, + "logps/chosen": -154.885498046875, + "logps/rejected": -153.17678833007812, + "loss": 0.9752, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.11419603228569031, + "rewards/margins": -0.42291784286499023, + "rewards/rejected": 0.3087218403816223, + "step": 874 + }, + { + "epoch": 0.1, + "learning_rate": 2.738733465995552e-07, + "logits/chosen": -3.5250675678253174, + "logits/rejected": -3.100283622741699, + "logps/chosen": -140.64231872558594, + "logps/rejected": -166.44606018066406, + "loss": 0.3511, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19421644508838654, + "rewards/margins": 1.327646255493164, + "rewards/rejected": -1.133429765701294, + "step": 875 + }, + { + "epoch": 0.1, + "learning_rate": 2.7383823012993094e-07, + "logits/chosen": -3.2447712421417236, + "logits/rejected": -3.0207910537719727, + "logps/chosen": -496.02557373046875, + "logps/rejected": -273.6380310058594, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4117898941040039, + "rewards/margins": 1.565908432006836, + "rewards/rejected": -1.154118537902832, + "step": 876 + }, + { + "epoch": 0.1, + "learning_rate": 2.7380311366030664e-07, + "logits/chosen": -3.3602166175842285, + "logits/rejected": -3.648688793182373, + "logps/chosen": -256.692626953125, + "logps/rejected": -211.4127655029297, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3914310038089752, + "rewards/margins": 0.5668737888336182, + "rewards/rejected": -0.958304762840271, + "step": 877 + }, + { + "epoch": 0.1, + "learning_rate": 2.737679971906824e-07, + "logits/chosen": -3.647803783416748, + "logits/rejected": -3.6295180320739746, + "logps/chosen": -208.53182983398438, + "logps/rejected": -188.347900390625, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13901200890541077, + "rewards/margins": 1.7613424062728882, + "rewards/rejected": -1.6223305463790894, + "step": 878 + }, + { + "epoch": 0.1, + "learning_rate": 2.737328807210582e-07, + "logits/chosen": -2.697908878326416, + "logits/rejected": -2.2430458068847656, + "logps/chosen": -354.37591552734375, + "logps/rejected": -455.57562255859375, + "loss": 0.5494, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15349599719047546, + "rewards/margins": 1.0928208827972412, + "rewards/rejected": -0.9393249750137329, + "step": 879 + }, + { + "epoch": 0.1, + "learning_rate": 2.736977642514339e-07, + "logits/chosen": -3.46490478515625, + "logits/rejected": -3.2499845027923584, + "logps/chosen": -328.3086853027344, + "logps/rejected": -312.980712890625, + "loss": 0.3316, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37826910614967346, + "rewards/margins": 1.6406892538070679, + "rewards/rejected": -1.2624201774597168, + "step": 880 + }, + { + "epoch": 0.1, + "learning_rate": 2.7366264778180966e-07, + "logits/chosen": -2.9198622703552246, + "logits/rejected": -2.7697479724884033, + "logps/chosen": -210.04257202148438, + "logps/rejected": -166.7701873779297, + "loss": 0.582, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05798802897334099, + "rewards/margins": 0.3633895814418793, + "rewards/rejected": -0.3054015636444092, + "step": 881 + }, + { + "epoch": 0.1, + "learning_rate": 2.736275313121854e-07, + "logits/chosen": -3.456423044204712, + "logits/rejected": -3.374634027481079, + "logps/chosen": -201.5541534423828, + "logps/rejected": -162.4180908203125, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02390441484749317, + "rewards/margins": 0.8680941462516785, + "rewards/rejected": -0.8919986486434937, + "step": 882 + }, + { + "epoch": 0.1, + "learning_rate": 2.7359241484256117e-07, + "logits/chosen": -2.9666905403137207, + "logits/rejected": -3.176607370376587, + "logps/chosen": -269.659912109375, + "logps/rejected": -325.2682189941406, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.002189110964536667, + "rewards/margins": 1.018014907836914, + "rewards/rejected": -1.0202040672302246, + "step": 883 + }, + { + "epoch": 0.1, + "learning_rate": 2.735572983729369e-07, + "logits/chosen": -3.3751349449157715, + "logits/rejected": -3.492903232574463, + "logps/chosen": -308.1128234863281, + "logps/rejected": -337.5738525390625, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3474319577217102, + "rewards/margins": 2.1836965084075928, + "rewards/rejected": -1.8362646102905273, + "step": 884 + }, + { + "epoch": 0.1, + "learning_rate": 2.735221819033126e-07, + "logits/chosen": -2.870685577392578, + "logits/rejected": -2.6847031116485596, + "logps/chosen": -263.741455078125, + "logps/rejected": -230.78762817382812, + "loss": 0.4276, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4300898015499115, + "rewards/margins": 0.8930739164352417, + "rewards/rejected": -0.4629840850830078, + "step": 885 + }, + { + "epoch": 0.1, + "learning_rate": 2.734870654336884e-07, + "logits/chosen": -3.4636075496673584, + "logits/rejected": -3.4705357551574707, + "logps/chosen": -268.2343444824219, + "logps/rejected": -314.6069030761719, + "loss": 0.3378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06484910845756531, + "rewards/margins": 1.4335392713546753, + "rewards/rejected": -1.498388409614563, + "step": 886 + }, + { + "epoch": 0.1, + "learning_rate": 2.7345194896406413e-07, + "logits/chosen": -3.4642715454101562, + "logits/rejected": -3.288006544113159, + "logps/chosen": -168.78045654296875, + "logps/rejected": -226.36146545410156, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09172104299068451, + "rewards/margins": 1.519975185394287, + "rewards/rejected": -1.6116962432861328, + "step": 887 + }, + { + "epoch": 0.1, + "learning_rate": 2.734168324944399e-07, + "logits/chosen": -3.482382297515869, + "logits/rejected": -3.510183095932007, + "logps/chosen": -225.66082763671875, + "logps/rejected": -179.1909637451172, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04460671544075012, + "rewards/margins": 0.8375352025032043, + "rewards/rejected": -0.7929285168647766, + "step": 888 + }, + { + "epoch": 0.1, + "learning_rate": 2.7338171602481564e-07, + "logits/chosen": -3.683042526245117, + "logits/rejected": -3.3677713871002197, + "logps/chosen": -290.6557922363281, + "logps/rejected": -243.47109985351562, + "loss": 0.4793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3878090977668762, + "rewards/margins": 1.1898540258407593, + "rewards/rejected": -1.5776631832122803, + "step": 889 + }, + { + "epoch": 0.1, + "learning_rate": 2.7334659955519134e-07, + "logits/chosen": -3.787473678588867, + "logits/rejected": -3.6991310119628906, + "logps/chosen": -175.05210876464844, + "logps/rejected": -180.95370483398438, + "loss": 0.418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010378304868936539, + "rewards/margins": 1.3296396732330322, + "rewards/rejected": -1.3192614316940308, + "step": 890 + }, + { + "epoch": 0.1, + "learning_rate": 2.733114830855671e-07, + "logits/chosen": -2.6952500343322754, + "logits/rejected": -2.800529718399048, + "logps/chosen": -235.1688995361328, + "logps/rejected": -286.77069091796875, + "loss": 0.4325, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2818378508090973, + "rewards/margins": 0.9569096565246582, + "rewards/rejected": -0.6750718355178833, + "step": 891 + }, + { + "epoch": 0.1, + "learning_rate": 2.732763666159429e-07, + "logits/chosen": -3.5838582515716553, + "logits/rejected": -3.304550886154175, + "logps/chosen": -289.6060791015625, + "logps/rejected": -213.3480224609375, + "loss": 0.7233, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07249242067337036, + "rewards/margins": 0.8512861728668213, + "rewards/rejected": -0.9237786531448364, + "step": 892 + }, + { + "epoch": 0.1, + "learning_rate": 2.732412501463186e-07, + "logits/chosen": -3.2545435428619385, + "logits/rejected": -3.603292465209961, + "logps/chosen": -293.82000732421875, + "logps/rejected": -291.3116149902344, + "loss": 0.3847, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29506242275238037, + "rewards/margins": 1.4451227188110352, + "rewards/rejected": -1.1500602960586548, + "step": 893 + }, + { + "epoch": 0.1, + "learning_rate": 2.7320613367669435e-07, + "logits/chosen": -3.7829883098602295, + "logits/rejected": -3.668048858642578, + "logps/chosen": -237.40284729003906, + "logps/rejected": -257.0686340332031, + "loss": 0.2857, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2788296937942505, + "rewards/margins": 1.8316324949264526, + "rewards/rejected": -1.5528028011322021, + "step": 894 + }, + { + "epoch": 0.1, + "learning_rate": 2.731710172070701e-07, + "logits/chosen": -2.9281060695648193, + "logits/rejected": -2.9708497524261475, + "logps/chosen": -242.07452392578125, + "logps/rejected": -432.1681823730469, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2294134646654129, + "rewards/margins": 2.5995736122131348, + "rewards/rejected": -2.3701601028442383, + "step": 895 + }, + { + "epoch": 0.1, + "learning_rate": 2.7313590073744586e-07, + "logits/chosen": -2.279395580291748, + "logits/rejected": -2.5472590923309326, + "logps/chosen": -320.8092956542969, + "logps/rejected": -230.53469848632812, + "loss": 0.5916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1299741566181183, + "rewards/margins": 0.3151235282421112, + "rewards/rejected": -0.4450976848602295, + "step": 896 + }, + { + "epoch": 0.1, + "learning_rate": 2.731007842678216e-07, + "logits/chosen": -3.36965012550354, + "logits/rejected": -3.397615909576416, + "logps/chosen": -161.10597229003906, + "logps/rejected": -167.104736328125, + "loss": 0.8153, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006314871832728386, + "rewards/margins": 0.4814962148666382, + "rewards/rejected": -0.4751812815666199, + "step": 897 + }, + { + "epoch": 0.1, + "learning_rate": 2.730656677981973e-07, + "logits/chosen": -3.173219680786133, + "logits/rejected": -3.1868748664855957, + "logps/chosen": -223.11569213867188, + "logps/rejected": -240.66107177734375, + "loss": 0.6403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48031431436538696, + "rewards/margins": 0.7474995851516724, + "rewards/rejected": -1.2278138399124146, + "step": 898 + }, + { + "epoch": 0.1, + "learning_rate": 2.7303055132857307e-07, + "logits/chosen": -3.792707920074463, + "logits/rejected": -3.7691166400909424, + "logps/chosen": -180.13307189941406, + "logps/rejected": -255.35284423828125, + "loss": 0.3052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39928966760635376, + "rewards/margins": 1.502578616142273, + "rewards/rejected": -1.1032888889312744, + "step": 899 + }, + { + "epoch": 0.1, + "learning_rate": 2.729954348589488e-07, + "logits/chosen": -3.1248369216918945, + "logits/rejected": -3.1994898319244385, + "logps/chosen": -235.48703002929688, + "logps/rejected": -257.3819580078125, + "loss": 0.559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27769044041633606, + "rewards/margins": 1.2040843963623047, + "rewards/rejected": -1.481774926185608, + "step": 900 + }, + { + "epoch": 0.1, + "learning_rate": 2.729603183893246e-07, + "logits/chosen": -3.0169053077697754, + "logits/rejected": -3.217195510864258, + "logps/chosen": -356.239013671875, + "logps/rejected": -248.81185913085938, + "loss": 0.4039, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07552237808704376, + "rewards/margins": 1.4441397190093994, + "rewards/rejected": -1.368617296218872, + "step": 901 + }, + { + "epoch": 0.1, + "learning_rate": 2.7292520191970033e-07, + "logits/chosen": -3.0921874046325684, + "logits/rejected": -3.1441752910614014, + "logps/chosen": -246.675048828125, + "logps/rejected": -227.9263916015625, + "loss": 0.3748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21944963932037354, + "rewards/margins": 1.3254164457321167, + "rewards/rejected": -1.1059668064117432, + "step": 902 + }, + { + "epoch": 0.1, + "learning_rate": 2.7289008545007603e-07, + "logits/chosen": -2.5496089458465576, + "logits/rejected": -2.510307788848877, + "logps/chosen": -278.2304382324219, + "logps/rejected": -248.73326110839844, + "loss": 0.2501, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06262153387069702, + "rewards/margins": 1.8063157796859741, + "rewards/rejected": -1.7436943054199219, + "step": 903 + }, + { + "epoch": 0.1, + "learning_rate": 2.7285496898045184e-07, + "logits/chosen": -3.1528491973876953, + "logits/rejected": -2.5254809856414795, + "logps/chosen": -306.5439453125, + "logps/rejected": -283.7752380371094, + "loss": 0.6391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5247014760971069, + "rewards/margins": 0.3074069917201996, + "rewards/rejected": -0.8321084976196289, + "step": 904 + }, + { + "epoch": 0.1, + "learning_rate": 2.728198525108276e-07, + "logits/chosen": -3.4172539710998535, + "logits/rejected": -3.4588940143585205, + "logps/chosen": -246.3363494873047, + "logps/rejected": -179.74822998046875, + "loss": 0.3563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08095742762088776, + "rewards/margins": 1.3279359340667725, + "rewards/rejected": -1.40889310836792, + "step": 905 + }, + { + "epoch": 0.1, + "learning_rate": 2.727847360412033e-07, + "logits/chosen": -3.248837471008301, + "logits/rejected": -3.330338954925537, + "logps/chosen": -115.96588134765625, + "logps/rejected": -240.95388793945312, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1359207034111023, + "rewards/margins": 1.4903448820114136, + "rewards/rejected": -1.354424238204956, + "step": 906 + }, + { + "epoch": 0.1, + "learning_rate": 2.7274961957157905e-07, + "logits/chosen": -3.1344480514526367, + "logits/rejected": -3.188601493835449, + "logps/chosen": -265.72772216796875, + "logps/rejected": -432.5203857421875, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004061520099639893, + "rewards/margins": 1.6172616481781006, + "rewards/rejected": -1.6131999492645264, + "step": 907 + }, + { + "epoch": 0.1, + "learning_rate": 2.727145031019548e-07, + "logits/chosen": -2.7339181900024414, + "logits/rejected": -2.841093063354492, + "logps/chosen": -196.99000549316406, + "logps/rejected": -217.36041259765625, + "loss": 0.6587, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14531797170639038, + "rewards/margins": 0.36278384923934937, + "rewards/rejected": -0.21746587753295898, + "step": 908 + }, + { + "epoch": 0.1, + "learning_rate": 2.7267938663233056e-07, + "logits/chosen": -2.3694746494293213, + "logits/rejected": -2.571016311645508, + "logps/chosen": -303.3848876953125, + "logps/rejected": -279.5934143066406, + "loss": 0.7169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14588168263435364, + "rewards/margins": 0.15638813376426697, + "rewards/rejected": -0.302269846200943, + "step": 909 + }, + { + "epoch": 0.1, + "learning_rate": 2.726442701627063e-07, + "logits/chosen": -2.7573957443237305, + "logits/rejected": -2.4185047149658203, + "logps/chosen": -405.59478759765625, + "logps/rejected": -390.17626953125, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0948440432548523, + "rewards/margins": 1.5815706253051758, + "rewards/rejected": -1.4867265224456787, + "step": 910 + }, + { + "epoch": 0.11, + "learning_rate": 2.72609153693082e-07, + "logits/chosen": -1.918601632118225, + "logits/rejected": -2.0897252559661865, + "logps/chosen": -403.3478088378906, + "logps/rejected": -339.4903869628906, + "loss": 0.3581, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29311293363571167, + "rewards/margins": 1.606999397277832, + "rewards/rejected": -1.3138864040374756, + "step": 911 + }, + { + "epoch": 0.11, + "learning_rate": 2.7257403722345776e-07, + "logits/chosen": -3.1630916595458984, + "logits/rejected": -3.2855231761932373, + "logps/chosen": -163.98687744140625, + "logps/rejected": -200.6642303466797, + "loss": 0.4939, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004030302166938782, + "rewards/margins": 1.1513805389404297, + "rewards/rejected": -1.1473501920700073, + "step": 912 + }, + { + "epoch": 0.11, + "learning_rate": 2.7253892075383357e-07, + "logits/chosen": -3.1772027015686035, + "logits/rejected": -2.925394058227539, + "logps/chosen": -384.686279296875, + "logps/rejected": -375.16949462890625, + "loss": 0.634, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3023124933242798, + "rewards/margins": 0.5147613883018494, + "rewards/rejected": -0.8170739412307739, + "step": 913 + }, + { + "epoch": 0.11, + "learning_rate": 2.7250380428420927e-07, + "logits/chosen": -2.9498748779296875, + "logits/rejected": -2.8009510040283203, + "logps/chosen": -205.6226348876953, + "logps/rejected": -242.76422119140625, + "loss": 0.5356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21075215935707092, + "rewards/margins": 0.7407183647155762, + "rewards/rejected": -0.9514705538749695, + "step": 914 + }, + { + "epoch": 0.11, + "learning_rate": 2.7246868781458503e-07, + "logits/chosen": -3.302330493927002, + "logits/rejected": -3.010176658630371, + "logps/chosen": -420.05255126953125, + "logps/rejected": -299.21136474609375, + "loss": 0.4446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2245606631040573, + "rewards/margins": 1.1491971015930176, + "rewards/rejected": -0.9246364831924438, + "step": 915 + }, + { + "epoch": 0.11, + "learning_rate": 2.724335713449608e-07, + "logits/chosen": -2.8898839950561523, + "logits/rejected": -3.015110731124878, + "logps/chosen": -287.5570373535156, + "logps/rejected": -335.2170104980469, + "loss": 0.2438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01172279566526413, + "rewards/margins": 2.0249836444854736, + "rewards/rejected": -2.0367064476013184, + "step": 916 + }, + { + "epoch": 0.11, + "learning_rate": 2.7239845487533653e-07, + "logits/chosen": -3.9990854263305664, + "logits/rejected": -3.486722469329834, + "logps/chosen": -389.77490234375, + "logps/rejected": -188.4969024658203, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025478176772594452, + "rewards/margins": 2.083963394165039, + "rewards/rejected": -2.0584850311279297, + "step": 917 + }, + { + "epoch": 0.11, + "learning_rate": 2.723633384057123e-07, + "logits/chosen": -2.306894540786743, + "logits/rejected": -2.420416831970215, + "logps/chosen": -440.85333251953125, + "logps/rejected": -234.17715454101562, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07106418907642365, + "rewards/margins": 0.3188053369522095, + "rewards/rejected": -0.24774114787578583, + "step": 918 + }, + { + "epoch": 0.11, + "learning_rate": 2.72328221936088e-07, + "logits/chosen": -2.4436309337615967, + "logits/rejected": -2.366974353790283, + "logps/chosen": -336.9437561035156, + "logps/rejected": -248.24087524414062, + "loss": 0.704, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29611319303512573, + "rewards/margins": 0.31762832403182983, + "rewards/rejected": -0.6137415170669556, + "step": 919 + }, + { + "epoch": 0.11, + "learning_rate": 2.7229310546646374e-07, + "logits/chosen": -3.344815492630005, + "logits/rejected": -3.564399242401123, + "logps/chosen": -174.67694091796875, + "logps/rejected": -193.72999572753906, + "loss": 0.2455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4308107793331146, + "rewards/margins": 1.627485752105713, + "rewards/rejected": -1.1966749429702759, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 2.722579889968395e-07, + "logits/chosen": -3.5176708698272705, + "logits/rejected": -3.2607157230377197, + "logps/chosen": -300.05047607421875, + "logps/rejected": -294.6630859375, + "loss": 0.6199, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43917638063430786, + "rewards/margins": 0.5336997509002686, + "rewards/rejected": -0.9728761911392212, + "step": 921 + }, + { + "epoch": 0.11, + "learning_rate": 2.7222287252721525e-07, + "logits/chosen": -2.4959073066711426, + "logits/rejected": -2.5892107486724854, + "logps/chosen": -199.0511474609375, + "logps/rejected": -280.9212646484375, + "loss": 0.8191, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36172452569007874, + "rewards/margins": -0.06297557055950165, + "rewards/rejected": -0.2987489700317383, + "step": 922 + }, + { + "epoch": 0.11, + "learning_rate": 2.72187756057591e-07, + "logits/chosen": -3.241739273071289, + "logits/rejected": -3.280200958251953, + "logps/chosen": -300.1468200683594, + "logps/rejected": -202.93215942382812, + "loss": 0.3672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11721419543027878, + "rewards/margins": 1.380391240119934, + "rewards/rejected": -1.2631769180297852, + "step": 923 + }, + { + "epoch": 0.11, + "learning_rate": 2.7215263958796676e-07, + "logits/chosen": -2.7592411041259766, + "logits/rejected": -2.867053747177124, + "logps/chosen": -206.97164916992188, + "logps/rejected": -286.4378662109375, + "loss": 0.5792, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12773418426513672, + "rewards/margins": 0.702996015548706, + "rewards/rejected": -0.5752618312835693, + "step": 924 + }, + { + "epoch": 0.11, + "learning_rate": 2.7211752311834246e-07, + "logits/chosen": -3.023634672164917, + "logits/rejected": -3.0151944160461426, + "logps/chosen": -167.0782928466797, + "logps/rejected": -216.24655151367188, + "loss": 0.4429, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39342784881591797, + "rewards/margins": 1.2722269296646118, + "rewards/rejected": -0.8787990808486938, + "step": 925 + }, + { + "epoch": 0.11, + "learning_rate": 2.7208240664871827e-07, + "logits/chosen": -2.6129889488220215, + "logits/rejected": -2.501737594604492, + "logps/chosen": -444.32354736328125, + "logps/rejected": -505.006591796875, + "loss": 0.4616, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3884381055831909, + "rewards/margins": 0.7828586101531982, + "rewards/rejected": -0.3944205343723297, + "step": 926 + }, + { + "epoch": 0.11, + "learning_rate": 2.7204729017909397e-07, + "logits/chosen": -3.0094833374023438, + "logits/rejected": -2.943114757537842, + "logps/chosen": -360.1683654785156, + "logps/rejected": -304.1644287109375, + "loss": 0.7426, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3677533268928528, + "rewards/margins": 0.00048439204692840576, + "rewards/rejected": -0.36823770403862, + "step": 927 + }, + { + "epoch": 0.11, + "learning_rate": 2.720121737094697e-07, + "logits/chosen": -3.487783432006836, + "logits/rejected": -3.646026134490967, + "logps/chosen": -183.87905883789062, + "logps/rejected": -213.95388793945312, + "loss": 0.4562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08492925018072128, + "rewards/margins": 1.190995693206787, + "rewards/rejected": -1.2759250402450562, + "step": 928 + }, + { + "epoch": 0.11, + "learning_rate": 2.719770572398455e-07, + "logits/chosen": -3.632976531982422, + "logits/rejected": -3.083171844482422, + "logps/chosen": -386.4703369140625, + "logps/rejected": -171.87962341308594, + "loss": 0.3675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01657090336084366, + "rewards/margins": 1.3287577629089355, + "rewards/rejected": -1.3121867179870605, + "step": 929 + }, + { + "epoch": 0.11, + "learning_rate": 2.7194194077022123e-07, + "logits/chosen": -3.493985652923584, + "logits/rejected": -3.3654446601867676, + "logps/chosen": -154.06573486328125, + "logps/rejected": -172.19325256347656, + "loss": 0.343, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11701072752475739, + "rewards/margins": 1.1649894714355469, + "rewards/rejected": -1.0479786396026611, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 2.71906824300597e-07, + "logits/chosen": -3.280346393585205, + "logits/rejected": -3.0646750926971436, + "logps/chosen": -302.97088623046875, + "logps/rejected": -255.5186767578125, + "loss": 0.6315, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18493299186229706, + "rewards/margins": 0.6485589742660522, + "rewards/rejected": -0.8334920406341553, + "step": 931 + }, + { + "epoch": 0.11, + "learning_rate": 2.7187170783097274e-07, + "logits/chosen": -3.14422607421875, + "logits/rejected": -3.1548752784729004, + "logps/chosen": -259.3439025878906, + "logps/rejected": -301.1274108886719, + "loss": 0.3588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05870581790804863, + "rewards/margins": 1.8426764011383057, + "rewards/rejected": -1.9013820886611938, + "step": 932 + }, + { + "epoch": 0.11, + "learning_rate": 2.7183659136134844e-07, + "logits/chosen": -2.8842153549194336, + "logits/rejected": -2.8962459564208984, + "logps/chosen": -154.29705810546875, + "logps/rejected": -243.53192138671875, + "loss": 0.3651, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09773492813110352, + "rewards/margins": 1.3322856426239014, + "rewards/rejected": -1.2345507144927979, + "step": 933 + }, + { + "epoch": 0.11, + "learning_rate": 2.718014748917242e-07, + "logits/chosen": -2.6701111793518066, + "logits/rejected": -2.807131290435791, + "logps/chosen": -218.81524658203125, + "logps/rejected": -278.7330017089844, + "loss": 0.5134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23717649281024933, + "rewards/margins": 1.611145257949829, + "rewards/rejected": -1.848321795463562, + "step": 934 + }, + { + "epoch": 0.11, + "learning_rate": 2.7176635842209994e-07, + "logits/chosen": -2.975229024887085, + "logits/rejected": -2.8885738849639893, + "logps/chosen": -122.40269470214844, + "logps/rejected": -128.49169921875, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09790345281362534, + "rewards/margins": 0.7106057405471802, + "rewards/rejected": -0.8085091710090637, + "step": 935 + }, + { + "epoch": 0.11, + "learning_rate": 2.717312419524757e-07, + "logits/chosen": -3.4507875442504883, + "logits/rejected": -3.414834499359131, + "logps/chosen": -243.28448486328125, + "logps/rejected": -265.5135803222656, + "loss": 0.2805, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5943059921264648, + "rewards/margins": 1.720097541809082, + "rewards/rejected": -1.1257915496826172, + "step": 936 + }, + { + "epoch": 0.11, + "learning_rate": 2.7169612548285145e-07, + "logits/chosen": -2.889873504638672, + "logits/rejected": -2.9832043647766113, + "logps/chosen": -197.5057373046875, + "logps/rejected": -114.00299835205078, + "loss": 0.5697, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16511191427707672, + "rewards/margins": 0.49695950746536255, + "rewards/rejected": -0.6620713472366333, + "step": 937 + }, + { + "epoch": 0.11, + "learning_rate": 2.716610090132272e-07, + "logits/chosen": -3.812404155731201, + "logits/rejected": -3.8224549293518066, + "logps/chosen": -139.3220672607422, + "logps/rejected": -162.1681365966797, + "loss": 0.4036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0451909676194191, + "rewards/margins": 1.1646888256072998, + "rewards/rejected": -1.2098798751831055, + "step": 938 + }, + { + "epoch": 0.11, + "learning_rate": 2.7162589254360296e-07, + "logits/chosen": -3.9208450317382812, + "logits/rejected": -3.644503116607666, + "logps/chosen": -429.77001953125, + "logps/rejected": -298.3420104980469, + "loss": 0.761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8798959851264954, + "rewards/margins": 0.1864919811487198, + "rewards/rejected": -1.0663880109786987, + "step": 939 + }, + { + "epoch": 0.11, + "learning_rate": 2.715907760739787e-07, + "logits/chosen": -2.138636589050293, + "logits/rejected": -2.1230974197387695, + "logps/chosen": -284.8170166015625, + "logps/rejected": -295.7038269042969, + "loss": 0.5185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1885736733675003, + "rewards/margins": 0.983759880065918, + "rewards/rejected": -1.1723335981369019, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 2.715556596043544e-07, + "logits/chosen": -3.553351402282715, + "logits/rejected": -3.8664088249206543, + "logps/chosen": -247.04934692382812, + "logps/rejected": -362.0994567871094, + "loss": 0.4512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23752403259277344, + "rewards/margins": 1.1279785633087158, + "rewards/rejected": -1.3655025959014893, + "step": 941 + }, + { + "epoch": 0.11, + "learning_rate": 2.7152054313473017e-07, + "logits/chosen": -2.975712299346924, + "logits/rejected": -3.1567800045013428, + "logps/chosen": -306.24237060546875, + "logps/rejected": -264.3021240234375, + "loss": 0.4559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09428466856479645, + "rewards/margins": 1.0892207622528076, + "rewards/rejected": -1.1835055351257324, + "step": 942 + }, + { + "epoch": 0.11, + "learning_rate": 2.714854266651059e-07, + "logits/chosen": -3.493198871612549, + "logits/rejected": -3.3508262634277344, + "logps/chosen": -302.9748229980469, + "logps/rejected": -291.0611267089844, + "loss": 0.5418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.351968377828598, + "rewards/margins": 1.118449091911316, + "rewards/rejected": -1.4704174995422363, + "step": 943 + }, + { + "epoch": 0.11, + "learning_rate": 2.714503101954817e-07, + "logits/chosen": -3.0290277004241943, + "logits/rejected": -3.262327194213867, + "logps/chosen": -270.17156982421875, + "logps/rejected": -286.8800354003906, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09464433789253235, + "rewards/margins": 1.1609529256820679, + "rewards/rejected": -1.066308617591858, + "step": 944 + }, + { + "epoch": 0.11, + "learning_rate": 2.7141519372585743e-07, + "logits/chosen": -2.8107359409332275, + "logits/rejected": -2.7836945056915283, + "logps/chosen": -263.7593688964844, + "logps/rejected": -237.64352416992188, + "loss": 0.5396, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.025854304432868958, + "rewards/margins": 1.1632884740829468, + "rewards/rejected": -1.1891427040100098, + "step": 945 + }, + { + "epoch": 0.11, + "learning_rate": 2.7138007725623313e-07, + "logits/chosen": -2.8193705081939697, + "logits/rejected": -2.9974308013916016, + "logps/chosen": -303.5420837402344, + "logps/rejected": -171.15150451660156, + "loss": 0.6617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16644184291362762, + "rewards/margins": 0.5049163103103638, + "rewards/rejected": -0.3384745121002197, + "step": 946 + }, + { + "epoch": 0.11, + "learning_rate": 2.7134496078660894e-07, + "logits/chosen": -3.325385093688965, + "logits/rejected": -3.2183837890625, + "logps/chosen": -197.38986206054688, + "logps/rejected": -238.8984832763672, + "loss": 0.3043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14208994805812836, + "rewards/margins": 1.0950318574905396, + "rewards/rejected": -0.9529418349266052, + "step": 947 + }, + { + "epoch": 0.11, + "learning_rate": 2.7130984431698464e-07, + "logits/chosen": -3.369166612625122, + "logits/rejected": -3.4006786346435547, + "logps/chosen": -157.16506958007812, + "logps/rejected": -160.9402313232422, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14649298787117004, + "rewards/margins": 1.0613946914672852, + "rewards/rejected": -1.2078876495361328, + "step": 948 + }, + { + "epoch": 0.11, + "learning_rate": 2.712747278473604e-07, + "logits/chosen": -3.165745735168457, + "logits/rejected": -3.417111873626709, + "logps/chosen": -222.613525390625, + "logps/rejected": -263.4788513183594, + "loss": 0.5029, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18515644967556, + "rewards/margins": 1.022705078125, + "rewards/rejected": -1.2078614234924316, + "step": 949 + }, + { + "epoch": 0.11, + "learning_rate": 2.7123961137773615e-07, + "logits/chosen": -3.3367128372192383, + "logits/rejected": -3.596122980117798, + "logps/chosen": -325.55694580078125, + "logps/rejected": -176.81243896484375, + "loss": 0.5394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6319919228553772, + "rewards/margins": 0.41301268339157104, + "rewards/rejected": -1.0450046062469482, + "step": 950 + }, + { + "epoch": 0.11, + "learning_rate": 2.712044949081119e-07, + "logits/chosen": -3.745450019836426, + "logits/rejected": -3.401707887649536, + "logps/chosen": -331.63543701171875, + "logps/rejected": -164.2527618408203, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1420428454875946, + "rewards/margins": 1.5322678089141846, + "rewards/rejected": -1.390224814414978, + "step": 951 + }, + { + "epoch": 0.11, + "learning_rate": 2.7116937843848765e-07, + "logits/chosen": -2.7094602584838867, + "logits/rejected": -2.7347474098205566, + "logps/chosen": -455.28558349609375, + "logps/rejected": -379.8451232910156, + "loss": 0.4634, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03492278605699539, + "rewards/margins": 0.9114267230033875, + "rewards/rejected": -0.9463495016098022, + "step": 952 + }, + { + "epoch": 0.11, + "learning_rate": 2.711342619688634e-07, + "logits/chosen": -3.1910560131073, + "logits/rejected": -2.6987390518188477, + "logps/chosen": -333.8016662597656, + "logps/rejected": -243.1864013671875, + "loss": 0.322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13402239978313446, + "rewards/margins": 1.1437492370605469, + "rewards/rejected": -1.2777715921401978, + "step": 953 + }, + { + "epoch": 0.11, + "learning_rate": 2.710991454992391e-07, + "logits/chosen": -2.849750518798828, + "logits/rejected": -2.8113839626312256, + "logps/chosen": -252.40562438964844, + "logps/rejected": -261.11541748046875, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15758399665355682, + "rewards/margins": 0.8513338565826416, + "rewards/rejected": -1.0089179277420044, + "step": 954 + }, + { + "epoch": 0.11, + "learning_rate": 2.7106402902961486e-07, + "logits/chosen": -2.7115280628204346, + "logits/rejected": -2.7580766677856445, + "logps/chosen": -495.89532470703125, + "logps/rejected": -405.28082275390625, + "loss": 0.3589, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.460294246673584, + "rewards/margins": 1.2527241706848145, + "rewards/rejected": -0.7924299240112305, + "step": 955 + }, + { + "epoch": 0.11, + "learning_rate": 2.710289125599906e-07, + "logits/chosen": -3.030745029449463, + "logits/rejected": -3.0923924446105957, + "logps/chosen": -208.54684448242188, + "logps/rejected": -281.8843078613281, + "loss": 0.3004, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15190669894218445, + "rewards/margins": 2.155510902404785, + "rewards/rejected": -2.0036041736602783, + "step": 956 + }, + { + "epoch": 0.11, + "learning_rate": 2.7099379609036637e-07, + "logits/chosen": -3.021512031555176, + "logits/rejected": -3.050847053527832, + "logps/chosen": -114.19599914550781, + "logps/rejected": -190.4558868408203, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1878015398979187, + "rewards/margins": 1.601518154144287, + "rewards/rejected": -1.7893197536468506, + "step": 957 + }, + { + "epoch": 0.11, + "learning_rate": 2.709586796207421e-07, + "logits/chosen": -2.7128593921661377, + "logits/rejected": -2.8246259689331055, + "logps/chosen": -439.3732604980469, + "logps/rejected": -315.50213623046875, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17414307594299316, + "rewards/margins": 1.7964141368865967, + "rewards/rejected": -1.6222712993621826, + "step": 958 + }, + { + "epoch": 0.11, + "learning_rate": 2.709235631511178e-07, + "logits/chosen": -2.8361897468566895, + "logits/rejected": -3.0565404891967773, + "logps/chosen": -168.5650177001953, + "logps/rejected": -226.47120666503906, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5012440085411072, + "rewards/margins": 0.6525136232376099, + "rewards/rejected": -1.1537576913833618, + "step": 959 + }, + { + "epoch": 0.11, + "learning_rate": 2.7088844668149363e-07, + "logits/chosen": -2.7673158645629883, + "logits/rejected": -2.9847230911254883, + "logps/chosen": -155.2431640625, + "logps/rejected": -146.95323181152344, + "loss": 0.6291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11602359265089035, + "rewards/margins": 0.9629310369491577, + "rewards/rejected": -1.0789545774459839, + "step": 960 + }, + { + "epoch": 0.11, + "learning_rate": 2.708533302118694e-07, + "logits/chosen": -2.4599609375, + "logits/rejected": -2.3996639251708984, + "logps/chosen": -435.77947998046875, + "logps/rejected": -361.87213134765625, + "loss": 0.6775, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2719172537326813, + "rewards/margins": 0.47527581453323364, + "rewards/rejected": -0.20335856080055237, + "step": 961 + }, + { + "epoch": 0.11, + "learning_rate": 2.708182137422451e-07, + "logits/chosen": -3.1659483909606934, + "logits/rejected": -3.0268325805664062, + "logps/chosen": -264.2563171386719, + "logps/rejected": -219.39776611328125, + "loss": 0.5216, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2708269953727722, + "rewards/margins": 0.5214576721191406, + "rewards/rejected": -0.2506306767463684, + "step": 962 + }, + { + "epoch": 0.11, + "learning_rate": 2.7078309727262084e-07, + "logits/chosen": -2.314093589782715, + "logits/rejected": -2.2031283378601074, + "logps/chosen": -188.3428497314453, + "logps/rejected": -242.99642944335938, + "loss": 0.6951, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10376887768507004, + "rewards/margins": 0.18178167939186096, + "rewards/rejected": -0.07801275700330734, + "step": 963 + }, + { + "epoch": 0.11, + "learning_rate": 2.707479808029966e-07, + "logits/chosen": -3.683884620666504, + "logits/rejected": -3.606537342071533, + "logps/chosen": -190.3939208984375, + "logps/rejected": -177.0293731689453, + "loss": 0.5049, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09123976528644562, + "rewards/margins": 0.7861461639404297, + "rewards/rejected": -0.6949064135551453, + "step": 964 + }, + { + "epoch": 0.11, + "learning_rate": 2.7071286433337235e-07, + "logits/chosen": -2.5376155376434326, + "logits/rejected": -2.6280434131622314, + "logps/chosen": -388.2626647949219, + "logps/rejected": -300.81170654296875, + "loss": 0.3047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6321220397949219, + "rewards/margins": 1.5867618322372437, + "rewards/rejected": -0.9546397924423218, + "step": 965 + }, + { + "epoch": 0.11, + "learning_rate": 2.706777478637481e-07, + "logits/chosen": -3.2188303470611572, + "logits/rejected": -3.3861145973205566, + "logps/chosen": -209.6493682861328, + "logps/rejected": -205.88406372070312, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3750436305999756, + "rewards/margins": 1.3258228302001953, + "rewards/rejected": -0.9507793188095093, + "step": 966 + }, + { + "epoch": 0.11, + "learning_rate": 2.706426313941238e-07, + "logits/chosen": -3.0122857093811035, + "logits/rejected": -3.042473077774048, + "logps/chosen": -439.5854797363281, + "logps/rejected": -385.3846435546875, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20541119575500488, + "rewards/margins": 0.6080130338668823, + "rewards/rejected": -0.40260183811187744, + "step": 967 + }, + { + "epoch": 0.11, + "learning_rate": 2.7060751492449956e-07, + "logits/chosen": -3.0006396770477295, + "logits/rejected": -2.839611291885376, + "logps/chosen": -250.30145263671875, + "logps/rejected": -214.79049682617188, + "loss": 0.3444, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05845535174012184, + "rewards/margins": 1.7147998809814453, + "rewards/rejected": -1.6563444137573242, + "step": 968 + }, + { + "epoch": 0.11, + "learning_rate": 2.7057239845487536e-07, + "logits/chosen": -3.455221652984619, + "logits/rejected": -3.0053658485412598, + "logps/chosen": -274.49749755859375, + "logps/rejected": -263.79644775390625, + "loss": 0.5441, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2278425395488739, + "rewards/margins": 0.7620329856872559, + "rewards/rejected": -0.9898754954338074, + "step": 969 + }, + { + "epoch": 0.11, + "learning_rate": 2.7053728198525106e-07, + "logits/chosen": -2.83850359916687, + "logits/rejected": -2.9234111309051514, + "logps/chosen": -200.509033203125, + "logps/rejected": -211.42225646972656, + "loss": 0.3536, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18831130862236023, + "rewards/margins": 1.7086939811706543, + "rewards/rejected": -1.5203826427459717, + "step": 970 + }, + { + "epoch": 0.11, + "learning_rate": 2.705021655156268e-07, + "logits/chosen": -3.025434732437134, + "logits/rejected": -3.2490336894989014, + "logps/chosen": -203.34445190429688, + "logps/rejected": -320.43096923828125, + "loss": 0.7002, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4108351171016693, + "rewards/margins": 0.7105294466018677, + "rewards/rejected": -0.299694299697876, + "step": 971 + }, + { + "epoch": 0.11, + "learning_rate": 2.7046704904600257e-07, + "logits/chosen": -2.8695127964019775, + "logits/rejected": -3.0990819931030273, + "logps/chosen": -278.72174072265625, + "logps/rejected": -235.55712890625, + "loss": 0.6387, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.027105100452899933, + "rewards/margins": 0.9839267730712891, + "rewards/rejected": -1.011031985282898, + "step": 972 + }, + { + "epoch": 0.11, + "learning_rate": 2.7043193257637833e-07, + "logits/chosen": -3.7534422874450684, + "logits/rejected": -3.6492557525634766, + "logps/chosen": -210.11024475097656, + "logps/rejected": -231.05670166015625, + "loss": 0.2903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035855215042829514, + "rewards/margins": 1.829070806503296, + "rewards/rejected": -1.7932155132293701, + "step": 973 + }, + { + "epoch": 0.11, + "learning_rate": 2.703968161067541e-07, + "logits/chosen": -3.133993625640869, + "logits/rejected": -2.5676791667938232, + "logps/chosen": -329.77996826171875, + "logps/rejected": -317.7174377441406, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.042493656277656555, + "rewards/margins": 1.4304752349853516, + "rewards/rejected": -1.3879815340042114, + "step": 974 + }, + { + "epoch": 0.11, + "learning_rate": 2.703616996371298e-07, + "logits/chosen": -2.669872283935547, + "logits/rejected": -2.8968656063079834, + "logps/chosen": -182.85948181152344, + "logps/rejected": -162.0805206298828, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4360937476158142, + "rewards/margins": 1.4947019815444946, + "rewards/rejected": -1.0586082935333252, + "step": 975 + }, + { + "epoch": 0.11, + "learning_rate": 2.7032658316750554e-07, + "logits/chosen": -3.2783894538879395, + "logits/rejected": -3.618631362915039, + "logps/chosen": -152.9832763671875, + "logps/rejected": -190.10696411132812, + "loss": 0.5047, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11848974227905273, + "rewards/margins": 1.1498838663101196, + "rewards/rejected": -1.031394124031067, + "step": 976 + }, + { + "epoch": 0.11, + "learning_rate": 2.702914666978813e-07, + "logits/chosen": -2.243051528930664, + "logits/rejected": -2.3684303760528564, + "logps/chosen": -355.09503173828125, + "logps/rejected": -228.03611755371094, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0780981108546257, + "rewards/margins": 0.7269706130027771, + "rewards/rejected": -0.8050686717033386, + "step": 977 + }, + { + "epoch": 0.11, + "learning_rate": 2.7025635022825704e-07, + "logits/chosen": -2.8946638107299805, + "logits/rejected": -2.8324694633483887, + "logps/chosen": -151.55172729492188, + "logps/rejected": -208.7655487060547, + "loss": 0.3202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08181506395339966, + "rewards/margins": 1.2029764652252197, + "rewards/rejected": -1.2847914695739746, + "step": 978 + }, + { + "epoch": 0.11, + "learning_rate": 2.702212337586328e-07, + "logits/chosen": -2.8441410064697266, + "logits/rejected": -2.9558866024017334, + "logps/chosen": -522.5399780273438, + "logps/rejected": -342.52520751953125, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48758476972579956, + "rewards/margins": 1.563379168510437, + "rewards/rejected": -1.0757943391799927, + "step": 979 + }, + { + "epoch": 0.11, + "learning_rate": 2.701861172890085e-07, + "logits/chosen": -3.30424165725708, + "logits/rejected": -3.3372042179107666, + "logps/chosen": -343.93792724609375, + "logps/rejected": -470.9925842285156, + "loss": 0.4793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21383866667747498, + "rewards/margins": 1.3687883615493774, + "rewards/rejected": -1.5826270580291748, + "step": 980 + }, + { + "epoch": 0.11, + "learning_rate": 2.701510008193843e-07, + "logits/chosen": -3.6467607021331787, + "logits/rejected": -3.415522575378418, + "logps/chosen": -307.7083740234375, + "logps/rejected": -232.10049438476562, + "loss": 0.7704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5199838280677795, + "rewards/margins": 0.3384891152381897, + "rewards/rejected": -0.8584729433059692, + "step": 981 + }, + { + "epoch": 0.11, + "learning_rate": 2.7011588434976006e-07, + "logits/chosen": -2.210667133331299, + "logits/rejected": -2.431813955307007, + "logps/chosen": -383.2902526855469, + "logps/rejected": -377.5415954589844, + "loss": 0.536, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22913381457328796, + "rewards/margins": 0.573865532875061, + "rewards/rejected": -0.34473171830177307, + "step": 982 + }, + { + "epoch": 0.11, + "learning_rate": 2.7008076788013576e-07, + "logits/chosen": -3.2514007091522217, + "logits/rejected": -3.481778144836426, + "logps/chosen": -143.31439208984375, + "logps/rejected": -201.0032958984375, + "loss": 0.5603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14243854582309723, + "rewards/margins": 0.7480747699737549, + "rewards/rejected": -0.8905133605003357, + "step": 983 + }, + { + "epoch": 0.11, + "learning_rate": 2.700456514105115e-07, + "logits/chosen": -3.275930404663086, + "logits/rejected": -3.127488851547241, + "logps/chosen": -339.2230529785156, + "logps/rejected": -323.06976318359375, + "loss": 0.2795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37228983640670776, + "rewards/margins": 1.5589337348937988, + "rewards/rejected": -1.1866439580917358, + "step": 984 + }, + { + "epoch": 0.11, + "learning_rate": 2.7001053494088727e-07, + "logits/chosen": -3.872694492340088, + "logits/rejected": -3.9138529300689697, + "logps/chosen": -101.06723022460938, + "logps/rejected": -172.11981201171875, + "loss": 0.4884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38010311126708984, + "rewards/margins": 0.6716045141220093, + "rewards/rejected": -1.0517076253890991, + "step": 985 + }, + { + "epoch": 0.11, + "learning_rate": 2.69975418471263e-07, + "logits/chosen": -3.3201053142547607, + "logits/rejected": -3.2113113403320312, + "logps/chosen": -291.1360778808594, + "logps/rejected": -248.64578247070312, + "loss": 0.4353, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15845462679862976, + "rewards/margins": 0.7811118364334106, + "rewards/rejected": -0.6226572394371033, + "step": 986 + }, + { + "epoch": 0.11, + "learning_rate": 2.699403020016388e-07, + "logits/chosen": -2.7711539268493652, + "logits/rejected": -3.1714000701904297, + "logps/chosen": -334.8018798828125, + "logps/rejected": -209.05471801757812, + "loss": 0.6539, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09502878785133362, + "rewards/margins": 0.4424913227558136, + "rewards/rejected": -0.5375201106071472, + "step": 987 + }, + { + "epoch": 0.11, + "learning_rate": 2.699051855320145e-07, + "logits/chosen": -2.6127400398254395, + "logits/rejected": -2.5484299659729004, + "logps/chosen": -254.2086944580078, + "logps/rejected": -260.4604187011719, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1699618250131607, + "rewards/margins": 1.173986792564392, + "rewards/rejected": -1.0040249824523926, + "step": 988 + }, + { + "epoch": 0.11, + "learning_rate": 2.6987006906239023e-07, + "logits/chosen": -2.0360960960388184, + "logits/rejected": -2.2439823150634766, + "logps/chosen": -236.25289916992188, + "logps/rejected": -241.26095581054688, + "loss": 0.56, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17995098233222961, + "rewards/margins": 0.8275477886199951, + "rewards/rejected": -1.007498860359192, + "step": 989 + }, + { + "epoch": 0.11, + "learning_rate": 2.69834952592766e-07, + "logits/chosen": -3.196899890899658, + "logits/rejected": -2.928715944290161, + "logps/chosen": -234.94532775878906, + "logps/rejected": -165.67864990234375, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08156568557024002, + "rewards/margins": 0.6188755631446838, + "rewards/rejected": -0.7004412412643433, + "step": 990 + }, + { + "epoch": 0.11, + "learning_rate": 2.6979983612314174e-07, + "logits/chosen": -2.80786395072937, + "logits/rejected": -2.6346147060394287, + "logps/chosen": -308.9167175292969, + "logps/rejected": -256.677734375, + "loss": 0.593, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2223612666130066, + "rewards/margins": 0.36026954650878906, + "rewards/rejected": -0.13790826499462128, + "step": 991 + }, + { + "epoch": 0.11, + "learning_rate": 2.697647196535175e-07, + "logits/chosen": -3.226032257080078, + "logits/rejected": -3.506516933441162, + "logps/chosen": -162.94314575195312, + "logps/rejected": -271.2950744628906, + "loss": 0.5812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27302369475364685, + "rewards/margins": 0.9992998838424683, + "rewards/rejected": -1.272323489189148, + "step": 992 + }, + { + "epoch": 0.11, + "learning_rate": 2.697296031838932e-07, + "logits/chosen": -2.8696129322052, + "logits/rejected": -2.801812171936035, + "logps/chosen": -224.36204528808594, + "logps/rejected": -264.31707763671875, + "loss": 0.4144, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21553891897201538, + "rewards/margins": 0.9869098663330078, + "rewards/rejected": -0.7713708877563477, + "step": 993 + }, + { + "epoch": 0.11, + "learning_rate": 2.69694486714269e-07, + "logits/chosen": -3.83064866065979, + "logits/rejected": -3.643404245376587, + "logps/chosen": -194.24085998535156, + "logps/rejected": -172.23036193847656, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17081782221794128, + "rewards/margins": 1.6466740369796753, + "rewards/rejected": -1.8174920082092285, + "step": 994 + }, + { + "epoch": 0.11, + "learning_rate": 2.6965937024464475e-07, + "logits/chosen": -3.7284765243530273, + "logits/rejected": -3.479773998260498, + "logps/chosen": -241.43002319335938, + "logps/rejected": -201.12896728515625, + "loss": 0.4834, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09852510690689087, + "rewards/margins": 1.2628109455108643, + "rewards/rejected": -1.1642858982086182, + "step": 995 + }, + { + "epoch": 0.11, + "learning_rate": 2.6962425377502045e-07, + "logits/chosen": -3.3035807609558105, + "logits/rejected": -3.288422107696533, + "logps/chosen": -347.50457763671875, + "logps/rejected": -234.35922241210938, + "loss": 0.392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39147573709487915, + "rewards/margins": 1.139094591140747, + "rewards/rejected": -0.7476187944412231, + "step": 996 + }, + { + "epoch": 0.11, + "learning_rate": 2.695891373053962e-07, + "logits/chosen": -2.641629934310913, + "logits/rejected": -2.8399672508239746, + "logps/chosen": -487.91217041015625, + "logps/rejected": -400.4972839355469, + "loss": 0.5717, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08460550010204315, + "rewards/margins": 0.42655980587005615, + "rewards/rejected": -0.3419543504714966, + "step": 997 + }, + { + "epoch": 0.12, + "learning_rate": 2.6955402083577196e-07, + "logits/chosen": -3.0048513412475586, + "logits/rejected": -2.698700428009033, + "logps/chosen": -161.59068298339844, + "logps/rejected": -260.7747497558594, + "loss": 0.4292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04530215263366699, + "rewards/margins": 0.8997824788093567, + "rewards/rejected": -0.9450846314430237, + "step": 998 + }, + { + "epoch": 0.12, + "learning_rate": 2.695189043661477e-07, + "logits/chosen": -2.8768482208251953, + "logits/rejected": -2.7550835609436035, + "logps/chosen": -314.4839172363281, + "logps/rejected": -257.24560546875, + "loss": 0.5145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.544773280620575, + "rewards/margins": 0.7183042168617249, + "rewards/rejected": -1.2630774974822998, + "step": 999 + }, + { + "epoch": 0.12, + "learning_rate": 2.6948378789652347e-07, + "logits/chosen": -2.9549267292022705, + "logits/rejected": -3.0565500259399414, + "logps/chosen": -488.66455078125, + "logps/rejected": -401.908203125, + "loss": 0.2682, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47644034028053284, + "rewards/margins": 1.7262794971466064, + "rewards/rejected": -1.249839186668396, + "step": 1000 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.8467764854431152, + "eval_logits/rejected": -2.8113934993743896, + "eval_logps/chosen": -292.5613098144531, + "eval_logps/rejected": -230.89024353027344, + "eval_loss": 0.4792271852493286, + "eval_rewards/accuracies": 0.7857142686843872, + "eval_rewards/chosen": 0.14936694502830505, + "eval_rewards/margins": 0.8041319847106934, + "eval_rewards/rejected": -0.6547650694847107, + "eval_runtime": 32.6014, + "eval_samples_per_second": 2.147, + "eval_steps_per_second": 1.074, + "step": 1000 + }, + { + "epoch": 0.12, + "learning_rate": 2.6944867142689917e-07, + "logits/chosen": -2.9890384674072266, + "logits/rejected": -3.0015764236450195, + "logps/chosen": -294.6145324707031, + "logps/rejected": -180.47592163085938, + "loss": 0.5646, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3045096695423126, + "rewards/margins": 0.5944913625717163, + "rewards/rejected": -0.8990010023117065, + "step": 1001 + }, + { + "epoch": 0.12, + "learning_rate": 2.694135549572749e-07, + "logits/chosen": -2.927506923675537, + "logits/rejected": -3.082932710647583, + "logps/chosen": -250.5115203857422, + "logps/rejected": -169.50790405273438, + "loss": 0.55, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08371113240718842, + "rewards/margins": 0.6469983458518982, + "rewards/rejected": -0.7307094931602478, + "step": 1002 + }, + { + "epoch": 0.12, + "learning_rate": 2.6937843848765073e-07, + "logits/chosen": -2.3061983585357666, + "logits/rejected": -2.2522501945495605, + "logps/chosen": -221.78208923339844, + "logps/rejected": -228.43289184570312, + "loss": 0.3965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03845285251736641, + "rewards/margins": 1.173314094543457, + "rewards/rejected": -1.2117670774459839, + "step": 1003 + }, + { + "epoch": 0.12, + "learning_rate": 2.6934332201802643e-07, + "logits/chosen": -3.0025792121887207, + "logits/rejected": -3.091940402984619, + "logps/chosen": -211.87171936035156, + "logps/rejected": -298.54644775390625, + "loss": 0.4181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08777110278606415, + "rewards/margins": 1.828642725944519, + "rewards/rejected": -1.9164137840270996, + "step": 1004 + }, + { + "epoch": 0.12, + "learning_rate": 2.693082055484022e-07, + "logits/chosen": -3.2955312728881836, + "logits/rejected": -3.3115639686584473, + "logps/chosen": -129.1331329345703, + "logps/rejected": -108.62527465820312, + "loss": 0.493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21543221175670624, + "rewards/margins": 0.5740291476249695, + "rewards/rejected": -0.7894613742828369, + "step": 1005 + }, + { + "epoch": 0.12, + "learning_rate": 2.6927308907877794e-07, + "logits/chosen": -3.173074960708618, + "logits/rejected": -3.3787930011749268, + "logps/chosen": -190.12193298339844, + "logps/rejected": -173.813720703125, + "loss": 0.3139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12221789360046387, + "rewards/margins": 1.5886244773864746, + "rewards/rejected": -1.4664065837860107, + "step": 1006 + }, + { + "epoch": 0.12, + "learning_rate": 2.692379726091537e-07, + "logits/chosen": -3.27113676071167, + "logits/rejected": -3.5632402896881104, + "logps/chosen": -244.30538940429688, + "logps/rejected": -267.55548095703125, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22619611024856567, + "rewards/margins": 1.4454288482666016, + "rewards/rejected": -1.2192325592041016, + "step": 1007 + }, + { + "epoch": 0.12, + "learning_rate": 2.6920285613952945e-07, + "logits/chosen": -3.2622568607330322, + "logits/rejected": -3.4214370250701904, + "logps/chosen": -179.5801239013672, + "logps/rejected": -128.70155334472656, + "loss": 0.7735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1666165292263031, + "rewards/margins": -0.05037976801395416, + "rewards/rejected": -0.11623678356409073, + "step": 1008 + }, + { + "epoch": 0.12, + "learning_rate": 2.6916773966990515e-07, + "logits/chosen": -3.065295696258545, + "logits/rejected": -3.0291500091552734, + "logps/chosen": -473.509765625, + "logps/rejected": -407.18243408203125, + "loss": 0.5594, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2686772346496582, + "rewards/margins": 0.526474118232727, + "rewards/rejected": -0.25779685378074646, + "step": 1009 + }, + { + "epoch": 0.12, + "learning_rate": 2.691326232002809e-07, + "logits/chosen": -3.5243821144104004, + "logits/rejected": -3.3372323513031006, + "logps/chosen": -464.8660888671875, + "logps/rejected": -307.4770202636719, + "loss": 0.3817, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44373583793640137, + "rewards/margins": 1.2967848777770996, + "rewards/rejected": -0.8530490398406982, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 2.6909750673065666e-07, + "logits/chosen": -3.042860984802246, + "logits/rejected": -3.187589168548584, + "logps/chosen": -440.2353515625, + "logps/rejected": -380.84912109375, + "loss": 0.3126, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36759084463119507, + "rewards/margins": 1.452439308166504, + "rewards/rejected": -1.084848403930664, + "step": 1011 + }, + { + "epoch": 0.12, + "learning_rate": 2.690623902610324e-07, + "logits/chosen": -3.1054091453552246, + "logits/rejected": -2.9565813541412354, + "logps/chosen": -209.68051147460938, + "logps/rejected": -263.39630126953125, + "loss": 0.2324, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49350109696388245, + "rewards/margins": 2.3182191848754883, + "rewards/rejected": -1.8247181177139282, + "step": 1012 + }, + { + "epoch": 0.12, + "learning_rate": 2.6902727379140816e-07, + "logits/chosen": -2.4899420738220215, + "logits/rejected": -2.744485855102539, + "logps/chosen": -208.46820068359375, + "logps/rejected": -210.41334533691406, + "loss": 0.4245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003124713897705078, + "rewards/margins": 1.0875492095947266, + "rewards/rejected": -1.090673804283142, + "step": 1013 + }, + { + "epoch": 0.12, + "learning_rate": 2.689921573217839e-07, + "logits/chosen": -3.222302198410034, + "logits/rejected": -3.0362987518310547, + "logps/chosen": -244.73927307128906, + "logps/rejected": -309.02520751953125, + "loss": 0.418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005220465362071991, + "rewards/margins": 1.5701682567596436, + "rewards/rejected": -1.5649478435516357, + "step": 1014 + }, + { + "epoch": 0.12, + "learning_rate": 2.6895704085215967e-07, + "logits/chosen": -3.699026346206665, + "logits/rejected": -3.897007703781128, + "logps/chosen": -112.20047760009766, + "logps/rejected": -253.962646484375, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005344323813915253, + "rewards/margins": 1.6780694723129272, + "rewards/rejected": -1.6834138631820679, + "step": 1015 + }, + { + "epoch": 0.12, + "learning_rate": 2.689219243825354e-07, + "logits/chosen": -2.9722886085510254, + "logits/rejected": -3.029719591140747, + "logps/chosen": -179.494384765625, + "logps/rejected": -192.71896362304688, + "loss": 0.4079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2800194323062897, + "rewards/margins": 0.7049951553344727, + "rewards/rejected": -0.9850145578384399, + "step": 1016 + }, + { + "epoch": 0.12, + "learning_rate": 2.688868079129111e-07, + "logits/chosen": -2.727360248565674, + "logits/rejected": -2.815967559814453, + "logps/chosen": -235.6004180908203, + "logps/rejected": -271.8351135253906, + "loss": 0.3562, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2899309992790222, + "rewards/margins": 1.3568127155303955, + "rewards/rejected": -1.0668818950653076, + "step": 1017 + }, + { + "epoch": 0.12, + "learning_rate": 2.688516914432869e-07, + "logits/chosen": -3.189326524734497, + "logits/rejected": -3.5165343284606934, + "logps/chosen": -136.04640197753906, + "logps/rejected": -152.27615356445312, + "loss": 0.4583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012683719396591187, + "rewards/margins": 1.0235559940338135, + "rewards/rejected": -1.0108723640441895, + "step": 1018 + }, + { + "epoch": 0.12, + "learning_rate": 2.6881657497366263e-07, + "logits/chosen": -3.616323471069336, + "logits/rejected": -3.1945323944091797, + "logps/chosen": -420.41229248046875, + "logps/rejected": -269.4302978515625, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5952157974243164, + "rewards/margins": 1.3824422359466553, + "rewards/rejected": -1.9776577949523926, + "step": 1019 + }, + { + "epoch": 0.12, + "learning_rate": 2.687814585040384e-07, + "logits/chosen": -3.232095718383789, + "logits/rejected": -3.3242034912109375, + "logps/chosen": -202.30516052246094, + "logps/rejected": -243.49832153320312, + "loss": 0.4645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0598047710955143, + "rewards/margins": 0.7807599306106567, + "rewards/rejected": -0.7209552526473999, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 2.6874634203441414e-07, + "logits/chosen": -3.7904224395751953, + "logits/rejected": -3.6266684532165527, + "logps/chosen": -229.26454162597656, + "logps/rejected": -261.8208312988281, + "loss": 0.2976, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44723767042160034, + "rewards/margins": 2.6612606048583984, + "rewards/rejected": -2.2140228748321533, + "step": 1021 + }, + { + "epoch": 0.12, + "learning_rate": 2.687112255647899e-07, + "logits/chosen": -2.679919958114624, + "logits/rejected": -2.5095417499542236, + "logps/chosen": -245.14520263671875, + "logps/rejected": -310.1107177734375, + "loss": 0.3998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31252458691596985, + "rewards/margins": 1.1531708240509033, + "rewards/rejected": -0.8406461477279663, + "step": 1022 + }, + { + "epoch": 0.12, + "learning_rate": 2.686761090951656e-07, + "logits/chosen": -3.5106751918792725, + "logits/rejected": -3.803213119506836, + "logps/chosen": -322.8358459472656, + "logps/rejected": -402.86181640625, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4084693193435669, + "rewards/margins": 3.093571901321411, + "rewards/rejected": -2.685102701187134, + "step": 1023 + }, + { + "epoch": 0.12, + "learning_rate": 2.6864099262554135e-07, + "logits/chosen": -3.5380992889404297, + "logits/rejected": -3.8108534812927246, + "logps/chosen": -249.86752319335938, + "logps/rejected": -230.67649841308594, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3425680994987488, + "rewards/margins": 1.9143743515014648, + "rewards/rejected": -1.5718063116073608, + "step": 1024 + }, + { + "epoch": 0.12, + "learning_rate": 2.686058761559171e-07, + "logits/chosen": -3.6146469116210938, + "logits/rejected": -3.470076084136963, + "logps/chosen": -227.20889282226562, + "logps/rejected": -285.7851867675781, + "loss": 0.4603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21230866014957428, + "rewards/margins": 0.6457839608192444, + "rewards/rejected": -0.8580926060676575, + "step": 1025 + }, + { + "epoch": 0.12, + "learning_rate": 2.6857075968629286e-07, + "logits/chosen": -3.1639404296875, + "logits/rejected": -3.043987274169922, + "logps/chosen": -370.7873229980469, + "logps/rejected": -211.15228271484375, + "loss": 0.2411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2567855417728424, + "rewards/margins": 1.548858642578125, + "rewards/rejected": -1.2920732498168945, + "step": 1026 + }, + { + "epoch": 0.12, + "learning_rate": 2.685356432166686e-07, + "logits/chosen": -3.4740045070648193, + "logits/rejected": -3.239774227142334, + "logps/chosen": -228.8424072265625, + "logps/rejected": -251.1282196044922, + "loss": 0.5786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5096825361251831, + "rewards/margins": 0.7991372346878052, + "rewards/rejected": -1.3088197708129883, + "step": 1027 + }, + { + "epoch": 0.12, + "learning_rate": 2.6850052674704437e-07, + "logits/chosen": -3.0004751682281494, + "logits/rejected": -2.838207960128784, + "logps/chosen": -319.01763916015625, + "logps/rejected": -396.8262023925781, + "loss": 0.4932, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34719160199165344, + "rewards/margins": 0.6924643516540527, + "rewards/rejected": -0.3452726900577545, + "step": 1028 + }, + { + "epoch": 0.12, + "learning_rate": 2.684654102774201e-07, + "logits/chosen": -4.0099639892578125, + "logits/rejected": -3.4964346885681152, + "logps/chosen": -192.5290069580078, + "logps/rejected": -176.32797241210938, + "loss": 0.6351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8147139549255371, + "rewards/margins": 0.3653281331062317, + "rewards/rejected": -1.1800421476364136, + "step": 1029 + }, + { + "epoch": 0.12, + "learning_rate": 2.6843029380779587e-07, + "logits/chosen": -2.4619901180267334, + "logits/rejected": -3.058591842651367, + "logps/chosen": -288.5072021484375, + "logps/rejected": -169.99966430664062, + "loss": 0.3636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3259803056716919, + "rewards/margins": 1.6095463037490845, + "rewards/rejected": -1.2835659980773926, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 2.683951773381716e-07, + "logits/chosen": -3.937607526779175, + "logits/rejected": -3.728919506072998, + "logps/chosen": -200.14173889160156, + "logps/rejected": -166.66603088378906, + "loss": 0.3692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12413468956947327, + "rewards/margins": 1.0714726448059082, + "rewards/rejected": -0.9473379254341125, + "step": 1031 + }, + { + "epoch": 0.12, + "learning_rate": 2.6836006086854733e-07, + "logits/chosen": -2.6367554664611816, + "logits/rejected": -2.5294089317321777, + "logps/chosen": -303.33203125, + "logps/rejected": -362.5694274902344, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10377032309770584, + "rewards/margins": 1.1775587797164917, + "rewards/rejected": -1.2813290357589722, + "step": 1032 + }, + { + "epoch": 0.12, + "learning_rate": 2.683249443989231e-07, + "logits/chosen": -3.3861491680145264, + "logits/rejected": -3.283461809158325, + "logps/chosen": -314.738525390625, + "logps/rejected": -295.21697998046875, + "loss": 0.3442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03976573050022125, + "rewards/margins": 1.329036831855774, + "rewards/rejected": -1.3688024282455444, + "step": 1033 + }, + { + "epoch": 0.12, + "learning_rate": 2.6828982792929884e-07, + "logits/chosen": -3.5183475017547607, + "logits/rejected": -3.4384169578552246, + "logps/chosen": -223.3145294189453, + "logps/rejected": -269.67437744140625, + "loss": 0.3638, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02993015944957733, + "rewards/margins": 1.160941243171692, + "rewards/rejected": -1.1310110092163086, + "step": 1034 + }, + { + "epoch": 0.12, + "learning_rate": 2.682547114596746e-07, + "logits/chosen": -2.67248797416687, + "logits/rejected": -2.6747329235076904, + "logps/chosen": -152.82984924316406, + "logps/rejected": -178.09820556640625, + "loss": 0.5968, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16354693472385406, + "rewards/margins": 0.4074639081954956, + "rewards/rejected": -0.24391698837280273, + "step": 1035 + }, + { + "epoch": 0.12, + "learning_rate": 2.682195949900503e-07, + "logits/chosen": -2.944822311401367, + "logits/rejected": -2.8333382606506348, + "logps/chosen": -282.808837890625, + "logps/rejected": -256.447509765625, + "loss": 0.8227, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12120199203491211, + "rewards/margins": 0.14242622256278992, + "rewards/rejected": -0.263628214597702, + "step": 1036 + }, + { + "epoch": 0.12, + "learning_rate": 2.681844785204261e-07, + "logits/chosen": -3.3827338218688965, + "logits/rejected": -3.1289026737213135, + "logps/chosen": -256.9677734375, + "logps/rejected": -183.3800811767578, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23338547348976135, + "rewards/margins": 1.2781083583831787, + "rewards/rejected": -1.0447227954864502, + "step": 1037 + }, + { + "epoch": 0.12, + "learning_rate": 2.681493620508018e-07, + "logits/chosen": -3.1724696159362793, + "logits/rejected": -3.282991409301758, + "logps/chosen": -218.65682983398438, + "logps/rejected": -216.284423828125, + "loss": 0.4224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07080493122339249, + "rewards/margins": 1.2737340927124023, + "rewards/rejected": -1.2029290199279785, + "step": 1038 + }, + { + "epoch": 0.12, + "learning_rate": 2.6811424558117755e-07, + "logits/chosen": -2.9321727752685547, + "logits/rejected": -2.8633341789245605, + "logps/chosen": -239.38882446289062, + "logps/rejected": -242.52133178710938, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.057997554540634155, + "rewards/margins": 0.5173536539077759, + "rewards/rejected": -0.4593561589717865, + "step": 1039 + }, + { + "epoch": 0.12, + "learning_rate": 2.680791291115533e-07, + "logits/chosen": -1.744834065437317, + "logits/rejected": -1.8946303129196167, + "logps/chosen": -364.2138671875, + "logps/rejected": -296.4847106933594, + "loss": 0.5446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2992771863937378, + "rewards/margins": 1.4173939228057861, + "rewards/rejected": -1.1181166172027588, + "step": 1040 + }, + { + "epoch": 0.12, + "learning_rate": 2.6804401264192906e-07, + "logits/chosen": -2.891101121902466, + "logits/rejected": -3.454953670501709, + "logps/chosen": -253.30838012695312, + "logps/rejected": -244.77305603027344, + "loss": 0.434, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4544670879840851, + "rewards/margins": 1.174118995666504, + "rewards/rejected": -0.7196518778800964, + "step": 1041 + }, + { + "epoch": 0.12, + "learning_rate": 2.680088961723048e-07, + "logits/chosen": -2.6055498123168945, + "logits/rejected": -2.8302998542785645, + "logps/chosen": -289.50347900390625, + "logps/rejected": -289.9058532714844, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11054809391498566, + "rewards/margins": 1.4239352941513062, + "rewards/rejected": -1.5344834327697754, + "step": 1042 + }, + { + "epoch": 0.12, + "learning_rate": 2.6797377970268057e-07, + "logits/chosen": -3.259413719177246, + "logits/rejected": -3.3293957710266113, + "logps/chosen": -301.2493896484375, + "logps/rejected": -260.6601257324219, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09799051284790039, + "rewards/margins": 1.4851690530776978, + "rewards/rejected": -1.3871784210205078, + "step": 1043 + }, + { + "epoch": 0.12, + "learning_rate": 2.6793866323305627e-07, + "logits/chosen": -2.489177703857422, + "logits/rejected": -2.4383292198181152, + "logps/chosen": -183.40313720703125, + "logps/rejected": -242.26174926757812, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3571731746196747, + "rewards/margins": 2.117004871368408, + "rewards/rejected": -1.7598316669464111, + "step": 1044 + }, + { + "epoch": 0.12, + "learning_rate": 2.67903546763432e-07, + "logits/chosen": -3.4089417457580566, + "logits/rejected": -3.052192211151123, + "logps/chosen": -347.5773010253906, + "logps/rejected": -349.1462707519531, + "loss": 0.5503, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005301922559738159, + "rewards/margins": 1.3491414785385132, + "rewards/rejected": -1.3438396453857422, + "step": 1045 + }, + { + "epoch": 0.12, + "learning_rate": 2.678684302938078e-07, + "logits/chosen": -2.7101449966430664, + "logits/rejected": -2.726557493209839, + "logps/chosen": -291.880859375, + "logps/rejected": -336.6107177734375, + "loss": 0.2856, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0006092656403779984, + "rewards/margins": 1.6222069263458252, + "rewards/rejected": -1.6215977668762207, + "step": 1046 + }, + { + "epoch": 0.12, + "learning_rate": 2.6783331382418353e-07, + "logits/chosen": -3.1871161460876465, + "logits/rejected": -3.429884910583496, + "logps/chosen": -203.8602294921875, + "logps/rejected": -251.33966064453125, + "loss": 0.6293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17574141919612885, + "rewards/margins": 0.2031751573085785, + "rewards/rejected": -0.37891656160354614, + "step": 1047 + }, + { + "epoch": 0.12, + "learning_rate": 2.677981973545593e-07, + "logits/chosen": -2.3977134227752686, + "logits/rejected": -2.600922107696533, + "logps/chosen": -208.11224365234375, + "logps/rejected": -247.62356567382812, + "loss": 0.2661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014521032571792603, + "rewards/margins": 1.5620085000991821, + "rewards/rejected": -1.5474873781204224, + "step": 1048 + }, + { + "epoch": 0.12, + "learning_rate": 2.6776308088493504e-07, + "logits/chosen": -3.1489908695220947, + "logits/rejected": -3.1833267211914062, + "logps/chosen": -159.6882781982422, + "logps/rejected": -157.5350341796875, + "loss": 0.6725, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04417683556675911, + "rewards/margins": 0.305560439825058, + "rewards/rejected": -0.26138362288475037, + "step": 1049 + }, + { + "epoch": 0.12, + "learning_rate": 2.677279644153108e-07, + "logits/chosen": -2.7399721145629883, + "logits/rejected": -2.56968355178833, + "logps/chosen": -375.86492919921875, + "logps/rejected": -251.56936645507812, + "loss": 0.8015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3396145701408386, + "rewards/margins": 0.2890181541442871, + "rewards/rejected": -0.628632664680481, + "step": 1050 + }, + { + "epoch": 0.12, + "learning_rate": 2.6769284794568654e-07, + "logits/chosen": -2.6026909351348877, + "logits/rejected": -2.691554069519043, + "logps/chosen": -326.17657470703125, + "logps/rejected": -278.4223937988281, + "loss": 0.474, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4548858404159546, + "rewards/margins": 0.9007307291030884, + "rewards/rejected": -0.445844829082489, + "step": 1051 + }, + { + "epoch": 0.12, + "learning_rate": 2.6765773147606225e-07, + "logits/chosen": -2.9153225421905518, + "logits/rejected": -2.8656976222991943, + "logps/chosen": -403.8117370605469, + "logps/rejected": -272.4035949707031, + "loss": 0.4506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37804603576660156, + "rewards/margins": 1.2277987003326416, + "rewards/rejected": -1.6058447360992432, + "step": 1052 + }, + { + "epoch": 0.12, + "learning_rate": 2.67622615006438e-07, + "logits/chosen": -2.333549737930298, + "logits/rejected": -2.3535423278808594, + "logps/chosen": -356.178955078125, + "logps/rejected": -322.5814514160156, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032656386494636536, + "rewards/margins": 1.3281056880950928, + "rewards/rejected": -1.2954492568969727, + "step": 1053 + }, + { + "epoch": 0.12, + "learning_rate": 2.6758749853681375e-07, + "logits/chosen": -3.3473165035247803, + "logits/rejected": -3.547835350036621, + "logps/chosen": -197.41567993164062, + "logps/rejected": -238.34506225585938, + "loss": 0.2488, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38756442070007324, + "rewards/margins": 2.175715208053589, + "rewards/rejected": -1.7881507873535156, + "step": 1054 + }, + { + "epoch": 0.12, + "learning_rate": 2.675523820671895e-07, + "logits/chosen": -3.1267170906066895, + "logits/rejected": -3.072960615158081, + "logps/chosen": -302.6055908203125, + "logps/rejected": -336.59906005859375, + "loss": 0.2842, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4374012053012848, + "rewards/margins": 1.9779233932495117, + "rewards/rejected": -1.5405220985412598, + "step": 1055 + }, + { + "epoch": 0.12, + "learning_rate": 2.6751726559756526e-07, + "logits/chosen": -3.085273027420044, + "logits/rejected": -2.912855863571167, + "logps/chosen": -208.47203063964844, + "logps/rejected": -257.9039306640625, + "loss": 0.4504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15865278244018555, + "rewards/margins": 0.742802083492279, + "rewards/rejected": -0.5841493010520935, + "step": 1056 + }, + { + "epoch": 0.12, + "learning_rate": 2.6748214912794096e-07, + "logits/chosen": -3.421421527862549, + "logits/rejected": -3.3457555770874023, + "logps/chosen": -367.41168212890625, + "logps/rejected": -197.7659454345703, + "loss": 0.4109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29268813133239746, + "rewards/margins": 0.9864341020584106, + "rewards/rejected": -1.2791221141815186, + "step": 1057 + }, + { + "epoch": 0.12, + "learning_rate": 2.674470326583167e-07, + "logits/chosen": -3.323819398880005, + "logits/rejected": -3.4420759677886963, + "logps/chosen": -286.00201416015625, + "logps/rejected": -293.2824401855469, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4576173722743988, + "rewards/margins": 0.4057440757751465, + "rewards/rejected": -0.8633613586425781, + "step": 1058 + }, + { + "epoch": 0.12, + "learning_rate": 2.674119161886925e-07, + "logits/chosen": -2.7999916076660156, + "logits/rejected": -2.7776260375976562, + "logps/chosen": -239.51145935058594, + "logps/rejected": -280.16815185546875, + "loss": 0.3613, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2060440629720688, + "rewards/margins": 1.3570518493652344, + "rewards/rejected": -1.151007890701294, + "step": 1059 + }, + { + "epoch": 0.12, + "learning_rate": 2.673767997190682e-07, + "logits/chosen": -2.5541467666625977, + "logits/rejected": -2.443340539932251, + "logps/chosen": -136.96914672851562, + "logps/rejected": -220.15000915527344, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1561250239610672, + "rewards/margins": 0.828513503074646, + "rewards/rejected": -0.6723884344100952, + "step": 1060 + }, + { + "epoch": 0.12, + "learning_rate": 2.67341683249444e-07, + "logits/chosen": -2.7907042503356934, + "logits/rejected": -2.606461524963379, + "logps/chosen": -549.8987426757812, + "logps/rejected": -331.3394470214844, + "loss": 0.4905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4911750853061676, + "rewards/margins": 0.9159469604492188, + "rewards/rejected": -1.4071221351623535, + "step": 1061 + }, + { + "epoch": 0.12, + "learning_rate": 2.6730656677981973e-07, + "logits/chosen": -3.384805679321289, + "logits/rejected": -3.3100056648254395, + "logps/chosen": -265.1316223144531, + "logps/rejected": -303.02789306640625, + "loss": 0.7555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18150471150875092, + "rewards/margins": 0.9252256155014038, + "rewards/rejected": -1.1067302227020264, + "step": 1062 + }, + { + "epoch": 0.12, + "learning_rate": 2.672714503101955e-07, + "logits/chosen": -3.2793126106262207, + "logits/rejected": -3.299539566040039, + "logps/chosen": -416.5001525878906, + "logps/rejected": -247.2704620361328, + "loss": 0.5255, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14084531366825104, + "rewards/margins": 0.811980128288269, + "rewards/rejected": -0.6711347699165344, + "step": 1063 + }, + { + "epoch": 0.12, + "learning_rate": 2.6723633384057124e-07, + "logits/chosen": -2.9731521606445312, + "logits/rejected": -3.0756006240844727, + "logps/chosen": -182.33493041992188, + "logps/rejected": -201.4945831298828, + "loss": 0.4503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.186550110578537, + "rewards/margins": 0.8320742249488831, + "rewards/rejected": -1.0186243057250977, + "step": 1064 + }, + { + "epoch": 0.12, + "learning_rate": 2.6720121737094694e-07, + "logits/chosen": -2.324869394302368, + "logits/rejected": -2.5463755130767822, + "logps/chosen": -211.65577697753906, + "logps/rejected": -219.51858520507812, + "loss": 0.5382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16899175941944122, + "rewards/margins": 0.8738133907318115, + "rewards/rejected": -1.0428051948547363, + "step": 1065 + }, + { + "epoch": 0.12, + "learning_rate": 2.671661009013227e-07, + "logits/chosen": -2.910780906677246, + "logits/rejected": -2.7264161109924316, + "logps/chosen": -341.3624267578125, + "logps/rejected": -404.66522216796875, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10699562728404999, + "rewards/margins": 1.8566417694091797, + "rewards/rejected": -1.7496461868286133, + "step": 1066 + }, + { + "epoch": 0.12, + "learning_rate": 2.6713098443169845e-07, + "logits/chosen": -3.347903251647949, + "logits/rejected": -3.618755578994751, + "logps/chosen": -272.55792236328125, + "logps/rejected": -255.17843627929688, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20856042206287384, + "rewards/margins": 2.3251523971557617, + "rewards/rejected": -2.116591691970825, + "step": 1067 + }, + { + "epoch": 0.12, + "learning_rate": 2.670958679620742e-07, + "logits/chosen": -3.202124834060669, + "logits/rejected": -3.703721046447754, + "logps/chosen": -206.53793334960938, + "logps/rejected": -279.0238037109375, + "loss": 0.4358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09561172127723694, + "rewards/margins": 1.4642515182495117, + "rewards/rejected": -1.3686398267745972, + "step": 1068 + }, + { + "epoch": 0.12, + "learning_rate": 2.6706075149244996e-07, + "logits/chosen": -2.863401174545288, + "logits/rejected": -3.3243112564086914, + "logps/chosen": -258.0196228027344, + "logps/rejected": -268.0166015625, + "loss": 0.5218, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04101572930812836, + "rewards/margins": 1.4315996170043945, + "rewards/rejected": -1.3905837535858154, + "step": 1069 + }, + { + "epoch": 0.12, + "learning_rate": 2.6702563502282566e-07, + "logits/chosen": -3.385283946990967, + "logits/rejected": -3.264040946960449, + "logps/chosen": -231.66885375976562, + "logps/rejected": -324.6797180175781, + "loss": 0.532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3443859815597534, + "rewards/margins": 0.9833326935768127, + "rewards/rejected": -1.327718734741211, + "step": 1070 + }, + { + "epoch": 0.12, + "learning_rate": 2.6699051855320146e-07, + "logits/chosen": -2.8875722885131836, + "logits/rejected": -2.58618426322937, + "logps/chosen": -527.53076171875, + "logps/rejected": -305.5352783203125, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.61677485704422, + "rewards/margins": 2.2003703117370605, + "rewards/rejected": -1.583595633506775, + "step": 1071 + }, + { + "epoch": 0.12, + "learning_rate": 2.669554020835772e-07, + "logits/chosen": -3.650496244430542, + "logits/rejected": -3.6738176345825195, + "logps/chosen": -400.08001708984375, + "logps/rejected": -283.344970703125, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046713605523109436, + "rewards/margins": 1.6101597547531128, + "rewards/rejected": -1.563446283340454, + "step": 1072 + }, + { + "epoch": 0.12, + "learning_rate": 2.669202856139529e-07, + "logits/chosen": -2.981619119644165, + "logits/rejected": -2.9584925174713135, + "logps/chosen": -412.3600158691406, + "logps/rejected": -236.72940063476562, + "loss": 0.4286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5699102878570557, + "rewards/margins": 0.8904027938842773, + "rewards/rejected": -0.3204925060272217, + "step": 1073 + }, + { + "epoch": 0.12, + "learning_rate": 2.6688516914432867e-07, + "logits/chosen": -2.1321170330047607, + "logits/rejected": -2.559342622756958, + "logps/chosen": -342.568603515625, + "logps/rejected": -226.67481994628906, + "loss": 0.5178, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030689842998981476, + "rewards/margins": 1.3157991170883179, + "rewards/rejected": -1.2851094007492065, + "step": 1074 + }, + { + "epoch": 0.12, + "learning_rate": 2.668500526747044e-07, + "logits/chosen": -2.3557770252227783, + "logits/rejected": -2.2170639038085938, + "logps/chosen": -384.9610595703125, + "logps/rejected": -368.3170471191406, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11112399399280548, + "rewards/margins": 0.7109432816505432, + "rewards/rejected": -0.8220672607421875, + "step": 1075 + }, + { + "epoch": 0.12, + "learning_rate": 2.668149362050802e-07, + "logits/chosen": -3.3153393268585205, + "logits/rejected": -3.2915453910827637, + "logps/chosen": -93.95964050292969, + "logps/rejected": -157.31948852539062, + "loss": 0.5637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3066788911819458, + "rewards/margins": 0.5229943990707397, + "rewards/rejected": -0.8296732902526855, + "step": 1076 + }, + { + "epoch": 0.12, + "learning_rate": 2.6677981973545593e-07, + "logits/chosen": -2.7158753871917725, + "logits/rejected": -2.593374729156494, + "logps/chosen": -316.8660888671875, + "logps/rejected": -325.7777099609375, + "loss": 0.561, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7072499394416809, + "rewards/margins": 0.40227651596069336, + "rewards/rejected": -1.109526515007019, + "step": 1077 + }, + { + "epoch": 0.12, + "learning_rate": 2.6674470326583163e-07, + "logits/chosen": -3.1633102893829346, + "logits/rejected": -3.722296714782715, + "logps/chosen": -301.582275390625, + "logps/rejected": -209.6153564453125, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14551995694637299, + "rewards/margins": 0.6555773019790649, + "rewards/rejected": -0.5100574493408203, + "step": 1078 + }, + { + "epoch": 0.12, + "learning_rate": 2.667095867962074e-07, + "logits/chosen": -3.211073398590088, + "logits/rejected": -2.792351484298706, + "logps/chosen": -423.162353515625, + "logps/rejected": -229.6708984375, + "loss": 0.3373, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5055925250053406, + "rewards/margins": 2.000922679901123, + "rewards/rejected": -1.4953300952911377, + "step": 1079 + }, + { + "epoch": 0.12, + "learning_rate": 2.6667447032658314e-07, + "logits/chosen": -2.6621952056884766, + "logits/rejected": -2.622739315032959, + "logps/chosen": -207.3436737060547, + "logps/rejected": -267.5551452636719, + "loss": 0.3238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07570178806781769, + "rewards/margins": 2.126349687576294, + "rewards/rejected": -2.2020514011383057, + "step": 1080 + }, + { + "epoch": 0.12, + "learning_rate": 2.666393538569589e-07, + "logits/chosen": -3.1314122676849365, + "logits/rejected": -3.0342020988464355, + "logps/chosen": -131.2550048828125, + "logps/rejected": -276.4896545410156, + "loss": 0.6655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20077301561832428, + "rewards/margins": 0.6015779376029968, + "rewards/rejected": -0.8023509383201599, + "step": 1081 + }, + { + "epoch": 0.12, + "learning_rate": 2.6660423738733465e-07, + "logits/chosen": -3.6032538414001465, + "logits/rejected": -3.600999355316162, + "logps/chosen": -245.648681640625, + "logps/rejected": -175.40623474121094, + "loss": 0.3843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3788723051548004, + "rewards/margins": 1.3567819595336914, + "rewards/rejected": -0.9779095649719238, + "step": 1082 + }, + { + "epoch": 0.12, + "learning_rate": 2.665691209177104e-07, + "logits/chosen": -2.820107936859131, + "logits/rejected": -2.7412166595458984, + "logps/chosen": -329.960693359375, + "logps/rejected": -177.3825225830078, + "loss": 0.3421, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23718053102493286, + "rewards/margins": 1.4442846775054932, + "rewards/rejected": -1.2071040868759155, + "step": 1083 + }, + { + "epoch": 0.12, + "learning_rate": 2.6653400444808616e-07, + "logits/chosen": -2.8122293949127197, + "logits/rejected": -2.8643693923950195, + "logps/chosen": -160.08087158203125, + "logps/rejected": -227.11929321289062, + "loss": 0.7896, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5910912752151489, + "rewards/margins": 0.23349127173423767, + "rewards/rejected": -0.824582576751709, + "step": 1084 + }, + { + "epoch": 0.13, + "learning_rate": 2.664988879784619e-07, + "logits/chosen": -2.9615302085876465, + "logits/rejected": -2.979785919189453, + "logps/chosen": -348.69891357421875, + "logps/rejected": -274.6491394042969, + "loss": 0.4069, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18154031038284302, + "rewards/margins": 1.5098885297775269, + "rewards/rejected": -1.3283482789993286, + "step": 1085 + }, + { + "epoch": 0.13, + "learning_rate": 2.664637715088376e-07, + "logits/chosen": -3.1836366653442383, + "logits/rejected": -3.4585728645324707, + "logps/chosen": -244.2884979248047, + "logps/rejected": -358.07452392578125, + "loss": 0.4327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.034999266266822815, + "rewards/margins": 1.1542115211486816, + "rewards/rejected": -1.1892108917236328, + "step": 1086 + }, + { + "epoch": 0.13, + "learning_rate": 2.6642865503921337e-07, + "logits/chosen": -3.0731682777404785, + "logits/rejected": -2.764840602874756, + "logps/chosen": -336.82794189453125, + "logps/rejected": -254.755859375, + "loss": 0.7336, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5028422474861145, + "rewards/margins": 0.6855195760726929, + "rewards/rejected": -1.1883617639541626, + "step": 1087 + }, + { + "epoch": 0.13, + "learning_rate": 2.663935385695891e-07, + "logits/chosen": -3.5641391277313232, + "logits/rejected": -3.512181282043457, + "logps/chosen": -220.45806884765625, + "logps/rejected": -265.07550048828125, + "loss": 0.7004, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3117159605026245, + "rewards/margins": 0.05495902895927429, + "rewards/rejected": -0.3666749596595764, + "step": 1088 + }, + { + "epoch": 0.13, + "learning_rate": 2.663584220999649e-07, + "logits/chosen": -3.6256418228149414, + "logits/rejected": -3.539555311203003, + "logps/chosen": -264.65911865234375, + "logps/rejected": -226.02462768554688, + "loss": 0.4815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21201497316360474, + "rewards/margins": 1.1389046907424927, + "rewards/rejected": -1.3509197235107422, + "step": 1089 + }, + { + "epoch": 0.13, + "learning_rate": 2.6632330563034063e-07, + "logits/chosen": -4.0398406982421875, + "logits/rejected": -3.7346415519714355, + "logps/chosen": -213.818359375, + "logps/rejected": -209.65670776367188, + "loss": 0.4201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07279305160045624, + "rewards/margins": 1.0541012287139893, + "rewards/rejected": -1.126894235610962, + "step": 1090 + }, + { + "epoch": 0.13, + "learning_rate": 2.6628818916071633e-07, + "logits/chosen": -3.548835039138794, + "logits/rejected": -3.646570920944214, + "logps/chosen": -66.86070251464844, + "logps/rejected": -151.00665283203125, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.031580112874507904, + "rewards/margins": 1.167687177658081, + "rewards/rejected": -1.1361072063446045, + "step": 1091 + }, + { + "epoch": 0.13, + "learning_rate": 2.662530726910921e-07, + "logits/chosen": -2.8751325607299805, + "logits/rejected": -2.998018264770508, + "logps/chosen": -367.55194091796875, + "logps/rejected": -322.14599609375, + "loss": 0.4341, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14278297126293182, + "rewards/margins": 0.9159032106399536, + "rewards/rejected": -1.0586862564086914, + "step": 1092 + }, + { + "epoch": 0.13, + "learning_rate": 2.662179562214679e-07, + "logits/chosen": -2.8614587783813477, + "logits/rejected": -2.7682926654815674, + "logps/chosen": -227.6217803955078, + "logps/rejected": -198.5792694091797, + "loss": 0.4673, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0648302361369133, + "rewards/margins": 1.257708191871643, + "rewards/rejected": -1.1928777694702148, + "step": 1093 + }, + { + "epoch": 0.13, + "learning_rate": 2.661828397518436e-07, + "logits/chosen": -4.009253978729248, + "logits/rejected": -3.6083920001983643, + "logps/chosen": -340.5731201171875, + "logps/rejected": -225.06423950195312, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17818161845207214, + "rewards/margins": 1.3058801889419556, + "rewards/rejected": -1.127698540687561, + "step": 1094 + }, + { + "epoch": 0.13, + "learning_rate": 2.6614772328221934e-07, + "logits/chosen": -3.0888280868530273, + "logits/rejected": -4.110617160797119, + "logps/chosen": -113.39640808105469, + "logps/rejected": -321.6815185546875, + "loss": 0.4615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0016039982438087463, + "rewards/margins": 1.353739619255066, + "rewards/rejected": -1.3553435802459717, + "step": 1095 + }, + { + "epoch": 0.13, + "learning_rate": 2.661126068125951e-07, + "logits/chosen": -2.986386775970459, + "logits/rejected": -3.0536322593688965, + "logps/chosen": -253.0177001953125, + "logps/rejected": -215.14727783203125, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2331954538822174, + "rewards/margins": 1.0443871021270752, + "rewards/rejected": -0.8111915588378906, + "step": 1096 + }, + { + "epoch": 0.13, + "learning_rate": 2.6607749034297085e-07, + "logits/chosen": -2.886111259460449, + "logits/rejected": -2.669189691543579, + "logps/chosen": -281.52105712890625, + "logps/rejected": -261.98297119140625, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16390664875507355, + "rewards/margins": 1.2526737451553345, + "rewards/rejected": -1.4165804386138916, + "step": 1097 + }, + { + "epoch": 0.13, + "learning_rate": 2.660423738733466e-07, + "logits/chosen": -3.1427114009857178, + "logits/rejected": -3.1371700763702393, + "logps/chosen": -134.64695739746094, + "logps/rejected": -276.44525146484375, + "loss": 0.4604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1724531352519989, + "rewards/margins": 1.3602094650268555, + "rewards/rejected": -1.5326625108718872, + "step": 1098 + }, + { + "epoch": 0.13, + "learning_rate": 2.660072574037223e-07, + "logits/chosen": -3.1629531383514404, + "logits/rejected": -3.320312976837158, + "logps/chosen": -305.5906982421875, + "logps/rejected": -248.65670776367188, + "loss": 0.307, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.395014226436615, + "rewards/margins": 1.9799516201019287, + "rewards/rejected": -2.3749659061431885, + "step": 1099 + }, + { + "epoch": 0.13, + "learning_rate": 2.6597214093409806e-07, + "logits/chosen": -3.239506244659424, + "logits/rejected": -3.213106155395508, + "logps/chosen": -154.67005920410156, + "logps/rejected": -191.02911376953125, + "loss": 0.6085, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2719899117946625, + "rewards/margins": 0.33840304613113403, + "rewards/rejected": -0.6103929877281189, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 2.659370244644738e-07, + "logits/chosen": -3.1850335597991943, + "logits/rejected": -3.552513599395752, + "logps/chosen": -136.6389923095703, + "logps/rejected": -201.15696716308594, + "loss": 0.4944, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1187017560005188, + "rewards/margins": 0.5159597396850586, + "rewards/rejected": -0.6346614956855774, + "step": 1101 + }, + { + "epoch": 0.13, + "learning_rate": 2.6590190799484957e-07, + "logits/chosen": -2.8145833015441895, + "logits/rejected": -2.9616990089416504, + "logps/chosen": -306.9453125, + "logps/rejected": -241.09246826171875, + "loss": 0.4184, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.055389538407325745, + "rewards/margins": 0.9024931192398071, + "rewards/rejected": -0.8471035957336426, + "step": 1102 + }, + { + "epoch": 0.13, + "learning_rate": 2.658667915252253e-07, + "logits/chosen": -3.4070816040039062, + "logits/rejected": -3.544999361038208, + "logps/chosen": -230.68817138671875, + "logps/rejected": -215.06195068359375, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21901550889015198, + "rewards/margins": 2.616081476211548, + "rewards/rejected": -2.3970658779144287, + "step": 1103 + }, + { + "epoch": 0.13, + "learning_rate": 2.658316750556011e-07, + "logits/chosen": -3.8440022468566895, + "logits/rejected": -3.4502382278442383, + "logps/chosen": -208.4144287109375, + "logps/rejected": -165.04470825195312, + "loss": 0.3298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1886543333530426, + "rewards/margins": 1.1106518507003784, + "rewards/rejected": -0.9219975471496582, + "step": 1104 + }, + { + "epoch": 0.13, + "learning_rate": 2.6579655858597683e-07, + "logits/chosen": -2.5575389862060547, + "logits/rejected": -2.8185529708862305, + "logps/chosen": -279.93829345703125, + "logps/rejected": -322.7536926269531, + "loss": 0.1733, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5039827823638916, + "rewards/margins": 2.1893160343170166, + "rewards/rejected": -1.6853331327438354, + "step": 1105 + }, + { + "epoch": 0.13, + "learning_rate": 2.657614421163526e-07, + "logits/chosen": -3.8403432369232178, + "logits/rejected": -3.906801462173462, + "logps/chosen": -230.33114624023438, + "logps/rejected": -251.86447143554688, + "loss": 0.4141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013188734650611877, + "rewards/margins": 1.3897064924240112, + "rewards/rejected": -1.402895212173462, + "step": 1106 + }, + { + "epoch": 0.13, + "learning_rate": 2.657263256467283e-07, + "logits/chosen": -2.2037038803100586, + "logits/rejected": -2.3885562419891357, + "logps/chosen": -231.07699584960938, + "logps/rejected": -227.6617431640625, + "loss": 0.5779, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.038130901753902435, + "rewards/margins": 0.7168096899986267, + "rewards/rejected": -0.6786787509918213, + "step": 1107 + }, + { + "epoch": 0.13, + "learning_rate": 2.6569120917710404e-07, + "logits/chosen": -3.2889950275421143, + "logits/rejected": -3.2071967124938965, + "logps/chosen": -124.60722351074219, + "logps/rejected": -165.28273010253906, + "loss": 0.3187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13064639270305634, + "rewards/margins": 1.4673404693603516, + "rewards/rejected": -1.5979869365692139, + "step": 1108 + }, + { + "epoch": 0.13, + "learning_rate": 2.656560927074798e-07, + "logits/chosen": -2.0073370933532715, + "logits/rejected": -2.2956976890563965, + "logps/chosen": -300.39739990234375, + "logps/rejected": -172.28770446777344, + "loss": 0.4172, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22266235947608948, + "rewards/margins": 0.9134190082550049, + "rewards/rejected": -0.6907566785812378, + "step": 1109 + }, + { + "epoch": 0.13, + "learning_rate": 2.6562097623785555e-07, + "logits/chosen": -3.15394926071167, + "logits/rejected": -2.9639358520507812, + "logps/chosen": -376.3689270019531, + "logps/rejected": -422.06671142578125, + "loss": 0.3243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3569018542766571, + "rewards/margins": 1.726264238357544, + "rewards/rejected": -2.0831661224365234, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 2.655858597682313e-07, + "logits/chosen": -3.0604772567749023, + "logits/rejected": -3.125676393508911, + "logps/chosen": -159.4078369140625, + "logps/rejected": -174.72250366210938, + "loss": 0.494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1539372205734253, + "rewards/margins": 1.3404419422149658, + "rewards/rejected": -1.186504602432251, + "step": 1111 + }, + { + "epoch": 0.13, + "learning_rate": 2.6555074329860705e-07, + "logits/chosen": -3.1175551414489746, + "logits/rejected": -3.2056047916412354, + "logps/chosen": -431.451171875, + "logps/rejected": -279.39337158203125, + "loss": 0.2793, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12195511907339096, + "rewards/margins": 1.8242889642715454, + "rewards/rejected": -1.7023338079452515, + "step": 1112 + }, + { + "epoch": 0.13, + "learning_rate": 2.6551562682898275e-07, + "logits/chosen": -3.4403445720672607, + "logits/rejected": -2.997777223587036, + "logps/chosen": -290.8890075683594, + "logps/rejected": -202.0396270751953, + "loss": 0.4725, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09080486744642258, + "rewards/margins": 0.906679630279541, + "rewards/rejected": -0.8158748149871826, + "step": 1113 + }, + { + "epoch": 0.13, + "learning_rate": 2.654805103593585e-07, + "logits/chosen": -3.6513216495513916, + "logits/rejected": -3.675164222717285, + "logps/chosen": -189.81710815429688, + "logps/rejected": -122.24368286132812, + "loss": 0.3833, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34641045331954956, + "rewards/margins": 1.2991657257080078, + "rewards/rejected": -0.9527552127838135, + "step": 1114 + }, + { + "epoch": 0.13, + "learning_rate": 2.6544539388973426e-07, + "logits/chosen": -3.4919981956481934, + "logits/rejected": -3.3947594165802, + "logps/chosen": -293.4674072265625, + "logps/rejected": -342.3107604980469, + "loss": 0.7052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5479456186294556, + "rewards/margins": 0.49130839109420776, + "rewards/rejected": -1.0392539501190186, + "step": 1115 + }, + { + "epoch": 0.13, + "learning_rate": 2.6541027742011e-07, + "logits/chosen": -3.504812717437744, + "logits/rejected": -3.1602911949157715, + "logps/chosen": -163.70880126953125, + "logps/rejected": -177.30487060546875, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13161659240722656, + "rewards/margins": 1.2275633811950684, + "rewards/rejected": -1.3591800928115845, + "step": 1116 + }, + { + "epoch": 0.13, + "learning_rate": 2.6537516095048577e-07, + "logits/chosen": -3.481541156768799, + "logits/rejected": -3.0874714851379395, + "logps/chosen": -221.72964477539062, + "logps/rejected": -269.4653015136719, + "loss": 0.3156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1969405710697174, + "rewards/margins": 1.3327747583389282, + "rewards/rejected": -1.5297152996063232, + "step": 1117 + }, + { + "epoch": 0.13, + "learning_rate": 2.653400444808615e-07, + "logits/chosen": -3.1617136001586914, + "logits/rejected": -3.3838589191436768, + "logps/chosen": -221.19395446777344, + "logps/rejected": -188.6819305419922, + "loss": 0.5127, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3910120725631714, + "rewards/margins": 0.7642046213150024, + "rewards/rejected": -1.1552165746688843, + "step": 1118 + }, + { + "epoch": 0.13, + "learning_rate": 2.653049280112373e-07, + "logits/chosen": -3.4359350204467773, + "logits/rejected": -3.5465497970581055, + "logps/chosen": -286.7685852050781, + "logps/rejected": -167.11415100097656, + "loss": 0.384, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14442375302314758, + "rewards/margins": 1.4920531511306763, + "rewards/rejected": -1.3476293087005615, + "step": 1119 + }, + { + "epoch": 0.13, + "learning_rate": 2.6526981154161303e-07, + "logits/chosen": -3.0433547496795654, + "logits/rejected": -3.186009168624878, + "logps/chosen": -403.11566162109375, + "logps/rejected": -269.0381164550781, + "loss": 0.4046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42417412996292114, + "rewards/margins": 0.8064361810684204, + "rewards/rejected": -0.38226208090782166, + "step": 1120 + }, + { + "epoch": 0.13, + "learning_rate": 2.6523469507198873e-07, + "logits/chosen": -3.0856523513793945, + "logits/rejected": -3.4052882194519043, + "logps/chosen": -220.1566925048828, + "logps/rejected": -276.8841552734375, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6537335515022278, + "rewards/margins": 1.9068706035614014, + "rewards/rejected": -1.2531368732452393, + "step": 1121 + }, + { + "epoch": 0.13, + "learning_rate": 2.651995786023645e-07, + "logits/chosen": -3.0048563480377197, + "logits/rejected": -2.9592180252075195, + "logps/chosen": -355.81793212890625, + "logps/rejected": -223.06158447265625, + "loss": 0.6845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4326724410057068, + "rewards/margins": 0.44764286279678345, + "rewards/rejected": -0.8803153038024902, + "step": 1122 + }, + { + "epoch": 0.13, + "learning_rate": 2.6516446213274024e-07, + "logits/chosen": -2.7078959941864014, + "logits/rejected": -2.700967788696289, + "logps/chosen": -294.5287170410156, + "logps/rejected": -214.2689666748047, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1467777043581009, + "rewards/margins": 0.9895541667938232, + "rewards/rejected": -1.1363317966461182, + "step": 1123 + }, + { + "epoch": 0.13, + "learning_rate": 2.65129345663116e-07, + "logits/chosen": -3.276919364929199, + "logits/rejected": -3.485095739364624, + "logps/chosen": -152.3216552734375, + "logps/rejected": -181.8409423828125, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1355457752943039, + "rewards/margins": 1.6287325620651245, + "rewards/rejected": -1.4931867122650146, + "step": 1124 + }, + { + "epoch": 0.13, + "learning_rate": 2.6509422919349175e-07, + "logits/chosen": -2.919196605682373, + "logits/rejected": -2.790947675704956, + "logps/chosen": -538.43994140625, + "logps/rejected": -286.8423767089844, + "loss": 0.8325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.058187201619148254, + "rewards/margins": 0.30952396988868713, + "rewards/rejected": -0.36771121621131897, + "step": 1125 + }, + { + "epoch": 0.13, + "learning_rate": 2.6505911272386745e-07, + "logits/chosen": -2.921529769897461, + "logits/rejected": -3.069629669189453, + "logps/chosen": -388.2000427246094, + "logps/rejected": -261.0193786621094, + "loss": 0.5618, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41262391209602356, + "rewards/margins": 0.421667218208313, + "rewards/rejected": -0.009043306112289429, + "step": 1126 + }, + { + "epoch": 0.13, + "learning_rate": 2.6502399625424326e-07, + "logits/chosen": -2.9504518508911133, + "logits/rejected": -3.107922315597534, + "logps/chosen": -121.18948364257812, + "logps/rejected": -274.86407470703125, + "loss": 0.4565, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46845734119415283, + "rewards/margins": 1.2508995532989502, + "rewards/rejected": -0.7824422121047974, + "step": 1127 + }, + { + "epoch": 0.13, + "learning_rate": 2.64988879784619e-07, + "logits/chosen": -3.111607789993286, + "logits/rejected": -3.314397096633911, + "logps/chosen": -153.505615234375, + "logps/rejected": -221.0738983154297, + "loss": 0.2983, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3992217183113098, + "rewards/margins": 1.3036235570907593, + "rewards/rejected": -0.9044018983840942, + "step": 1128 + }, + { + "epoch": 0.13, + "learning_rate": 2.649537633149947e-07, + "logits/chosen": -3.0053534507751465, + "logits/rejected": -3.0161592960357666, + "logps/chosen": -272.62664794921875, + "logps/rejected": -172.41371154785156, + "loss": 0.6409, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41247886419296265, + "rewards/margins": 0.5148661732673645, + "rewards/rejected": -0.9273449778556824, + "step": 1129 + }, + { + "epoch": 0.13, + "learning_rate": 2.6491864684537046e-07, + "logits/chosen": -2.398597002029419, + "logits/rejected": -2.696943998336792, + "logps/chosen": -188.15185546875, + "logps/rejected": -132.21063232421875, + "loss": 0.615, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.060358427464962006, + "rewards/margins": 0.21854951977729797, + "rewards/rejected": -0.2789079546928406, + "step": 1130 + }, + { + "epoch": 0.13, + "learning_rate": 2.648835303757462e-07, + "logits/chosen": -3.0550076961517334, + "logits/rejected": -2.8901286125183105, + "logps/chosen": -392.510498046875, + "logps/rejected": -505.06134033203125, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31051135063171387, + "rewards/margins": 1.561906099319458, + "rewards/rejected": -1.8724174499511719, + "step": 1131 + }, + { + "epoch": 0.13, + "learning_rate": 2.6484841390612197e-07, + "logits/chosen": -3.2196896076202393, + "logits/rejected": -3.2791953086853027, + "logps/chosen": -130.60301208496094, + "logps/rejected": -175.105224609375, + "loss": 0.6484, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07421918958425522, + "rewards/margins": 0.8465479612350464, + "rewards/rejected": -0.7723287343978882, + "step": 1132 + }, + { + "epoch": 0.13, + "learning_rate": 2.648132974364977e-07, + "logits/chosen": -3.818188190460205, + "logits/rejected": -3.7487478256225586, + "logps/chosen": -237.87509155273438, + "logps/rejected": -215.31460571289062, + "loss": 0.537, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11975985765457153, + "rewards/margins": 0.8998790383338928, + "rewards/rejected": -1.019639015197754, + "step": 1133 + }, + { + "epoch": 0.13, + "learning_rate": 2.6477818096687343e-07, + "logits/chosen": -2.8150267601013184, + "logits/rejected": -2.9363014698028564, + "logps/chosen": -335.4690246582031, + "logps/rejected": -257.10845947265625, + "loss": 0.506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3079419434070587, + "rewards/margins": 0.6567952632904053, + "rewards/rejected": -0.9647372961044312, + "step": 1134 + }, + { + "epoch": 0.13, + "learning_rate": 2.647430644972492e-07, + "logits/chosen": -3.524766206741333, + "logits/rejected": -3.2200090885162354, + "logps/chosen": -422.7547912597656, + "logps/rejected": -306.89923095703125, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01270398497581482, + "rewards/margins": 0.7448458671569824, + "rewards/rejected": -0.7575498819351196, + "step": 1135 + }, + { + "epoch": 0.13, + "learning_rate": 2.6470794802762493e-07, + "logits/chosen": -3.0551304817199707, + "logits/rejected": -3.240947723388672, + "logps/chosen": -324.19525146484375, + "logps/rejected": -278.0753479003906, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37303459644317627, + "rewards/margins": 1.9975358247756958, + "rewards/rejected": -1.6245012283325195, + "step": 1136 + }, + { + "epoch": 0.13, + "learning_rate": 2.646728315580007e-07, + "logits/chosen": -3.1667635440826416, + "logits/rejected": -2.9411778450012207, + "logps/chosen": -164.29660034179688, + "logps/rejected": -266.9853515625, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.041577816009521484, + "rewards/margins": 2.3180017471313477, + "rewards/rejected": -2.359579563140869, + "step": 1137 + }, + { + "epoch": 0.13, + "learning_rate": 2.6463771508837644e-07, + "logits/chosen": -2.3710896968841553, + "logits/rejected": -2.5383169651031494, + "logps/chosen": -450.00567626953125, + "logps/rejected": -319.1898193359375, + "loss": 0.4053, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3168097734451294, + "rewards/margins": 1.349205732345581, + "rewards/rejected": -1.0323959589004517, + "step": 1138 + }, + { + "epoch": 0.13, + "learning_rate": 2.646025986187522e-07, + "logits/chosen": -3.3401284217834473, + "logits/rejected": -3.5005054473876953, + "logps/chosen": -66.05833435058594, + "logps/rejected": -163.06019592285156, + "loss": 0.5837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03818170726299286, + "rewards/margins": 0.8913490176200867, + "rewards/rejected": -0.929530680179596, + "step": 1139 + }, + { + "epoch": 0.13, + "learning_rate": 2.6456748214912795e-07, + "logits/chosen": -3.113823652267456, + "logits/rejected": -3.3339028358459473, + "logps/chosen": -336.054931640625, + "logps/rejected": -253.0380096435547, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2362760752439499, + "rewards/margins": 1.52045738697052, + "rewards/rejected": -1.2841812372207642, + "step": 1140 + }, + { + "epoch": 0.13, + "learning_rate": 2.645323656795037e-07, + "logits/chosen": -3.5555615425109863, + "logits/rejected": -3.581885576248169, + "logps/chosen": -404.1273498535156, + "logps/rejected": -256.18072509765625, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08714514225721359, + "rewards/margins": 1.3914008140563965, + "rewards/rejected": -1.3042556047439575, + "step": 1141 + }, + { + "epoch": 0.13, + "learning_rate": 2.644972492098794e-07, + "logits/chosen": -3.1227328777313232, + "logits/rejected": -3.675710678100586, + "logps/chosen": -127.75099182128906, + "logps/rejected": -276.4294128417969, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00015526264905929565, + "rewards/margins": 1.424871802330017, + "rewards/rejected": -1.4247164726257324, + "step": 1142 + }, + { + "epoch": 0.13, + "learning_rate": 2.6446213274025516e-07, + "logits/chosen": -2.8072571754455566, + "logits/rejected": -2.714836359024048, + "logps/chosen": -183.17105102539062, + "logps/rejected": -163.0018310546875, + "loss": 0.4076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1967604011297226, + "rewards/margins": 0.74781733751297, + "rewards/rejected": -0.5510568618774414, + "step": 1143 + }, + { + "epoch": 0.13, + "learning_rate": 2.644270162706309e-07, + "logits/chosen": -3.483196258544922, + "logits/rejected": -3.71005916595459, + "logps/chosen": -211.03248596191406, + "logps/rejected": -228.783203125, + "loss": 0.2445, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005968952551484108, + "rewards/margins": 2.059837818145752, + "rewards/rejected": -2.0538690090179443, + "step": 1144 + }, + { + "epoch": 0.13, + "learning_rate": 2.6439189980100667e-07, + "logits/chosen": -3.1735541820526123, + "logits/rejected": -3.2536001205444336, + "logps/chosen": -313.92059326171875, + "logps/rejected": -222.73507690429688, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.028326570987701416, + "rewards/margins": 1.348564624786377, + "rewards/rejected": -1.3202379941940308, + "step": 1145 + }, + { + "epoch": 0.13, + "learning_rate": 2.643567833313824e-07, + "logits/chosen": -3.06083607673645, + "logits/rejected": -3.1605420112609863, + "logps/chosen": -458.60009765625, + "logps/rejected": -479.2228088378906, + "loss": 0.2074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3232947885990143, + "rewards/margins": 2.071357011795044, + "rewards/rejected": -1.7480623722076416, + "step": 1146 + }, + { + "epoch": 0.13, + "learning_rate": 2.643216668617581e-07, + "logits/chosen": -2.5362708568573, + "logits/rejected": -2.726102352142334, + "logps/chosen": -332.25128173828125, + "logps/rejected": -321.62591552734375, + "loss": 0.3376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05548195540904999, + "rewards/margins": 1.3431968688964844, + "rewards/rejected": -1.287714958190918, + "step": 1147 + }, + { + "epoch": 0.13, + "learning_rate": 2.642865503921339e-07, + "logits/chosen": -3.7037458419799805, + "logits/rejected": -3.8020386695861816, + "logps/chosen": -265.3013916015625, + "logps/rejected": -235.034423828125, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32024362683296204, + "rewards/margins": 0.7788153290748596, + "rewards/rejected": -1.0990588665008545, + "step": 1148 + }, + { + "epoch": 0.13, + "learning_rate": 2.642514339225097e-07, + "logits/chosen": -2.742373466491699, + "logits/rejected": -2.7074027061462402, + "logps/chosen": -208.57275390625, + "logps/rejected": -198.40682983398438, + "loss": 0.4931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12414845824241638, + "rewards/margins": 0.7232298254966736, + "rewards/rejected": -0.5990813374519348, + "step": 1149 + }, + { + "epoch": 0.13, + "learning_rate": 2.642163174528854e-07, + "logits/chosen": -3.285632610321045, + "logits/rejected": -3.0850794315338135, + "logps/chosen": -183.4230194091797, + "logps/rejected": -170.808837890625, + "loss": 0.6424, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1261528879404068, + "rewards/margins": 0.5317463278770447, + "rewards/rejected": -0.4055934250354767, + "step": 1150 + }, + { + "epoch": 0.13, + "learning_rate": 2.6418120098326114e-07, + "logits/chosen": -2.9554035663604736, + "logits/rejected": -2.7266845703125, + "logps/chosen": -163.44700622558594, + "logps/rejected": -205.77630615234375, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.044466301798820496, + "rewards/margins": 0.8958410620689392, + "rewards/rejected": -0.8513747453689575, + "step": 1151 + }, + { + "epoch": 0.13, + "learning_rate": 2.641460845136369e-07, + "logits/chosen": -3.2066683769226074, + "logits/rejected": -2.8116397857666016, + "logps/chosen": -165.21005249023438, + "logps/rejected": -190.33555603027344, + "loss": 0.3289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28757211565971375, + "rewards/margins": 1.0682761669158936, + "rewards/rejected": -0.780704140663147, + "step": 1152 + }, + { + "epoch": 0.13, + "learning_rate": 2.6411096804401264e-07, + "logits/chosen": -3.3010871410369873, + "logits/rejected": -3.458136558532715, + "logps/chosen": -199.72857666015625, + "logps/rejected": -227.2401123046875, + "loss": 0.4695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2094159573316574, + "rewards/margins": 0.8545058965682983, + "rewards/rejected": -1.0639218091964722, + "step": 1153 + }, + { + "epoch": 0.13, + "learning_rate": 2.640758515743884e-07, + "logits/chosen": -3.487640142440796, + "logits/rejected": -3.188237428665161, + "logps/chosen": -269.1490478515625, + "logps/rejected": -241.99160766601562, + "loss": 0.6235, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23445971310138702, + "rewards/margins": 0.6877169609069824, + "rewards/rejected": -0.4532572031021118, + "step": 1154 + }, + { + "epoch": 0.13, + "learning_rate": 2.640407351047641e-07, + "logits/chosen": -2.62355637550354, + "logits/rejected": -2.727726459503174, + "logps/chosen": -291.8756103515625, + "logps/rejected": -201.43405151367188, + "loss": 0.2658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4572782516479492, + "rewards/margins": 1.4590744972229004, + "rewards/rejected": -1.0017961263656616, + "step": 1155 + }, + { + "epoch": 0.13, + "learning_rate": 2.6400561863513985e-07, + "logits/chosen": -2.646324396133423, + "logits/rejected": -2.8772103786468506, + "logps/chosen": -131.6094512939453, + "logps/rejected": -212.9236297607422, + "loss": 0.3245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2445332407951355, + "rewards/margins": 1.5212008953094482, + "rewards/rejected": -1.765734076499939, + "step": 1156 + }, + { + "epoch": 0.13, + "learning_rate": 2.639705021655156e-07, + "logits/chosen": -3.822502613067627, + "logits/rejected": -3.9203057289123535, + "logps/chosen": -281.3533630371094, + "logps/rejected": -298.17938232421875, + "loss": 0.2597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14304950833320618, + "rewards/margins": 1.909485936164856, + "rewards/rejected": -2.05253529548645, + "step": 1157 + }, + { + "epoch": 0.13, + "learning_rate": 2.6393538569589136e-07, + "logits/chosen": -3.2148208618164062, + "logits/rejected": -3.26446533203125, + "logps/chosen": -431.0948181152344, + "logps/rejected": -238.06103515625, + "loss": 0.3958, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2417408525943756, + "rewards/margins": 1.1710360050201416, + "rewards/rejected": -1.4127769470214844, + "step": 1158 + }, + { + "epoch": 0.13, + "learning_rate": 2.639002692262671e-07, + "logits/chosen": -3.327244281768799, + "logits/rejected": -3.8737828731536865, + "logps/chosen": -216.22923278808594, + "logps/rejected": -252.5621337890625, + "loss": 0.4256, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5626663565635681, + "rewards/margins": 1.4025487899780273, + "rewards/rejected": -0.839882493019104, + "step": 1159 + }, + { + "epoch": 0.13, + "learning_rate": 2.638651527566428e-07, + "logits/chosen": -2.4803550243377686, + "logits/rejected": -2.5593791007995605, + "logps/chosen": -202.14480590820312, + "logps/rejected": -303.0001220703125, + "loss": 0.5275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03640022873878479, + "rewards/margins": 1.2050468921661377, + "rewards/rejected": -1.2414470911026, + "step": 1160 + }, + { + "epoch": 0.13, + "learning_rate": 2.638300362870186e-07, + "logits/chosen": -2.804436683654785, + "logits/rejected": -2.898144483566284, + "logps/chosen": -365.42987060546875, + "logps/rejected": -222.9817657470703, + "loss": 0.9608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6515011787414551, + "rewards/margins": 0.8350887298583984, + "rewards/rejected": -1.486589789390564, + "step": 1161 + }, + { + "epoch": 0.13, + "learning_rate": 2.637949198173944e-07, + "logits/chosen": -2.5278637409210205, + "logits/rejected": -2.7136406898498535, + "logps/chosen": -274.7254638671875, + "logps/rejected": -264.2034606933594, + "loss": 0.5115, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3426119387149811, + "rewards/margins": 0.9534878730773926, + "rewards/rejected": -1.2960999011993408, + "step": 1162 + }, + { + "epoch": 0.13, + "learning_rate": 2.637598033477701e-07, + "logits/chosen": -3.1660380363464355, + "logits/rejected": -3.440300464630127, + "logps/chosen": -152.2317352294922, + "logps/rejected": -265.6800537109375, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12988163530826569, + "rewards/margins": 2.6952085494995117, + "rewards/rejected": -2.565326690673828, + "step": 1163 + }, + { + "epoch": 0.13, + "learning_rate": 2.6372468687814583e-07, + "logits/chosen": -3.953016757965088, + "logits/rejected": -4.006649017333984, + "logps/chosen": -146.94737243652344, + "logps/rejected": -227.32015991210938, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07558516412973404, + "rewards/margins": 1.61501944065094, + "rewards/rejected": -1.690604567527771, + "step": 1164 + }, + { + "epoch": 0.13, + "learning_rate": 2.636895704085216e-07, + "logits/chosen": -3.59645938873291, + "logits/rejected": -3.3608999252319336, + "logps/chosen": -252.2806854248047, + "logps/rejected": -200.71478271484375, + "loss": 0.3504, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029424652457237244, + "rewards/margins": 1.357444167137146, + "rewards/rejected": -1.3280194997787476, + "step": 1165 + }, + { + "epoch": 0.13, + "learning_rate": 2.6365445393889734e-07, + "logits/chosen": -3.1203033924102783, + "logits/rejected": -2.9929466247558594, + "logps/chosen": -206.53280639648438, + "logps/rejected": -180.3179168701172, + "loss": 0.3933, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.036619167774915695, + "rewards/margins": 0.8788002729415894, + "rewards/rejected": -0.9154193997383118, + "step": 1166 + }, + { + "epoch": 0.13, + "learning_rate": 2.636193374692731e-07, + "logits/chosen": -3.5457210540771484, + "logits/rejected": -3.2167935371398926, + "logps/chosen": -252.66542053222656, + "logps/rejected": -160.60671997070312, + "loss": 0.4133, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2558909058570862, + "rewards/margins": 1.053233027458191, + "rewards/rejected": -0.7973421216011047, + "step": 1167 + }, + { + "epoch": 0.13, + "learning_rate": 2.635842209996488e-07, + "logits/chosen": -3.5444822311401367, + "logits/rejected": -3.1382288932800293, + "logps/chosen": -464.6197509765625, + "logps/rejected": -265.07501220703125, + "loss": 0.3242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2711925804615021, + "rewards/margins": 1.4047973155975342, + "rewards/rejected": -1.6759898662567139, + "step": 1168 + }, + { + "epoch": 0.13, + "learning_rate": 2.6354910453002455e-07, + "logits/chosen": -2.88092041015625, + "logits/rejected": -2.92826509475708, + "logps/chosen": -300.991943359375, + "logps/rejected": -177.91783142089844, + "loss": 0.4204, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45847827196121216, + "rewards/margins": 0.976760983467102, + "rewards/rejected": -0.5182826519012451, + "step": 1169 + }, + { + "epoch": 0.13, + "learning_rate": 2.6351398806040035e-07, + "logits/chosen": -3.030751943588257, + "logits/rejected": -2.9094910621643066, + "logps/chosen": -257.4676513671875, + "logps/rejected": -175.7664031982422, + "loss": 0.4556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022378571331501007, + "rewards/margins": 0.94871985912323, + "rewards/rejected": -0.9710984230041504, + "step": 1170 + }, + { + "epoch": 0.13, + "learning_rate": 2.6347887159077605e-07, + "logits/chosen": -3.7276697158813477, + "logits/rejected": -3.109769344329834, + "logps/chosen": -336.2955322265625, + "logps/rejected": -313.4561767578125, + "loss": 0.5864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1274830847978592, + "rewards/margins": 0.44493913650512695, + "rewards/rejected": -0.5724222660064697, + "step": 1171 + }, + { + "epoch": 0.14, + "learning_rate": 2.634437551211518e-07, + "logits/chosen": -3.2787599563598633, + "logits/rejected": -2.883723497390747, + "logps/chosen": -323.1746826171875, + "logps/rejected": -269.3150634765625, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26428043842315674, + "rewards/margins": 1.0590006113052368, + "rewards/rejected": -0.7947201728820801, + "step": 1172 + }, + { + "epoch": 0.14, + "learning_rate": 2.6340863865152756e-07, + "logits/chosen": -3.4941961765289307, + "logits/rejected": -3.412862777709961, + "logps/chosen": -101.77346801757812, + "logps/rejected": -161.71405029296875, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1525445580482483, + "rewards/margins": 1.894465446472168, + "rewards/rejected": -2.0470099449157715, + "step": 1173 + }, + { + "epoch": 0.14, + "learning_rate": 2.633735221819033e-07, + "logits/chosen": -3.2377138137817383, + "logits/rejected": -3.7454915046691895, + "logps/chosen": -191.99063110351562, + "logps/rejected": -236.19329833984375, + "loss": 0.3064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.226438969373703, + "rewards/margins": 1.5194885730743408, + "rewards/rejected": -1.7459276914596558, + "step": 1174 + }, + { + "epoch": 0.14, + "learning_rate": 2.6333840571227907e-07, + "logits/chosen": -3.5576388835906982, + "logits/rejected": -3.320265293121338, + "logps/chosen": -347.0915832519531, + "logps/rejected": -266.872802734375, + "loss": 0.5172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.529593288898468, + "rewards/margins": 0.8514747023582458, + "rewards/rejected": -1.3810679912567139, + "step": 1175 + }, + { + "epoch": 0.14, + "learning_rate": 2.6330328924265477e-07, + "logits/chosen": -3.101223945617676, + "logits/rejected": -3.2820241451263428, + "logps/chosen": -386.18341064453125, + "logps/rejected": -237.38047790527344, + "loss": 0.4948, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23741579055786133, + "rewards/margins": 0.8363604545593262, + "rewards/rejected": -0.5989446640014648, + "step": 1176 + }, + { + "epoch": 0.14, + "learning_rate": 2.632681727730305e-07, + "logits/chosen": -3.100520372390747, + "logits/rejected": -3.142585277557373, + "logps/chosen": -249.1744842529297, + "logps/rejected": -230.4940643310547, + "loss": 0.4263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1742902398109436, + "rewards/margins": 1.793858528137207, + "rewards/rejected": -1.9681488275527954, + "step": 1177 + }, + { + "epoch": 0.14, + "learning_rate": 2.632330563034063e-07, + "logits/chosen": -3.573251962661743, + "logits/rejected": -3.4033362865448, + "logps/chosen": -87.0963363647461, + "logps/rejected": -143.77487182617188, + "loss": 0.5074, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14743776619434357, + "rewards/margins": 0.6411831378936768, + "rewards/rejected": -0.4937452971935272, + "step": 1178 + }, + { + "epoch": 0.14, + "learning_rate": 2.6319793983378203e-07, + "logits/chosen": -2.7285749912261963, + "logits/rejected": -2.8543498516082764, + "logps/chosen": -259.2212829589844, + "logps/rejected": -320.10748291015625, + "loss": 0.34, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06968726217746735, + "rewards/margins": 1.613879680633545, + "rewards/rejected": -1.6835670471191406, + "step": 1179 + }, + { + "epoch": 0.14, + "learning_rate": 2.631628233641578e-07, + "logits/chosen": -2.950324535369873, + "logits/rejected": -3.041490077972412, + "logps/chosen": -315.1473083496094, + "logps/rejected": -242.81930541992188, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1525687724351883, + "rewards/margins": 0.6952160000801086, + "rewards/rejected": -0.5426473021507263, + "step": 1180 + }, + { + "epoch": 0.14, + "learning_rate": 2.631277068945335e-07, + "logits/chosen": -3.0589451789855957, + "logits/rejected": -2.7982590198516846, + "logps/chosen": -448.7673034667969, + "logps/rejected": -301.37115478515625, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.030834436416626, + "rewards/margins": 2.1853466033935547, + "rewards/rejected": -1.1545119285583496, + "step": 1181 + }, + { + "epoch": 0.14, + "learning_rate": 2.6309259042490924e-07, + "logits/chosen": -2.3934059143066406, + "logits/rejected": -2.337977647781372, + "logps/chosen": -215.0846405029297, + "logps/rejected": -180.48843383789062, + "loss": 0.4268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07917890697717667, + "rewards/margins": 0.8075821995735168, + "rewards/rejected": -0.7284033298492432, + "step": 1182 + }, + { + "epoch": 0.14, + "learning_rate": 2.6305747395528505e-07, + "logits/chosen": -2.7609660625457764, + "logits/rejected": -2.9934773445129395, + "logps/chosen": -311.5391540527344, + "logps/rejected": -224.38735961914062, + "loss": 0.4512, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37749576568603516, + "rewards/margins": 0.8909459114074707, + "rewards/rejected": -0.5134501457214355, + "step": 1183 + }, + { + "epoch": 0.14, + "learning_rate": 2.6302235748566075e-07, + "logits/chosen": -3.3895435333251953, + "logits/rejected": -3.5497915744781494, + "logps/chosen": -128.94158935546875, + "logps/rejected": -125.99014282226562, + "loss": 0.5472, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2376810610294342, + "rewards/margins": 0.65836501121521, + "rewards/rejected": -0.42068392038345337, + "step": 1184 + }, + { + "epoch": 0.14, + "learning_rate": 2.629872410160365e-07, + "logits/chosen": -2.526494026184082, + "logits/rejected": -2.765414237976074, + "logps/chosen": -207.33148193359375, + "logps/rejected": -300.87548828125, + "loss": 0.4863, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19567319750785828, + "rewards/margins": 1.3545764684677124, + "rewards/rejected": -1.5502498149871826, + "step": 1185 + }, + { + "epoch": 0.14, + "learning_rate": 2.6295212454641226e-07, + "logits/chosen": -2.939251661300659, + "logits/rejected": -2.755117177963257, + "logps/chosen": -183.38392639160156, + "logps/rejected": -202.21408081054688, + "loss": 0.3868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009866468608379364, + "rewards/margins": 1.3239699602127075, + "rewards/rejected": -1.3141034841537476, + "step": 1186 + }, + { + "epoch": 0.14, + "learning_rate": 2.62917008076788e-07, + "logits/chosen": -3.3001906871795654, + "logits/rejected": -3.465632915496826, + "logps/chosen": -287.233642578125, + "logps/rejected": -273.9935302734375, + "loss": 0.3337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3053727447986603, + "rewards/margins": 1.4368269443511963, + "rewards/rejected": -1.7421997785568237, + "step": 1187 + }, + { + "epoch": 0.14, + "learning_rate": 2.6288189160716376e-07, + "logits/chosen": -3.176413059234619, + "logits/rejected": -3.1112475395202637, + "logps/chosen": -185.18736267089844, + "logps/rejected": -236.84011840820312, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02018701285123825, + "rewards/margins": 1.5580048561096191, + "rewards/rejected": -1.5378179550170898, + "step": 1188 + }, + { + "epoch": 0.14, + "learning_rate": 2.6284677513753947e-07, + "logits/chosen": -3.434058666229248, + "logits/rejected": -3.3798446655273438, + "logps/chosen": -280.39312744140625, + "logps/rejected": -304.622802734375, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39674603939056396, + "rewards/margins": 0.48376649618148804, + "rewards/rejected": -0.880512535572052, + "step": 1189 + }, + { + "epoch": 0.14, + "learning_rate": 2.628116586679152e-07, + "logits/chosen": -3.294607639312744, + "logits/rejected": -3.136362075805664, + "logps/chosen": -232.37252807617188, + "logps/rejected": -138.50704956054688, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0002674981951713562, + "rewards/margins": 0.31020474433898926, + "rewards/rejected": -0.3104722499847412, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 2.6277654219829097e-07, + "logits/chosen": -3.156045436859131, + "logits/rejected": -3.253831624984741, + "logps/chosen": -296.0826416015625, + "logps/rejected": -253.27841186523438, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5889577865600586, + "rewards/margins": 2.6336612701416016, + "rewards/rejected": -2.044703245162964, + "step": 1191 + }, + { + "epoch": 0.14, + "learning_rate": 2.6274142572866673e-07, + "logits/chosen": -2.7620837688446045, + "logits/rejected": -2.728872060775757, + "logps/chosen": -103.6409912109375, + "logps/rejected": -97.89027404785156, + "loss": 0.7472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25599542260169983, + "rewards/margins": 0.030623771250247955, + "rewards/rejected": -0.2866191864013672, + "step": 1192 + }, + { + "epoch": 0.14, + "learning_rate": 2.627063092590425e-07, + "logits/chosen": -3.0368871688842773, + "logits/rejected": -2.8645498752593994, + "logps/chosen": -206.26072692871094, + "logps/rejected": -187.91006469726562, + "loss": 0.5278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17441660165786743, + "rewards/margins": 0.44798439741134644, + "rewards/rejected": -0.6224009990692139, + "step": 1193 + }, + { + "epoch": 0.14, + "learning_rate": 2.6267119278941823e-07, + "logits/chosen": -2.9795985221862793, + "logits/rejected": -3.218921661376953, + "logps/chosen": -410.1129455566406, + "logps/rejected": -461.47027587890625, + "loss": 0.2911, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5375205874443054, + "rewards/margins": 1.5954796075820923, + "rewards/rejected": -1.0579591989517212, + "step": 1194 + }, + { + "epoch": 0.14, + "learning_rate": 2.62636076319794e-07, + "logits/chosen": -3.753042697906494, + "logits/rejected": -3.6333470344543457, + "logps/chosen": -187.22178649902344, + "logps/rejected": -187.4674530029297, + "loss": 0.5789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.57789146900177, + "rewards/margins": 0.8073467016220093, + "rewards/rejected": -1.3852381706237793, + "step": 1195 + }, + { + "epoch": 0.14, + "learning_rate": 2.6260095985016974e-07, + "logits/chosen": -2.9603824615478516, + "logits/rejected": -3.2064013481140137, + "logps/chosen": -277.5440673828125, + "logps/rejected": -266.9850769042969, + "loss": 0.7151, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5664582848548889, + "rewards/margins": 0.5947904586791992, + "rewards/rejected": -1.1612486839294434, + "step": 1196 + }, + { + "epoch": 0.14, + "learning_rate": 2.6256584338054544e-07, + "logits/chosen": -3.846341848373413, + "logits/rejected": -3.5898423194885254, + "logps/chosen": -194.23770141601562, + "logps/rejected": -304.7276611328125, + "loss": 0.4886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.453357994556427, + "rewards/margins": 0.8248500227928162, + "rewards/rejected": -1.2782080173492432, + "step": 1197 + }, + { + "epoch": 0.14, + "learning_rate": 2.625307269109212e-07, + "logits/chosen": -3.616652488708496, + "logits/rejected": -3.5786354541778564, + "logps/chosen": -305.54388427734375, + "logps/rejected": -181.3466339111328, + "loss": 0.4595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08782371878623962, + "rewards/margins": 1.290541410446167, + "rewards/rejected": -1.202717661857605, + "step": 1198 + }, + { + "epoch": 0.14, + "learning_rate": 2.6249561044129695e-07, + "logits/chosen": -3.1046082973480225, + "logits/rejected": -3.1202735900878906, + "logps/chosen": -167.1373291015625, + "logps/rejected": -262.22149658203125, + "loss": 0.7328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23059387505054474, + "rewards/margins": 0.7052990198135376, + "rewards/rejected": -0.9358928799629211, + "step": 1199 + }, + { + "epoch": 0.14, + "learning_rate": 2.624604939716727e-07, + "logits/chosen": -3.6484487056732178, + "logits/rejected": -3.410665988922119, + "logps/chosen": -196.5653076171875, + "logps/rejected": -286.01800537109375, + "loss": 0.3786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17695683240890503, + "rewards/margins": 2.156592845916748, + "rewards/rejected": -2.333549976348877, + "step": 1200 + }, + { + "epoch": 0.14, + "learning_rate": 2.6242537750204846e-07, + "logits/chosen": -3.497046947479248, + "logits/rejected": -3.4260759353637695, + "logps/chosen": -103.20014953613281, + "logps/rejected": -167.691162109375, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.020122356712818146, + "rewards/margins": 0.8744657635688782, + "rewards/rejected": -0.8945881128311157, + "step": 1201 + }, + { + "epoch": 0.14, + "learning_rate": 2.623902610324242e-07, + "logits/chosen": -3.442551612854004, + "logits/rejected": -3.2935521602630615, + "logps/chosen": -403.45306396484375, + "logps/rejected": -298.0394287109375, + "loss": 0.8709, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41956818103790283, + "rewards/margins": 0.3612107038497925, + "rewards/rejected": -0.7807788848876953, + "step": 1202 + }, + { + "epoch": 0.14, + "learning_rate": 2.623551445627999e-07, + "logits/chosen": -3.6135380268096924, + "logits/rejected": -3.618638038635254, + "logps/chosen": -296.8923645019531, + "logps/rejected": -347.677001953125, + "loss": 0.3003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4914509057998657, + "rewards/margins": 1.7996132373809814, + "rewards/rejected": -1.3081622123718262, + "step": 1203 + }, + { + "epoch": 0.14, + "learning_rate": 2.623200280931757e-07, + "logits/chosen": -2.6543173789978027, + "logits/rejected": -2.757397413253784, + "logps/chosen": -208.95167541503906, + "logps/rejected": -248.06454467773438, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2177644670009613, + "rewards/margins": 1.8779759407043457, + "rewards/rejected": -2.09574031829834, + "step": 1204 + }, + { + "epoch": 0.14, + "learning_rate": 2.622849116235514e-07, + "logits/chosen": -3.0110507011413574, + "logits/rejected": -2.8109912872314453, + "logps/chosen": -454.27880859375, + "logps/rejected": -273.0755920410156, + "loss": 0.3737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04671168327331543, + "rewards/margins": 1.133287787437439, + "rewards/rejected": -1.0865761041641235, + "step": 1205 + }, + { + "epoch": 0.14, + "learning_rate": 2.622497951539272e-07, + "logits/chosen": -2.9705848693847656, + "logits/rejected": -3.4963574409484863, + "logps/chosen": -184.06619262695312, + "logps/rejected": -175.2967529296875, + "loss": 0.6394, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29350727796554565, + "rewards/margins": 1.7692946195602417, + "rewards/rejected": -2.0628018379211426, + "step": 1206 + }, + { + "epoch": 0.14, + "learning_rate": 2.6221467868430293e-07, + "logits/chosen": -3.502842426300049, + "logits/rejected": -3.630740165710449, + "logps/chosen": -295.95281982421875, + "logps/rejected": -211.37509155273438, + "loss": 0.4251, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12740890681743622, + "rewards/margins": 0.9196792840957642, + "rewards/rejected": -0.7922704219818115, + "step": 1207 + }, + { + "epoch": 0.14, + "learning_rate": 2.621795622146787e-07, + "logits/chosen": -3.059675455093384, + "logits/rejected": -3.207529067993164, + "logps/chosen": -296.3916015625, + "logps/rejected": -233.34600830078125, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5595195889472961, + "rewards/margins": 1.4842607975006104, + "rewards/rejected": -0.9247411489486694, + "step": 1208 + }, + { + "epoch": 0.14, + "learning_rate": 2.6214444574505444e-07, + "logits/chosen": -3.7902140617370605, + "logits/rejected": -3.2357122898101807, + "logps/chosen": -267.97381591796875, + "logps/rejected": -255.15994262695312, + "loss": 0.8247, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2194626331329346, + "rewards/margins": -0.16171997785568237, + "rewards/rejected": -1.057742714881897, + "step": 1209 + }, + { + "epoch": 0.14, + "learning_rate": 2.621093292754302e-07, + "logits/chosen": -3.1220099925994873, + "logits/rejected": -3.432051181793213, + "logps/chosen": -213.610107421875, + "logps/rejected": -271.35150146484375, + "loss": 0.7012, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20678411424160004, + "rewards/margins": 1.4082266092300415, + "rewards/rejected": -1.6150107383728027, + "step": 1210 + }, + { + "epoch": 0.14, + "learning_rate": 2.620742128058059e-07, + "logits/chosen": -3.6795268058776855, + "logits/rejected": -3.4376368522644043, + "logps/chosen": -392.11285400390625, + "logps/rejected": -221.34605407714844, + "loss": 0.5037, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05437291041016579, + "rewards/margins": 0.8223720788955688, + "rewards/rejected": -0.8767449259757996, + "step": 1211 + }, + { + "epoch": 0.14, + "learning_rate": 2.6203909633618164e-07, + "logits/chosen": -3.2899718284606934, + "logits/rejected": -3.053548574447632, + "logps/chosen": -211.36944580078125, + "logps/rejected": -204.5604705810547, + "loss": 0.4386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1047859638929367, + "rewards/margins": 1.0950453281402588, + "rewards/rejected": -1.199831247329712, + "step": 1212 + }, + { + "epoch": 0.14, + "learning_rate": 2.620039798665574e-07, + "logits/chosen": -2.5597352981567383, + "logits/rejected": -2.5827295780181885, + "logps/chosen": -89.36774444580078, + "logps/rejected": -145.798095703125, + "loss": 0.4773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14074450731277466, + "rewards/margins": 1.1211738586425781, + "rewards/rejected": -1.261918306350708, + "step": 1213 + }, + { + "epoch": 0.14, + "learning_rate": 2.6196886339693315e-07, + "logits/chosen": -2.7932703495025635, + "logits/rejected": -2.922492504119873, + "logps/chosen": -181.97044372558594, + "logps/rejected": -191.1028594970703, + "loss": 0.3136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01868841052055359, + "rewards/margins": 1.501734972000122, + "rewards/rejected": -1.520423412322998, + "step": 1214 + }, + { + "epoch": 0.14, + "learning_rate": 2.619337469273089e-07, + "logits/chosen": -2.409871816635132, + "logits/rejected": -2.48637056350708, + "logps/chosen": -335.9388732910156, + "logps/rejected": -239.02120971679688, + "loss": 0.5348, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17619505524635315, + "rewards/margins": 1.254032850265503, + "rewards/rejected": -1.4302278757095337, + "step": 1215 + }, + { + "epoch": 0.14, + "learning_rate": 2.618986304576846e-07, + "logits/chosen": -2.8044636249542236, + "logits/rejected": -3.308803081512451, + "logps/chosen": -207.7530059814453, + "logps/rejected": -290.6954040527344, + "loss": 0.1834, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40450558066368103, + "rewards/margins": 3.2002768516540527, + "rewards/rejected": -2.7957711219787598, + "step": 1216 + }, + { + "epoch": 0.14, + "learning_rate": 2.618635139880604e-07, + "logits/chosen": -2.6024394035339355, + "logits/rejected": -2.6902215480804443, + "logps/chosen": -385.6015625, + "logps/rejected": -362.0187072753906, + "loss": 0.2161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27261805534362793, + "rewards/margins": 1.67147958278656, + "rewards/rejected": -1.944097638130188, + "step": 1217 + }, + { + "epoch": 0.14, + "learning_rate": 2.6182839751843617e-07, + "logits/chosen": -2.7859675884246826, + "logits/rejected": -2.707383871078491, + "logps/chosen": -257.12530517578125, + "logps/rejected": -177.07656860351562, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47198402881622314, + "rewards/margins": 0.918461799621582, + "rewards/rejected": -1.3904458284378052, + "step": 1218 + }, + { + "epoch": 0.14, + "learning_rate": 2.6179328104881187e-07, + "logits/chosen": -3.0397071838378906, + "logits/rejected": -2.8149397373199463, + "logps/chosen": -207.83535766601562, + "logps/rejected": -202.79832458496094, + "loss": 0.4612, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6916698217391968, + "rewards/margins": 0.7458405494689941, + "rewards/rejected": -0.054170697927474976, + "step": 1219 + }, + { + "epoch": 0.14, + "learning_rate": 2.617581645791876e-07, + "logits/chosen": -3.9591684341430664, + "logits/rejected": -3.9738426208496094, + "logps/chosen": -276.22320556640625, + "logps/rejected": -251.22706604003906, + "loss": 0.3134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03408125787973404, + "rewards/margins": 1.5596749782562256, + "rewards/rejected": -1.5937564373016357, + "step": 1220 + }, + { + "epoch": 0.14, + "learning_rate": 2.617230481095634e-07, + "logits/chosen": -2.9041268825531006, + "logits/rejected": -2.7624189853668213, + "logps/chosen": -316.84149169921875, + "logps/rejected": -446.8443603515625, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21041324734687805, + "rewards/margins": 2.1312544345855713, + "rewards/rejected": -1.9208412170410156, + "step": 1221 + }, + { + "epoch": 0.14, + "learning_rate": 2.6168793163993913e-07, + "logits/chosen": -3.041705369949341, + "logits/rejected": -2.8295531272888184, + "logps/chosen": -207.0461883544922, + "logps/rejected": -236.07821655273438, + "loss": 0.3421, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1854623556137085, + "rewards/margins": 1.3783029317855835, + "rewards/rejected": -1.192840576171875, + "step": 1222 + }, + { + "epoch": 0.14, + "learning_rate": 2.616528151703149e-07, + "logits/chosen": -2.881880044937134, + "logits/rejected": -3.0564446449279785, + "logps/chosen": -298.6236877441406, + "logps/rejected": -205.32493591308594, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20007996261119843, + "rewards/margins": 2.0181522369384766, + "rewards/rejected": -2.2182321548461914, + "step": 1223 + }, + { + "epoch": 0.14, + "learning_rate": 2.616176987006906e-07, + "logits/chosen": -3.3401763439178467, + "logits/rejected": -3.094081401824951, + "logps/chosen": -264.14288330078125, + "logps/rejected": -306.1895751953125, + "loss": 0.3563, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1618640422821045, + "rewards/margins": 1.0272811651229858, + "rewards/rejected": -0.8654171824455261, + "step": 1224 + }, + { + "epoch": 0.14, + "learning_rate": 2.6158258223106634e-07, + "logits/chosen": -3.0743939876556396, + "logits/rejected": -2.9931015968322754, + "logps/chosen": -329.1996154785156, + "logps/rejected": -326.04376220703125, + "loss": 0.2952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05222554877400398, + "rewards/margins": 1.5065926313400269, + "rewards/rejected": -1.5588182210922241, + "step": 1225 + }, + { + "epoch": 0.14, + "learning_rate": 2.6154746576144215e-07, + "logits/chosen": -3.040006637573242, + "logits/rejected": -3.3644235134124756, + "logps/chosen": -195.9557342529297, + "logps/rejected": -341.0067443847656, + "loss": 0.4675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021116290241479874, + "rewards/margins": 1.4113513231277466, + "rewards/rejected": -1.4324675798416138, + "step": 1226 + }, + { + "epoch": 0.14, + "learning_rate": 2.6151234929181785e-07, + "logits/chosen": -2.9008522033691406, + "logits/rejected": -2.878602981567383, + "logps/chosen": -239.6218719482422, + "logps/rejected": -266.8584289550781, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0775146633386612, + "rewards/margins": 1.4534400701522827, + "rewards/rejected": -1.3759254217147827, + "step": 1227 + }, + { + "epoch": 0.14, + "learning_rate": 2.614772328221936e-07, + "logits/chosen": -2.6953284740448, + "logits/rejected": -2.8165056705474854, + "logps/chosen": -199.06704711914062, + "logps/rejected": -193.72781372070312, + "loss": 0.5636, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.062103547155857086, + "rewards/margins": 0.5349035263061523, + "rewards/rejected": -0.47279998660087585, + "step": 1228 + }, + { + "epoch": 0.14, + "learning_rate": 2.6144211635256935e-07, + "logits/chosen": -3.118995428085327, + "logits/rejected": -3.1838812828063965, + "logps/chosen": -196.59376525878906, + "logps/rejected": -340.3057556152344, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27144718170166016, + "rewards/margins": 1.4590797424316406, + "rewards/rejected": -1.7305269241333008, + "step": 1229 + }, + { + "epoch": 0.14, + "learning_rate": 2.614069998829451e-07, + "logits/chosen": -2.4227356910705566, + "logits/rejected": -2.388801097869873, + "logps/chosen": -270.2369689941406, + "logps/rejected": -399.65130615234375, + "loss": 0.3958, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.102063849568367, + "rewards/margins": 1.264222502708435, + "rewards/rejected": -1.162158727645874, + "step": 1230 + }, + { + "epoch": 0.14, + "learning_rate": 2.6137188341332086e-07, + "logits/chosen": -2.8920364379882812, + "logits/rejected": -2.704340696334839, + "logps/chosen": -325.8037414550781, + "logps/rejected": -313.0450134277344, + "loss": 0.5686, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1999027281999588, + "rewards/margins": 0.6791297793388367, + "rewards/rejected": -0.8790324926376343, + "step": 1231 + }, + { + "epoch": 0.14, + "learning_rate": 2.6133676694369656e-07, + "logits/chosen": -3.270562171936035, + "logits/rejected": -3.500222682952881, + "logps/chosen": -230.20037841796875, + "logps/rejected": -201.35707092285156, + "loss": 0.5523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18336951732635498, + "rewards/margins": 0.8029341101646423, + "rewards/rejected": -0.6195645928382874, + "step": 1232 + }, + { + "epoch": 0.14, + "learning_rate": 2.613016504740723e-07, + "logits/chosen": -2.02183198928833, + "logits/rejected": -2.1559906005859375, + "logps/chosen": -188.16981506347656, + "logps/rejected": -255.16818237304688, + "loss": 0.6533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26096677780151367, + "rewards/margins": 0.3351914882659912, + "rewards/rejected": -0.5961582064628601, + "step": 1233 + }, + { + "epoch": 0.14, + "learning_rate": 2.6126653400444807e-07, + "logits/chosen": -2.315589427947998, + "logits/rejected": -2.302199363708496, + "logps/chosen": -316.6741027832031, + "logps/rejected": -372.4480285644531, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22616541385650635, + "rewards/margins": 1.6129578351974487, + "rewards/rejected": -1.839123249053955, + "step": 1234 + }, + { + "epoch": 0.14, + "learning_rate": 2.612314175348238e-07, + "logits/chosen": -3.4977946281433105, + "logits/rejected": -3.2461538314819336, + "logps/chosen": -214.875244140625, + "logps/rejected": -199.24325561523438, + "loss": 0.9586, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1848311722278595, + "rewards/margins": -0.42164939641952515, + "rewards/rejected": 0.23681819438934326, + "step": 1235 + }, + { + "epoch": 0.14, + "learning_rate": 2.611963010651996e-07, + "logits/chosen": -3.2507569789886475, + "logits/rejected": -3.3427178859710693, + "logps/chosen": -204.02667236328125, + "logps/rejected": -211.50567626953125, + "loss": 0.3064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24388472735881805, + "rewards/margins": 1.5830415487289429, + "rewards/rejected": -1.3391568660736084, + "step": 1236 + }, + { + "epoch": 0.14, + "learning_rate": 2.611611845955753e-07, + "logits/chosen": -2.399153709411621, + "logits/rejected": -2.4923818111419678, + "logps/chosen": -231.37274169921875, + "logps/rejected": -296.00885009765625, + "loss": 0.4762, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04505304992198944, + "rewards/margins": 1.7013713121414185, + "rewards/rejected": -1.656318187713623, + "step": 1237 + }, + { + "epoch": 0.14, + "learning_rate": 2.611260681259511e-07, + "logits/chosen": -2.4996585845947266, + "logits/rejected": -2.37514328956604, + "logps/chosen": -433.9339599609375, + "logps/rejected": -389.885498046875, + "loss": 0.225, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4906667470932007, + "rewards/margins": 1.8463754653930664, + "rewards/rejected": -1.3557087182998657, + "step": 1238 + }, + { + "epoch": 0.14, + "learning_rate": 2.6109095165632684e-07, + "logits/chosen": -2.565741539001465, + "logits/rejected": -2.809699535369873, + "logps/chosen": -286.50823974609375, + "logps/rejected": -229.80722045898438, + "loss": 0.2866, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.455525279045105, + "rewards/margins": 1.3931748867034912, + "rewards/rejected": -0.9376495480537415, + "step": 1239 + }, + { + "epoch": 0.14, + "learning_rate": 2.6105583518670254e-07, + "logits/chosen": -3.428104877471924, + "logits/rejected": -3.4822864532470703, + "logps/chosen": -229.9440460205078, + "logps/rejected": -224.49395751953125, + "loss": 0.4375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12973244488239288, + "rewards/margins": 1.756459355354309, + "rewards/rejected": -1.8861918449401855, + "step": 1240 + }, + { + "epoch": 0.14, + "learning_rate": 2.610207187170783e-07, + "logits/chosen": -2.95914626121521, + "logits/rejected": -2.8100621700286865, + "logps/chosen": -301.2416687011719, + "logps/rejected": -291.8368835449219, + "loss": 0.3733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016038358211517334, + "rewards/margins": 0.9308395385742188, + "rewards/rejected": -0.9468779563903809, + "step": 1241 + }, + { + "epoch": 0.14, + "learning_rate": 2.6098560224745405e-07, + "logits/chosen": -3.0953927040100098, + "logits/rejected": -3.2632038593292236, + "logps/chosen": -72.46780395507812, + "logps/rejected": -128.88055419921875, + "loss": 0.3498, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19745410978794098, + "rewards/margins": 1.226307988166809, + "rewards/rejected": -1.0288538932800293, + "step": 1242 + }, + { + "epoch": 0.14, + "learning_rate": 2.609504857778298e-07, + "logits/chosen": -2.893667697906494, + "logits/rejected": -3.084047555923462, + "logps/chosen": -192.5306396484375, + "logps/rejected": -212.5204620361328, + "loss": 0.3174, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16227740049362183, + "rewards/margins": 2.066342353820801, + "rewards/rejected": -1.9040648937225342, + "step": 1243 + }, + { + "epoch": 0.14, + "learning_rate": 2.6091536930820556e-07, + "logits/chosen": -3.1496381759643555, + "logits/rejected": -3.1395297050476074, + "logps/chosen": -405.1414489746094, + "logps/rejected": -362.4064025878906, + "loss": 0.3606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10727101564407349, + "rewards/margins": 1.2385269403457642, + "rewards/rejected": -1.3457978963851929, + "step": 1244 + }, + { + "epoch": 0.14, + "learning_rate": 2.6088025283858126e-07, + "logits/chosen": -3.751851797103882, + "logits/rejected": -3.6648006439208984, + "logps/chosen": -351.31402587890625, + "logps/rejected": -265.43951416015625, + "loss": 0.5905, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2989659905433655, + "rewards/margins": 0.8734144568443298, + "rewards/rejected": -0.5744484663009644, + "step": 1245 + }, + { + "epoch": 0.14, + "learning_rate": 2.60845136368957e-07, + "logits/chosen": -2.638249635696411, + "logits/rejected": -2.515268087387085, + "logps/chosen": -122.7892074584961, + "logps/rejected": -190.04763793945312, + "loss": 0.3774, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11463932693004608, + "rewards/margins": 1.002360224723816, + "rewards/rejected": -0.8877209424972534, + "step": 1246 + }, + { + "epoch": 0.14, + "learning_rate": 2.6081001989933277e-07, + "logits/chosen": -2.860322952270508, + "logits/rejected": -3.108102798461914, + "logps/chosen": -283.07989501953125, + "logps/rejected": -455.01971435546875, + "loss": 0.4285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06789693981409073, + "rewards/margins": 1.4148297309875488, + "rewards/rejected": -1.4827266931533813, + "step": 1247 + }, + { + "epoch": 0.14, + "learning_rate": 2.607749034297085e-07, + "logits/chosen": -3.7937490940093994, + "logits/rejected": -3.579148054122925, + "logps/chosen": -134.30584716796875, + "logps/rejected": -144.80386352539062, + "loss": 0.65, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28740403056144714, + "rewards/margins": 1.2118487358093262, + "rewards/rejected": -0.9244446754455566, + "step": 1248 + }, + { + "epoch": 0.14, + "learning_rate": 2.6073978696008427e-07, + "logits/chosen": -2.1343436241149902, + "logits/rejected": -2.459807872772217, + "logps/chosen": -299.5708312988281, + "logps/rejected": -276.59307861328125, + "loss": 0.3958, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04353836178779602, + "rewards/margins": 1.7347865104675293, + "rewards/rejected": -1.6912481784820557, + "step": 1249 + }, + { + "epoch": 0.14, + "learning_rate": 2.6070467049046e-07, + "logits/chosen": -3.473194122314453, + "logits/rejected": -3.427959442138672, + "logps/chosen": -139.63751220703125, + "logps/rejected": -196.62179565429688, + "loss": 0.2858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17462079226970673, + "rewards/margins": 1.8477861881256104, + "rewards/rejected": -1.6731654405593872, + "step": 1250 + }, + { + "epoch": 0.14, + "learning_rate": 2.606695540208358e-07, + "logits/chosen": -2.88488507270813, + "logits/rejected": -3.114806652069092, + "logps/chosen": -192.2626190185547, + "logps/rejected": -242.4342041015625, + "loss": 0.571, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15763500332832336, + "rewards/margins": 1.1480674743652344, + "rewards/rejected": -0.9904325008392334, + "step": 1251 + }, + { + "epoch": 0.14, + "learning_rate": 2.6063443755121153e-07, + "logits/chosen": -3.074521064758301, + "logits/rejected": -3.0216445922851562, + "logps/chosen": -241.5642547607422, + "logps/rejected": -258.04193115234375, + "loss": 0.3361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6394314169883728, + "rewards/margins": 1.4470374584197998, + "rewards/rejected": -0.8076058626174927, + "step": 1252 + }, + { + "epoch": 0.14, + "learning_rate": 2.6059932108158724e-07, + "logits/chosen": -3.411302328109741, + "logits/rejected": -3.1982076168060303, + "logps/chosen": -252.36891174316406, + "logps/rejected": -222.7611541748047, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1281997710466385, + "rewards/margins": 2.6057016849517822, + "rewards/rejected": -2.477501630783081, + "step": 1253 + }, + { + "epoch": 0.14, + "learning_rate": 2.60564204611963e-07, + "logits/chosen": -2.6756086349487305, + "logits/rejected": -2.9818902015686035, + "logps/chosen": -335.9529724121094, + "logps/rejected": -279.1679382324219, + "loss": 0.415, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04776123911142349, + "rewards/margins": 1.0915879011154175, + "rewards/rejected": -1.043826699256897, + "step": 1254 + }, + { + "epoch": 0.14, + "learning_rate": 2.6052908814233874e-07, + "logits/chosen": -2.853520154953003, + "logits/rejected": -2.442591667175293, + "logps/chosen": -322.25640869140625, + "logps/rejected": -362.79144287109375, + "loss": 0.422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4186449944972992, + "rewards/margins": 1.2126481533050537, + "rewards/rejected": -0.7940032482147217, + "step": 1255 + }, + { + "epoch": 0.14, + "learning_rate": 2.604939716727145e-07, + "logits/chosen": -2.764824628829956, + "logits/rejected": -2.6589951515197754, + "logps/chosen": -292.8194580078125, + "logps/rejected": -321.891845703125, + "loss": 0.8181, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3818066716194153, + "rewards/margins": 1.3206005096435547, + "rewards/rejected": -1.7024071216583252, + "step": 1256 + }, + { + "epoch": 0.14, + "learning_rate": 2.6045885520309025e-07, + "logits/chosen": -2.8895866870880127, + "logits/rejected": -2.912273406982422, + "logps/chosen": -376.0956726074219, + "logps/rejected": -214.6659393310547, + "loss": 0.7383, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23697806894779205, + "rewards/margins": 0.4268755316734314, + "rewards/rejected": -0.663853645324707, + "step": 1257 + }, + { + "epoch": 0.15, + "learning_rate": 2.6042373873346595e-07, + "logits/chosen": -3.118597984313965, + "logits/rejected": -3.0233066082000732, + "logps/chosen": -234.56399536132812, + "logps/rejected": -260.0751953125, + "loss": 0.9664, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4169316291809082, + "rewards/margins": -0.28791970014572144, + "rewards/rejected": -0.12901189923286438, + "step": 1258 + }, + { + "epoch": 0.15, + "learning_rate": 2.603886222638417e-07, + "logits/chosen": -4.106902599334717, + "logits/rejected": -3.9498696327209473, + "logps/chosen": -203.10264587402344, + "logps/rejected": -171.9250946044922, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06892501562833786, + "rewards/margins": 0.9735456109046936, + "rewards/rejected": -0.9046206474304199, + "step": 1259 + }, + { + "epoch": 0.15, + "learning_rate": 2.603535057942175e-07, + "logits/chosen": -3.2452073097229004, + "logits/rejected": -2.8832969665527344, + "logps/chosen": -532.1001586914062, + "logps/rejected": -245.87144470214844, + "loss": 0.6972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21020887792110443, + "rewards/margins": 1.137298822402954, + "rewards/rejected": -1.3475077152252197, + "step": 1260 + }, + { + "epoch": 0.15, + "learning_rate": 2.603183893245932e-07, + "logits/chosen": -4.053818702697754, + "logits/rejected": -3.4232513904571533, + "logps/chosen": -293.766845703125, + "logps/rejected": -209.13522338867188, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4996156692504883, + "rewards/margins": 0.8672517538070679, + "rewards/rejected": -1.3668673038482666, + "step": 1261 + }, + { + "epoch": 0.15, + "learning_rate": 2.6028327285496897e-07, + "logits/chosen": -2.867202043533325, + "logits/rejected": -2.9406702518463135, + "logps/chosen": -255.1750030517578, + "logps/rejected": -320.5224914550781, + "loss": 0.3698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3969117999076843, + "rewards/margins": 1.4228506088256836, + "rewards/rejected": -1.8197624683380127, + "step": 1262 + }, + { + "epoch": 0.15, + "learning_rate": 2.602481563853447e-07, + "logits/chosen": -3.0925660133361816, + "logits/rejected": -3.043455123901367, + "logps/chosen": -124.40054321289062, + "logps/rejected": -154.558837890625, + "loss": 0.7198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5317804217338562, + "rewards/margins": 0.24838754534721375, + "rewards/rejected": -0.7801679968833923, + "step": 1263 + }, + { + "epoch": 0.15, + "learning_rate": 2.602130399157205e-07, + "logits/chosen": -3.5569663047790527, + "logits/rejected": -3.1190528869628906, + "logps/chosen": -251.91822814941406, + "logps/rejected": -229.67767333984375, + "loss": 0.563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0971226766705513, + "rewards/margins": 0.9962552785873413, + "rewards/rejected": -0.8991326093673706, + "step": 1264 + }, + { + "epoch": 0.15, + "learning_rate": 2.6017792344609623e-07, + "logits/chosen": -3.1028783321380615, + "logits/rejected": -3.1372337341308594, + "logps/chosen": -263.86541748046875, + "logps/rejected": -294.6171875, + "loss": 0.3522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3497154414653778, + "rewards/margins": 1.131418228149414, + "rewards/rejected": -1.4811336994171143, + "step": 1265 + }, + { + "epoch": 0.15, + "learning_rate": 2.6014280697647193e-07, + "logits/chosen": -3.4317240715026855, + "logits/rejected": -2.9669342041015625, + "logps/chosen": -421.86822509765625, + "logps/rejected": -480.8500671386719, + "loss": 0.3302, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16947278380393982, + "rewards/margins": 2.216059684753418, + "rewards/rejected": -2.0465869903564453, + "step": 1266 + }, + { + "epoch": 0.15, + "learning_rate": 2.601076905068477e-07, + "logits/chosen": -2.837587594985962, + "logits/rejected": -2.6420607566833496, + "logps/chosen": -315.09417724609375, + "logps/rejected": -285.3146057128906, + "loss": 0.3883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1792723685503006, + "rewards/margins": 1.0624361038208008, + "rewards/rejected": -0.8831638097763062, + "step": 1267 + }, + { + "epoch": 0.15, + "learning_rate": 2.6007257403722344e-07, + "logits/chosen": -3.2752652168273926, + "logits/rejected": -3.2611289024353027, + "logps/chosen": -343.8131103515625, + "logps/rejected": -227.1166534423828, + "loss": 0.4165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2622881829738617, + "rewards/margins": 1.7003253698349, + "rewards/rejected": -1.962613582611084, + "step": 1268 + }, + { + "epoch": 0.15, + "learning_rate": 2.600374575675992e-07, + "logits/chosen": -3.229914665222168, + "logits/rejected": -3.509070634841919, + "logps/chosen": -512.1622314453125, + "logps/rejected": -375.2642822265625, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022244591265916824, + "rewards/margins": 2.8704724311828613, + "rewards/rejected": -2.8482279777526855, + "step": 1269 + }, + { + "epoch": 0.15, + "learning_rate": 2.6000234109797495e-07, + "logits/chosen": -2.4674558639526367, + "logits/rejected": -2.614603042602539, + "logps/chosen": -404.96197509765625, + "logps/rejected": -350.16046142578125, + "loss": 0.5131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17047154903411865, + "rewards/margins": 0.685597836971283, + "rewards/rejected": -0.8560694456100464, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 2.599672246283507e-07, + "logits/chosen": -3.1197214126586914, + "logits/rejected": -3.0060253143310547, + "logps/chosen": -366.09332275390625, + "logps/rejected": -249.36624145507812, + "loss": 0.3422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1908625066280365, + "rewards/margins": 1.33160400390625, + "rewards/rejected": -1.1407414674758911, + "step": 1271 + }, + { + "epoch": 0.15, + "learning_rate": 2.5993210815872645e-07, + "logits/chosen": -2.741081476211548, + "logits/rejected": -2.368004322052002, + "logps/chosen": -334.4292297363281, + "logps/rejected": -326.1259765625, + "loss": 0.5526, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09303249418735504, + "rewards/margins": 0.712805986404419, + "rewards/rejected": -0.8058385252952576, + "step": 1272 + }, + { + "epoch": 0.15, + "learning_rate": 2.598969916891022e-07, + "logits/chosen": -2.9212374687194824, + "logits/rejected": -2.659376621246338, + "logps/chosen": -282.4862060546875, + "logps/rejected": -204.09007263183594, + "loss": 0.4624, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15247690677642822, + "rewards/margins": 1.0850062370300293, + "rewards/rejected": -0.9325292110443115, + "step": 1273 + }, + { + "epoch": 0.15, + "learning_rate": 2.598618752194779e-07, + "logits/chosen": -2.710968255996704, + "logits/rejected": -2.8165411949157715, + "logps/chosen": -232.6937255859375, + "logps/rejected": -423.9354553222656, + "loss": 0.3082, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05913534015417099, + "rewards/margins": 2.411250114440918, + "rewards/rejected": -2.352114677429199, + "step": 1274 + }, + { + "epoch": 0.15, + "learning_rate": 2.5982675874985366e-07, + "logits/chosen": -3.1423983573913574, + "logits/rejected": -3.17585825920105, + "logps/chosen": -117.81981658935547, + "logps/rejected": -206.79238891601562, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3020673394203186, + "rewards/margins": 1.8536884784698486, + "rewards/rejected": -2.1557559967041016, + "step": 1275 + }, + { + "epoch": 0.15, + "learning_rate": 2.597916422802294e-07, + "logits/chosen": -2.9921231269836426, + "logits/rejected": -3.17488956451416, + "logps/chosen": -207.14071655273438, + "logps/rejected": -308.4000549316406, + "loss": 0.2779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03297881782054901, + "rewards/margins": 1.7886940240859985, + "rewards/rejected": -1.8216726779937744, + "step": 1276 + }, + { + "epoch": 0.15, + "learning_rate": 2.5975652581060517e-07, + "logits/chosen": -3.2946152687072754, + "logits/rejected": -2.874213218688965, + "logps/chosen": -382.6255798339844, + "logps/rejected": -251.70358276367188, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16489487886428833, + "rewards/margins": 1.3828957080841064, + "rewards/rejected": -1.218000888824463, + "step": 1277 + }, + { + "epoch": 0.15, + "learning_rate": 2.597214093409809e-07, + "logits/chosen": -3.8209757804870605, + "logits/rejected": -3.94968843460083, + "logps/chosen": -166.04222106933594, + "logps/rejected": -219.08106994628906, + "loss": 0.3817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0673372820019722, + "rewards/margins": 1.873173475265503, + "rewards/rejected": -1.940510630607605, + "step": 1278 + }, + { + "epoch": 0.15, + "learning_rate": 2.596862928713566e-07, + "logits/chosen": -2.888946533203125, + "logits/rejected": -2.8783323764801025, + "logps/chosen": -151.70892333984375, + "logps/rejected": -185.6434783935547, + "loss": 0.5683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04054642841219902, + "rewards/margins": 0.4204131066799164, + "rewards/rejected": -0.37986665964126587, + "step": 1279 + }, + { + "epoch": 0.15, + "learning_rate": 2.596511764017324e-07, + "logits/chosen": -2.5820536613464355, + "logits/rejected": -2.842998504638672, + "logps/chosen": -345.49261474609375, + "logps/rejected": -290.9452819824219, + "loss": 0.3511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11564323306083679, + "rewards/margins": 1.0789257287979126, + "rewards/rejected": -1.1945688724517822, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 2.5961605993210813e-07, + "logits/chosen": -3.3668510913848877, + "logits/rejected": -3.2866597175598145, + "logps/chosen": -383.4378662109375, + "logps/rejected": -355.94366455078125, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30375710129737854, + "rewards/margins": 4.16273832321167, + "rewards/rejected": -3.8589813709259033, + "step": 1281 + }, + { + "epoch": 0.15, + "learning_rate": 2.595809434624839e-07, + "logits/chosen": -3.7810850143432617, + "logits/rejected": -3.6944327354431152, + "logps/chosen": -299.6083984375, + "logps/rejected": -158.12698364257812, + "loss": 0.9182, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8842620849609375, + "rewards/margins": 0.2196952998638153, + "rewards/rejected": -1.1039574146270752, + "step": 1282 + }, + { + "epoch": 0.15, + "learning_rate": 2.5954582699285964e-07, + "logits/chosen": -3.3617372512817383, + "logits/rejected": -3.576263666152954, + "logps/chosen": -111.85790252685547, + "logps/rejected": -132.98446655273438, + "loss": 0.4583, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1069284975528717, + "rewards/margins": 0.9012032747268677, + "rewards/rejected": -0.7942748069763184, + "step": 1283 + }, + { + "epoch": 0.15, + "learning_rate": 2.595107105232354e-07, + "logits/chosen": -3.6058006286621094, + "logits/rejected": -3.1365981101989746, + "logps/chosen": -241.54397583007812, + "logps/rejected": -213.15835571289062, + "loss": 0.6164, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35670149326324463, + "rewards/margins": 1.0320377349853516, + "rewards/rejected": -1.3887391090393066, + "step": 1284 + }, + { + "epoch": 0.15, + "learning_rate": 2.5947559405361115e-07, + "logits/chosen": -2.5054614543914795, + "logits/rejected": -2.9064981937408447, + "logps/chosen": -282.3585205078125, + "logps/rejected": -191.29638671875, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6501674056053162, + "rewards/margins": 2.5472280979156494, + "rewards/rejected": -1.897060751914978, + "step": 1285 + }, + { + "epoch": 0.15, + "learning_rate": 2.594404775839869e-07, + "logits/chosen": -3.532264232635498, + "logits/rejected": -3.5364441871643066, + "logps/chosen": -206.72787475585938, + "logps/rejected": -194.4606170654297, + "loss": 0.5487, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23328472673892975, + "rewards/margins": 0.772308886051178, + "rewards/rejected": -1.0055935382843018, + "step": 1286 + }, + { + "epoch": 0.15, + "learning_rate": 2.594053611143626e-07, + "logits/chosen": -3.2990620136260986, + "logits/rejected": -3.459531307220459, + "logps/chosen": -245.9703369140625, + "logps/rejected": -217.69302368164062, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08416178077459335, + "rewards/margins": 0.6740147471427917, + "rewards/rejected": -0.5898529887199402, + "step": 1287 + }, + { + "epoch": 0.15, + "learning_rate": 2.5937024464473836e-07, + "logits/chosen": -4.074586868286133, + "logits/rejected": -4.008978843688965, + "logps/chosen": -164.36434936523438, + "logps/rejected": -130.31381225585938, + "loss": 0.5328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17911836504936218, + "rewards/margins": 1.4981013536453247, + "rewards/rejected": -1.6772196292877197, + "step": 1288 + }, + { + "epoch": 0.15, + "learning_rate": 2.593351281751141e-07, + "logits/chosen": -2.689720392227173, + "logits/rejected": -2.728484630584717, + "logps/chosen": -291.118896484375, + "logps/rejected": -225.03164672851562, + "loss": 0.718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2973710894584656, + "rewards/margins": 0.5966485738754272, + "rewards/rejected": -0.8940197825431824, + "step": 1289 + }, + { + "epoch": 0.15, + "learning_rate": 2.5930001170548986e-07, + "logits/chosen": -3.393251657485962, + "logits/rejected": -3.3632326126098633, + "logps/chosen": -271.45233154296875, + "logps/rejected": -283.763916015625, + "loss": 0.3478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07456254959106445, + "rewards/margins": 1.062697410583496, + "rewards/rejected": -0.9881348609924316, + "step": 1290 + }, + { + "epoch": 0.15, + "learning_rate": 2.592648952358656e-07, + "logits/chosen": -3.6126067638397217, + "logits/rejected": -3.651167392730713, + "logps/chosen": -255.2904052734375, + "logps/rejected": -241.7370147705078, + "loss": 0.4147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2138913869857788, + "rewards/margins": 2.8744282722473145, + "rewards/rejected": -2.660537004470825, + "step": 1291 + }, + { + "epoch": 0.15, + "learning_rate": 2.5922977876624137e-07, + "logits/chosen": -2.827080726623535, + "logits/rejected": -3.1792807579040527, + "logps/chosen": -296.0008239746094, + "logps/rejected": -284.73541259765625, + "loss": 0.3339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0038594603538513184, + "rewards/margins": 1.9282746315002441, + "rewards/rejected": -1.9321341514587402, + "step": 1292 + }, + { + "epoch": 0.15, + "learning_rate": 2.5919466229661707e-07, + "logits/chosen": -2.8907952308654785, + "logits/rejected": -2.895859718322754, + "logps/chosen": -211.64186096191406, + "logps/rejected": -241.32791137695312, + "loss": 0.5372, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3302392363548279, + "rewards/margins": 0.5769890546798706, + "rewards/rejected": -0.9072282910346985, + "step": 1293 + }, + { + "epoch": 0.15, + "learning_rate": 2.591595458269929e-07, + "logits/chosen": -3.444784641265869, + "logits/rejected": -3.0962746143341064, + "logps/chosen": -291.75152587890625, + "logps/rejected": -136.48199462890625, + "loss": 0.8326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4753166139125824, + "rewards/margins": 0.36070555448532104, + "rewards/rejected": -0.8360220789909363, + "step": 1294 + }, + { + "epoch": 0.15, + "learning_rate": 2.591244293573686e-07, + "logits/chosen": -3.5744259357452393, + "logits/rejected": -3.6351099014282227, + "logps/chosen": -230.36766052246094, + "logps/rejected": -158.3636932373047, + "loss": 0.5953, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2603738307952881, + "rewards/margins": 0.6703643798828125, + "rewards/rejected": -0.9307381510734558, + "step": 1295 + }, + { + "epoch": 0.15, + "learning_rate": 2.5908931288774433e-07, + "logits/chosen": -2.756112813949585, + "logits/rejected": -2.728919267654419, + "logps/chosen": -330.05670166015625, + "logps/rejected": -325.7234191894531, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4362429082393646, + "rewards/margins": 1.570178508758545, + "rewards/rejected": -1.133935570716858, + "step": 1296 + }, + { + "epoch": 0.15, + "learning_rate": 2.590541964181201e-07, + "logits/chosen": -3.059100866317749, + "logits/rejected": -3.200026512145996, + "logps/chosen": -214.97756958007812, + "logps/rejected": -276.8621826171875, + "loss": 0.4629, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15328067541122437, + "rewards/margins": 0.8713489770889282, + "rewards/rejected": -1.0246297121047974, + "step": 1297 + }, + { + "epoch": 0.15, + "learning_rate": 2.5901907994849584e-07, + "logits/chosen": -3.480208396911621, + "logits/rejected": -3.6882548332214355, + "logps/chosen": -293.0123291015625, + "logps/rejected": -277.376220703125, + "loss": 0.7636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4127768874168396, + "rewards/margins": 1.7253131866455078, + "rewards/rejected": -2.138090133666992, + "step": 1298 + }, + { + "epoch": 0.15, + "learning_rate": 2.589839634788716e-07, + "logits/chosen": -2.7288150787353516, + "logits/rejected": -2.754058361053467, + "logps/chosen": -272.82305908203125, + "logps/rejected": -338.906494140625, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5219551920890808, + "rewards/margins": 1.0228451490402222, + "rewards/rejected": -1.5448004007339478, + "step": 1299 + }, + { + "epoch": 0.15, + "learning_rate": 2.5894884700924735e-07, + "logits/chosen": -3.9196786880493164, + "logits/rejected": -4.041879653930664, + "logps/chosen": -107.51551055908203, + "logps/rejected": -183.47714233398438, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.284287691116333, + "rewards/margins": 1.2184343338012695, + "rewards/rejected": -1.502721905708313, + "step": 1300 + }, + { + "epoch": 0.15, + "learning_rate": 2.5891373053962305e-07, + "logits/chosen": -2.800198793411255, + "logits/rejected": -2.808199644088745, + "logps/chosen": -212.77622985839844, + "logps/rejected": -249.2012939453125, + "loss": 0.7444, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19236822426319122, + "rewards/margins": 0.7992057204246521, + "rewards/rejected": -0.9915739297866821, + "step": 1301 + }, + { + "epoch": 0.15, + "learning_rate": 2.588786140699988e-07, + "logits/chosen": -3.0747416019439697, + "logits/rejected": -3.10546612739563, + "logps/chosen": -263.40899658203125, + "logps/rejected": -244.4541015625, + "loss": 0.3898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15884999930858612, + "rewards/margins": 1.0069949626922607, + "rewards/rejected": -1.1658449172973633, + "step": 1302 + }, + { + "epoch": 0.15, + "learning_rate": 2.5884349760037456e-07, + "logits/chosen": -2.9573512077331543, + "logits/rejected": -3.1573257446289062, + "logps/chosen": -255.2872314453125, + "logps/rejected": -378.9390869140625, + "loss": 0.5537, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1679532527923584, + "rewards/margins": 1.058525800704956, + "rewards/rejected": -1.2264790534973145, + "step": 1303 + }, + { + "epoch": 0.15, + "learning_rate": 2.588083811307503e-07, + "logits/chosen": -3.092365264892578, + "logits/rejected": -2.8742856979370117, + "logps/chosen": -253.04208374023438, + "logps/rejected": -148.779296875, + "loss": 0.4893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06802567839622498, + "rewards/margins": 1.276638388633728, + "rewards/rejected": -1.2086126804351807, + "step": 1304 + }, + { + "epoch": 0.15, + "learning_rate": 2.5877326466112607e-07, + "logits/chosen": -3.113018751144409, + "logits/rejected": -3.186005115509033, + "logps/chosen": -138.67953491210938, + "logps/rejected": -145.1586456298828, + "loss": 0.478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09670032560825348, + "rewards/margins": 0.9930595755577087, + "rewards/rejected": -0.8963592648506165, + "step": 1305 + }, + { + "epoch": 0.15, + "learning_rate": 2.587381481915018e-07, + "logits/chosen": -3.447582483291626, + "logits/rejected": -3.4858503341674805, + "logps/chosen": -192.93130493164062, + "logps/rejected": -187.5643310546875, + "loss": 0.3989, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3504765033721924, + "rewards/margins": 0.9960402250289917, + "rewards/rejected": -0.6455637216567993, + "step": 1306 + }, + { + "epoch": 0.15, + "learning_rate": 2.5870303172187757e-07, + "logits/chosen": -3.5059256553649902, + "logits/rejected": -3.4606306552886963, + "logps/chosen": -305.87811279296875, + "logps/rejected": -251.9505157470703, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41335949301719666, + "rewards/margins": 1.1022999286651611, + "rewards/rejected": -1.5156594514846802, + "step": 1307 + }, + { + "epoch": 0.15, + "learning_rate": 2.5866791525225333e-07, + "logits/chosen": -3.2620432376861572, + "logits/rejected": -3.291475296020508, + "logps/chosen": -356.01129150390625, + "logps/rejected": -222.39328002929688, + "loss": 0.4457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15751652419567108, + "rewards/margins": 0.9991155862808228, + "rewards/rejected": -0.8415990471839905, + "step": 1308 + }, + { + "epoch": 0.15, + "learning_rate": 2.5863279878262903e-07, + "logits/chosen": -2.914588212966919, + "logits/rejected": -2.7374255657196045, + "logps/chosen": -341.7193603515625, + "logps/rejected": -304.9005432128906, + "loss": 0.4779, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10665670782327652, + "rewards/margins": 1.4474871158599854, + "rewards/rejected": -1.3408302068710327, + "step": 1309 + }, + { + "epoch": 0.15, + "learning_rate": 2.585976823130048e-07, + "logits/chosen": -3.4403414726257324, + "logits/rejected": -3.1371870040893555, + "logps/chosen": -153.07095336914062, + "logps/rejected": -145.9384002685547, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6596205234527588, + "rewards/margins": 0.7322850823402405, + "rewards/rejected": -1.3919057846069336, + "step": 1310 + }, + { + "epoch": 0.15, + "learning_rate": 2.5856256584338054e-07, + "logits/chosen": -3.120992422103882, + "logits/rejected": -3.071319341659546, + "logps/chosen": -242.12005615234375, + "logps/rejected": -289.5521240234375, + "loss": 0.5491, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10733366012573242, + "rewards/margins": 0.7503787279129028, + "rewards/rejected": -0.6430450081825256, + "step": 1311 + }, + { + "epoch": 0.15, + "learning_rate": 2.585274493737563e-07, + "logits/chosen": -2.70572829246521, + "logits/rejected": -2.6100759506225586, + "logps/chosen": -345.95416259765625, + "logps/rejected": -226.0701446533203, + "loss": 0.2302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.551724910736084, + "rewards/margins": 1.677099585533142, + "rewards/rejected": -1.1253745555877686, + "step": 1312 + }, + { + "epoch": 0.15, + "learning_rate": 2.5849233290413204e-07, + "logits/chosen": -2.434321641921997, + "logits/rejected": -2.4609122276306152, + "logps/chosen": -336.62823486328125, + "logps/rejected": -291.9024658203125, + "loss": 0.3314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06507334858179092, + "rewards/margins": 1.4492993354797363, + "rewards/rejected": -1.51437246799469, + "step": 1313 + }, + { + "epoch": 0.15, + "learning_rate": 2.5845721643450774e-07, + "logits/chosen": -3.1006526947021484, + "logits/rejected": -2.8798556327819824, + "logps/chosen": -343.8378601074219, + "logps/rejected": -280.94281005859375, + "loss": 0.4671, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025163792073726654, + "rewards/margins": 1.2496585845947266, + "rewards/rejected": -1.2244948148727417, + "step": 1314 + }, + { + "epoch": 0.15, + "learning_rate": 2.584220999648835e-07, + "logits/chosen": -2.8396191596984863, + "logits/rejected": -3.412388801574707, + "logps/chosen": -134.468505859375, + "logps/rejected": -190.02305603027344, + "loss": 0.4279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3174448609352112, + "rewards/margins": 1.9064357280731201, + "rewards/rejected": -1.5889910459518433, + "step": 1315 + }, + { + "epoch": 0.15, + "learning_rate": 2.583869834952593e-07, + "logits/chosen": -3.92814302444458, + "logits/rejected": -3.6312851905822754, + "logps/chosen": -235.47775268554688, + "logps/rejected": -195.09982299804688, + "loss": 0.4433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1444067656993866, + "rewards/margins": 0.8122756481170654, + "rewards/rejected": -0.6678689122200012, + "step": 1316 + }, + { + "epoch": 0.15, + "learning_rate": 2.58351867025635e-07, + "logits/chosen": -2.1425931453704834, + "logits/rejected": -2.3161802291870117, + "logps/chosen": -238.81643676757812, + "logps/rejected": -198.32586669921875, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1310850977897644, + "rewards/margins": 1.0565693378448486, + "rewards/rejected": -1.1876544952392578, + "step": 1317 + }, + { + "epoch": 0.15, + "learning_rate": 2.5831675055601076e-07, + "logits/chosen": -3.299617290496826, + "logits/rejected": -3.3087098598480225, + "logps/chosen": -144.30484008789062, + "logps/rejected": -235.2721405029297, + "loss": 0.2335, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09746107459068298, + "rewards/margins": 2.4311635494232178, + "rewards/rejected": -2.333702325820923, + "step": 1318 + }, + { + "epoch": 0.15, + "learning_rate": 2.582816340863865e-07, + "logits/chosen": -3.2929329872131348, + "logits/rejected": -3.379911422729492, + "logps/chosen": -116.9150619506836, + "logps/rejected": -212.86988830566406, + "loss": 0.5589, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15315155684947968, + "rewards/margins": 1.0505399703979492, + "rewards/rejected": -0.8973883986473083, + "step": 1319 + }, + { + "epoch": 0.15, + "learning_rate": 2.5824651761676227e-07, + "logits/chosen": -3.469623565673828, + "logits/rejected": -3.279270648956299, + "logps/chosen": -231.5189208984375, + "logps/rejected": -184.39744567871094, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04548273980617523, + "rewards/margins": 0.6871821880340576, + "rewards/rejected": -0.6416994333267212, + "step": 1320 + }, + { + "epoch": 0.15, + "learning_rate": 2.58211401147138e-07, + "logits/chosen": -3.0359675884246826, + "logits/rejected": -3.128007650375366, + "logps/chosen": -432.75286865234375, + "logps/rejected": -339.6465148925781, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.188251331448555, + "rewards/margins": 0.9831944108009338, + "rewards/rejected": -0.79494309425354, + "step": 1321 + }, + { + "epoch": 0.15, + "learning_rate": 2.581762846775137e-07, + "logits/chosen": -3.544379234313965, + "logits/rejected": -3.136430263519287, + "logps/chosen": -318.9771728515625, + "logps/rejected": -327.9295654296875, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24966707825660706, + "rewards/margins": 1.5483887195587158, + "rewards/rejected": -1.7980557680130005, + "step": 1322 + }, + { + "epoch": 0.15, + "learning_rate": 2.581411682078895e-07, + "logits/chosen": -3.1763789653778076, + "logits/rejected": -3.3069419860839844, + "logps/chosen": -227.46932983398438, + "logps/rejected": -166.3903045654297, + "loss": 0.4, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08789289742708206, + "rewards/margins": 1.0674495697021484, + "rewards/rejected": -0.9795567393302917, + "step": 1323 + }, + { + "epoch": 0.15, + "learning_rate": 2.5810605173826523e-07, + "logits/chosen": -2.833911895751953, + "logits/rejected": -2.856973648071289, + "logps/chosen": -213.52345275878906, + "logps/rejected": -294.8507080078125, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01629640907049179, + "rewards/margins": 1.7067673206329346, + "rewards/rejected": -1.6904709339141846, + "step": 1324 + }, + { + "epoch": 0.15, + "learning_rate": 2.58070935268641e-07, + "logits/chosen": -2.3931326866149902, + "logits/rejected": -2.4397151470184326, + "logps/chosen": -218.00160217285156, + "logps/rejected": -239.77919006347656, + "loss": 0.4171, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08324562013149261, + "rewards/margins": 1.0585038661956787, + "rewards/rejected": -0.9752583503723145, + "step": 1325 + }, + { + "epoch": 0.15, + "learning_rate": 2.5803581879901674e-07, + "logits/chosen": -2.6797802448272705, + "logits/rejected": -2.593536853790283, + "logps/chosen": -241.52352905273438, + "logps/rejected": -298.5032653808594, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1565411239862442, + "rewards/margins": 1.6411848068237305, + "rewards/rejected": -1.484643578529358, + "step": 1326 + }, + { + "epoch": 0.15, + "learning_rate": 2.5800070232939244e-07, + "logits/chosen": -3.2821755409240723, + "logits/rejected": -3.0782220363616943, + "logps/chosen": -233.2435302734375, + "logps/rejected": -292.4256591796875, + "loss": 0.449, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0727543756365776, + "rewards/margins": 1.4862308502197266, + "rewards/rejected": -1.5589852333068848, + "step": 1327 + }, + { + "epoch": 0.15, + "learning_rate": 2.5796558585976825e-07, + "logits/chosen": -3.5607621669769287, + "logits/rejected": -3.2520854473114014, + "logps/chosen": -451.402587890625, + "logps/rejected": -259.1007385253906, + "loss": 0.653, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12275351583957672, + "rewards/margins": 0.33525529503822327, + "rewards/rejected": -0.4580088257789612, + "step": 1328 + }, + { + "epoch": 0.15, + "learning_rate": 2.57930469390144e-07, + "logits/chosen": -3.107207775115967, + "logits/rejected": -3.1797850131988525, + "logps/chosen": -374.5860595703125, + "logps/rejected": -537.3178100585938, + "loss": 0.3995, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0880943238735199, + "rewards/margins": 1.243980884552002, + "rewards/rejected": -1.1558865308761597, + "step": 1329 + }, + { + "epoch": 0.15, + "learning_rate": 2.578953529205197e-07, + "logits/chosen": -2.8288464546203613, + "logits/rejected": -2.8298370838165283, + "logps/chosen": -220.2200927734375, + "logps/rejected": -378.2925720214844, + "loss": 0.4168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34553563594818115, + "rewards/margins": 1.497823715209961, + "rewards/rejected": -1.843359351158142, + "step": 1330 + }, + { + "epoch": 0.15, + "learning_rate": 2.5786023645089545e-07, + "logits/chosen": -3.011458396911621, + "logits/rejected": -2.9832024574279785, + "logps/chosen": -194.14791870117188, + "logps/rejected": -300.98291015625, + "loss": 0.2893, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.008231490850448608, + "rewards/margins": 2.0071983337402344, + "rewards/rejected": -2.015429735183716, + "step": 1331 + }, + { + "epoch": 0.15, + "learning_rate": 2.578251199812712e-07, + "logits/chosen": -3.0211377143859863, + "logits/rejected": -2.7029144763946533, + "logps/chosen": -299.80572509765625, + "logps/rejected": -187.5726318359375, + "loss": 0.6022, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20919884741306305, + "rewards/margins": 0.7625776529312134, + "rewards/rejected": -0.9717764854431152, + "step": 1332 + }, + { + "epoch": 0.15, + "learning_rate": 2.5779000351164696e-07, + "logits/chosen": -2.910921573638916, + "logits/rejected": -2.8228819370269775, + "logps/chosen": -320.490234375, + "logps/rejected": -363.3335266113281, + "loss": 0.4639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23510269820690155, + "rewards/margins": 1.20047128200531, + "rewards/rejected": -1.435573935508728, + "step": 1333 + }, + { + "epoch": 0.15, + "learning_rate": 2.577548870420227e-07, + "logits/chosen": -1.9972152709960938, + "logits/rejected": -1.8992042541503906, + "logps/chosen": -279.6734924316406, + "logps/rejected": -266.1451721191406, + "loss": 0.6477, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17433536052703857, + "rewards/margins": 0.2880229949951172, + "rewards/rejected": -0.46235838532447815, + "step": 1334 + }, + { + "epoch": 0.15, + "learning_rate": 2.577197705723984e-07, + "logits/chosen": -2.8397796154022217, + "logits/rejected": -2.9166674613952637, + "logps/chosen": -172.4813690185547, + "logps/rejected": -178.2071533203125, + "loss": 0.5685, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.49427711963653564, + "rewards/margins": 0.9155290126800537, + "rewards/rejected": -1.4098061323165894, + "step": 1335 + }, + { + "epoch": 0.15, + "learning_rate": 2.5768465410277417e-07, + "logits/chosen": -3.04011607170105, + "logits/rejected": -3.047320604324341, + "logps/chosen": -141.34194946289062, + "logps/rejected": -185.8659210205078, + "loss": 0.3501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23633472621440887, + "rewards/margins": 1.0943012237548828, + "rewards/rejected": -1.330635905265808, + "step": 1336 + }, + { + "epoch": 0.15, + "learning_rate": 2.576495376331499e-07, + "logits/chosen": -2.802640914916992, + "logits/rejected": -2.7107882499694824, + "logps/chosen": -378.320068359375, + "logps/rejected": -393.7737731933594, + "loss": 0.3839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1115812212228775, + "rewards/margins": 1.0793211460113525, + "rewards/rejected": -1.1909023523330688, + "step": 1337 + }, + { + "epoch": 0.15, + "learning_rate": 2.576144211635257e-07, + "logits/chosen": -3.5441951751708984, + "logits/rejected": -3.316993474960327, + "logps/chosen": -253.17745971679688, + "logps/rejected": -195.74221801757812, + "loss": 0.4647, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14122381806373596, + "rewards/margins": 0.9724863171577454, + "rewards/rejected": -0.831262469291687, + "step": 1338 + }, + { + "epoch": 0.15, + "learning_rate": 2.5757930469390143e-07, + "logits/chosen": -3.1588149070739746, + "logits/rejected": -3.020139694213867, + "logps/chosen": -300.21502685546875, + "logps/rejected": -357.209228515625, + "loss": 0.7251, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.279959499835968, + "rewards/margins": 0.26815205812454224, + "rewards/rejected": -0.5481115579605103, + "step": 1339 + }, + { + "epoch": 0.15, + "learning_rate": 2.575441882242772e-07, + "logits/chosen": -2.766725778579712, + "logits/rejected": -2.8605597019195557, + "logps/chosen": -348.01904296875, + "logps/rejected": -205.75148010253906, + "loss": 0.6724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19295647740364075, + "rewards/margins": 0.4833073318004608, + "rewards/rejected": -0.6762638092041016, + "step": 1340 + }, + { + "epoch": 0.15, + "learning_rate": 2.5750907175465294e-07, + "logits/chosen": -2.8689775466918945, + "logits/rejected": -2.532484292984009, + "logps/chosen": -243.0447540283203, + "logps/rejected": -359.3069763183594, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5818072557449341, + "rewards/margins": 2.804609775543213, + "rewards/rejected": -2.2228024005889893, + "step": 1341 + }, + { + "epoch": 0.15, + "learning_rate": 2.574739552850287e-07, + "logits/chosen": -3.046572208404541, + "logits/rejected": -3.5317859649658203, + "logps/chosen": -173.205078125, + "logps/rejected": -150.58621215820312, + "loss": 0.4521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17993669211864471, + "rewards/margins": 0.9181038737297058, + "rewards/rejected": -1.0980406999588013, + "step": 1342 + }, + { + "epoch": 0.15, + "learning_rate": 2.574388388154044e-07, + "logits/chosen": -2.9279863834381104, + "logits/rejected": -2.8664026260375977, + "logps/chosen": -386.1767883300781, + "logps/rejected": -312.8895263671875, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5790375471115112, + "rewards/margins": 0.6932671070098877, + "rewards/rejected": -1.2723045349121094, + "step": 1343 + }, + { + "epoch": 0.15, + "learning_rate": 2.5740372234578015e-07, + "logits/chosen": -3.529600143432617, + "logits/rejected": -3.6541333198547363, + "logps/chosen": -190.43984985351562, + "logps/rejected": -189.32638549804688, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2642107307910919, + "rewards/margins": 1.0616402626037598, + "rewards/rejected": -0.7974294424057007, + "step": 1344 + }, + { + "epoch": 0.16, + "learning_rate": 2.573686058761559e-07, + "logits/chosen": -3.0529565811157227, + "logits/rejected": -3.256930351257324, + "logps/chosen": -415.5158996582031, + "logps/rejected": -347.5145263671875, + "loss": 0.3115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01732298731803894, + "rewards/margins": 2.555565357208252, + "rewards/rejected": -2.5382423400878906, + "step": 1345 + }, + { + "epoch": 0.16, + "learning_rate": 2.5733348940653166e-07, + "logits/chosen": -3.524427890777588, + "logits/rejected": -3.2087621688842773, + "logps/chosen": -521.156494140625, + "logps/rejected": -338.38818359375, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24943920969963074, + "rewards/margins": 2.0311365127563477, + "rewards/rejected": -1.781697392463684, + "step": 1346 + }, + { + "epoch": 0.16, + "learning_rate": 2.572983729369074e-07, + "logits/chosen": -2.212906837463379, + "logits/rejected": -2.129861831665039, + "logps/chosen": -281.44464111328125, + "logps/rejected": -254.72792053222656, + "loss": 0.683, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02011007070541382, + "rewards/margins": 0.32506948709487915, + "rewards/rejected": -0.30495941638946533, + "step": 1347 + }, + { + "epoch": 0.16, + "learning_rate": 2.572632564672831e-07, + "logits/chosen": -3.854924201965332, + "logits/rejected": -3.8617465496063232, + "logps/chosen": -362.65655517578125, + "logps/rejected": -331.388671875, + "loss": 0.4386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3759136199951172, + "rewards/margins": 1.5254466533660889, + "rewards/rejected": -1.901360273361206, + "step": 1348 + }, + { + "epoch": 0.16, + "learning_rate": 2.5722813999765886e-07, + "logits/chosen": -2.695035457611084, + "logits/rejected": -2.9419705867767334, + "logps/chosen": -213.92608642578125, + "logps/rejected": -175.310546875, + "loss": 0.8734, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2719343304634094, + "rewards/margins": -0.2346378117799759, + "rewards/rejected": -0.037296511232852936, + "step": 1349 + }, + { + "epoch": 0.16, + "learning_rate": 2.5719302352803467e-07, + "logits/chosen": -2.6823413372039795, + "logits/rejected": -2.6569836139678955, + "logps/chosen": -324.4682922363281, + "logps/rejected": -293.76092529296875, + "loss": 0.5095, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12054458260536194, + "rewards/margins": 0.8754315376281738, + "rewards/rejected": -0.7548869848251343, + "step": 1350 + }, + { + "epoch": 0.16, + "learning_rate": 2.5715790705841037e-07, + "logits/chosen": -3.385639190673828, + "logits/rejected": -3.45224666595459, + "logps/chosen": -265.8811340332031, + "logps/rejected": -216.66966247558594, + "loss": 0.3896, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006525054574012756, + "rewards/margins": 1.8008615970611572, + "rewards/rejected": -1.7943366765975952, + "step": 1351 + }, + { + "epoch": 0.16, + "learning_rate": 2.571227905887861e-07, + "logits/chosen": -3.4179399013519287, + "logits/rejected": -3.3528499603271484, + "logps/chosen": -196.7360076904297, + "logps/rejected": -205.28848266601562, + "loss": 0.4716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18109527230262756, + "rewards/margins": 1.005918264389038, + "rewards/rejected": -1.1870136260986328, + "step": 1352 + }, + { + "epoch": 0.16, + "learning_rate": 2.570876741191619e-07, + "logits/chosen": -3.1486754417419434, + "logits/rejected": -3.3651914596557617, + "logps/chosen": -84.2898178100586, + "logps/rejected": -138.9302215576172, + "loss": 0.3656, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0016013383865356445, + "rewards/margins": 1.6735140085220337, + "rewards/rejected": -1.671912670135498, + "step": 1353 + }, + { + "epoch": 0.16, + "learning_rate": 2.5705255764953763e-07, + "logits/chosen": -3.6131789684295654, + "logits/rejected": -3.488199472427368, + "logps/chosen": -144.23800659179688, + "logps/rejected": -162.8486785888672, + "loss": 0.4406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.029750511050224304, + "rewards/margins": 0.875946044921875, + "rewards/rejected": -0.9056965708732605, + "step": 1354 + }, + { + "epoch": 0.16, + "learning_rate": 2.570174411799134e-07, + "logits/chosen": -3.6672186851501465, + "logits/rejected": -3.3276636600494385, + "logps/chosen": -161.36087036132812, + "logps/rejected": -195.42633056640625, + "loss": 0.3021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19010359048843384, + "rewards/margins": 2.0184736251831055, + "rewards/rejected": -2.2085771560668945, + "step": 1355 + }, + { + "epoch": 0.16, + "learning_rate": 2.569823247102891e-07, + "logits/chosen": -2.987417697906494, + "logits/rejected": -3.123345375061035, + "logps/chosen": -342.34649658203125, + "logps/rejected": -223.80052185058594, + "loss": 0.4614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09144477546215057, + "rewards/margins": 0.9815898537635803, + "rewards/rejected": -1.073034644126892, + "step": 1356 + }, + { + "epoch": 0.16, + "learning_rate": 2.5694720824066484e-07, + "logits/chosen": -2.8168628215789795, + "logits/rejected": -2.63747501373291, + "logps/chosen": -356.0416259765625, + "logps/rejected": -284.3099060058594, + "loss": 0.826, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.673096776008606, + "rewards/margins": 0.2346608191728592, + "rewards/rejected": -0.9077576398849487, + "step": 1357 + }, + { + "epoch": 0.16, + "learning_rate": 2.569120917710406e-07, + "logits/chosen": -2.88814640045166, + "logits/rejected": -2.9364426136016846, + "logps/chosen": -263.3108215332031, + "logps/rejected": -333.1740417480469, + "loss": 0.4675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2529527246952057, + "rewards/margins": 1.2739853858947754, + "rewards/rejected": -1.5269379615783691, + "step": 1358 + }, + { + "epoch": 0.16, + "learning_rate": 2.5687697530141635e-07, + "logits/chosen": -3.1437525749206543, + "logits/rejected": -2.7502827644348145, + "logps/chosen": -289.53216552734375, + "logps/rejected": -283.1186828613281, + "loss": 0.3312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28042498230934143, + "rewards/margins": 1.9493902921676636, + "rewards/rejected": -2.2298152446746826, + "step": 1359 + }, + { + "epoch": 0.16, + "learning_rate": 2.568418588317921e-07, + "logits/chosen": -2.395545482635498, + "logits/rejected": -2.6632871627807617, + "logps/chosen": -352.8138427734375, + "logps/rejected": -426.9617919921875, + "loss": 0.4107, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2137509435415268, + "rewards/margins": 1.4155316352844238, + "rewards/rejected": -1.2017806768417358, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 2.5680674236216786e-07, + "logits/chosen": -2.7834105491638184, + "logits/rejected": -2.9712319374084473, + "logps/chosen": -308.0777282714844, + "logps/rejected": -240.0155029296875, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1949208676815033, + "rewards/margins": 1.8474559783935547, + "rewards/rejected": -1.6525352001190186, + "step": 1361 + }, + { + "epoch": 0.16, + "learning_rate": 2.567716258925436e-07, + "logits/chosen": -2.6352343559265137, + "logits/rejected": -2.6359567642211914, + "logps/chosen": -190.03099060058594, + "logps/rejected": -298.51959228515625, + "loss": 0.2553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2650943696498871, + "rewards/margins": 2.1437110900878906, + "rewards/rejected": -1.8786166906356812, + "step": 1362 + }, + { + "epoch": 0.16, + "learning_rate": 2.5673650942291937e-07, + "logits/chosen": -3.2436914443969727, + "logits/rejected": -3.6064529418945312, + "logps/chosen": -197.88385009765625, + "logps/rejected": -304.5501403808594, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3506697118282318, + "rewards/margins": 2.064213991165161, + "rewards/rejected": -2.414883852005005, + "step": 1363 + }, + { + "epoch": 0.16, + "learning_rate": 2.5670139295329507e-07, + "logits/chosen": -3.2190346717834473, + "logits/rejected": -3.0140655040740967, + "logps/chosen": -260.16802978515625, + "logps/rejected": -297.23809814453125, + "loss": 0.3128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005299568176269531, + "rewards/margins": 1.3440093994140625, + "rewards/rejected": -1.349308967590332, + "step": 1364 + }, + { + "epoch": 0.16, + "learning_rate": 2.566662764836708e-07, + "logits/chosen": -3.5546624660491943, + "logits/rejected": -3.093675374984741, + "logps/chosen": -228.9453887939453, + "logps/rejected": -128.35955810546875, + "loss": 0.454, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2293323278427124, + "rewards/margins": 0.9732205271720886, + "rewards/rejected": -0.743888258934021, + "step": 1365 + }, + { + "epoch": 0.16, + "learning_rate": 2.566311600140466e-07, + "logits/chosen": -2.571366310119629, + "logits/rejected": -2.435089111328125, + "logps/chosen": -289.49371337890625, + "logps/rejected": -245.40585327148438, + "loss": 0.6215, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09940657019615173, + "rewards/margins": 0.5679407119750977, + "rewards/rejected": -0.4685341715812683, + "step": 1366 + }, + { + "epoch": 0.16, + "learning_rate": 2.5659604354442233e-07, + "logits/chosen": -3.2948272228240967, + "logits/rejected": -3.4599037170410156, + "logps/chosen": -281.29180908203125, + "logps/rejected": -277.0350646972656, + "loss": 0.6666, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6319332122802734, + "rewards/margins": 0.886909008026123, + "rewards/rejected": -1.5188422203063965, + "step": 1367 + }, + { + "epoch": 0.16, + "learning_rate": 2.565609270747981e-07, + "logits/chosen": -3.389592170715332, + "logits/rejected": -3.4308433532714844, + "logps/chosen": -132.67897033691406, + "logps/rejected": -189.1661376953125, + "loss": 0.4584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030227214097976685, + "rewards/margins": 1.4981093406677246, + "rewards/rejected": -1.528336524963379, + "step": 1368 + }, + { + "epoch": 0.16, + "learning_rate": 2.5652581060517384e-07, + "logits/chosen": -2.6943979263305664, + "logits/rejected": -3.1007564067840576, + "logps/chosen": -334.94122314453125, + "logps/rejected": -265.3455810546875, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45796167850494385, + "rewards/margins": 1.633500576019287, + "rewards/rejected": -1.1755388975143433, + "step": 1369 + }, + { + "epoch": 0.16, + "learning_rate": 2.5649069413554954e-07, + "logits/chosen": -2.9270825386047363, + "logits/rejected": -2.8515071868896484, + "logps/chosen": -159.76052856445312, + "logps/rejected": -262.5685729980469, + "loss": 0.5884, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23473826050758362, + "rewards/margins": 1.6722174882888794, + "rewards/rejected": -1.906955599784851, + "step": 1370 + }, + { + "epoch": 0.16, + "learning_rate": 2.564555776659253e-07, + "logits/chosen": -2.6052656173706055, + "logits/rejected": -2.7678017616271973, + "logps/chosen": -104.9153060913086, + "logps/rejected": -196.5333709716797, + "loss": 0.5037, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01072102040052414, + "rewards/margins": 0.6722900867462158, + "rewards/rejected": -0.6830111145973206, + "step": 1371 + }, + { + "epoch": 0.16, + "learning_rate": 2.5642046119630104e-07, + "logits/chosen": -2.592020273208618, + "logits/rejected": -2.6717891693115234, + "logps/chosen": -463.33685302734375, + "logps/rejected": -404.526611328125, + "loss": 0.3905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6120036244392395, + "rewards/margins": 1.602557897567749, + "rewards/rejected": -0.9905542135238647, + "step": 1372 + }, + { + "epoch": 0.16, + "learning_rate": 2.563853447266768e-07, + "logits/chosen": -3.421031951904297, + "logits/rejected": -3.3631749153137207, + "logps/chosen": -300.27581787109375, + "logps/rejected": -216.16343688964844, + "loss": 0.2445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.009352780878543854, + "rewards/margins": 1.9669498205184937, + "rewards/rejected": -1.9763026237487793, + "step": 1373 + }, + { + "epoch": 0.16, + "learning_rate": 2.5635022825705255e-07, + "logits/chosen": -3.7147321701049805, + "logits/rejected": -3.2772140502929688, + "logps/chosen": -410.2956237792969, + "logps/rejected": -355.19927978515625, + "loss": 0.457, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.031219899654388428, + "rewards/margins": 1.1931451559066772, + "rewards/rejected": -1.1619253158569336, + "step": 1374 + }, + { + "epoch": 0.16, + "learning_rate": 2.563151117874283e-07, + "logits/chosen": -3.7261199951171875, + "logits/rejected": -3.2402570247650146, + "logps/chosen": -402.9207763671875, + "logps/rejected": -289.322509765625, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16271284222602844, + "rewards/margins": 1.8227170705795288, + "rewards/rejected": -1.6600042581558228, + "step": 1375 + }, + { + "epoch": 0.16, + "learning_rate": 2.5627999531780406e-07, + "logits/chosen": -2.8411757946014404, + "logits/rejected": -2.626554012298584, + "logps/chosen": -246.9441680908203, + "logps/rejected": -326.43756103515625, + "loss": 0.4107, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03230428695678711, + "rewards/margins": 1.521988868713379, + "rewards/rejected": -1.4896845817565918, + "step": 1376 + }, + { + "epoch": 0.16, + "learning_rate": 2.5624487884817976e-07, + "logits/chosen": -2.673450469970703, + "logits/rejected": -2.6322360038757324, + "logps/chosen": -346.960693359375, + "logps/rejected": -301.95562744140625, + "loss": 0.5785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05377506464719772, + "rewards/margins": 0.8062514662742615, + "rewards/rejected": -0.8600265383720398, + "step": 1377 + }, + { + "epoch": 0.16, + "learning_rate": 2.562097623785555e-07, + "logits/chosen": -3.582223653793335, + "logits/rejected": -3.5398154258728027, + "logps/chosen": -180.41880798339844, + "logps/rejected": -205.47853088378906, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022180452942848206, + "rewards/margins": 1.2614927291870117, + "rewards/rejected": -1.2836731672286987, + "step": 1378 + }, + { + "epoch": 0.16, + "learning_rate": 2.5617464590893127e-07, + "logits/chosen": -2.4894216060638428, + "logits/rejected": -2.6623215675354004, + "logps/chosen": -287.93218994140625, + "logps/rejected": -248.9035186767578, + "loss": 0.4239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24348139762878418, + "rewards/margins": 1.2941224575042725, + "rewards/rejected": -1.5376038551330566, + "step": 1379 + }, + { + "epoch": 0.16, + "learning_rate": 2.56139529439307e-07, + "logits/chosen": -3.425917148590088, + "logits/rejected": -3.5656769275665283, + "logps/chosen": -221.22520446777344, + "logps/rejected": -235.19049072265625, + "loss": 0.2744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03283189237117767, + "rewards/margins": 2.3579044342041016, + "rewards/rejected": -2.3907361030578613, + "step": 1380 + }, + { + "epoch": 0.16, + "learning_rate": 2.561044129696828e-07, + "logits/chosen": -2.9858784675598145, + "logits/rejected": -3.2091174125671387, + "logps/chosen": -186.5463104248047, + "logps/rejected": -304.19207763671875, + "loss": 0.1565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.238559752702713, + "rewards/margins": 2.2824764251708984, + "rewards/rejected": -2.521036386489868, + "step": 1381 + }, + { + "epoch": 0.16, + "learning_rate": 2.5606929650005853e-07, + "logits/chosen": -3.080803394317627, + "logits/rejected": -2.9514644145965576, + "logps/chosen": -269.7321472167969, + "logps/rejected": -209.8255157470703, + "loss": 0.3882, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35495150089263916, + "rewards/margins": 0.9708800315856934, + "rewards/rejected": -0.615928590297699, + "step": 1382 + }, + { + "epoch": 0.16, + "learning_rate": 2.5603418003043423e-07, + "logits/chosen": -3.1469600200653076, + "logits/rejected": -3.211188793182373, + "logps/chosen": -190.2865447998047, + "logps/rejected": -237.42807006835938, + "loss": 0.2587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47352489829063416, + "rewards/margins": 2.0233206748962402, + "rewards/rejected": -1.5497956275939941, + "step": 1383 + }, + { + "epoch": 0.16, + "learning_rate": 2.5599906356081004e-07, + "logits/chosen": -3.057302713394165, + "logits/rejected": -2.9975523948669434, + "logps/chosen": -182.8653564453125, + "logps/rejected": -154.04212951660156, + "loss": 0.5416, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11364522576332092, + "rewards/margins": 0.730546772480011, + "rewards/rejected": -0.6169015169143677, + "step": 1384 + }, + { + "epoch": 0.16, + "learning_rate": 2.5596394709118574e-07, + "logits/chosen": -3.468611717224121, + "logits/rejected": -3.1657190322875977, + "logps/chosen": -240.2515869140625, + "logps/rejected": -177.2841033935547, + "loss": 0.6151, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10170875489711761, + "rewards/margins": 0.6754162907600403, + "rewards/rejected": -0.5737075805664062, + "step": 1385 + }, + { + "epoch": 0.16, + "learning_rate": 2.559288306215615e-07, + "logits/chosen": -3.4515485763549805, + "logits/rejected": -2.928544521331787, + "logps/chosen": -249.31964111328125, + "logps/rejected": -181.41744995117188, + "loss": 0.7096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08688148856163025, + "rewards/margins": 1.3099149465560913, + "rewards/rejected": -1.3967963457107544, + "step": 1386 + }, + { + "epoch": 0.16, + "learning_rate": 2.5589371415193725e-07, + "logits/chosen": -3.1115310192108154, + "logits/rejected": -3.0429539680480957, + "logps/chosen": -348.6588134765625, + "logps/rejected": -289.6445617675781, + "loss": 0.4574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10212290287017822, + "rewards/margins": 1.2949789762496948, + "rewards/rejected": -1.3971019983291626, + "step": 1387 + }, + { + "epoch": 0.16, + "learning_rate": 2.55858597682313e-07, + "logits/chosen": -2.8216207027435303, + "logits/rejected": -2.57527756690979, + "logps/chosen": -409.5839538574219, + "logps/rejected": -304.3885498046875, + "loss": 0.3275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19464054703712463, + "rewards/margins": 2.1957032680511475, + "rewards/rejected": -2.0010628700256348, + "step": 1388 + }, + { + "epoch": 0.16, + "learning_rate": 2.5582348121268875e-07, + "logits/chosen": -3.105970859527588, + "logits/rejected": -3.1891326904296875, + "logps/chosen": -144.0658416748047, + "logps/rejected": -191.33673095703125, + "loss": 0.3267, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10052652657032013, + "rewards/margins": 1.432431697845459, + "rewards/rejected": -1.3319051265716553, + "step": 1389 + }, + { + "epoch": 0.16, + "learning_rate": 2.557883647430645e-07, + "logits/chosen": -2.6900882720947266, + "logits/rejected": -2.7059788703918457, + "logps/chosen": -532.0977172851562, + "logps/rejected": -358.5438537597656, + "loss": 0.5714, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.209239199757576, + "rewards/margins": 0.9423686265945435, + "rewards/rejected": -1.151607871055603, + "step": 1390 + }, + { + "epoch": 0.16, + "learning_rate": 2.557532482734402e-07, + "logits/chosen": -3.71512508392334, + "logits/rejected": -3.428868293762207, + "logps/chosen": -340.1872253417969, + "logps/rejected": -208.19244384765625, + "loss": 0.3845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6652637720108032, + "rewards/margins": 1.5700204372406006, + "rewards/rejected": -2.2352843284606934, + "step": 1391 + }, + { + "epoch": 0.16, + "learning_rate": 2.5571813180381596e-07, + "logits/chosen": -3.448676347732544, + "logits/rejected": -3.607452392578125, + "logps/chosen": -230.37644958496094, + "logps/rejected": -322.3721923828125, + "loss": 0.4822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22931000590324402, + "rewards/margins": 1.4019787311553955, + "rewards/rejected": -1.6312886476516724, + "step": 1392 + }, + { + "epoch": 0.16, + "learning_rate": 2.556830153341917e-07, + "logits/chosen": -2.775630474090576, + "logits/rejected": -2.743398666381836, + "logps/chosen": -304.58099365234375, + "logps/rejected": -229.9409637451172, + "loss": 0.322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3270447552204132, + "rewards/margins": 1.822581171989441, + "rewards/rejected": -2.149625778198242, + "step": 1393 + }, + { + "epoch": 0.16, + "learning_rate": 2.5564789886456747e-07, + "logits/chosen": -2.7719762325286865, + "logits/rejected": -2.8218207359313965, + "logps/chosen": -264.3139343261719, + "logps/rejected": -243.90940856933594, + "loss": 0.3573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3033883273601532, + "rewards/margins": 1.013213038444519, + "rewards/rejected": -1.316601276397705, + "step": 1394 + }, + { + "epoch": 0.16, + "learning_rate": 2.556127823949432e-07, + "logits/chosen": -2.7200276851654053, + "logits/rejected": -2.755641460418701, + "logps/chosen": -317.62371826171875, + "logps/rejected": -245.68638610839844, + "loss": 0.4419, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21496382355690002, + "rewards/margins": 1.0160181522369385, + "rewards/rejected": -0.8010542392730713, + "step": 1395 + }, + { + "epoch": 0.16, + "learning_rate": 2.55577665925319e-07, + "logits/chosen": -3.2336153984069824, + "logits/rejected": -3.109405517578125, + "logps/chosen": -284.75213623046875, + "logps/rejected": -149.0267333984375, + "loss": 0.5451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14286932349205017, + "rewards/margins": 0.6663054823875427, + "rewards/rejected": -0.8091747164726257, + "step": 1396 + }, + { + "epoch": 0.16, + "learning_rate": 2.5554254945569473e-07, + "logits/chosen": -3.294424295425415, + "logits/rejected": -3.430720329284668, + "logps/chosen": -334.2586669921875, + "logps/rejected": -166.92367553710938, + "loss": 0.7215, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22337263822555542, + "rewards/margins": 1.0314642190933228, + "rewards/rejected": -1.254836916923523, + "step": 1397 + }, + { + "epoch": 0.16, + "learning_rate": 2.555074329860705e-07, + "logits/chosen": -3.60394024848938, + "logits/rejected": -3.6011621952056885, + "logps/chosen": -105.44680786132812, + "logps/rejected": -185.07354736328125, + "loss": 0.2512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06786474585533142, + "rewards/margins": 1.970583200454712, + "rewards/rejected": -2.038447856903076, + "step": 1398 + }, + { + "epoch": 0.16, + "learning_rate": 2.554723165164462e-07, + "logits/chosen": -2.903747081756592, + "logits/rejected": -3.1693594455718994, + "logps/chosen": -297.9914855957031, + "logps/rejected": -372.04876708984375, + "loss": 0.4474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012513790279626846, + "rewards/margins": 1.2624541521072388, + "rewards/rejected": -1.274968147277832, + "step": 1399 + }, + { + "epoch": 0.16, + "learning_rate": 2.5543720004682194e-07, + "logits/chosen": -2.9480085372924805, + "logits/rejected": -2.8699285984039307, + "logps/chosen": -218.75706481933594, + "logps/rejected": -193.74713134765625, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09799454361200333, + "rewards/margins": 2.174708366394043, + "rewards/rejected": -2.076713800430298, + "step": 1400 + }, + { + "epoch": 0.16, + "learning_rate": 2.554020835771977e-07, + "logits/chosen": -3.033289909362793, + "logits/rejected": -3.278651714324951, + "logps/chosen": -326.92120361328125, + "logps/rejected": -324.2908020019531, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1905927062034607, + "rewards/margins": 0.9285632371902466, + "rewards/rejected": -0.7379705309867859, + "step": 1401 + }, + { + "epoch": 0.16, + "learning_rate": 2.5536696710757345e-07, + "logits/chosen": -2.9420859813690186, + "logits/rejected": -3.3301327228546143, + "logps/chosen": -331.79266357421875, + "logps/rejected": -210.67750549316406, + "loss": 0.2849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07497105002403259, + "rewards/margins": 1.332480549812317, + "rewards/rejected": -1.257509469985962, + "step": 1402 + }, + { + "epoch": 0.16, + "learning_rate": 2.553318506379492e-07, + "logits/chosen": -2.7265026569366455, + "logits/rejected": -2.6109001636505127, + "logps/chosen": -210.85317993164062, + "logps/rejected": -267.5634765625, + "loss": 0.5189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.012741759419441223, + "rewards/margins": 0.6571580767631531, + "rewards/rejected": -0.644416332244873, + "step": 1403 + }, + { + "epoch": 0.16, + "learning_rate": 2.552967341683249e-07, + "logits/chosen": -2.76473331451416, + "logits/rejected": -2.766204357147217, + "logps/chosen": -160.80166625976562, + "logps/rejected": -169.257568359375, + "loss": 0.3156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5081496834754944, + "rewards/margins": 1.5454492568969727, + "rewards/rejected": -1.0372995138168335, + "step": 1404 + }, + { + "epoch": 0.16, + "learning_rate": 2.5526161769870066e-07, + "logits/chosen": -3.2509255409240723, + "logits/rejected": -3.56001615524292, + "logps/chosen": -299.73321533203125, + "logps/rejected": -215.5147705078125, + "loss": 0.2385, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25787806510925293, + "rewards/margins": 2.0118966102600098, + "rewards/rejected": -1.7540185451507568, + "step": 1405 + }, + { + "epoch": 0.16, + "learning_rate": 2.5522650122907646e-07, + "logits/chosen": -3.0658836364746094, + "logits/rejected": -2.988920211791992, + "logps/chosen": -254.1191864013672, + "logps/rejected": -330.7019348144531, + "loss": 0.4076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.567616879940033, + "rewards/margins": 1.8362107276916504, + "rewards/rejected": -2.403827428817749, + "step": 1406 + }, + { + "epoch": 0.16, + "learning_rate": 2.5519138475945216e-07, + "logits/chosen": -3.9214041233062744, + "logits/rejected": -3.9448089599609375, + "logps/chosen": -190.67385864257812, + "logps/rejected": -207.74990844726562, + "loss": 0.2945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37404966354370117, + "rewards/margins": 1.955817699432373, + "rewards/rejected": -1.5817679166793823, + "step": 1407 + }, + { + "epoch": 0.16, + "learning_rate": 2.551562682898279e-07, + "logits/chosen": -3.058922290802002, + "logits/rejected": -2.9888410568237305, + "logps/chosen": -302.7970275878906, + "logps/rejected": -224.65713500976562, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08770906925201416, + "rewards/margins": 0.7677137851715088, + "rewards/rejected": -0.8554227948188782, + "step": 1408 + }, + { + "epoch": 0.16, + "learning_rate": 2.5512115182020367e-07, + "logits/chosen": -3.2733347415924072, + "logits/rejected": -3.214801788330078, + "logps/chosen": -383.8865966796875, + "logps/rejected": -323.6093444824219, + "loss": 0.3678, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04087008535861969, + "rewards/margins": 2.200723171234131, + "rewards/rejected": -2.159853219985962, + "step": 1409 + }, + { + "epoch": 0.16, + "learning_rate": 2.550860353505794e-07, + "logits/chosen": -2.900692939758301, + "logits/rejected": -2.9764909744262695, + "logps/chosen": -353.63385009765625, + "logps/rejected": -258.88299560546875, + "loss": 0.4229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0929201990365982, + "rewards/margins": 1.258988857269287, + "rewards/rejected": -1.1660685539245605, + "step": 1410 + }, + { + "epoch": 0.16, + "learning_rate": 2.550509188809552e-07, + "logits/chosen": -2.7306056022644043, + "logits/rejected": -2.653247833251953, + "logps/chosen": -282.013427734375, + "logps/rejected": -299.36767578125, + "loss": 0.696, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5762251615524292, + "rewards/margins": 0.4028221070766449, + "rewards/rejected": -0.9790472388267517, + "step": 1411 + }, + { + "epoch": 0.16, + "learning_rate": 2.550158024113309e-07, + "logits/chosen": -3.175196647644043, + "logits/rejected": -3.1390161514282227, + "logps/chosen": -208.0045166015625, + "logps/rejected": -255.71461486816406, + "loss": 0.401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2842245101928711, + "rewards/margins": 1.2009432315826416, + "rewards/rejected": -1.4851677417755127, + "step": 1412 + }, + { + "epoch": 0.16, + "learning_rate": 2.5498068594170663e-07, + "logits/chosen": -3.1854653358459473, + "logits/rejected": -3.4655566215515137, + "logps/chosen": -262.5896301269531, + "logps/rejected": -235.31613159179688, + "loss": 0.3636, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.053534265607595444, + "rewards/margins": 1.620290756225586, + "rewards/rejected": -1.5667563676834106, + "step": 1413 + }, + { + "epoch": 0.16, + "learning_rate": 2.549455694720824e-07, + "logits/chosen": -2.4620165824890137, + "logits/rejected": -2.2541635036468506, + "logps/chosen": -262.27032470703125, + "logps/rejected": -199.74594116210938, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.217573881149292, + "rewards/margins": 0.7734484672546387, + "rewards/rejected": -0.9910223484039307, + "step": 1414 + }, + { + "epoch": 0.16, + "learning_rate": 2.5491045300245814e-07, + "logits/chosen": -2.7298824787139893, + "logits/rejected": -2.798719644546509, + "logps/chosen": -363.1792907714844, + "logps/rejected": -425.41326904296875, + "loss": 0.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29643192887306213, + "rewards/margins": 0.6651434898376465, + "rewards/rejected": -0.9615753889083862, + "step": 1415 + }, + { + "epoch": 0.16, + "learning_rate": 2.548753365328339e-07, + "logits/chosen": -3.4093003273010254, + "logits/rejected": -3.226437568664551, + "logps/chosen": -284.97686767578125, + "logps/rejected": -256.1290283203125, + "loss": 0.4435, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25546640157699585, + "rewards/margins": 1.2007815837860107, + "rewards/rejected": -0.9453150629997253, + "step": 1416 + }, + { + "epoch": 0.16, + "learning_rate": 2.548402200632096e-07, + "logits/chosen": -3.138772487640381, + "logits/rejected": -3.6539604663848877, + "logps/chosen": -140.26368713378906, + "logps/rejected": -221.04527282714844, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35735249519348145, + "rewards/margins": 2.7113747596740723, + "rewards/rejected": -2.354022264480591, + "step": 1417 + }, + { + "epoch": 0.16, + "learning_rate": 2.548051035935854e-07, + "logits/chosen": -2.9735805988311768, + "logits/rejected": -3.2216403484344482, + "logps/chosen": -257.8381652832031, + "logps/rejected": -175.92898559570312, + "loss": 0.3158, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1546740084886551, + "rewards/margins": 1.818570613861084, + "rewards/rejected": -1.6638966798782349, + "step": 1418 + }, + { + "epoch": 0.16, + "learning_rate": 2.5476998712396116e-07, + "logits/chosen": -2.5199999809265137, + "logits/rejected": -2.3516764640808105, + "logps/chosen": -257.1839294433594, + "logps/rejected": -159.08517456054688, + "loss": 0.7009, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1417919099330902, + "rewards/margins": 0.11230640113353729, + "rewards/rejected": -0.2540982961654663, + "step": 1419 + }, + { + "epoch": 0.16, + "learning_rate": 2.5473487065433686e-07, + "logits/chosen": -3.119842767715454, + "logits/rejected": -2.8308792114257812, + "logps/chosen": -281.4576416015625, + "logps/rejected": -308.8079833984375, + "loss": 0.3824, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20565614104270935, + "rewards/margins": 1.4788422584533691, + "rewards/rejected": -1.2731860876083374, + "step": 1420 + }, + { + "epoch": 0.16, + "learning_rate": 2.546997541847126e-07, + "logits/chosen": -2.6137266159057617, + "logits/rejected": -2.513880968093872, + "logps/chosen": -244.74778747558594, + "logps/rejected": -360.1038818359375, + "loss": 0.8429, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1371239274740219, + "rewards/margins": 0.37114816904067993, + "rewards/rejected": -0.5082720518112183, + "step": 1421 + }, + { + "epoch": 0.16, + "learning_rate": 2.5466463771508837e-07, + "logits/chosen": -3.0677576065063477, + "logits/rejected": -3.2199602127075195, + "logps/chosen": -193.3337860107422, + "logps/rejected": -176.3008270263672, + "loss": 0.5845, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20253866910934448, + "rewards/margins": 1.1353598833084106, + "rewards/rejected": -1.3378983736038208, + "step": 1422 + }, + { + "epoch": 0.16, + "learning_rate": 2.546295212454641e-07, + "logits/chosen": -2.728231906890869, + "logits/rejected": -2.893001079559326, + "logps/chosen": -326.0724792480469, + "logps/rejected": -267.9109802246094, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14530105888843536, + "rewards/margins": 0.9579471349716187, + "rewards/rejected": -0.8126461505889893, + "step": 1423 + }, + { + "epoch": 0.16, + "learning_rate": 2.545944047758399e-07, + "logits/chosen": -3.5356459617614746, + "logits/rejected": -3.059080123901367, + "logps/chosen": -323.202392578125, + "logps/rejected": -152.30972290039062, + "loss": 0.6849, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6670114994049072, + "rewards/margins": 0.565125048160553, + "rewards/rejected": -1.2321364879608154, + "step": 1424 + }, + { + "epoch": 0.16, + "learning_rate": 2.545592883062156e-07, + "logits/chosen": -3.024059772491455, + "logits/rejected": -3.453648090362549, + "logps/chosen": -111.16154479980469, + "logps/rejected": -215.12680053710938, + "loss": 0.3988, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35778146982192993, + "rewards/margins": 1.6886993646621704, + "rewards/rejected": -1.3309178352355957, + "step": 1425 + }, + { + "epoch": 0.16, + "learning_rate": 2.5452417183659133e-07, + "logits/chosen": -2.4465439319610596, + "logits/rejected": -2.389099597930908, + "logps/chosen": -256.2645263671875, + "logps/rejected": -239.9393310546875, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6622249484062195, + "rewards/margins": 0.5436898469924927, + "rewards/rejected": -1.2059147357940674, + "step": 1426 + }, + { + "epoch": 0.16, + "learning_rate": 2.5448905536696714e-07, + "logits/chosen": -3.1473026275634766, + "logits/rejected": -2.9449586868286133, + "logps/chosen": -312.6834716796875, + "logps/rejected": -275.94464111328125, + "loss": 0.3285, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.425985187292099, + "rewards/margins": 1.3570674657821655, + "rewards/rejected": -0.9310822486877441, + "step": 1427 + }, + { + "epoch": 0.16, + "learning_rate": 2.5445393889734284e-07, + "logits/chosen": -3.9885454177856445, + "logits/rejected": -3.911512613296509, + "logps/chosen": -236.85885620117188, + "logps/rejected": -263.17303466796875, + "loss": 0.5588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16725441813468933, + "rewards/margins": 0.5852547287940979, + "rewards/rejected": -0.7525091767311096, + "step": 1428 + }, + { + "epoch": 0.16, + "learning_rate": 2.544188224277186e-07, + "logits/chosen": -2.882204055786133, + "logits/rejected": -2.9953107833862305, + "logps/chosen": -308.7751159667969, + "logps/rejected": -226.295654296875, + "loss": 0.6722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14277642965316772, + "rewards/margins": 0.21266648173332214, + "rewards/rejected": -0.35544294118881226, + "step": 1429 + }, + { + "epoch": 0.16, + "learning_rate": 2.5438370595809434e-07, + "logits/chosen": -3.1365585327148438, + "logits/rejected": -2.8381552696228027, + "logps/chosen": -310.3899841308594, + "logps/rejected": -230.5426483154297, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18804927170276642, + "rewards/margins": 1.2387969493865967, + "rewards/rejected": -1.0507477521896362, + "step": 1430 + }, + { + "epoch": 0.16, + "learning_rate": 2.543485894884701e-07, + "logits/chosen": -3.2969491481781006, + "logits/rejected": -3.455859899520874, + "logps/chosen": -303.99932861328125, + "logps/rejected": -277.1585388183594, + "loss": 0.2286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09842848777770996, + "rewards/margins": 1.839847207069397, + "rewards/rejected": -1.7414188385009766, + "step": 1431 + }, + { + "epoch": 0.17, + "learning_rate": 2.5431347301884585e-07, + "logits/chosen": -2.6270406246185303, + "logits/rejected": -2.8720760345458984, + "logps/chosen": -243.7661895751953, + "logps/rejected": -311.8367004394531, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3001318573951721, + "rewards/margins": 1.8832015991210938, + "rewards/rejected": -1.5830698013305664, + "step": 1432 + }, + { + "epoch": 0.17, + "learning_rate": 2.5427835654922155e-07, + "logits/chosen": -3.280094861984253, + "logits/rejected": -3.1904406547546387, + "logps/chosen": -158.10226440429688, + "logps/rejected": -151.3723907470703, + "loss": 0.3439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05821910500526428, + "rewards/margins": 1.0154948234558105, + "rewards/rejected": -1.073714017868042, + "step": 1433 + }, + { + "epoch": 0.17, + "learning_rate": 2.542432400795973e-07, + "logits/chosen": -3.441411256790161, + "logits/rejected": -2.9636435508728027, + "logps/chosen": -157.55398559570312, + "logps/rejected": -308.5328674316406, + "loss": 0.9159, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7791717052459717, + "rewards/margins": -0.015420570969581604, + "rewards/rejected": -0.7637511491775513, + "step": 1434 + }, + { + "epoch": 0.17, + "learning_rate": 2.5420812360997306e-07, + "logits/chosen": -3.434446334838867, + "logits/rejected": -3.107944965362549, + "logps/chosen": -281.3332214355469, + "logps/rejected": -186.76759338378906, + "loss": 0.5194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21181698143482208, + "rewards/margins": 0.7621076107025146, + "rewards/rejected": -0.9739246368408203, + "step": 1435 + }, + { + "epoch": 0.17, + "learning_rate": 2.541730071403488e-07, + "logits/chosen": -2.982088327407837, + "logits/rejected": -3.2290596961975098, + "logps/chosen": -326.74676513671875, + "logps/rejected": -215.49935913085938, + "loss": 0.8542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18189701437950134, + "rewards/margins": 0.786063551902771, + "rewards/rejected": -0.9679606556892395, + "step": 1436 + }, + { + "epoch": 0.17, + "learning_rate": 2.5413789067072457e-07, + "logits/chosen": -2.580385684967041, + "logits/rejected": -2.457716464996338, + "logps/chosen": -265.2939147949219, + "logps/rejected": -193.81846618652344, + "loss": 0.9095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011305384337902069, + "rewards/margins": -0.21677416563034058, + "rewards/rejected": 0.22807952761650085, + "step": 1437 + }, + { + "epoch": 0.17, + "learning_rate": 2.5410277420110027e-07, + "logits/chosen": -2.7149603366851807, + "logits/rejected": -2.843262195587158, + "logps/chosen": -161.65170288085938, + "logps/rejected": -234.25845336914062, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15995177626609802, + "rewards/margins": 1.0177949666976929, + "rewards/rejected": -1.1777467727661133, + "step": 1438 + }, + { + "epoch": 0.17, + "learning_rate": 2.54067657731476e-07, + "logits/chosen": -3.739898920059204, + "logits/rejected": -3.83449649810791, + "logps/chosen": -183.050048828125, + "logps/rejected": -194.6539764404297, + "loss": 0.4051, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14488141238689423, + "rewards/margins": 0.9840406179428101, + "rewards/rejected": -0.8391591310501099, + "step": 1439 + }, + { + "epoch": 0.17, + "learning_rate": 2.5403254126185183e-07, + "logits/chosen": -3.525508403778076, + "logits/rejected": -3.204705238342285, + "logps/chosen": -222.77731323242188, + "logps/rejected": -170.7025146484375, + "loss": 0.3879, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08949099481105804, + "rewards/margins": 1.09969961643219, + "rewards/rejected": -1.0102087259292603, + "step": 1440 + }, + { + "epoch": 0.17, + "learning_rate": 2.5399742479222753e-07, + "logits/chosen": -3.348600387573242, + "logits/rejected": -3.1928982734680176, + "logps/chosen": -303.0113830566406, + "logps/rejected": -239.31076049804688, + "loss": 0.6129, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17562898993492126, + "rewards/margins": 0.29520368576049805, + "rewards/rejected": -0.119574636220932, + "step": 1441 + }, + { + "epoch": 0.17, + "learning_rate": 2.539623083226033e-07, + "logits/chosen": -3.233574867248535, + "logits/rejected": -3.428771734237671, + "logps/chosen": -117.05130767822266, + "logps/rejected": -156.03131103515625, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07057924568653107, + "rewards/margins": 0.6292253732681274, + "rewards/rejected": -0.6998046636581421, + "step": 1442 + }, + { + "epoch": 0.17, + "learning_rate": 2.5392719185297904e-07, + "logits/chosen": -2.8919119834899902, + "logits/rejected": -2.881925344467163, + "logps/chosen": -226.90806579589844, + "logps/rejected": -209.27633666992188, + "loss": 0.7011, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.37659284472465515, + "rewards/margins": 0.14072071015834808, + "rewards/rejected": -0.517313539981842, + "step": 1443 + }, + { + "epoch": 0.17, + "learning_rate": 2.538920753833548e-07, + "logits/chosen": -3.191077947616577, + "logits/rejected": -3.1942694187164307, + "logps/chosen": -264.18048095703125, + "logps/rejected": -271.5841064453125, + "loss": 0.3131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05281936004757881, + "rewards/margins": 2.2398056983947754, + "rewards/rejected": -2.2926251888275146, + "step": 1444 + }, + { + "epoch": 0.17, + "learning_rate": 2.5385695891373055e-07, + "logits/chosen": -2.6050026416778564, + "logits/rejected": -2.6897764205932617, + "logps/chosen": -167.01034545898438, + "logps/rejected": -239.47137451171875, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10167495906352997, + "rewards/margins": 0.7236835360527039, + "rewards/rejected": -0.8253585696220398, + "step": 1445 + }, + { + "epoch": 0.17, + "learning_rate": 2.5382184244410625e-07, + "logits/chosen": -2.561619520187378, + "logits/rejected": -2.4731087684631348, + "logps/chosen": -387.9241943359375, + "logps/rejected": -223.4948272705078, + "loss": 0.3081, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2910845875740051, + "rewards/margins": 1.4601305723190308, + "rewards/rejected": -1.1690459251403809, + "step": 1446 + }, + { + "epoch": 0.17, + "learning_rate": 2.53786725974482e-07, + "logits/chosen": -2.88962721824646, + "logits/rejected": -2.7959156036376953, + "logps/chosen": -206.5690460205078, + "logps/rejected": -147.1153106689453, + "loss": 0.596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21089939773082733, + "rewards/margins": 1.0483102798461914, + "rewards/rejected": -1.2592097520828247, + "step": 1447 + }, + { + "epoch": 0.17, + "learning_rate": 2.5375160950485775e-07, + "logits/chosen": -3.0684139728546143, + "logits/rejected": -3.082451581954956, + "logps/chosen": -327.32110595703125, + "logps/rejected": -494.4891357421875, + "loss": 0.3459, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15136921405792236, + "rewards/margins": 1.9616177082061768, + "rewards/rejected": -1.8102483749389648, + "step": 1448 + }, + { + "epoch": 0.17, + "learning_rate": 2.537164930352335e-07, + "logits/chosen": -3.188560724258423, + "logits/rejected": -3.1578826904296875, + "logps/chosen": -261.8722229003906, + "logps/rejected": -248.90887451171875, + "loss": 0.37, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.142405703663826, + "rewards/margins": 1.1259419918060303, + "rewards/rejected": -0.9835363030433655, + "step": 1449 + }, + { + "epoch": 0.17, + "learning_rate": 2.5368137656560926e-07, + "logits/chosen": -2.3176498413085938, + "logits/rejected": -2.564669132232666, + "logps/chosen": -324.75689697265625, + "logps/rejected": -237.09190368652344, + "loss": 0.3161, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5812521576881409, + "rewards/margins": 1.5060019493103027, + "rewards/rejected": -0.9247497916221619, + "step": 1450 + }, + { + "epoch": 0.17, + "learning_rate": 2.53646260095985e-07, + "logits/chosen": -3.1454029083251953, + "logits/rejected": -2.87129282951355, + "logps/chosen": -278.41424560546875, + "logps/rejected": -228.3530731201172, + "loss": 0.6634, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04729343205690384, + "rewards/margins": 0.9725439548492432, + "rewards/rejected": -1.0198372602462769, + "step": 1451 + }, + { + "epoch": 0.17, + "learning_rate": 2.5361114362636077e-07, + "logits/chosen": -3.744586229324341, + "logits/rejected": -3.8065309524536133, + "logps/chosen": -210.700927734375, + "logps/rejected": -191.57992553710938, + "loss": 0.6531, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3940339684486389, + "rewards/margins": 1.0979719161987305, + "rewards/rejected": -1.4920058250427246, + "step": 1452 + }, + { + "epoch": 0.17, + "learning_rate": 2.535760271567365e-07, + "logits/chosen": -3.4310383796691895, + "logits/rejected": -3.348679780960083, + "logps/chosen": -249.92538452148438, + "logps/rejected": -609.4087524414062, + "loss": 0.458, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0054101720452308655, + "rewards/margins": 0.9296847581863403, + "rewards/rejected": -0.9242745637893677, + "step": 1453 + }, + { + "epoch": 0.17, + "learning_rate": 2.535409106871122e-07, + "logits/chosen": -2.9084067344665527, + "logits/rejected": -2.95662784576416, + "logps/chosen": -182.98326110839844, + "logps/rejected": -299.76788330078125, + "loss": 0.3571, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11952178925275803, + "rewards/margins": 1.6894251108169556, + "rewards/rejected": -1.5699032545089722, + "step": 1454 + }, + { + "epoch": 0.17, + "learning_rate": 2.53505794217488e-07, + "logits/chosen": -2.547166109085083, + "logits/rejected": -2.9583826065063477, + "logps/chosen": -255.48046875, + "logps/rejected": -160.6226806640625, + "loss": 0.6756, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6849334836006165, + "rewards/margins": 0.16748090088367462, + "rewards/rejected": -0.8524143695831299, + "step": 1455 + }, + { + "epoch": 0.17, + "learning_rate": 2.5347067774786373e-07, + "logits/chosen": -2.5268592834472656, + "logits/rejected": -2.4770936965942383, + "logps/chosen": -210.41299438476562, + "logps/rejected": -273.1662292480469, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4524649679660797, + "rewards/margins": 0.1581394523382187, + "rewards/rejected": -0.6106044054031372, + "step": 1456 + }, + { + "epoch": 0.17, + "learning_rate": 2.534355612782395e-07, + "logits/chosen": -2.7171380519866943, + "logits/rejected": -2.9614906311035156, + "logps/chosen": -387.09613037109375, + "logps/rejected": -233.01150512695312, + "loss": 0.3176, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20289286971092224, + "rewards/margins": 1.8997175693511963, + "rewards/rejected": -1.6968247890472412, + "step": 1457 + }, + { + "epoch": 0.17, + "learning_rate": 2.5340044480861524e-07, + "logits/chosen": -3.22169828414917, + "logits/rejected": -3.0274715423583984, + "logps/chosen": -156.74111938476562, + "logps/rejected": -160.19729614257812, + "loss": 0.3713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21668705344200134, + "rewards/margins": 1.3162301778793335, + "rewards/rejected": -1.5329172611236572, + "step": 1458 + }, + { + "epoch": 0.17, + "learning_rate": 2.53365328338991e-07, + "logits/chosen": -3.1705307960510254, + "logits/rejected": -2.92834734916687, + "logps/chosen": -345.17236328125, + "logps/rejected": -196.11795043945312, + "loss": 0.3713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1067657470703125, + "rewards/margins": 0.9882016181945801, + "rewards/rejected": -0.8814358711242676, + "step": 1459 + }, + { + "epoch": 0.17, + "learning_rate": 2.533302118693667e-07, + "logits/chosen": -2.959587812423706, + "logits/rejected": -3.300440549850464, + "logps/chosen": -182.3204345703125, + "logps/rejected": -232.6237030029297, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5622125864028931, + "rewards/margins": 1.6167045831680298, + "rewards/rejected": -1.0544921159744263, + "step": 1460 + }, + { + "epoch": 0.17, + "learning_rate": 2.532950953997425e-07, + "logits/chosen": -2.762432098388672, + "logits/rejected": -3.162893772125244, + "logps/chosen": -171.29266357421875, + "logps/rejected": -312.14019775390625, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14879512786865234, + "rewards/margins": 2.1880366802215576, + "rewards/rejected": -2.0392415523529053, + "step": 1461 + }, + { + "epoch": 0.17, + "learning_rate": 2.532599789301182e-07, + "logits/chosen": -3.7785749435424805, + "logits/rejected": -3.501652717590332, + "logps/chosen": -280.18634033203125, + "logps/rejected": -282.50616455078125, + "loss": 0.4051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2677035331726074, + "rewards/margins": 1.1350148916244507, + "rewards/rejected": -1.402718424797058, + "step": 1462 + }, + { + "epoch": 0.17, + "learning_rate": 2.5322486246049396e-07, + "logits/chosen": -2.3671164512634277, + "logits/rejected": -2.6628472805023193, + "logps/chosen": -370.0285949707031, + "logps/rejected": -200.931884765625, + "loss": 0.4575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2552938759326935, + "rewards/margins": 0.9206752777099609, + "rewards/rejected": -0.6653813719749451, + "step": 1463 + }, + { + "epoch": 0.17, + "learning_rate": 2.531897459908697e-07, + "logits/chosen": -3.276956796646118, + "logits/rejected": -3.230988025665283, + "logps/chosen": -354.437744140625, + "logps/rejected": -293.67919921875, + "loss": 0.4872, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009841442108154297, + "rewards/margins": 1.6163280010223389, + "rewards/rejected": -1.6261694431304932, + "step": 1464 + }, + { + "epoch": 0.17, + "learning_rate": 2.5315462952124546e-07, + "logits/chosen": -3.958280563354492, + "logits/rejected": -3.4684972763061523, + "logps/chosen": -142.178955078125, + "logps/rejected": -198.68429565429688, + "loss": 0.6072, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5083469152450562, + "rewards/margins": 0.3178911805152893, + "rewards/rejected": -0.8262380361557007, + "step": 1465 + }, + { + "epoch": 0.17, + "learning_rate": 2.531195130516212e-07, + "logits/chosen": -2.6625940799713135, + "logits/rejected": -2.6984243392944336, + "logps/chosen": -166.5047607421875, + "logps/rejected": -184.11871337890625, + "loss": 0.6604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3204686641693115, + "rewards/margins": 1.0151715278625488, + "rewards/rejected": -1.3356401920318604, + "step": 1466 + }, + { + "epoch": 0.17, + "learning_rate": 2.5308439658199697e-07, + "logits/chosen": -2.70451021194458, + "logits/rejected": -3.227217674255371, + "logps/chosen": -301.150390625, + "logps/rejected": -261.32763671875, + "loss": 0.3475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0415346622467041, + "rewards/margins": 1.9166679382324219, + "rewards/rejected": -1.8751332759857178, + "step": 1467 + }, + { + "epoch": 0.17, + "learning_rate": 2.5304928011237267e-07, + "logits/chosen": -2.750596761703491, + "logits/rejected": -2.7246763706207275, + "logps/chosen": -352.0214538574219, + "logps/rejected": -309.0485534667969, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40644171833992004, + "rewards/margins": 0.7567801475524902, + "rewards/rejected": -1.163221836090088, + "step": 1468 + }, + { + "epoch": 0.17, + "learning_rate": 2.5301416364274843e-07, + "logits/chosen": -3.0502450466156006, + "logits/rejected": -2.9883627891540527, + "logps/chosen": -279.7407531738281, + "logps/rejected": -242.40512084960938, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14638537168502808, + "rewards/margins": 1.4199833869934082, + "rewards/rejected": -1.2735979557037354, + "step": 1469 + }, + { + "epoch": 0.17, + "learning_rate": 2.529790471731242e-07, + "logits/chosen": -3.2775676250457764, + "logits/rejected": -3.4606127738952637, + "logps/chosen": -172.9986572265625, + "logps/rejected": -204.53424072265625, + "loss": 0.4707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17027579247951508, + "rewards/margins": 1.6353858709335327, + "rewards/rejected": -1.805661678314209, + "step": 1470 + }, + { + "epoch": 0.17, + "learning_rate": 2.5294393070349993e-07, + "logits/chosen": -3.2827870845794678, + "logits/rejected": -3.176438093185425, + "logps/chosen": -423.6960754394531, + "logps/rejected": -329.7424621582031, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3224334716796875, + "rewards/margins": 2.721797227859497, + "rewards/rejected": -2.3993639945983887, + "step": 1471 + }, + { + "epoch": 0.17, + "learning_rate": 2.529088142338757e-07, + "logits/chosen": -2.4656100273132324, + "logits/rejected": -2.5939559936523438, + "logps/chosen": -354.7267150878906, + "logps/rejected": -391.6355895996094, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38906872272491455, + "rewards/margins": 1.0885554552078247, + "rewards/rejected": -0.6994867324829102, + "step": 1472 + }, + { + "epoch": 0.17, + "learning_rate": 2.528736977642514e-07, + "logits/chosen": -3.2631592750549316, + "logits/rejected": -3.3675734996795654, + "logps/chosen": -224.52090454101562, + "logps/rejected": -180.0391845703125, + "loss": 1.1499, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28892096877098083, + "rewards/margins": 0.10618922114372253, + "rewards/rejected": -0.39511021971702576, + "step": 1473 + }, + { + "epoch": 0.17, + "learning_rate": 2.528385812946272e-07, + "logits/chosen": -2.901008367538452, + "logits/rejected": -3.1624083518981934, + "logps/chosen": -264.330810546875, + "logps/rejected": -174.38119506835938, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17445456981658936, + "rewards/margins": 0.693864643573761, + "rewards/rejected": -0.8683191537857056, + "step": 1474 + }, + { + "epoch": 0.17, + "learning_rate": 2.528034648250029e-07, + "logits/chosen": -3.1642580032348633, + "logits/rejected": -3.069148540496826, + "logps/chosen": -305.0733947753906, + "logps/rejected": -207.73834228515625, + "loss": 0.2639, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8478806614875793, + "rewards/margins": 1.9835484027862549, + "rewards/rejected": -1.1356676816940308, + "step": 1475 + }, + { + "epoch": 0.17, + "learning_rate": 2.5276834835537865e-07, + "logits/chosen": -3.209831714630127, + "logits/rejected": -3.1313014030456543, + "logps/chosen": -225.40476989746094, + "logps/rejected": -267.27935791015625, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25886791944503784, + "rewards/margins": 1.9014215469360352, + "rewards/rejected": -1.6425535678863525, + "step": 1476 + }, + { + "epoch": 0.17, + "learning_rate": 2.527332318857544e-07, + "logits/chosen": -2.7364768981933594, + "logits/rejected": -2.755094528198242, + "logps/chosen": -369.8105163574219, + "logps/rejected": -227.78607177734375, + "loss": 0.358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.030137568712234497, + "rewards/margins": 1.3893910646438599, + "rewards/rejected": -1.419528603553772, + "step": 1477 + }, + { + "epoch": 0.17, + "learning_rate": 2.5269811541613016e-07, + "logits/chosen": -3.484485626220703, + "logits/rejected": -3.40386962890625, + "logps/chosen": -319.8224792480469, + "logps/rejected": -183.54689025878906, + "loss": 0.4416, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03464818000793457, + "rewards/margins": 1.0196079015731812, + "rewards/rejected": -1.0542562007904053, + "step": 1478 + }, + { + "epoch": 0.17, + "learning_rate": 2.526629989465059e-07, + "logits/chosen": -2.840170383453369, + "logits/rejected": -2.5846152305603027, + "logps/chosen": -478.67913818359375, + "logps/rejected": -347.0202331542969, + "loss": 0.5693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47054025530815125, + "rewards/margins": 0.8161214590072632, + "rewards/rejected": -1.2866617441177368, + "step": 1479 + }, + { + "epoch": 0.17, + "learning_rate": 2.5262788247688167e-07, + "logits/chosen": -3.4284145832061768, + "logits/rejected": -3.521925926208496, + "logps/chosen": -190.2987823486328, + "logps/rejected": -235.26800537109375, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24903905391693115, + "rewards/margins": 1.112391471862793, + "rewards/rejected": -0.8633524179458618, + "step": 1480 + }, + { + "epoch": 0.17, + "learning_rate": 2.5259276600725737e-07, + "logits/chosen": -2.452449083328247, + "logits/rejected": -2.6403400897979736, + "logps/chosen": -220.6059112548828, + "logps/rejected": -292.6126403808594, + "loss": 0.3999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013939021155238152, + "rewards/margins": 0.9825767874717712, + "rewards/rejected": -0.9965158104896545, + "step": 1481 + }, + { + "epoch": 0.17, + "learning_rate": 2.525576495376331e-07, + "logits/chosen": -2.3533377647399902, + "logits/rejected": -2.173398494720459, + "logps/chosen": -285.12921142578125, + "logps/rejected": -332.5311279296875, + "loss": 0.6385, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12080040574073792, + "rewards/margins": 0.3772051930427551, + "rewards/rejected": -0.2564047574996948, + "step": 1482 + }, + { + "epoch": 0.17, + "learning_rate": 2.525225330680089e-07, + "logits/chosen": -2.3709564208984375, + "logits/rejected": -2.4902408123016357, + "logps/chosen": -276.57794189453125, + "logps/rejected": -315.3499755859375, + "loss": 0.5572, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33589598536491394, + "rewards/margins": 0.8976808786392212, + "rewards/rejected": -1.233577013015747, + "step": 1483 + }, + { + "epoch": 0.17, + "learning_rate": 2.5248741659838463e-07, + "logits/chosen": -2.551868438720703, + "logits/rejected": -2.747617483139038, + "logps/chosen": -227.4838409423828, + "logps/rejected": -255.47607421875, + "loss": 0.445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.038092032074928284, + "rewards/margins": 0.9756458401679993, + "rewards/rejected": -1.0137379169464111, + "step": 1484 + }, + { + "epoch": 0.17, + "learning_rate": 2.524523001287604e-07, + "logits/chosen": -3.8182642459869385, + "logits/rejected": -3.6585323810577393, + "logps/chosen": -233.0343780517578, + "logps/rejected": -254.59454345703125, + "loss": 0.6396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42380088567733765, + "rewards/margins": 0.45865052938461304, + "rewards/rejected": -0.8824514150619507, + "step": 1485 + }, + { + "epoch": 0.17, + "learning_rate": 2.5241718365913614e-07, + "logits/chosen": -2.9236905574798584, + "logits/rejected": -2.729034185409546, + "logps/chosen": -253.08409118652344, + "logps/rejected": -233.7904815673828, + "loss": 0.5589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5912806391716003, + "rewards/margins": 1.1578669548034668, + "rewards/rejected": -1.749147653579712, + "step": 1486 + }, + { + "epoch": 0.17, + "learning_rate": 2.523820671895119e-07, + "logits/chosen": -2.9666190147399902, + "logits/rejected": -3.2817306518554688, + "logps/chosen": -329.497314453125, + "logps/rejected": -278.8468017578125, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007200382649898529, + "rewards/margins": 1.053473949432373, + "rewards/rejected": -1.0462735891342163, + "step": 1487 + }, + { + "epoch": 0.17, + "learning_rate": 2.5234695071988764e-07, + "logits/chosen": -3.2970314025878906, + "logits/rejected": -3.1642990112304688, + "logps/chosen": -418.9378967285156, + "logps/rejected": -285.11322021484375, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.033888038247823715, + "rewards/margins": 2.2888152599334717, + "rewards/rejected": -2.254927158355713, + "step": 1488 + }, + { + "epoch": 0.17, + "learning_rate": 2.5231183425026335e-07, + "logits/chosen": -2.9056015014648438, + "logits/rejected": -2.822819948196411, + "logps/chosen": -144.73825073242188, + "logps/rejected": -194.91416931152344, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21557393670082092, + "rewards/margins": 0.5858629941940308, + "rewards/rejected": -0.8014369606971741, + "step": 1489 + }, + { + "epoch": 0.17, + "learning_rate": 2.522767177806391e-07, + "logits/chosen": -3.5405848026275635, + "logits/rejected": -3.176156520843506, + "logps/chosen": -420.5745849609375, + "logps/rejected": -404.83551025390625, + "loss": 1.2178, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17814184725284576, + "rewards/margins": 0.3955309987068176, + "rewards/rejected": -0.5736728310585022, + "step": 1490 + }, + { + "epoch": 0.17, + "learning_rate": 2.5224160131101485e-07, + "logits/chosen": -2.5740370750427246, + "logits/rejected": -2.3131885528564453, + "logps/chosen": -362.47308349609375, + "logps/rejected": -329.52801513671875, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0076027363538742065, + "rewards/margins": 0.439958781003952, + "rewards/rejected": -0.4475615620613098, + "step": 1491 + }, + { + "epoch": 0.17, + "learning_rate": 2.522064848413906e-07, + "logits/chosen": -3.3274459838867188, + "logits/rejected": -3.0136053562164307, + "logps/chosen": -185.93385314941406, + "logps/rejected": -217.38868713378906, + "loss": 0.292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2784789800643921, + "rewards/margins": 1.7974997758865356, + "rewards/rejected": -1.519020915031433, + "step": 1492 + }, + { + "epoch": 0.17, + "learning_rate": 2.5217136837176636e-07, + "logits/chosen": -3.760270357131958, + "logits/rejected": -3.2762935161590576, + "logps/chosen": -251.96646118164062, + "logps/rejected": -295.4446716308594, + "loss": 0.4716, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04217401146888733, + "rewards/margins": 1.0856494903564453, + "rewards/rejected": -1.1278234720230103, + "step": 1493 + }, + { + "epoch": 0.17, + "learning_rate": 2.5213625190214206e-07, + "logits/chosen": -3.0328121185302734, + "logits/rejected": -2.936918258666992, + "logps/chosen": -201.1170196533203, + "logps/rejected": -168.11474609375, + "loss": 0.8659, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6281031966209412, + "rewards/margins": 0.08696767687797546, + "rewards/rejected": -0.7150708436965942, + "step": 1494 + }, + { + "epoch": 0.17, + "learning_rate": 2.5210113543251787e-07, + "logits/chosen": -3.338139533996582, + "logits/rejected": -3.2716612815856934, + "logps/chosen": -144.10745239257812, + "logps/rejected": -215.35394287109375, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05990919470787048, + "rewards/margins": 1.7733186483383179, + "rewards/rejected": -1.713409423828125, + "step": 1495 + }, + { + "epoch": 0.17, + "learning_rate": 2.520660189628936e-07, + "logits/chosen": -3.7239742279052734, + "logits/rejected": -3.289851427078247, + "logps/chosen": -302.9237060546875, + "logps/rejected": -237.79861450195312, + "loss": 0.2464, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38266369700431824, + "rewards/margins": 1.9189233779907227, + "rewards/rejected": -1.5362597703933716, + "step": 1496 + }, + { + "epoch": 0.17, + "learning_rate": 2.520309024932693e-07, + "logits/chosen": -3.18709135055542, + "logits/rejected": -3.2826099395751953, + "logps/chosen": -145.81912231445312, + "logps/rejected": -116.89448547363281, + "loss": 0.5348, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004350811243057251, + "rewards/margins": 0.8543806672096252, + "rewards/rejected": -0.8587315082550049, + "step": 1497 + }, + { + "epoch": 0.17, + "learning_rate": 2.519957860236451e-07, + "logits/chosen": -3.356926679611206, + "logits/rejected": -3.1862244606018066, + "logps/chosen": -416.5458068847656, + "logps/rejected": -238.13705444335938, + "loss": 0.3507, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08060942590236664, + "rewards/margins": 1.4154481887817383, + "rewards/rejected": -1.4960577487945557, + "step": 1498 + }, + { + "epoch": 0.17, + "learning_rate": 2.5196066955402083e-07, + "logits/chosen": -3.050638198852539, + "logits/rejected": -3.012503147125244, + "logps/chosen": -145.2467041015625, + "logps/rejected": -184.32815551757812, + "loss": 0.7082, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6995250582695007, + "rewards/margins": 0.56214439868927, + "rewards/rejected": -1.2616695165634155, + "step": 1499 + }, + { + "epoch": 0.17, + "learning_rate": 2.519255530843966e-07, + "logits/chosen": -3.6339104175567627, + "logits/rejected": -3.552980422973633, + "logps/chosen": -240.80221557617188, + "logps/rejected": -224.62625122070312, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19340506196022034, + "rewards/margins": 1.5050827264785767, + "rewards/rejected": -1.6984878778457642, + "step": 1500 + }, + { + "epoch": 0.17, + "learning_rate": 2.5189043661477234e-07, + "logits/chosen": -3.8133316040039062, + "logits/rejected": -3.6130716800689697, + "logps/chosen": -183.78750610351562, + "logps/rejected": -259.74945068359375, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07362888008356094, + "rewards/margins": 2.4486570358276367, + "rewards/rejected": -2.5222856998443604, + "step": 1501 + }, + { + "epoch": 0.17, + "learning_rate": 2.5185532014514804e-07, + "logits/chosen": -3.3617734909057617, + "logits/rejected": -3.133798837661743, + "logps/chosen": -216.087646484375, + "logps/rejected": -271.9454345703125, + "loss": 0.4731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1340225785970688, + "rewards/margins": 1.750115156173706, + "rewards/rejected": -1.8841376304626465, + "step": 1502 + }, + { + "epoch": 0.17, + "learning_rate": 2.518202036755238e-07, + "logits/chosen": -3.1169919967651367, + "logits/rejected": -2.9825000762939453, + "logps/chosen": -283.0966796875, + "logps/rejected": -231.33950805664062, + "loss": 0.418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24275217950344086, + "rewards/margins": 1.3899757862091064, + "rewards/rejected": -1.632727861404419, + "step": 1503 + }, + { + "epoch": 0.17, + "learning_rate": 2.5178508720589955e-07, + "logits/chosen": -2.987081527709961, + "logits/rejected": -2.6397030353546143, + "logps/chosen": -219.1697998046875, + "logps/rejected": -131.4944610595703, + "loss": 0.541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1636643409729004, + "rewards/margins": 0.5931396484375, + "rewards/rejected": -0.7568040490150452, + "step": 1504 + }, + { + "epoch": 0.17, + "learning_rate": 2.517499707362753e-07, + "logits/chosen": -3.062993288040161, + "logits/rejected": -2.8243839740753174, + "logps/chosen": -172.07723999023438, + "logps/rejected": -155.2359619140625, + "loss": 0.7437, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5870838761329651, + "rewards/margins": 0.43352046608924866, + "rewards/rejected": -1.0206043720245361, + "step": 1505 + }, + { + "epoch": 0.17, + "learning_rate": 2.5171485426665105e-07, + "logits/chosen": -3.0968070030212402, + "logits/rejected": -3.1918082237243652, + "logps/chosen": -312.1986999511719, + "logps/rejected": -229.95018005371094, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03913099318742752, + "rewards/margins": 1.4451513290405273, + "rewards/rejected": -1.4060204029083252, + "step": 1506 + }, + { + "epoch": 0.17, + "learning_rate": 2.5167973779702676e-07, + "logits/chosen": -3.5944979190826416, + "logits/rejected": -3.679166793823242, + "logps/chosen": -146.9656219482422, + "logps/rejected": -180.70065307617188, + "loss": 0.3643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20001515746116638, + "rewards/margins": 1.4685659408569336, + "rewards/rejected": -1.6685811281204224, + "step": 1507 + }, + { + "epoch": 0.17, + "learning_rate": 2.5164462132740256e-07, + "logits/chosen": -3.5475285053253174, + "logits/rejected": -3.7423617839813232, + "logps/chosen": -297.0629577636719, + "logps/rejected": -312.3211975097656, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06770460307598114, + "rewards/margins": 2.316929340362549, + "rewards/rejected": -2.2492244243621826, + "step": 1508 + }, + { + "epoch": 0.17, + "learning_rate": 2.516095048577783e-07, + "logits/chosen": -3.592390537261963, + "logits/rejected": -3.5594072341918945, + "logps/chosen": -379.09405517578125, + "logps/rejected": -354.7581481933594, + "loss": 0.2075, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30030566453933716, + "rewards/margins": 1.8552958965301514, + "rewards/rejected": -1.554990291595459, + "step": 1509 + }, + { + "epoch": 0.17, + "learning_rate": 2.51574388388154e-07, + "logits/chosen": -2.481785774230957, + "logits/rejected": -2.6133971214294434, + "logps/chosen": -364.5009765625, + "logps/rejected": -284.3797912597656, + "loss": 0.5773, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30536380410194397, + "rewards/margins": 1.714145302772522, + "rewards/rejected": -2.0195090770721436, + "step": 1510 + }, + { + "epoch": 0.17, + "learning_rate": 2.5153927191852977e-07, + "logits/chosen": -3.0965898036956787, + "logits/rejected": -2.941537380218506, + "logps/chosen": -162.57894897460938, + "logps/rejected": -212.2760772705078, + "loss": 0.4261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0826662927865982, + "rewards/margins": 1.461862564086914, + "rewards/rejected": -1.544528841972351, + "step": 1511 + }, + { + "epoch": 0.17, + "learning_rate": 2.515041554489055e-07, + "logits/chosen": -3.766932964324951, + "logits/rejected": -3.260315418243408, + "logps/chosen": -161.52146911621094, + "logps/rejected": -216.46792602539062, + "loss": 0.394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07360772788524628, + "rewards/margins": 1.493163824081421, + "rewards/rejected": -1.5667716264724731, + "step": 1512 + }, + { + "epoch": 0.17, + "learning_rate": 2.514690389792813e-07, + "logits/chosen": -3.773805618286133, + "logits/rejected": -3.721752882003784, + "logps/chosen": -260.4869384765625, + "logps/rejected": -601.2622680664062, + "loss": 0.5625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17422962188720703, + "rewards/margins": 1.3251670598983765, + "rewards/rejected": -1.4993966817855835, + "step": 1513 + }, + { + "epoch": 0.17, + "learning_rate": 2.5143392250965703e-07, + "logits/chosen": -3.7214431762695312, + "logits/rejected": -3.800624370574951, + "logps/chosen": -208.68771362304688, + "logps/rejected": -178.89479064941406, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2097526341676712, + "rewards/margins": 1.836012840270996, + "rewards/rejected": -1.6262602806091309, + "step": 1514 + }, + { + "epoch": 0.17, + "learning_rate": 2.5139880604003273e-07, + "logits/chosen": -2.7074971199035645, + "logits/rejected": -2.684040069580078, + "logps/chosen": -292.5515441894531, + "logps/rejected": -375.0087585449219, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.033129312098026276, + "rewards/margins": 1.973036289215088, + "rewards/rejected": -1.9399070739746094, + "step": 1515 + }, + { + "epoch": 0.17, + "learning_rate": 2.513636895704085e-07, + "logits/chosen": -2.9524431228637695, + "logits/rejected": -2.5515966415405273, + "logps/chosen": -151.3790283203125, + "logps/rejected": -129.77256774902344, + "loss": 0.531, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.036276623606681824, + "rewards/margins": 0.5077760815620422, + "rewards/rejected": -0.5440527200698853, + "step": 1516 + }, + { + "epoch": 0.17, + "learning_rate": 2.513285731007843e-07, + "logits/chosen": -2.5420925617218018, + "logits/rejected": -2.2470641136169434, + "logps/chosen": -178.55606079101562, + "logps/rejected": -325.8009033203125, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24279308319091797, + "rewards/margins": 0.7724299430847168, + "rewards/rejected": -1.0152230262756348, + "step": 1517 + }, + { + "epoch": 0.17, + "learning_rate": 2.5129345663116e-07, + "logits/chosen": -2.867743730545044, + "logits/rejected": -3.0709424018859863, + "logps/chosen": -355.54925537109375, + "logps/rejected": -321.88739013671875, + "loss": 0.2218, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3604843020439148, + "rewards/margins": 2.753653049468994, + "rewards/rejected": -2.3931686878204346, + "step": 1518 + }, + { + "epoch": 0.18, + "learning_rate": 2.5125834016153575e-07, + "logits/chosen": -3.983919382095337, + "logits/rejected": -3.6354708671569824, + "logps/chosen": -249.71902465820312, + "logps/rejected": -216.30906677246094, + "loss": 0.5517, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17475387454032898, + "rewards/margins": 1.0399212837219238, + "rewards/rejected": -0.8651673793792725, + "step": 1519 + }, + { + "epoch": 0.18, + "learning_rate": 2.512232236919115e-07, + "logits/chosen": -2.983504295349121, + "logits/rejected": -3.1349196434020996, + "logps/chosen": -358.331787109375, + "logps/rejected": -359.50201416015625, + "loss": 0.6995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.90455561876297, + "rewards/margins": 0.47947126626968384, + "rewards/rejected": -1.3840270042419434, + "step": 1520 + }, + { + "epoch": 0.18, + "learning_rate": 2.5118810722228726e-07, + "logits/chosen": -3.6364874839782715, + "logits/rejected": -3.4873604774475098, + "logps/chosen": -223.56915283203125, + "logps/rejected": -215.62892150878906, + "loss": 0.8344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16205745935440063, + "rewards/margins": 0.4507281184196472, + "rewards/rejected": -0.6127855777740479, + "step": 1521 + }, + { + "epoch": 0.18, + "learning_rate": 2.51152990752663e-07, + "logits/chosen": -2.818822145462036, + "logits/rejected": -2.511584520339966, + "logps/chosen": -177.04193115234375, + "logps/rejected": -202.612060546875, + "loss": 0.6728, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10911092162132263, + "rewards/margins": 0.38139206171035767, + "rewards/rejected": -0.4905029535293579, + "step": 1522 + }, + { + "epoch": 0.18, + "learning_rate": 2.511178742830387e-07, + "logits/chosen": -3.6642038822174072, + "logits/rejected": -3.6444716453552246, + "logps/chosen": -151.7933807373047, + "logps/rejected": -222.90638732910156, + "loss": 0.5246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12398136407136917, + "rewards/margins": 0.6475263237953186, + "rewards/rejected": -0.7715076208114624, + "step": 1523 + }, + { + "epoch": 0.18, + "learning_rate": 2.5108275781341447e-07, + "logits/chosen": -2.939218282699585, + "logits/rejected": -2.7400050163269043, + "logps/chosen": -351.4151306152344, + "logps/rejected": -290.5722351074219, + "loss": 0.3827, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11940590292215347, + "rewards/margins": 1.3269720077514648, + "rewards/rejected": -1.207566261291504, + "step": 1524 + }, + { + "epoch": 0.18, + "learning_rate": 2.510476413437902e-07, + "logits/chosen": -3.1605799198150635, + "logits/rejected": -2.9916059970855713, + "logps/chosen": -201.02120971679688, + "logps/rejected": -244.4366455078125, + "loss": 0.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.341769814491272, + "rewards/margins": 0.650853157043457, + "rewards/rejected": -0.992622971534729, + "step": 1525 + }, + { + "epoch": 0.18, + "learning_rate": 2.5101252487416597e-07, + "logits/chosen": -3.2518396377563477, + "logits/rejected": -2.992037296295166, + "logps/chosen": -319.9474182128906, + "logps/rejected": -218.11830139160156, + "loss": 0.4058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44514983892440796, + "rewards/margins": 0.8046483993530273, + "rewards/rejected": -1.2497981786727905, + "step": 1526 + }, + { + "epoch": 0.18, + "learning_rate": 2.5097740840454173e-07, + "logits/chosen": -3.189215898513794, + "logits/rejected": -3.2065792083740234, + "logps/chosen": -262.51348876953125, + "logps/rejected": -177.74945068359375, + "loss": 0.4016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0813589096069336, + "rewards/margins": 1.019492268562317, + "rewards/rejected": -1.100851058959961, + "step": 1527 + }, + { + "epoch": 0.18, + "learning_rate": 2.5094229193491743e-07, + "logits/chosen": -3.1080856323242188, + "logits/rejected": -3.2889695167541504, + "logps/chosen": -206.8444061279297, + "logps/rejected": -307.3691101074219, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.054033853113651276, + "rewards/margins": 2.93294358253479, + "rewards/rejected": -2.8789095878601074, + "step": 1528 + }, + { + "epoch": 0.18, + "learning_rate": 2.5090717546529323e-07, + "logits/chosen": -2.9138283729553223, + "logits/rejected": -3.11415958404541, + "logps/chosen": -203.69384765625, + "logps/rejected": -273.83673095703125, + "loss": 0.3313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5592360496520996, + "rewards/margins": 1.452459692955017, + "rewards/rejected": -0.8932234644889832, + "step": 1529 + }, + { + "epoch": 0.18, + "learning_rate": 2.50872058995669e-07, + "logits/chosen": -3.6793699264526367, + "logits/rejected": -3.726012706756592, + "logps/chosen": -249.152587890625, + "logps/rejected": -315.2601013183594, + "loss": 0.1541, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.61383056640625, + "rewards/margins": 3.3448610305786133, + "rewards/rejected": -2.7310304641723633, + "step": 1530 + }, + { + "epoch": 0.18, + "learning_rate": 2.508369425260447e-07, + "logits/chosen": -3.5303189754486084, + "logits/rejected": -3.0636942386627197, + "logps/chosen": -327.10516357421875, + "logps/rejected": -171.04747009277344, + "loss": 0.158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6687989234924316, + "rewards/margins": 2.985759735107422, + "rewards/rejected": -2.316960334777832, + "step": 1531 + }, + { + "epoch": 0.18, + "learning_rate": 2.5080182605642044e-07, + "logits/chosen": -3.8624749183654785, + "logits/rejected": -3.7587456703186035, + "logps/chosen": -182.19215393066406, + "logps/rejected": -177.85023498535156, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23013362288475037, + "rewards/margins": 0.9002361297607422, + "rewards/rejected": -0.6701024770736694, + "step": 1532 + }, + { + "epoch": 0.18, + "learning_rate": 2.507667095867962e-07, + "logits/chosen": -2.931903839111328, + "logits/rejected": -2.8240795135498047, + "logps/chosen": -230.96624755859375, + "logps/rejected": -189.26007080078125, + "loss": 0.4907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.034889012575149536, + "rewards/margins": 1.2451589107513428, + "rewards/rejected": -1.28004789352417, + "step": 1533 + }, + { + "epoch": 0.18, + "learning_rate": 2.5073159311717195e-07, + "logits/chosen": -3.2453575134277344, + "logits/rejected": -3.176525115966797, + "logps/chosen": -328.6094665527344, + "logps/rejected": -371.0775146484375, + "loss": 0.5481, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15590377151966095, + "rewards/margins": 0.8421227931976318, + "rewards/rejected": -0.9980265498161316, + "step": 1534 + }, + { + "epoch": 0.18, + "learning_rate": 2.506964766475477e-07, + "logits/chosen": -3.648616313934326, + "logits/rejected": -3.693873405456543, + "logps/chosen": -206.76675415039062, + "logps/rejected": -203.88589477539062, + "loss": 0.3627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25549715757369995, + "rewards/margins": 1.6640931367874146, + "rewards/rejected": -1.9195902347564697, + "step": 1535 + }, + { + "epoch": 0.18, + "learning_rate": 2.506613601779234e-07, + "logits/chosen": -2.862778902053833, + "logits/rejected": -3.0783872604370117, + "logps/chosen": -160.05374145507812, + "logps/rejected": -199.77247619628906, + "loss": 0.3634, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11497992277145386, + "rewards/margins": 1.8958029747009277, + "rewards/rejected": -1.7808231115341187, + "step": 1536 + }, + { + "epoch": 0.18, + "learning_rate": 2.5062624370829916e-07, + "logits/chosen": -2.9934234619140625, + "logits/rejected": -3.3008384704589844, + "logps/chosen": -265.7355651855469, + "logps/rejected": -254.08668518066406, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06950540840625763, + "rewards/margins": 1.3061575889587402, + "rewards/rejected": -1.236652135848999, + "step": 1537 + }, + { + "epoch": 0.18, + "learning_rate": 2.505911272386749e-07, + "logits/chosen": -3.3784804344177246, + "logits/rejected": -3.5589330196380615, + "logps/chosen": -198.95965576171875, + "logps/rejected": -218.23545837402344, + "loss": 0.414, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016104191541671753, + "rewards/margins": 1.0998241901397705, + "rewards/rejected": -1.0837198495864868, + "step": 1538 + }, + { + "epoch": 0.18, + "learning_rate": 2.5055601076905067e-07, + "logits/chosen": -3.2002041339874268, + "logits/rejected": -2.983340263366699, + "logps/chosen": -346.22802734375, + "logps/rejected": -166.5175323486328, + "loss": 0.4991, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21396848559379578, + "rewards/margins": 0.7792195081710815, + "rewards/rejected": -0.5652509927749634, + "step": 1539 + }, + { + "epoch": 0.18, + "learning_rate": 2.505208942994264e-07, + "logits/chosen": -2.4443418979644775, + "logits/rejected": -2.3749611377716064, + "logps/chosen": -231.0518035888672, + "logps/rejected": -248.08737182617188, + "loss": 0.3561, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1508564054965973, + "rewards/margins": 1.5054519176483154, + "rewards/rejected": -1.354595422744751, + "step": 1540 + }, + { + "epoch": 0.18, + "learning_rate": 2.504857778298022e-07, + "logits/chosen": -2.6610379219055176, + "logits/rejected": -3.072960615158081, + "logps/chosen": -233.3808135986328, + "logps/rejected": -246.25082397460938, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012616094201803207, + "rewards/margins": 1.4887852668762207, + "rewards/rejected": -1.5014013051986694, + "step": 1541 + }, + { + "epoch": 0.18, + "learning_rate": 2.5045066136017793e-07, + "logits/chosen": -2.637578248977661, + "logits/rejected": -2.898172378540039, + "logps/chosen": -214.5455322265625, + "logps/rejected": -280.34515380859375, + "loss": 0.3984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4203321635723114, + "rewards/margins": 1.1077349185943604, + "rewards/rejected": -1.5280669927597046, + "step": 1542 + }, + { + "epoch": 0.18, + "learning_rate": 2.504155448905537e-07, + "logits/chosen": -2.4137964248657227, + "logits/rejected": -2.6297972202301025, + "logps/chosen": -272.5074462890625, + "logps/rejected": -201.68695068359375, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027765393257141113, + "rewards/margins": 1.4940557479858398, + "rewards/rejected": -1.4662903547286987, + "step": 1543 + }, + { + "epoch": 0.18, + "learning_rate": 2.503804284209294e-07, + "logits/chosen": -3.3724944591522217, + "logits/rejected": -3.337322235107422, + "logps/chosen": -229.9352264404297, + "logps/rejected": -249.21282958984375, + "loss": 0.418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5513677597045898, + "rewards/margins": 1.8722014427185059, + "rewards/rejected": -2.4235692024230957, + "step": 1544 + }, + { + "epoch": 0.18, + "learning_rate": 2.5034531195130514e-07, + "logits/chosen": -3.412881374359131, + "logits/rejected": -3.261493682861328, + "logps/chosen": -129.72894287109375, + "logps/rejected": -200.30181884765625, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1342257261276245, + "rewards/margins": 1.9664995670318604, + "rewards/rejected": -2.1007254123687744, + "step": 1545 + }, + { + "epoch": 0.18, + "learning_rate": 2.503101954816809e-07, + "logits/chosen": -2.6883387565612793, + "logits/rejected": -2.8215625286102295, + "logps/chosen": -202.64227294921875, + "logps/rejected": -136.78631591796875, + "loss": 0.5223, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06284372508525848, + "rewards/margins": 0.5342840552330017, + "rewards/rejected": -0.4714403748512268, + "step": 1546 + }, + { + "epoch": 0.18, + "learning_rate": 2.5027507901205665e-07, + "logits/chosen": -3.4476959705352783, + "logits/rejected": -3.2603530883789062, + "logps/chosen": -205.8194122314453, + "logps/rejected": -305.54925537109375, + "loss": 0.3538, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11508862674236298, + "rewards/margins": 1.584865689277649, + "rewards/rejected": -1.46977698802948, + "step": 1547 + }, + { + "epoch": 0.18, + "learning_rate": 2.502399625424324e-07, + "logits/chosen": -3.1585943698883057, + "logits/rejected": -3.5449790954589844, + "logps/chosen": -209.16317749023438, + "logps/rejected": -332.84423828125, + "loss": 0.26, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2629750967025757, + "rewards/margins": 1.6706701517105103, + "rewards/rejected": -1.4076950550079346, + "step": 1548 + }, + { + "epoch": 0.18, + "learning_rate": 2.5020484607280815e-07, + "logits/chosen": -3.0686416625976562, + "logits/rejected": -3.214515447616577, + "logps/chosen": -169.7054443359375, + "logps/rejected": -135.6467742919922, + "loss": 0.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7438569068908691, + "rewards/margins": 0.21290910243988037, + "rewards/rejected": -0.9567660689353943, + "step": 1549 + }, + { + "epoch": 0.18, + "learning_rate": 2.5016972960318385e-07, + "logits/chosen": -3.47174334526062, + "logits/rejected": -3.284346342086792, + "logps/chosen": -297.818359375, + "logps/rejected": -281.1500549316406, + "loss": 0.8368, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06337091326713562, + "rewards/margins": 0.37006694078445435, + "rewards/rejected": -0.3066960871219635, + "step": 1550 + }, + { + "epoch": 0.18, + "learning_rate": 2.5013461313355966e-07, + "logits/chosen": -3.2129576206207275, + "logits/rejected": -3.3759443759918213, + "logps/chosen": -330.897216796875, + "logps/rejected": -324.5155334472656, + "loss": 0.2553, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7761394381523132, + "rewards/margins": 2.4174435138702393, + "rewards/rejected": -1.6413041353225708, + "step": 1551 + }, + { + "epoch": 0.18, + "learning_rate": 2.5009949666393536e-07, + "logits/chosen": -3.4097652435302734, + "logits/rejected": -3.7561657428741455, + "logps/chosen": -240.09689331054688, + "logps/rejected": -292.41033935546875, + "loss": 0.6568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19328394532203674, + "rewards/margins": 0.9810360670089722, + "rewards/rejected": -1.1743199825286865, + "step": 1552 + }, + { + "epoch": 0.18, + "learning_rate": 2.500643801943111e-07, + "logits/chosen": -3.154953956604004, + "logits/rejected": -3.1746201515197754, + "logps/chosen": -223.4775390625, + "logps/rejected": -233.44090270996094, + "loss": 0.6542, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03741864860057831, + "rewards/margins": 0.4985285997390747, + "rewards/rejected": -0.4611099362373352, + "step": 1553 + }, + { + "epoch": 0.18, + "learning_rate": 2.5002926372468687e-07, + "logits/chosen": -2.7029380798339844, + "logits/rejected": -3.0414600372314453, + "logps/chosen": -222.06947326660156, + "logps/rejected": -333.65850830078125, + "loss": 0.4251, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06763535737991333, + "rewards/margins": 1.301175832748413, + "rewards/rejected": -1.233540415763855, + "step": 1554 + }, + { + "epoch": 0.18, + "learning_rate": 2.499941472550626e-07, + "logits/chosen": -2.7445449829101562, + "logits/rejected": -2.816120147705078, + "logps/chosen": -183.89080810546875, + "logps/rejected": -212.0049285888672, + "loss": 0.4289, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00840894877910614, + "rewards/margins": 1.3367893695831299, + "rewards/rejected": -1.3283804655075073, + "step": 1555 + }, + { + "epoch": 0.18, + "learning_rate": 2.499590307854384e-07, + "logits/chosen": -3.7259578704833984, + "logits/rejected": -3.061728000640869, + "logps/chosen": -440.28936767578125, + "logps/rejected": -236.62127685546875, + "loss": 0.4553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2671167254447937, + "rewards/margins": 1.394223690032959, + "rewards/rejected": -1.661340594291687, + "step": 1556 + }, + { + "epoch": 0.18, + "learning_rate": 2.4992391431581413e-07, + "logits/chosen": -3.3398337364196777, + "logits/rejected": -3.5374815464019775, + "logps/chosen": -210.3216552734375, + "logps/rejected": -361.6911315917969, + "loss": 0.2462, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21619391441345215, + "rewards/margins": 2.6669905185699463, + "rewards/rejected": -2.450796365737915, + "step": 1557 + }, + { + "epoch": 0.18, + "learning_rate": 2.4988879784618983e-07, + "logits/chosen": -3.350187301635742, + "logits/rejected": -3.329085111618042, + "logps/chosen": -248.35025024414062, + "logps/rejected": -478.1139831542969, + "loss": 0.6189, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20270265638828278, + "rewards/margins": 1.2441428899765015, + "rewards/rejected": -1.446845531463623, + "step": 1558 + }, + { + "epoch": 0.18, + "learning_rate": 2.498536813765656e-07, + "logits/chosen": -2.966024160385132, + "logits/rejected": -2.862165689468384, + "logps/chosen": -133.44998168945312, + "logps/rejected": -272.40924072265625, + "loss": 0.3241, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35100653767585754, + "rewards/margins": 1.519694447517395, + "rewards/rejected": -1.1686878204345703, + "step": 1559 + }, + { + "epoch": 0.18, + "learning_rate": 2.4981856490694134e-07, + "logits/chosen": -2.8258790969848633, + "logits/rejected": -2.789914131164551, + "logps/chosen": -287.9441223144531, + "logps/rejected": -319.75823974609375, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08661016821861267, + "rewards/margins": 1.0961226224899292, + "rewards/rejected": -1.1827328205108643, + "step": 1560 + }, + { + "epoch": 0.18, + "learning_rate": 2.497834484373171e-07, + "logits/chosen": -2.6846508979797363, + "logits/rejected": -3.1744747161865234, + "logps/chosen": -316.5074157714844, + "logps/rejected": -177.9369354248047, + "loss": 0.6516, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5177503824234009, + "rewards/margins": 0.621163010597229, + "rewards/rejected": -1.1389133930206299, + "step": 1561 + }, + { + "epoch": 0.18, + "learning_rate": 2.4974833196769285e-07, + "logits/chosen": -3.4028801918029785, + "logits/rejected": -3.491983413696289, + "logps/chosen": -88.90687561035156, + "logps/rejected": -194.2624969482422, + "loss": 0.3695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.047424376010894775, + "rewards/margins": 1.8772265911102295, + "rewards/rejected": -1.8298022747039795, + "step": 1562 + }, + { + "epoch": 0.18, + "learning_rate": 2.4971321549806855e-07, + "logits/chosen": -3.5973734855651855, + "logits/rejected": -3.220582962036133, + "logps/chosen": -420.0931396484375, + "logps/rejected": -308.4820861816406, + "loss": 0.4165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1363658308982849, + "rewards/margins": 1.0620437860488892, + "rewards/rejected": -1.1984095573425293, + "step": 1563 + }, + { + "epoch": 0.18, + "learning_rate": 2.4967809902844435e-07, + "logits/chosen": -3.2502803802490234, + "logits/rejected": -3.1472179889678955, + "logps/chosen": -392.24847412109375, + "logps/rejected": -310.6517333984375, + "loss": 0.4379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2914094924926758, + "rewards/margins": 1.9295068979263306, + "rewards/rejected": -1.6380972862243652, + "step": 1564 + }, + { + "epoch": 0.18, + "learning_rate": 2.4964298255882006e-07, + "logits/chosen": -3.1605935096740723, + "logits/rejected": -2.9460926055908203, + "logps/chosen": -319.314208984375, + "logps/rejected": -148.277587890625, + "loss": 0.3422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2379521131515503, + "rewards/margins": 1.936168909072876, + "rewards/rejected": -1.6982166767120361, + "step": 1565 + }, + { + "epoch": 0.18, + "learning_rate": 2.496078660891958e-07, + "logits/chosen": -3.3417553901672363, + "logits/rejected": -3.0829310417175293, + "logps/chosen": -193.19308471679688, + "logps/rejected": -207.80809020996094, + "loss": 0.7751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3709440231323242, + "rewards/margins": 1.8440539836883545, + "rewards/rejected": -2.214998245239258, + "step": 1566 + }, + { + "epoch": 0.18, + "learning_rate": 2.4957274961957156e-07, + "logits/chosen": -2.997227668762207, + "logits/rejected": -2.9109063148498535, + "logps/chosen": -351.82177734375, + "logps/rejected": -340.80035400390625, + "loss": 0.5469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24442750215530396, + "rewards/margins": 0.815355658531189, + "rewards/rejected": -1.0597832202911377, + "step": 1567 + }, + { + "epoch": 0.18, + "learning_rate": 2.495376331499473e-07, + "logits/chosen": -2.496757745742798, + "logits/rejected": -2.7690396308898926, + "logps/chosen": -313.14971923828125, + "logps/rejected": -284.92803955078125, + "loss": 0.6893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0612785667181015, + "rewards/margins": 0.6400415897369385, + "rewards/rejected": -0.5787630081176758, + "step": 1568 + }, + { + "epoch": 0.18, + "learning_rate": 2.4950251668032307e-07, + "logits/chosen": -2.715089797973633, + "logits/rejected": -2.7075071334838867, + "logps/chosen": -352.3373718261719, + "logps/rejected": -322.6280517578125, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25847962498664856, + "rewards/margins": 0.31642913818359375, + "rewards/rejected": -0.5749087929725647, + "step": 1569 + }, + { + "epoch": 0.18, + "learning_rate": 2.494674002106988e-07, + "logits/chosen": -3.4108870029449463, + "logits/rejected": -3.565894365310669, + "logps/chosen": -241.74029541015625, + "logps/rejected": -273.7139587402344, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24049139022827148, + "rewards/margins": 1.1604478359222412, + "rewards/rejected": -1.4009393453598022, + "step": 1570 + }, + { + "epoch": 0.18, + "learning_rate": 2.494322837410745e-07, + "logits/chosen": -2.145329475402832, + "logits/rejected": -2.1749789714813232, + "logps/chosen": -238.5030517578125, + "logps/rejected": -188.67239379882812, + "loss": 0.6515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09605341404676437, + "rewards/margins": 0.7354627251625061, + "rewards/rejected": -0.8315160274505615, + "step": 1571 + }, + { + "epoch": 0.18, + "learning_rate": 2.493971672714503e-07, + "logits/chosen": -2.934159755706787, + "logits/rejected": -2.9304256439208984, + "logps/chosen": -255.47142028808594, + "logps/rejected": -272.83392333984375, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.037884414196014404, + "rewards/margins": 1.394587516784668, + "rewards/rejected": -1.4324719905853271, + "step": 1572 + }, + { + "epoch": 0.18, + "learning_rate": 2.4936205080182603e-07, + "logits/chosen": -2.2936620712280273, + "logits/rejected": -2.2601540088653564, + "logps/chosen": -346.5345153808594, + "logps/rejected": -327.0447998046875, + "loss": 0.2829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5475779175758362, + "rewards/margins": 1.8047429323196411, + "rewards/rejected": -1.2571649551391602, + "step": 1573 + }, + { + "epoch": 0.18, + "learning_rate": 2.493269343322018e-07, + "logits/chosen": -2.484318971633911, + "logits/rejected": -2.7793354988098145, + "logps/chosen": -204.45579528808594, + "logps/rejected": -228.13340759277344, + "loss": 0.4569, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0884358137845993, + "rewards/margins": 0.9228543639183044, + "rewards/rejected": -0.8344185948371887, + "step": 1574 + }, + { + "epoch": 0.18, + "learning_rate": 2.4929181786257754e-07, + "logits/chosen": -2.2387490272521973, + "logits/rejected": -2.1321918964385986, + "logps/chosen": -168.62783813476562, + "logps/rejected": -255.7255859375, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05357038602232933, + "rewards/margins": 0.6330357193946838, + "rewards/rejected": -0.5794653296470642, + "step": 1575 + }, + { + "epoch": 0.18, + "learning_rate": 2.492567013929533e-07, + "logits/chosen": -3.2566893100738525, + "logits/rejected": -3.1014766693115234, + "logps/chosen": -192.5931854248047, + "logps/rejected": -303.3433532714844, + "loss": 0.2376, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3248193562030792, + "rewards/margins": 1.9190062284469604, + "rewards/rejected": -1.5941870212554932, + "step": 1576 + }, + { + "epoch": 0.18, + "learning_rate": 2.4922158492332905e-07, + "logits/chosen": -3.010411500930786, + "logits/rejected": -2.990661144256592, + "logps/chosen": -134.6187286376953, + "logps/rejected": -259.3593444824219, + "loss": 0.43, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23420998454093933, + "rewards/margins": 1.313028335571289, + "rewards/rejected": -1.0788183212280273, + "step": 1577 + }, + { + "epoch": 0.18, + "learning_rate": 2.491864684537048e-07, + "logits/chosen": -2.957808017730713, + "logits/rejected": -2.910841226577759, + "logps/chosen": -189.14065551757812, + "logps/rejected": -182.82687377929688, + "loss": 0.6245, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5441092848777771, + "rewards/margins": 1.0646005868911743, + "rewards/rejected": -1.6087098121643066, + "step": 1578 + }, + { + "epoch": 0.18, + "learning_rate": 2.491513519840805e-07, + "logits/chosen": -3.1919941902160645, + "logits/rejected": -2.9025208950042725, + "logps/chosen": -315.1257019042969, + "logps/rejected": -395.47076416015625, + "loss": 0.598, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09761975705623627, + "rewards/margins": 1.0324183702468872, + "rewards/rejected": -0.934798538684845, + "step": 1579 + }, + { + "epoch": 0.18, + "learning_rate": 2.4911623551445626e-07, + "logits/chosen": -4.053675651550293, + "logits/rejected": -4.055758476257324, + "logps/chosen": -166.01144409179688, + "logps/rejected": -144.49032592773438, + "loss": 0.385, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32841819524765015, + "rewards/margins": 1.7198209762573242, + "rewards/rejected": -1.3914028406143188, + "step": 1580 + }, + { + "epoch": 0.18, + "learning_rate": 2.49081119044832e-07, + "logits/chosen": -2.8934061527252197, + "logits/rejected": -3.2023355960845947, + "logps/chosen": -162.78872680664062, + "logps/rejected": -136.22914123535156, + "loss": 0.5985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34314340353012085, + "rewards/margins": 1.3626636266708374, + "rewards/rejected": -1.705807089805603, + "step": 1581 + }, + { + "epoch": 0.18, + "learning_rate": 2.4904600257520777e-07, + "logits/chosen": -3.2262425422668457, + "logits/rejected": -3.333815097808838, + "logps/chosen": -275.0162353515625, + "logps/rejected": -268.2195129394531, + "loss": 0.5603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4277450442314148, + "rewards/margins": 1.2416092157363892, + "rewards/rejected": -1.6693542003631592, + "step": 1582 + }, + { + "epoch": 0.18, + "learning_rate": 2.490108861055835e-07, + "logits/chosen": -3.454385757446289, + "logits/rejected": -2.9647676944732666, + "logps/chosen": -381.7331848144531, + "logps/rejected": -275.11383056640625, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1144372969865799, + "rewards/margins": 1.9118257761001587, + "rewards/rejected": -2.0262629985809326, + "step": 1583 + }, + { + "epoch": 0.18, + "learning_rate": 2.489757696359592e-07, + "logits/chosen": -2.8361501693725586, + "logits/rejected": -3.033151149749756, + "logps/chosen": -205.57240295410156, + "logps/rejected": -195.34246826171875, + "loss": 0.4334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1300015151500702, + "rewards/margins": 1.1216909885406494, + "rewards/rejected": -0.9916895031929016, + "step": 1584 + }, + { + "epoch": 0.18, + "learning_rate": 2.4894065316633503e-07, + "logits/chosen": -3.0783472061157227, + "logits/rejected": -2.981541395187378, + "logps/chosen": -463.94000244140625, + "logps/rejected": -295.4822692871094, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05068501830101013, + "rewards/margins": 1.8550400733947754, + "rewards/rejected": -1.905725121498108, + "step": 1585 + }, + { + "epoch": 0.18, + "learning_rate": 2.489055366967108e-07, + "logits/chosen": -3.156273126602173, + "logits/rejected": -3.4516024589538574, + "logps/chosen": -339.36834716796875, + "logps/rejected": -225.49777221679688, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06556139141321182, + "rewards/margins": 1.5363905429840088, + "rewards/rejected": -1.4708291292190552, + "step": 1586 + }, + { + "epoch": 0.18, + "learning_rate": 2.488704202270865e-07, + "logits/chosen": -2.662827253341675, + "logits/rejected": -2.6474499702453613, + "logps/chosen": -272.6754150390625, + "logps/rejected": -264.35333251953125, + "loss": 0.5074, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8834242224693298, + "rewards/margins": 0.5176845192909241, + "rewards/rejected": 0.3657396137714386, + "step": 1587 + }, + { + "epoch": 0.18, + "learning_rate": 2.4883530375746224e-07, + "logits/chosen": -3.398542881011963, + "logits/rejected": -3.3827009201049805, + "logps/chosen": -369.9556579589844, + "logps/rejected": -328.391845703125, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17286589741706848, + "rewards/margins": 0.9410245418548584, + "rewards/rejected": -0.7681586742401123, + "step": 1588 + }, + { + "epoch": 0.18, + "learning_rate": 2.48800187287838e-07, + "logits/chosen": -3.1807615756988525, + "logits/rejected": -3.209909439086914, + "logps/chosen": -256.842529296875, + "logps/rejected": -231.21083068847656, + "loss": 0.3386, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3647618293762207, + "rewards/margins": 2.1404671669006348, + "rewards/rejected": -1.775705099105835, + "step": 1589 + }, + { + "epoch": 0.18, + "learning_rate": 2.4876507081821374e-07, + "logits/chosen": -3.210204839706421, + "logits/rejected": -3.1936206817626953, + "logps/chosen": -160.5758056640625, + "logps/rejected": -138.31202697753906, + "loss": 0.3473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6779247522354126, + "rewards/margins": 1.426550030708313, + "rewards/rejected": -0.7486251592636108, + "step": 1590 + }, + { + "epoch": 0.18, + "learning_rate": 2.487299543485895e-07, + "logits/chosen": -2.3738036155700684, + "logits/rejected": -2.468855142593384, + "logps/chosen": -379.32012939453125, + "logps/rejected": -332.70819091796875, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32497167587280273, + "rewards/margins": 0.8997296094894409, + "rewards/rejected": -0.5747578740119934, + "step": 1591 + }, + { + "epoch": 0.18, + "learning_rate": 2.486948378789652e-07, + "logits/chosen": -3.562161684036255, + "logits/rejected": -3.0826175212860107, + "logps/chosen": -251.93624877929688, + "logps/rejected": -192.786865234375, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6212844848632812, + "rewards/margins": 0.7748706340789795, + "rewards/rejected": -1.3961551189422607, + "step": 1592 + }, + { + "epoch": 0.18, + "learning_rate": 2.4865972140934095e-07, + "logits/chosen": -2.4381299018859863, + "logits/rejected": -2.9638969898223877, + "logps/chosen": -302.59332275390625, + "logps/rejected": -299.6400451660156, + "loss": 0.3321, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3786783218383789, + "rewards/margins": 2.049281120300293, + "rewards/rejected": -1.670602798461914, + "step": 1593 + }, + { + "epoch": 0.18, + "learning_rate": 2.486246049397167e-07, + "logits/chosen": -2.9776320457458496, + "logits/rejected": -2.75177264213562, + "logps/chosen": -225.4220428466797, + "logps/rejected": -150.24493408203125, + "loss": 0.4986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08923830091953278, + "rewards/margins": 0.8605561256408691, + "rewards/rejected": -0.9497944712638855, + "step": 1594 + }, + { + "epoch": 0.18, + "learning_rate": 2.4858948847009246e-07, + "logits/chosen": -2.8370304107666016, + "logits/rejected": -2.55305552482605, + "logps/chosen": -417.40911865234375, + "logps/rejected": -214.65200805664062, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12078927457332611, + "rewards/margins": 1.4924275875091553, + "rewards/rejected": -1.613216757774353, + "step": 1595 + }, + { + "epoch": 0.18, + "learning_rate": 2.485543720004682e-07, + "logits/chosen": -2.8922176361083984, + "logits/rejected": -2.8392605781555176, + "logps/chosen": -179.53965759277344, + "logps/rejected": -193.11843872070312, + "loss": 0.6272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3021852970123291, + "rewards/margins": 1.0617811679840088, + "rewards/rejected": -1.363966464996338, + "step": 1596 + }, + { + "epoch": 0.18, + "learning_rate": 2.485192555308439e-07, + "logits/chosen": -2.915086269378662, + "logits/rejected": -3.243215560913086, + "logps/chosen": -136.1141357421875, + "logps/rejected": -206.26516723632812, + "loss": 0.2515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.028124883770942688, + "rewards/margins": 1.6708011627197266, + "rewards/rejected": -1.6426763534545898, + "step": 1597 + }, + { + "epoch": 0.18, + "learning_rate": 2.484841390612197e-07, + "logits/chosen": -2.5279502868652344, + "logits/rejected": -2.3170130252838135, + "logps/chosen": -281.1407165527344, + "logps/rejected": -283.10491943359375, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5464170575141907, + "rewards/margins": 0.8834316730499268, + "rewards/rejected": -1.4298487901687622, + "step": 1598 + }, + { + "epoch": 0.18, + "learning_rate": 2.484490225915955e-07, + "logits/chosen": -3.3838205337524414, + "logits/rejected": -3.777003288269043, + "logps/chosen": -140.44464111328125, + "logps/rejected": -234.48382568359375, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33067017793655396, + "rewards/margins": 1.441359281539917, + "rewards/rejected": -1.1106890439987183, + "step": 1599 + }, + { + "epoch": 0.18, + "learning_rate": 2.484139061219712e-07, + "logits/chosen": -2.4268360137939453, + "logits/rejected": -2.7971158027648926, + "logps/chosen": -474.1029052734375, + "logps/rejected": -311.8132019042969, + "loss": 0.4138, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019878730177879333, + "rewards/margins": 1.0621018409729004, + "rewards/rejected": -1.0422230958938599, + "step": 1600 + }, + { + "epoch": 0.18, + "learning_rate": 2.4837878965234693e-07, + "logits/chosen": -2.8265023231506348, + "logits/rejected": -2.715358018875122, + "logps/chosen": -271.5411071777344, + "logps/rejected": -215.6396484375, + "loss": 0.6791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40635865926742554, + "rewards/margins": 0.5961260795593262, + "rewards/rejected": -1.0024847984313965, + "step": 1601 + }, + { + "epoch": 0.18, + "learning_rate": 2.483436731827227e-07, + "logits/chosen": -2.425666570663452, + "logits/rejected": -2.435767650604248, + "logps/chosen": -401.4701232910156, + "logps/rejected": -383.2276306152344, + "loss": 0.1583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.491716593503952, + "rewards/margins": 2.392782688140869, + "rewards/rejected": -1.9010661840438843, + "step": 1602 + }, + { + "epoch": 0.18, + "learning_rate": 2.4830855671309844e-07, + "logits/chosen": -3.0405819416046143, + "logits/rejected": -3.0422515869140625, + "logps/chosen": -208.3442840576172, + "logps/rejected": -192.49871826171875, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30194300413131714, + "rewards/margins": 1.4936113357543945, + "rewards/rejected": -1.191668152809143, + "step": 1603 + }, + { + "epoch": 0.18, + "learning_rate": 2.482734402434742e-07, + "logits/chosen": -2.9584836959838867, + "logits/rejected": -2.805340051651001, + "logps/chosen": -303.8201904296875, + "logps/rejected": -221.5235595703125, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2331658899784088, + "rewards/margins": 1.1537485122680664, + "rewards/rejected": -0.92058265209198, + "step": 1604 + }, + { + "epoch": 0.19, + "learning_rate": 2.482383237738499e-07, + "logits/chosen": -2.7143802642822266, + "logits/rejected": -2.929252862930298, + "logps/chosen": -256.7149963378906, + "logps/rejected": -242.93460083007812, + "loss": 0.6775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.491890549659729, + "rewards/margins": 0.7323378324508667, + "rewards/rejected": -1.2242283821105957, + "step": 1605 + }, + { + "epoch": 0.19, + "learning_rate": 2.4820320730422565e-07, + "logits/chosen": -3.0357983112335205, + "logits/rejected": -3.186969041824341, + "logps/chosen": -343.0533142089844, + "logps/rejected": -258.09930419921875, + "loss": 0.4322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07278376817703247, + "rewards/margins": 0.7776103019714355, + "rewards/rejected": -0.8503940105438232, + "step": 1606 + }, + { + "epoch": 0.19, + "learning_rate": 2.4816809083460145e-07, + "logits/chosen": -2.9929580688476562, + "logits/rejected": -2.909958839416504, + "logps/chosen": -318.1580810546875, + "logps/rejected": -201.66400146484375, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11313905566930771, + "rewards/margins": 1.3388831615447998, + "rewards/rejected": -1.4520221948623657, + "step": 1607 + }, + { + "epoch": 0.19, + "learning_rate": 2.4813297436497715e-07, + "logits/chosen": -3.3802247047424316, + "logits/rejected": -3.0135278701782227, + "logps/chosen": -208.70184326171875, + "logps/rejected": -133.7020721435547, + "loss": 0.5389, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3157225549221039, + "rewards/margins": 0.6978271007537842, + "rewards/rejected": -1.0135496854782104, + "step": 1608 + }, + { + "epoch": 0.19, + "learning_rate": 2.480978578953529e-07, + "logits/chosen": -3.0474600791931152, + "logits/rejected": -3.0875656604766846, + "logps/chosen": -204.23968505859375, + "logps/rejected": -286.5731201171875, + "loss": 0.1808, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3811301290988922, + "rewards/margins": 2.51344633102417, + "rewards/rejected": -2.1323161125183105, + "step": 1609 + }, + { + "epoch": 0.19, + "learning_rate": 2.4806274142572866e-07, + "logits/chosen": -3.1642093658447266, + "logits/rejected": -3.080878257751465, + "logps/chosen": -285.5237121582031, + "logps/rejected": -211.30711364746094, + "loss": 0.4949, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16524535417556763, + "rewards/margins": 1.6269410848617554, + "rewards/rejected": -1.461695671081543, + "step": 1610 + }, + { + "epoch": 0.19, + "learning_rate": 2.480276249561044e-07, + "logits/chosen": -2.4304065704345703, + "logits/rejected": -2.5123777389526367, + "logps/chosen": -356.9730529785156, + "logps/rejected": -323.14324951171875, + "loss": 0.7664, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04421408474445343, + "rewards/margins": 0.09974081814289093, + "rewards/rejected": -0.14395490288734436, + "step": 1611 + }, + { + "epoch": 0.19, + "learning_rate": 2.4799250848648017e-07, + "logits/chosen": -3.4611644744873047, + "logits/rejected": -3.5673651695251465, + "logps/chosen": -200.72555541992188, + "logps/rejected": -238.0050811767578, + "loss": 0.1976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.037610769271850586, + "rewards/margins": 1.9399579763412476, + "rewards/rejected": -1.9775688648223877, + "step": 1612 + }, + { + "epoch": 0.19, + "learning_rate": 2.4795739201685587e-07, + "logits/chosen": -2.1919658184051514, + "logits/rejected": -2.4560458660125732, + "logps/chosen": -412.3431701660156, + "logps/rejected": -345.5562744140625, + "loss": 0.4905, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0391206294298172, + "rewards/margins": 0.879487156867981, + "rewards/rejected": -0.8403664827346802, + "step": 1613 + }, + { + "epoch": 0.19, + "learning_rate": 2.479222755472316e-07, + "logits/chosen": -3.2937936782836914, + "logits/rejected": -3.07669997215271, + "logps/chosen": -165.2415008544922, + "logps/rejected": -90.95587158203125, + "loss": 0.7672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4400079846382141, + "rewards/margins": 0.053300559520721436, + "rewards/rejected": -0.49330854415893555, + "step": 1614 + }, + { + "epoch": 0.19, + "learning_rate": 2.478871590776074e-07, + "logits/chosen": -3.26701283454895, + "logits/rejected": -3.3003578186035156, + "logps/chosen": -262.5455322265625, + "logps/rejected": -307.493408203125, + "loss": 0.2377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2662431597709656, + "rewards/margins": 1.801184892654419, + "rewards/rejected": -1.5349416732788086, + "step": 1615 + }, + { + "epoch": 0.19, + "learning_rate": 2.4785204260798313e-07, + "logits/chosen": -3.234891891479492, + "logits/rejected": -3.027536630630493, + "logps/chosen": -121.38128662109375, + "logps/rejected": -157.8800506591797, + "loss": 1.0565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5487164258956909, + "rewards/margins": 0.36075013875961304, + "rewards/rejected": -0.9094666242599487, + "step": 1616 + }, + { + "epoch": 0.19, + "learning_rate": 2.478169261383589e-07, + "logits/chosen": -3.284700393676758, + "logits/rejected": -3.5850830078125, + "logps/chosen": -282.3678894042969, + "logps/rejected": -434.3486328125, + "loss": 0.2356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47401413321495056, + "rewards/margins": 2.741239070892334, + "rewards/rejected": -3.2152533531188965, + "step": 1617 + }, + { + "epoch": 0.19, + "learning_rate": 2.477818096687346e-07, + "logits/chosen": -3.302093029022217, + "logits/rejected": -3.140270709991455, + "logps/chosen": -431.5946960449219, + "logps/rejected": -240.39224243164062, + "loss": 0.5559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08787223696708679, + "rewards/margins": 2.0257503986358643, + "rewards/rejected": -1.937877893447876, + "step": 1618 + }, + { + "epoch": 0.19, + "learning_rate": 2.477466931991104e-07, + "logits/chosen": -3.3976333141326904, + "logits/rejected": -3.103829860687256, + "logps/chosen": -361.5479736328125, + "logps/rejected": -147.52084350585938, + "loss": 0.7588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7584477663040161, + "rewards/margins": 0.8833099603652954, + "rewards/rejected": -1.641757607460022, + "step": 1619 + }, + { + "epoch": 0.19, + "learning_rate": 2.4771157672948615e-07, + "logits/chosen": -2.461674451828003, + "logits/rejected": -2.426726818084717, + "logps/chosen": -335.3467712402344, + "logps/rejected": -316.70782470703125, + "loss": 0.3147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4429143965244293, + "rewards/margins": 1.821640968322754, + "rewards/rejected": -1.3787267208099365, + "step": 1620 + }, + { + "epoch": 0.19, + "learning_rate": 2.4767646025986185e-07, + "logits/chosen": -3.7233800888061523, + "logits/rejected": -3.445781707763672, + "logps/chosen": -326.6361389160156, + "logps/rejected": -241.48277282714844, + "loss": 0.2796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06145267188549042, + "rewards/margins": 1.5817797183990479, + "rewards/rejected": -1.6432322263717651, + "step": 1621 + }, + { + "epoch": 0.19, + "learning_rate": 2.476413437902376e-07, + "logits/chosen": -3.1440072059631348, + "logits/rejected": -2.9725191593170166, + "logps/chosen": -344.57000732421875, + "logps/rejected": -251.28086853027344, + "loss": 0.3917, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05121784657239914, + "rewards/margins": 0.8979679346084595, + "rewards/rejected": -0.9491857886314392, + "step": 1622 + }, + { + "epoch": 0.19, + "learning_rate": 2.4760622732061336e-07, + "logits/chosen": -3.5760037899017334, + "logits/rejected": -3.423732280731201, + "logps/chosen": -229.76626586914062, + "logps/rejected": -239.6583251953125, + "loss": 0.2293, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24821406602859497, + "rewards/margins": 2.1366474628448486, + "rewards/rejected": -1.8884334564208984, + "step": 1623 + }, + { + "epoch": 0.19, + "learning_rate": 2.475711108509891e-07, + "logits/chosen": -3.2058005332946777, + "logits/rejected": -3.03995943069458, + "logps/chosen": -243.10418701171875, + "logps/rejected": -143.76434326171875, + "loss": 0.9873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8367437124252319, + "rewards/margins": 0.39725565910339355, + "rewards/rejected": -1.2339993715286255, + "step": 1624 + }, + { + "epoch": 0.19, + "learning_rate": 2.4753599438136486e-07, + "logits/chosen": -3.5770716667175293, + "logits/rejected": -3.1531360149383545, + "logps/chosen": -275.5059814453125, + "logps/rejected": -303.7064208984375, + "loss": 0.1833, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35189539194107056, + "rewards/margins": 2.042738914489746, + "rewards/rejected": -1.6908434629440308, + "step": 1625 + }, + { + "epoch": 0.19, + "learning_rate": 2.4750087791174056e-07, + "logits/chosen": -3.15144681930542, + "logits/rejected": -3.310819149017334, + "logps/chosen": -241.8475341796875, + "logps/rejected": -401.38739013671875, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6157566905021667, + "rewards/margins": 0.8701713681221008, + "rewards/rejected": -1.4859280586242676, + "step": 1626 + }, + { + "epoch": 0.19, + "learning_rate": 2.474657614421163e-07, + "logits/chosen": -2.487565517425537, + "logits/rejected": -2.5232656002044678, + "logps/chosen": -169.6397705078125, + "logps/rejected": -403.33953857421875, + "loss": 0.4673, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20190362632274628, + "rewards/margins": 0.8333655595779419, + "rewards/rejected": -0.6314619183540344, + "step": 1627 + }, + { + "epoch": 0.19, + "learning_rate": 2.4743064497249207e-07, + "logits/chosen": -3.3822743892669678, + "logits/rejected": -3.4034886360168457, + "logps/chosen": -266.6377258300781, + "logps/rejected": -218.320556640625, + "loss": 0.3814, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02176293544471264, + "rewards/margins": 1.3964648246765137, + "rewards/rejected": -1.374701976776123, + "step": 1628 + }, + { + "epoch": 0.19, + "learning_rate": 2.473955285028678e-07, + "logits/chosen": -2.823197603225708, + "logits/rejected": -2.691889762878418, + "logps/chosen": -130.83242797851562, + "logps/rejected": -304.0140380859375, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12175226211547852, + "rewards/margins": 1.6047595739364624, + "rewards/rejected": -1.7265119552612305, + "step": 1629 + }, + { + "epoch": 0.19, + "learning_rate": 2.473604120332436e-07, + "logits/chosen": -3.1767563819885254, + "logits/rejected": -2.942979335784912, + "logps/chosen": -255.70855712890625, + "logps/rejected": -243.71224975585938, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18955880403518677, + "rewards/margins": 2.641592025756836, + "rewards/rejected": -2.452033281326294, + "step": 1630 + }, + { + "epoch": 0.19, + "learning_rate": 2.4732529556361933e-07, + "logits/chosen": -2.874816417694092, + "logits/rejected": -3.161055088043213, + "logps/chosen": -351.501953125, + "logps/rejected": -309.83587646484375, + "loss": 0.3744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5034058690071106, + "rewards/margins": 1.476503610610962, + "rewards/rejected": -0.9730978012084961, + "step": 1631 + }, + { + "epoch": 0.19, + "learning_rate": 2.472901790939951e-07, + "logits/chosen": -2.97882080078125, + "logits/rejected": -2.8736541271209717, + "logps/chosen": -361.94427490234375, + "logps/rejected": -316.7669372558594, + "loss": 0.3838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3193373680114746, + "rewards/margins": 0.9276111125946045, + "rewards/rejected": -1.246948480606079, + "step": 1632 + }, + { + "epoch": 0.19, + "learning_rate": 2.4725506262437084e-07, + "logits/chosen": -3.559875011444092, + "logits/rejected": -2.8241186141967773, + "logps/chosen": -384.7473449707031, + "logps/rejected": -192.31300354003906, + "loss": 1.1148, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.364538550376892, + "rewards/margins": -0.20309188961982727, + "rewards/rejected": -1.1614465713500977, + "step": 1633 + }, + { + "epoch": 0.19, + "learning_rate": 2.4721994615474654e-07, + "logits/chosen": -3.332775115966797, + "logits/rejected": -3.0801808834075928, + "logps/chosen": -124.4903793334961, + "logps/rejected": -156.7494354248047, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1827080249786377, + "rewards/margins": 1.346949815750122, + "rewards/rejected": -1.1642416715621948, + "step": 1634 + }, + { + "epoch": 0.19, + "learning_rate": 2.471848296851223e-07, + "logits/chosen": -3.0425779819488525, + "logits/rejected": -3.290282726287842, + "logps/chosen": -132.42999267578125, + "logps/rejected": -190.01869201660156, + "loss": 0.3994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3373163342475891, + "rewards/margins": 1.3747307062149048, + "rewards/rejected": -1.7120471000671387, + "step": 1635 + }, + { + "epoch": 0.19, + "learning_rate": 2.4714971321549805e-07, + "logits/chosen": -2.9455347061157227, + "logits/rejected": -3.003751516342163, + "logps/chosen": -214.91864013671875, + "logps/rejected": -134.51930236816406, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.111922986805439, + "rewards/margins": 0.8513867259025574, + "rewards/rejected": -0.7394638061523438, + "step": 1636 + }, + { + "epoch": 0.19, + "learning_rate": 2.471145967458738e-07, + "logits/chosen": -2.219520330429077, + "logits/rejected": -2.2625210285186768, + "logps/chosen": -683.4371337890625, + "logps/rejected": -396.0511474609375, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44750234484672546, + "rewards/margins": 1.6360344886779785, + "rewards/rejected": -1.1885322332382202, + "step": 1637 + }, + { + "epoch": 0.19, + "learning_rate": 2.4707948027624956e-07, + "logits/chosen": -2.7788584232330322, + "logits/rejected": -2.84614896774292, + "logps/chosen": -158.1094207763672, + "logps/rejected": -207.4463653564453, + "loss": 0.3494, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20430941879749298, + "rewards/margins": 1.6551119089126587, + "rewards/rejected": -1.4508025646209717, + "step": 1638 + }, + { + "epoch": 0.19, + "learning_rate": 2.470443638066253e-07, + "logits/chosen": -3.3876399993896484, + "logits/rejected": -3.3464629650115967, + "logps/chosen": -280.7225036621094, + "logps/rejected": -251.35427856445312, + "loss": 0.3093, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08294618874788284, + "rewards/margins": 1.8079230785369873, + "rewards/rejected": -1.7249767780303955, + "step": 1639 + }, + { + "epoch": 0.19, + "learning_rate": 2.47009247337001e-07, + "logits/chosen": -3.1355056762695312, + "logits/rejected": -2.62129545211792, + "logps/chosen": -331.9334411621094, + "logps/rejected": -186.84597778320312, + "loss": 0.3948, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15465782582759857, + "rewards/margins": 1.5128577947616577, + "rewards/rejected": -1.3581998348236084, + "step": 1640 + }, + { + "epoch": 0.19, + "learning_rate": 2.469741308673768e-07, + "logits/chosen": -3.623434066772461, + "logits/rejected": -3.3550500869750977, + "logps/chosen": -249.16636657714844, + "logps/rejected": -232.155517578125, + "loss": 0.328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09672051668167114, + "rewards/margins": 1.829206943511963, + "rewards/rejected": -1.9259274005889893, + "step": 1641 + }, + { + "epoch": 0.19, + "learning_rate": 2.469390143977525e-07, + "logits/chosen": -3.4303834438323975, + "logits/rejected": -3.0111262798309326, + "logps/chosen": -236.13787841796875, + "logps/rejected": -266.9262390136719, + "loss": 0.8809, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5209572315216064, + "rewards/margins": -0.024635910987854004, + "rewards/rejected": -0.4963212311267853, + "step": 1642 + }, + { + "epoch": 0.19, + "learning_rate": 2.469038979281283e-07, + "logits/chosen": -2.565387725830078, + "logits/rejected": -2.8425614833831787, + "logps/chosen": -409.16015625, + "logps/rejected": -297.3311462402344, + "loss": 0.5821, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21906308829784393, + "rewards/margins": 0.6376460790634155, + "rewards/rejected": -0.8567091822624207, + "step": 1643 + }, + { + "epoch": 0.19, + "learning_rate": 2.4686878145850403e-07, + "logits/chosen": -3.0870227813720703, + "logits/rejected": -2.839993476867676, + "logps/chosen": -328.0035400390625, + "logps/rejected": -280.96343994140625, + "loss": 0.8185, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7000449299812317, + "rewards/margins": -0.14338122308254242, + "rewards/rejected": -0.5566637516021729, + "step": 1644 + }, + { + "epoch": 0.19, + "learning_rate": 2.468336649888798e-07, + "logits/chosen": -2.892493724822998, + "logits/rejected": -3.0386791229248047, + "logps/chosen": -384.6796875, + "logps/rejected": -214.46194458007812, + "loss": 0.5413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2288389652967453, + "rewards/margins": 0.7741386890411377, + "rewards/rejected": -1.002977728843689, + "step": 1645 + }, + { + "epoch": 0.19, + "learning_rate": 2.4679854851925554e-07, + "logits/chosen": -3.484879970550537, + "logits/rejected": -3.3117287158966064, + "logps/chosen": -94.59874725341797, + "logps/rejected": -121.7508544921875, + "loss": 0.5459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32910534739494324, + "rewards/margins": 0.854915976524353, + "rewards/rejected": -1.184021234512329, + "step": 1646 + }, + { + "epoch": 0.19, + "learning_rate": 2.467634320496313e-07, + "logits/chosen": -2.977233409881592, + "logits/rejected": -2.946171283721924, + "logps/chosen": -269.04193115234375, + "logps/rejected": -274.8897705078125, + "loss": 0.5668, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020627934485673904, + "rewards/margins": 0.8076647520065308, + "rewards/rejected": -0.7870368361473083, + "step": 1647 + }, + { + "epoch": 0.19, + "learning_rate": 2.46728315580007e-07, + "logits/chosen": -2.4336202144622803, + "logits/rejected": -2.4982075691223145, + "logps/chosen": -405.12652587890625, + "logps/rejected": -240.6546173095703, + "loss": 0.5605, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09702956676483154, + "rewards/margins": 0.6522217392921448, + "rewards/rejected": -0.555192232131958, + "step": 1648 + }, + { + "epoch": 0.19, + "learning_rate": 2.4669319911038274e-07, + "logits/chosen": -2.708063840866089, + "logits/rejected": -2.780207633972168, + "logps/chosen": -476.9808349609375, + "logps/rejected": -361.608154296875, + "loss": 0.3275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2493128478527069, + "rewards/margins": 2.3930797576904297, + "rewards/rejected": -2.1437668800354004, + "step": 1649 + }, + { + "epoch": 0.19, + "learning_rate": 2.466580826407585e-07, + "logits/chosen": -3.2944164276123047, + "logits/rejected": -2.7720751762390137, + "logps/chosen": -247.1857452392578, + "logps/rejected": -195.83348083496094, + "loss": 0.5702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16384786367416382, + "rewards/margins": 0.4309402406215668, + "rewards/rejected": -0.594788134098053, + "step": 1650 + }, + { + "epoch": 0.19, + "learning_rate": 2.4662296617113425e-07, + "logits/chosen": -3.4813716411590576, + "logits/rejected": -3.4087066650390625, + "logps/chosen": -381.19873046875, + "logps/rejected": -309.81585693359375, + "loss": 0.376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01619626022875309, + "rewards/margins": 1.618360996246338, + "rewards/rejected": -1.6345571279525757, + "step": 1651 + }, + { + "epoch": 0.19, + "learning_rate": 2.4658784970151e-07, + "logits/chosen": -2.406052827835083, + "logits/rejected": -2.5095372200012207, + "logps/chosen": -281.9344482421875, + "logps/rejected": -227.5969696044922, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1824236363172531, + "rewards/margins": 0.706309974193573, + "rewards/rejected": -0.5238863229751587, + "step": 1652 + }, + { + "epoch": 0.19, + "learning_rate": 2.4655273323188576e-07, + "logits/chosen": -3.4611401557922363, + "logits/rejected": -3.5146594047546387, + "logps/chosen": -179.00900268554688, + "logps/rejected": -212.01885986328125, + "loss": 0.5409, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22202692925930023, + "rewards/margins": 1.269743800163269, + "rewards/rejected": -1.4917707443237305, + "step": 1653 + }, + { + "epoch": 0.19, + "learning_rate": 2.465176167622615e-07, + "logits/chosen": -2.8392858505249023, + "logits/rejected": -3.058464765548706, + "logps/chosen": -178.03025817871094, + "logps/rejected": -181.70912170410156, + "loss": 0.4875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15759813785552979, + "rewards/margins": 1.230353832244873, + "rewards/rejected": -1.3879520893096924, + "step": 1654 + }, + { + "epoch": 0.19, + "learning_rate": 2.4648250029263727e-07, + "logits/chosen": -3.1389269828796387, + "logits/rejected": -3.3271584510803223, + "logps/chosen": -234.2293701171875, + "logps/rejected": -287.00726318359375, + "loss": 0.3277, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035910964012145996, + "rewards/margins": 1.3347375392913818, + "rewards/rejected": -1.2988265752792358, + "step": 1655 + }, + { + "epoch": 0.19, + "learning_rate": 2.4644738382301297e-07, + "logits/chosen": -2.915689706802368, + "logits/rejected": -2.8571135997772217, + "logps/chosen": -208.79710388183594, + "logps/rejected": -190.15957641601562, + "loss": 0.7201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18519210815429688, + "rewards/margins": 0.19241377711296082, + "rewards/rejected": -0.3776058554649353, + "step": 1656 + }, + { + "epoch": 0.19, + "learning_rate": 2.464122673533887e-07, + "logits/chosen": -3.790785312652588, + "logits/rejected": -3.59560489654541, + "logps/chosen": -170.98670959472656, + "logps/rejected": -183.72364807128906, + "loss": 0.3532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1265781819820404, + "rewards/margins": 1.4742681980133057, + "rewards/rejected": -1.3476901054382324, + "step": 1657 + }, + { + "epoch": 0.19, + "learning_rate": 2.463771508837645e-07, + "logits/chosen": -3.4610581398010254, + "logits/rejected": -3.166311502456665, + "logps/chosen": -238.10324096679688, + "logps/rejected": -184.92642211914062, + "loss": 0.3836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35447806119918823, + "rewards/margins": 0.9791409969329834, + "rewards/rejected": -1.3336191177368164, + "step": 1658 + }, + { + "epoch": 0.19, + "learning_rate": 2.4634203441414023e-07, + "logits/chosen": -3.5480051040649414, + "logits/rejected": -3.7581186294555664, + "logps/chosen": -177.24977111816406, + "logps/rejected": -183.33998107910156, + "loss": 0.2825, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17221404612064362, + "rewards/margins": 1.644298791885376, + "rewards/rejected": -1.4720847606658936, + "step": 1659 + }, + { + "epoch": 0.19, + "learning_rate": 2.46306917944516e-07, + "logits/chosen": -3.974099636077881, + "logits/rejected": -3.930417537689209, + "logps/chosen": -110.23602294921875, + "logps/rejected": -145.34014892578125, + "loss": 0.5658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3136996626853943, + "rewards/margins": 1.275432825088501, + "rewards/rejected": -0.9617331624031067, + "step": 1660 + }, + { + "epoch": 0.19, + "learning_rate": 2.462718014748917e-07, + "logits/chosen": -2.651881694793701, + "logits/rejected": -2.5829646587371826, + "logps/chosen": -360.2419738769531, + "logps/rejected": -173.4150390625, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13943196833133698, + "rewards/margins": 0.27662673592567444, + "rewards/rejected": -0.4160586893558502, + "step": 1661 + }, + { + "epoch": 0.19, + "learning_rate": 2.4623668500526744e-07, + "logits/chosen": -3.396226167678833, + "logits/rejected": -3.121123790740967, + "logps/chosen": -244.54550170898438, + "logps/rejected": -232.8888702392578, + "loss": 0.5882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10487784445285797, + "rewards/margins": 0.7730016708374023, + "rewards/rejected": -0.6681237816810608, + "step": 1662 + }, + { + "epoch": 0.19, + "learning_rate": 2.462015685356432e-07, + "logits/chosen": -3.088339328765869, + "logits/rejected": -3.0391621589660645, + "logps/chosen": -164.5384979248047, + "logps/rejected": -203.97634887695312, + "loss": 0.4186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3430846035480499, + "rewards/margins": 1.0082159042358398, + "rewards/rejected": -1.3513004779815674, + "step": 1663 + }, + { + "epoch": 0.19, + "learning_rate": 2.4616645206601895e-07, + "logits/chosen": -2.111482858657837, + "logits/rejected": -2.158494472503662, + "logps/chosen": -326.66357421875, + "logps/rejected": -313.67120361328125, + "loss": 0.2229, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2633962333202362, + "rewards/margins": 1.6975619792938232, + "rewards/rejected": -1.4341658353805542, + "step": 1664 + }, + { + "epoch": 0.19, + "learning_rate": 2.461313355963947e-07, + "logits/chosen": -3.597100019454956, + "logits/rejected": -3.314906120300293, + "logps/chosen": -319.1592102050781, + "logps/rejected": -219.0665283203125, + "loss": 0.4606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4545048773288727, + "rewards/margins": 1.2645498514175415, + "rewards/rejected": -0.8100448250770569, + "step": 1665 + }, + { + "epoch": 0.19, + "learning_rate": 2.4609621912677045e-07, + "logits/chosen": -2.8511533737182617, + "logits/rejected": -2.468804121017456, + "logps/chosen": -193.9290008544922, + "logps/rejected": -281.9854431152344, + "loss": 0.3209, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08376835286617279, + "rewards/margins": 1.9418946504592896, + "rewards/rejected": -1.8581262826919556, + "step": 1666 + }, + { + "epoch": 0.19, + "learning_rate": 2.460611026571462e-07, + "logits/chosen": -3.6707606315612793, + "logits/rejected": -3.9513099193573, + "logps/chosen": -247.72445678710938, + "logps/rejected": -239.5735626220703, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.054384276270866394, + "rewards/margins": 1.82422935962677, + "rewards/rejected": -1.7698450088500977, + "step": 1667 + }, + { + "epoch": 0.19, + "learning_rate": 2.4602598618752196e-07, + "logits/chosen": -2.993516683578491, + "logits/rejected": -3.0110347270965576, + "logps/chosen": -246.90931701660156, + "logps/rejected": -226.88943481445312, + "loss": 0.3784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09634879231452942, + "rewards/margins": 0.9972802400588989, + "rewards/rejected": -0.9009314775466919, + "step": 1668 + }, + { + "epoch": 0.19, + "learning_rate": 2.4599086971789766e-07, + "logits/chosen": -2.969608783721924, + "logits/rejected": -2.9063379764556885, + "logps/chosen": -429.904541015625, + "logps/rejected": -353.58367919921875, + "loss": 0.523, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24493397772312164, + "rewards/margins": 0.780466616153717, + "rewards/rejected": -0.5355326533317566, + "step": 1669 + }, + { + "epoch": 0.19, + "learning_rate": 2.459557532482734e-07, + "logits/chosen": -3.3365840911865234, + "logits/rejected": -2.877293348312378, + "logps/chosen": -253.65997314453125, + "logps/rejected": -122.17088317871094, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2868715524673462, + "rewards/margins": 1.0084301233291626, + "rewards/rejected": -0.7215585708618164, + "step": 1670 + }, + { + "epoch": 0.19, + "learning_rate": 2.4592063677864917e-07, + "logits/chosen": -3.4404802322387695, + "logits/rejected": -2.9636712074279785, + "logps/chosen": -287.96759033203125, + "logps/rejected": -350.8348083496094, + "loss": 0.2991, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31366777420043945, + "rewards/margins": 1.5818394422531128, + "rewards/rejected": -1.268171787261963, + "step": 1671 + }, + { + "epoch": 0.19, + "learning_rate": 2.458855203090249e-07, + "logits/chosen": -2.742400884628296, + "logits/rejected": -2.623107433319092, + "logps/chosen": -309.1057434082031, + "logps/rejected": -261.9565124511719, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03688700497150421, + "rewards/margins": 1.1435816287994385, + "rewards/rejected": -1.1804687976837158, + "step": 1672 + }, + { + "epoch": 0.19, + "learning_rate": 2.458504038394007e-07, + "logits/chosen": -3.473637819290161, + "logits/rejected": -3.3117377758026123, + "logps/chosen": -359.1307678222656, + "logps/rejected": -293.23370361328125, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12932097911834717, + "rewards/margins": 1.6799559593200684, + "rewards/rejected": -1.8092769384384155, + "step": 1673 + }, + { + "epoch": 0.19, + "learning_rate": 2.458152873697764e-07, + "logits/chosen": -2.7435302734375, + "logits/rejected": -2.567420721054077, + "logps/chosen": -303.9747009277344, + "logps/rejected": -349.2685241699219, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05257215350866318, + "rewards/margins": 2.1518969535827637, + "rewards/rejected": -2.0993247032165527, + "step": 1674 + }, + { + "epoch": 0.19, + "learning_rate": 2.457801709001522e-07, + "logits/chosen": -3.03609561920166, + "logits/rejected": -3.044161558151245, + "logps/chosen": -300.92315673828125, + "logps/rejected": -237.2432861328125, + "loss": 0.5763, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5415917038917542, + "rewards/margins": 0.6007184982299805, + "rewards/rejected": -1.1423102617263794, + "step": 1675 + }, + { + "epoch": 0.19, + "learning_rate": 2.4574505443052794e-07, + "logits/chosen": -3.8496711254119873, + "logits/rejected": -3.8292016983032227, + "logps/chosen": -229.2249755859375, + "logps/rejected": -220.6937255859375, + "loss": 0.1224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7747334241867065, + "rewards/margins": 2.4123823642730713, + "rewards/rejected": -1.6376488208770752, + "step": 1676 + }, + { + "epoch": 0.19, + "learning_rate": 2.4570993796090364e-07, + "logits/chosen": -3.068362236022949, + "logits/rejected": -3.0622189044952393, + "logps/chosen": -218.0693359375, + "logps/rejected": -265.5788269042969, + "loss": 0.461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35873308777809143, + "rewards/margins": 1.80496346950531, + "rewards/rejected": -1.446230411529541, + "step": 1677 + }, + { + "epoch": 0.19, + "learning_rate": 2.456748214912794e-07, + "logits/chosen": -3.4159510135650635, + "logits/rejected": -3.345960855484009, + "logps/chosen": -264.47808837890625, + "logps/rejected": -382.75909423828125, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06448528170585632, + "rewards/margins": 1.444566011428833, + "rewards/rejected": -1.3800805807113647, + "step": 1678 + }, + { + "epoch": 0.19, + "learning_rate": 2.4563970502165515e-07, + "logits/chosen": -2.7105515003204346, + "logits/rejected": -2.5486345291137695, + "logps/chosen": -242.34515380859375, + "logps/rejected": -214.1011505126953, + "loss": 0.7554, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8463466763496399, + "rewards/margins": -0.007047310471534729, + "rewards/rejected": -0.8392993807792664, + "step": 1679 + }, + { + "epoch": 0.19, + "learning_rate": 2.456045885520309e-07, + "logits/chosen": -3.4149301052093506, + "logits/rejected": -3.4928789138793945, + "logps/chosen": -435.8739013671875, + "logps/rejected": -348.6937255859375, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26740455627441406, + "rewards/margins": 1.4628195762634277, + "rewards/rejected": -1.7302241325378418, + "step": 1680 + }, + { + "epoch": 0.19, + "learning_rate": 2.4556947208240666e-07, + "logits/chosen": -3.772813558578491, + "logits/rejected": -3.413177490234375, + "logps/chosen": -217.1726837158203, + "logps/rejected": -241.602294921875, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10687769949436188, + "rewards/margins": 1.5374398231506348, + "rewards/rejected": -1.4305620193481445, + "step": 1681 + }, + { + "epoch": 0.19, + "learning_rate": 2.4553435561278236e-07, + "logits/chosen": -2.9016683101654053, + "logits/rejected": -3.014634370803833, + "logps/chosen": -204.0310516357422, + "logps/rejected": -267.7081298828125, + "loss": 0.8202, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09997561573982239, + "rewards/margins": 0.43439704179763794, + "rewards/rejected": -0.33442142605781555, + "step": 1682 + }, + { + "epoch": 0.19, + "learning_rate": 2.454992391431581e-07, + "logits/chosen": -3.5605297088623047, + "logits/rejected": -3.615485191345215, + "logps/chosen": -351.8419494628906, + "logps/rejected": -258.89398193359375, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14443156123161316, + "rewards/margins": 1.2567256689071655, + "rewards/rejected": -1.1122941970825195, + "step": 1683 + }, + { + "epoch": 0.19, + "learning_rate": 2.4546412267353386e-07, + "logits/chosen": -2.4477345943450928, + "logits/rejected": -2.362621307373047, + "logps/chosen": -360.66473388671875, + "logps/rejected": -328.2407531738281, + "loss": 0.2151, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4126198887825012, + "rewards/margins": 2.459287166595459, + "rewards/rejected": -2.0466673374176025, + "step": 1684 + }, + { + "epoch": 0.19, + "learning_rate": 2.454290062039096e-07, + "logits/chosen": -2.911421775817871, + "logits/rejected": -2.9039158821105957, + "logps/chosen": -290.2357482910156, + "logps/rejected": -322.86077880859375, + "loss": 0.4581, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45506545901298523, + "rewards/margins": 1.1540563106536865, + "rewards/rejected": -0.6989908218383789, + "step": 1685 + }, + { + "epoch": 0.19, + "learning_rate": 2.4539388973428537e-07, + "logits/chosen": -3.6172256469726562, + "logits/rejected": -3.466050863265991, + "logps/chosen": -274.277587890625, + "logps/rejected": -239.30740356445312, + "loss": 0.3698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17416085302829742, + "rewards/margins": 1.41618013381958, + "rewards/rejected": -1.5903409719467163, + "step": 1686 + }, + { + "epoch": 0.19, + "learning_rate": 2.453587732646611e-07, + "logits/chosen": -3.0522522926330566, + "logits/rejected": -2.8346526622772217, + "logps/chosen": -237.89743041992188, + "logps/rejected": -269.82427978515625, + "loss": 0.1814, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2388889491558075, + "rewards/margins": 2.0487163066864014, + "rewards/rejected": -1.809827446937561, + "step": 1687 + }, + { + "epoch": 0.19, + "learning_rate": 2.453236567950369e-07, + "logits/chosen": -3.385484457015991, + "logits/rejected": -3.126699447631836, + "logps/chosen": -373.0326843261719, + "logps/rejected": -381.6529541015625, + "loss": 0.4801, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25851303339004517, + "rewards/margins": 0.924397349357605, + "rewards/rejected": -0.665884256362915, + "step": 1688 + }, + { + "epoch": 0.19, + "learning_rate": 2.4528854032541263e-07, + "logits/chosen": -2.7377686500549316, + "logits/rejected": -3.031897783279419, + "logps/chosen": -166.29563903808594, + "logps/rejected": -186.349853515625, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19604967534542084, + "rewards/margins": 1.9597545862197876, + "rewards/rejected": -1.7637050151824951, + "step": 1689 + }, + { + "epoch": 0.19, + "learning_rate": 2.4525342385578833e-07, + "logits/chosen": -2.6053786277770996, + "logits/rejected": -2.7166953086853027, + "logps/chosen": -226.43463134765625, + "logps/rejected": -382.5266418457031, + "loss": 0.5322, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12152568250894547, + "rewards/margins": 1.2979834079742432, + "rewards/rejected": -1.1764576435089111, + "step": 1690 + }, + { + "epoch": 0.19, + "learning_rate": 2.452183073861641e-07, + "logits/chosen": -2.9455792903900146, + "logits/rejected": -3.1394710540771484, + "logps/chosen": -270.2583312988281, + "logps/rejected": -280.0306091308594, + "loss": 0.287, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41307783126831055, + "rewards/margins": 2.348109245300293, + "rewards/rejected": -1.9350314140319824, + "step": 1691 + }, + { + "epoch": 0.2, + "learning_rate": 2.4518319091653984e-07, + "logits/chosen": -3.4704463481903076, + "logits/rejected": -3.4533281326293945, + "logps/chosen": -146.36410522460938, + "logps/rejected": -296.54022216796875, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3706258535385132, + "rewards/margins": 1.2620245218276978, + "rewards/rejected": -1.6326501369476318, + "step": 1692 + }, + { + "epoch": 0.2, + "learning_rate": 2.451480744469156e-07, + "logits/chosen": -3.0328855514526367, + "logits/rejected": -3.118960380554199, + "logps/chosen": -194.57818603515625, + "logps/rejected": -155.09303283691406, + "loss": 0.5282, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05055350065231323, + "rewards/margins": 0.6075702905654907, + "rewards/rejected": -0.5570167899131775, + "step": 1693 + }, + { + "epoch": 0.2, + "learning_rate": 2.4511295797729135e-07, + "logits/chosen": -3.051926851272583, + "logits/rejected": -2.7508809566497803, + "logps/chosen": -231.03656005859375, + "logps/rejected": -224.9603729248047, + "loss": 0.4777, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32188910245895386, + "rewards/margins": 1.3823513984680176, + "rewards/rejected": -1.7042404413223267, + "step": 1694 + }, + { + "epoch": 0.2, + "learning_rate": 2.4507784150766705e-07, + "logits/chosen": -3.5282094478607178, + "logits/rejected": -3.548593759536743, + "logps/chosen": -113.14512634277344, + "logps/rejected": -176.37330627441406, + "loss": 0.7163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07644618302583694, + "rewards/margins": 0.49860531091690063, + "rewards/rejected": -0.5750515460968018, + "step": 1695 + }, + { + "epoch": 0.2, + "learning_rate": 2.450427250380428e-07, + "logits/chosen": -3.172929286956787, + "logits/rejected": -3.1298131942749023, + "logps/chosen": -335.30352783203125, + "logps/rejected": -267.81561279296875, + "loss": 0.2931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6214680075645447, + "rewards/margins": 1.3313868045806885, + "rewards/rejected": -0.7099188566207886, + "step": 1696 + }, + { + "epoch": 0.2, + "learning_rate": 2.450076085684186e-07, + "logits/chosen": -3.5110409259796143, + "logits/rejected": -3.172805070877075, + "logps/chosen": -215.44664001464844, + "logps/rejected": -164.82928466796875, + "loss": 0.5507, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10024100542068481, + "rewards/margins": 1.1242871284484863, + "rewards/rejected": -1.0240461826324463, + "step": 1697 + }, + { + "epoch": 0.2, + "learning_rate": 2.449724920987943e-07, + "logits/chosen": -2.1990904808044434, + "logits/rejected": -2.3453094959259033, + "logps/chosen": -334.96484375, + "logps/rejected": -214.06854248046875, + "loss": 0.3733, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04938840866088867, + "rewards/margins": 1.071897268295288, + "rewards/rejected": -1.022508978843689, + "step": 1698 + }, + { + "epoch": 0.2, + "learning_rate": 2.4493737562917007e-07, + "logits/chosen": -3.865786075592041, + "logits/rejected": -3.9234697818756104, + "logps/chosen": -198.1676025390625, + "logps/rejected": -221.02821350097656, + "loss": 0.4109, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20778770744800568, + "rewards/margins": 1.049099087715149, + "rewards/rejected": -1.2568868398666382, + "step": 1699 + }, + { + "epoch": 0.2, + "learning_rate": 2.449022591595458e-07, + "logits/chosen": -3.484004259109497, + "logits/rejected": -3.4127964973449707, + "logps/chosen": -383.62982177734375, + "logps/rejected": -201.284912109375, + "loss": 0.4026, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25389915704727173, + "rewards/margins": 1.5683817863464355, + "rewards/rejected": -1.3144826889038086, + "step": 1700 + }, + { + "epoch": 0.2, + "learning_rate": 2.448671426899216e-07, + "logits/chosen": -3.958258867263794, + "logits/rejected": -3.5568466186523438, + "logps/chosen": -222.71250915527344, + "logps/rejected": -252.0777587890625, + "loss": 0.4138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46280574798583984, + "rewards/margins": 1.4746692180633545, + "rewards/rejected": -1.9374749660491943, + "step": 1701 + }, + { + "epoch": 0.2, + "learning_rate": 2.4483202622029733e-07, + "logits/chosen": -3.428539752960205, + "logits/rejected": -3.9102351665496826, + "logps/chosen": -199.1168212890625, + "logps/rejected": -260.75555419921875, + "loss": 0.2859, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4801642596721649, + "rewards/margins": 2.066012144088745, + "rewards/rejected": -2.5461764335632324, + "step": 1702 + }, + { + "epoch": 0.2, + "learning_rate": 2.4479690975067303e-07, + "logits/chosen": -3.283670663833618, + "logits/rejected": -2.9816439151763916, + "logps/chosen": -335.1788330078125, + "logps/rejected": -328.33154296875, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3351360559463501, + "rewards/margins": 1.7112128734588623, + "rewards/rejected": -1.3760769367218018, + "step": 1703 + }, + { + "epoch": 0.2, + "learning_rate": 2.447617932810488e-07, + "logits/chosen": -3.535792589187622, + "logits/rejected": -3.7741522789001465, + "logps/chosen": -179.83258056640625, + "logps/rejected": -217.40878295898438, + "loss": 0.498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30200010538101196, + "rewards/margins": 1.3488916158676147, + "rewards/rejected": -1.6508917808532715, + "step": 1704 + }, + { + "epoch": 0.2, + "learning_rate": 2.4472667681142454e-07, + "logits/chosen": -3.020387887954712, + "logits/rejected": -2.9802629947662354, + "logps/chosen": -201.28756713867188, + "logps/rejected": -322.14471435546875, + "loss": 0.6051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46486616134643555, + "rewards/margins": 2.1346938610076904, + "rewards/rejected": -2.599560022354126, + "step": 1705 + }, + { + "epoch": 0.2, + "learning_rate": 2.446915603418003e-07, + "logits/chosen": -2.835179328918457, + "logits/rejected": -2.9476020336151123, + "logps/chosen": -166.61573791503906, + "logps/rejected": -185.72549438476562, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.581467866897583, + "rewards/margins": 2.1834447383880615, + "rewards/rejected": -1.6019768714904785, + "step": 1706 + }, + { + "epoch": 0.2, + "learning_rate": 2.4465644387217604e-07, + "logits/chosen": -3.1465251445770264, + "logits/rejected": -3.119692325592041, + "logps/chosen": -336.3077697753906, + "logps/rejected": -339.3916015625, + "loss": 0.4005, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5570852160453796, + "rewards/margins": 1.4699203968048096, + "rewards/rejected": -0.9128351807594299, + "step": 1707 + }, + { + "epoch": 0.2, + "learning_rate": 2.4462132740255175e-07, + "logits/chosen": -2.6089582443237305, + "logits/rejected": -2.737736940383911, + "logps/chosen": -306.18524169921875, + "logps/rejected": -177.68496704101562, + "loss": 0.4207, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39408519864082336, + "rewards/margins": 1.044629454612732, + "rewards/rejected": -1.4387147426605225, + "step": 1708 + }, + { + "epoch": 0.2, + "learning_rate": 2.4458621093292755e-07, + "logits/chosen": -3.9313507080078125, + "logits/rejected": -3.967329502105713, + "logps/chosen": -193.96939086914062, + "logps/rejected": -202.03741455078125, + "loss": 0.5294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7465137243270874, + "rewards/margins": 0.8238089680671692, + "rewards/rejected": -1.5703226327896118, + "step": 1709 + }, + { + "epoch": 0.2, + "learning_rate": 2.445510944633033e-07, + "logits/chosen": -3.155323028564453, + "logits/rejected": -2.9358575344085693, + "logps/chosen": -260.6053161621094, + "logps/rejected": -324.3551330566406, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2441752403974533, + "rewards/margins": 1.0690574645996094, + "rewards/rejected": -1.313232660293579, + "step": 1710 + }, + { + "epoch": 0.2, + "learning_rate": 2.44515977993679e-07, + "logits/chosen": -3.089785099029541, + "logits/rejected": -2.845656394958496, + "logps/chosen": -315.4617919921875, + "logps/rejected": -249.2813720703125, + "loss": 0.4313, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2886127829551697, + "rewards/margins": 1.4697961807250977, + "rewards/rejected": -1.1811834573745728, + "step": 1711 + }, + { + "epoch": 0.2, + "learning_rate": 2.4448086152405476e-07, + "logits/chosen": -2.4240732192993164, + "logits/rejected": -2.5394866466522217, + "logps/chosen": -318.2694396972656, + "logps/rejected": -220.41360473632812, + "loss": 0.7321, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08090024441480637, + "rewards/margins": 0.4448111355304718, + "rewards/rejected": -0.36391085386276245, + "step": 1712 + }, + { + "epoch": 0.2, + "learning_rate": 2.444457450544305e-07, + "logits/chosen": -2.9358439445495605, + "logits/rejected": -2.8569648265838623, + "logps/chosen": -264.7394104003906, + "logps/rejected": -341.2603759765625, + "loss": 0.3664, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36451125144958496, + "rewards/margins": 1.431419849395752, + "rewards/rejected": -1.066908597946167, + "step": 1713 + }, + { + "epoch": 0.2, + "learning_rate": 2.4441062858480627e-07, + "logits/chosen": -2.8864569664001465, + "logits/rejected": -2.8941972255706787, + "logps/chosen": -390.8206481933594, + "logps/rejected": -242.85182189941406, + "loss": 0.4984, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29410824179649353, + "rewards/margins": 0.7023544311523438, + "rewards/rejected": -0.9964627027511597, + "step": 1714 + }, + { + "epoch": 0.2, + "learning_rate": 2.44375512115182e-07, + "logits/chosen": -3.104605197906494, + "logits/rejected": -3.1454367637634277, + "logps/chosen": -270.9232177734375, + "logps/rejected": -248.38262939453125, + "loss": 0.3654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26806002855300903, + "rewards/margins": 1.5399221181869507, + "rewards/rejected": -1.8079822063446045, + "step": 1715 + }, + { + "epoch": 0.2, + "learning_rate": 2.443403956455577e-07, + "logits/chosen": -2.3233776092529297, + "logits/rejected": -2.3871142864227295, + "logps/chosen": -317.3779602050781, + "logps/rejected": -330.9116516113281, + "loss": 0.2406, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.837905764579773, + "rewards/margins": 2.1452691555023193, + "rewards/rejected": -1.3073632717132568, + "step": 1716 + }, + { + "epoch": 0.2, + "learning_rate": 2.443052791759335e-07, + "logits/chosen": -2.879929542541504, + "logits/rejected": -2.6622700691223145, + "logps/chosen": -258.1183776855469, + "logps/rejected": -275.150146484375, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16815334558486938, + "rewards/margins": 1.212031364440918, + "rewards/rejected": -1.0438780784606934, + "step": 1717 + }, + { + "epoch": 0.2, + "learning_rate": 2.4427016270630923e-07, + "logits/chosen": -2.9937429428100586, + "logits/rejected": -2.935084819793701, + "logps/chosen": -116.81980895996094, + "logps/rejected": -149.3265380859375, + "loss": 0.4749, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32410258054733276, + "rewards/margins": 0.6127547025680542, + "rewards/rejected": -0.9368573427200317, + "step": 1718 + }, + { + "epoch": 0.2, + "learning_rate": 2.44235046236685e-07, + "logits/chosen": -2.908172369003296, + "logits/rejected": -2.993743419647217, + "logps/chosen": -167.93698120117188, + "logps/rejected": -131.21994018554688, + "loss": 0.438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24395276606082916, + "rewards/margins": 0.6851309537887573, + "rewards/rejected": -0.9290837049484253, + "step": 1719 + }, + { + "epoch": 0.2, + "learning_rate": 2.4419992976706074e-07, + "logits/chosen": -2.7098655700683594, + "logits/rejected": -2.7113876342773438, + "logps/chosen": -374.839111328125, + "logps/rejected": -212.05490112304688, + "loss": 0.7125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7810652256011963, + "rewards/margins": 0.5353448987007141, + "rewards/rejected": -1.3164100646972656, + "step": 1720 + }, + { + "epoch": 0.2, + "learning_rate": 2.441648132974365e-07, + "logits/chosen": -3.228445529937744, + "logits/rejected": -3.542778491973877, + "logps/chosen": -333.4078063964844, + "logps/rejected": -213.998046875, + "loss": 0.378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22177620232105255, + "rewards/margins": 1.0722817182540894, + "rewards/rejected": -1.294057846069336, + "step": 1721 + }, + { + "epoch": 0.2, + "learning_rate": 2.4412969682781225e-07, + "logits/chosen": -2.91353702545166, + "logits/rejected": -2.7158563137054443, + "logps/chosen": -313.9520263671875, + "logps/rejected": -291.3470458984375, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30379754304885864, + "rewards/margins": 1.3417613506317139, + "rewards/rejected": -1.6455589532852173, + "step": 1722 + }, + { + "epoch": 0.2, + "learning_rate": 2.44094580358188e-07, + "logits/chosen": -3.5439860820770264, + "logits/rejected": -3.260101318359375, + "logps/chosen": -221.77609252929688, + "logps/rejected": -125.55340576171875, + "loss": 0.4755, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20629605650901794, + "rewards/margins": 0.8871363401412964, + "rewards/rejected": -0.680840253829956, + "step": 1723 + }, + { + "epoch": 0.2, + "learning_rate": 2.440594638885637e-07, + "logits/chosen": -3.0384671688079834, + "logits/rejected": -3.3780710697174072, + "logps/chosen": -154.1116485595703, + "logps/rejected": -139.92042541503906, + "loss": 0.5778, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0461917370557785, + "rewards/margins": 1.5937776565551758, + "rewards/rejected": -1.5475858449935913, + "step": 1724 + }, + { + "epoch": 0.2, + "learning_rate": 2.4402434741893945e-07, + "logits/chosen": -3.8238131999969482, + "logits/rejected": -3.6845932006835938, + "logps/chosen": -184.49453735351562, + "logps/rejected": -186.496826171875, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.544116199016571, + "rewards/margins": 2.154240608215332, + "rewards/rejected": -1.6101243495941162, + "step": 1725 + }, + { + "epoch": 0.2, + "learning_rate": 2.439892309493152e-07, + "logits/chosen": -3.196857452392578, + "logits/rejected": -2.6698999404907227, + "logps/chosen": -425.947509765625, + "logps/rejected": -417.09698486328125, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38281041383743286, + "rewards/margins": 1.9237964153289795, + "rewards/rejected": -1.5409860610961914, + "step": 1726 + }, + { + "epoch": 0.2, + "learning_rate": 2.4395411447969096e-07, + "logits/chosen": -2.7583677768707275, + "logits/rejected": -2.70349383354187, + "logps/chosen": -247.20159912109375, + "logps/rejected": -234.02919006347656, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05214317888021469, + "rewards/margins": 1.598447322845459, + "rewards/rejected": -1.5463039875030518, + "step": 1727 + }, + { + "epoch": 0.2, + "learning_rate": 2.439189980100667e-07, + "logits/chosen": -2.710408926010132, + "logits/rejected": -2.939810037612915, + "logps/chosen": -278.5091247558594, + "logps/rejected": -244.74429321289062, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04341694712638855, + "rewards/margins": 1.023653268814087, + "rewards/rejected": -1.0670702457427979, + "step": 1728 + }, + { + "epoch": 0.2, + "learning_rate": 2.4388388154044247e-07, + "logits/chosen": -3.2340686321258545, + "logits/rejected": -2.949521541595459, + "logps/chosen": -463.640625, + "logps/rejected": -356.5784912109375, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4406130015850067, + "rewards/margins": 2.4163620471954346, + "rewards/rejected": -1.975749135017395, + "step": 1729 + }, + { + "epoch": 0.2, + "learning_rate": 2.4384876507081817e-07, + "logits/chosen": -3.5759692192077637, + "logits/rejected": -3.338381052017212, + "logps/chosen": -153.74571228027344, + "logps/rejected": -142.6878204345703, + "loss": 0.5895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4222008287906647, + "rewards/margins": 0.6961211562156677, + "rewards/rejected": -1.1183220148086548, + "step": 1730 + }, + { + "epoch": 0.2, + "learning_rate": 2.43813648601194e-07, + "logits/chosen": -3.119556188583374, + "logits/rejected": -3.105013608932495, + "logps/chosen": -300.7023010253906, + "logps/rejected": -431.5294494628906, + "loss": 0.4579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029314160346984863, + "rewards/margins": 1.0032093524932861, + "rewards/rejected": -1.0325233936309814, + "step": 1731 + }, + { + "epoch": 0.2, + "learning_rate": 2.437785321315697e-07, + "logits/chosen": -2.997035026550293, + "logits/rejected": -2.7745275497436523, + "logps/chosen": -202.52383422851562, + "logps/rejected": -222.5989227294922, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06041629612445831, + "rewards/margins": 2.066164255142212, + "rewards/rejected": -2.0057480335235596, + "step": 1732 + }, + { + "epoch": 0.2, + "learning_rate": 2.4374341566194543e-07, + "logits/chosen": -3.2654011249542236, + "logits/rejected": -3.137331962585449, + "logps/chosen": -197.5450439453125, + "logps/rejected": -220.6737060546875, + "loss": 0.3962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37919241189956665, + "rewards/margins": 1.2437317371368408, + "rewards/rejected": -1.6229242086410522, + "step": 1733 + }, + { + "epoch": 0.2, + "learning_rate": 2.437082991923212e-07, + "logits/chosen": -3.7312777042388916, + "logits/rejected": -3.562410354614258, + "logps/chosen": -129.6446533203125, + "logps/rejected": -143.5083770751953, + "loss": 0.8129, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3476855754852295, + "rewards/margins": 0.4746500253677368, + "rewards/rejected": -0.8223356008529663, + "step": 1734 + }, + { + "epoch": 0.2, + "learning_rate": 2.4367318272269694e-07, + "logits/chosen": -3.0206260681152344, + "logits/rejected": -3.1526198387145996, + "logps/chosen": -281.50970458984375, + "logps/rejected": -304.1826171875, + "loss": 0.1389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17513632774353027, + "rewards/margins": 2.536421775817871, + "rewards/rejected": -2.7115581035614014, + "step": 1735 + }, + { + "epoch": 0.2, + "learning_rate": 2.436380662530727e-07, + "logits/chosen": -3.179959297180176, + "logits/rejected": -3.2266836166381836, + "logps/chosen": -245.88204956054688, + "logps/rejected": -242.5856475830078, + "loss": 0.527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14690721035003662, + "rewards/margins": 1.217486023902893, + "rewards/rejected": -1.3643931150436401, + "step": 1736 + }, + { + "epoch": 0.2, + "learning_rate": 2.4360294978344845e-07, + "logits/chosen": -3.715092420578003, + "logits/rejected": -3.394807815551758, + "logps/chosen": -283.05084228515625, + "logps/rejected": -348.5787658691406, + "loss": 0.4673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21136708557605743, + "rewards/margins": 0.8256571292877197, + "rewards/rejected": -1.0370242595672607, + "step": 1737 + }, + { + "epoch": 0.2, + "learning_rate": 2.4356783331382415e-07, + "logits/chosen": -3.5574638843536377, + "logits/rejected": -3.2605228424072266, + "logps/chosen": -156.2772674560547, + "logps/rejected": -130.5850830078125, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44039177894592285, + "rewards/margins": 1.5975128412246704, + "rewards/rejected": -1.1571210622787476, + "step": 1738 + }, + { + "epoch": 0.2, + "learning_rate": 2.435327168441999e-07, + "logits/chosen": -3.084960460662842, + "logits/rejected": -3.178116798400879, + "logps/chosen": -142.37596130371094, + "logps/rejected": -140.02943420410156, + "loss": 0.5473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10475844889879227, + "rewards/margins": 1.0007109642028809, + "rewards/rejected": -1.1054693460464478, + "step": 1739 + }, + { + "epoch": 0.2, + "learning_rate": 2.4349760037457566e-07, + "logits/chosen": -3.067857265472412, + "logits/rejected": -3.519385814666748, + "logps/chosen": -247.3993377685547, + "logps/rejected": -305.21099853515625, + "loss": 0.1719, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04354758560657501, + "rewards/margins": 2.7898519039154053, + "rewards/rejected": -2.833399772644043, + "step": 1740 + }, + { + "epoch": 0.2, + "learning_rate": 2.434624839049514e-07, + "logits/chosen": -3.057948112487793, + "logits/rejected": -3.0198819637298584, + "logps/chosen": -141.63870239257812, + "logps/rejected": -297.5664978027344, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10936189442873001, + "rewards/margins": 2.323464870452881, + "rewards/rejected": -2.4328267574310303, + "step": 1741 + }, + { + "epoch": 0.2, + "learning_rate": 2.4342736743532716e-07, + "logits/chosen": -3.2447316646575928, + "logits/rejected": -3.6341748237609863, + "logps/chosen": -145.13330078125, + "logps/rejected": -252.46206665039062, + "loss": 0.4078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12699754536151886, + "rewards/margins": 1.1056121587753296, + "rewards/rejected": -1.2326096296310425, + "step": 1742 + }, + { + "epoch": 0.2, + "learning_rate": 2.433922509657029e-07, + "logits/chosen": -3.209233283996582, + "logits/rejected": -3.231194496154785, + "logps/chosen": -150.76914978027344, + "logps/rejected": -193.64834594726562, + "loss": 0.3733, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17111843824386597, + "rewards/margins": 1.3984018564224243, + "rewards/rejected": -1.2272834777832031, + "step": 1743 + }, + { + "epoch": 0.2, + "learning_rate": 2.4335713449607867e-07, + "logits/chosen": -3.407757043838501, + "logits/rejected": -3.283406972885132, + "logps/chosen": -152.8195037841797, + "logps/rejected": -162.044677734375, + "loss": 0.474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1427876055240631, + "rewards/margins": 1.247788667678833, + "rewards/rejected": -1.3905761241912842, + "step": 1744 + }, + { + "epoch": 0.2, + "learning_rate": 2.433220180264544e-07, + "logits/chosen": -3.3044116497039795, + "logits/rejected": -3.735154867172241, + "logps/chosen": -173.23471069335938, + "logps/rejected": -206.9846649169922, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08235792070627213, + "rewards/margins": 1.034445881843567, + "rewards/rejected": -0.9520879983901978, + "step": 1745 + }, + { + "epoch": 0.2, + "learning_rate": 2.4328690155683013e-07, + "logits/chosen": -3.3846535682678223, + "logits/rejected": -3.1004958152770996, + "logps/chosen": -254.22317504882812, + "logps/rejected": -308.4259033203125, + "loss": 0.3208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39039522409439087, + "rewards/margins": 1.439897060394287, + "rewards/rejected": -1.8302921056747437, + "step": 1746 + }, + { + "epoch": 0.2, + "learning_rate": 2.432517850872059e-07, + "logits/chosen": -3.0105745792388916, + "logits/rejected": -2.939602851867676, + "logps/chosen": -201.98741149902344, + "logps/rejected": -240.71258544921875, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20477108657360077, + "rewards/margins": 1.049839735031128, + "rewards/rejected": -0.8450685739517212, + "step": 1747 + }, + { + "epoch": 0.2, + "learning_rate": 2.4321666861758163e-07, + "logits/chosen": -3.1395387649536133, + "logits/rejected": -3.0131208896636963, + "logps/chosen": -410.95562744140625, + "logps/rejected": -329.695556640625, + "loss": 0.1892, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15863315761089325, + "rewards/margins": 1.9727343320846558, + "rewards/rejected": -1.814100980758667, + "step": 1748 + }, + { + "epoch": 0.2, + "learning_rate": 2.431815521479574e-07, + "logits/chosen": -3.1851792335510254, + "logits/rejected": -3.1301703453063965, + "logps/chosen": -287.33148193359375, + "logps/rejected": -360.2151794433594, + "loss": 0.3618, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7716159820556641, + "rewards/margins": 1.8083436489105225, + "rewards/rejected": -2.5799596309661865, + "step": 1749 + }, + { + "epoch": 0.2, + "learning_rate": 2.4314643567833314e-07, + "logits/chosen": -3.543879508972168, + "logits/rejected": -3.985924482345581, + "logps/chosen": -221.46536254882812, + "logps/rejected": -311.0965881347656, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8416170477867126, + "rewards/margins": 2.7182304859161377, + "rewards/rejected": -1.8766134977340698, + "step": 1750 + }, + { + "epoch": 0.2, + "learning_rate": 2.4311131920870884e-07, + "logits/chosen": -3.168619394302368, + "logits/rejected": -3.195664405822754, + "logps/chosen": -148.01206970214844, + "logps/rejected": -181.02862548828125, + "loss": 0.511, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20444738864898682, + "rewards/margins": 0.570600688457489, + "rewards/rejected": -0.3661532402038574, + "step": 1751 + }, + { + "epoch": 0.2, + "learning_rate": 2.430762027390846e-07, + "logits/chosen": -2.8880393505096436, + "logits/rejected": -2.936306953430176, + "logps/chosen": -169.31515502929688, + "logps/rejected": -223.66928100585938, + "loss": 0.7079, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06547676026821136, + "rewards/margins": 0.41497230529785156, + "rewards/rejected": -0.3494955003261566, + "step": 1752 + }, + { + "epoch": 0.2, + "learning_rate": 2.430410862694604e-07, + "logits/chosen": -2.6722052097320557, + "logits/rejected": -2.9412734508514404, + "logps/chosen": -224.75271606445312, + "logps/rejected": -232.43492126464844, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3493640124797821, + "rewards/margins": 1.1181914806365967, + "rewards/rejected": -1.4675554037094116, + "step": 1753 + }, + { + "epoch": 0.2, + "learning_rate": 2.430059697998361e-07, + "logits/chosen": -3.102931499481201, + "logits/rejected": -3.0654520988464355, + "logps/chosen": -219.181640625, + "logps/rejected": -202.31582641601562, + "loss": 0.3232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05130919814109802, + "rewards/margins": 1.838759183883667, + "rewards/rejected": -1.787449836730957, + "step": 1754 + }, + { + "epoch": 0.2, + "learning_rate": 2.4297085333021186e-07, + "logits/chosen": -3.968606472015381, + "logits/rejected": -3.7249157428741455, + "logps/chosen": -326.8946228027344, + "logps/rejected": -296.5503234863281, + "loss": 0.4936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0016686655580997467, + "rewards/margins": 1.3206899166107178, + "rewards/rejected": -1.3223586082458496, + "step": 1755 + }, + { + "epoch": 0.2, + "learning_rate": 2.429357368605876e-07, + "logits/chosen": -2.62625789642334, + "logits/rejected": -2.566938877105713, + "logps/chosen": -366.4497985839844, + "logps/rejected": -326.11541748046875, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12545126676559448, + "rewards/margins": 2.252603054046631, + "rewards/rejected": -2.1271517276763916, + "step": 1756 + }, + { + "epoch": 0.2, + "learning_rate": 2.4290062039096337e-07, + "logits/chosen": -2.5912160873413086, + "logits/rejected": -2.966123104095459, + "logps/chosen": -415.5066223144531, + "logps/rejected": -333.67755126953125, + "loss": 0.6167, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2567613124847412, + "rewards/margins": 0.9944959878921509, + "rewards/rejected": -1.251257300376892, + "step": 1757 + }, + { + "epoch": 0.2, + "learning_rate": 2.428655039213391e-07, + "logits/chosen": -3.5497848987579346, + "logits/rejected": -3.349867820739746, + "logps/chosen": -205.57315063476562, + "logps/rejected": -155.82171630859375, + "loss": 0.5414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25648921728134155, + "rewards/margins": 0.8320026397705078, + "rewards/rejected": -1.0884917974472046, + "step": 1758 + }, + { + "epoch": 0.2, + "learning_rate": 2.428303874517148e-07, + "logits/chosen": -3.1462998390197754, + "logits/rejected": -3.194227457046509, + "logps/chosen": -282.48895263671875, + "logps/rejected": -287.45025634765625, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.273932546377182, + "rewards/margins": 0.48718565702438354, + "rewards/rejected": -0.7611181735992432, + "step": 1759 + }, + { + "epoch": 0.2, + "learning_rate": 2.427952709820906e-07, + "logits/chosen": -2.7691938877105713, + "logits/rejected": -2.6362013816833496, + "logps/chosen": -198.17068481445312, + "logps/rejected": -255.22598266601562, + "loss": 0.1862, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3451490104198456, + "rewards/margins": 2.164900779724121, + "rewards/rejected": -1.8197517395019531, + "step": 1760 + }, + { + "epoch": 0.2, + "learning_rate": 2.4276015451246633e-07, + "logits/chosen": -3.1371588706970215, + "logits/rejected": -2.9938597679138184, + "logps/chosen": -301.0254211425781, + "logps/rejected": -191.77513122558594, + "loss": 0.5222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07813544571399689, + "rewards/margins": 0.5883846879005432, + "rewards/rejected": -0.6665201187133789, + "step": 1761 + }, + { + "epoch": 0.2, + "learning_rate": 2.427250380428421e-07, + "logits/chosen": -3.436466932296753, + "logits/rejected": -3.3969531059265137, + "logps/chosen": -234.4161376953125, + "logps/rejected": -255.92367553710938, + "loss": 0.2263, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10461831092834473, + "rewards/margins": 2.507021188735962, + "rewards/rejected": -2.402402639389038, + "step": 1762 + }, + { + "epoch": 0.2, + "learning_rate": 2.4268992157321784e-07, + "logits/chosen": -2.299417018890381, + "logits/rejected": -2.2770919799804688, + "logps/chosen": -356.7138366699219, + "logps/rejected": -264.33868408203125, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.055784545838832855, + "rewards/margins": 1.0526140928268433, + "rewards/rejected": -0.9968295097351074, + "step": 1763 + }, + { + "epoch": 0.2, + "learning_rate": 2.4265480510359354e-07, + "logits/chosen": -3.2793633937835693, + "logits/rejected": -3.563754081726074, + "logps/chosen": -231.0777587890625, + "logps/rejected": -268.3797302246094, + "loss": 0.2034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2699204087257385, + "rewards/margins": 2.083373785018921, + "rewards/rejected": -2.3532941341400146, + "step": 1764 + }, + { + "epoch": 0.2, + "learning_rate": 2.4261968863396934e-07, + "logits/chosen": -2.8059375286102295, + "logits/rejected": -2.6914925575256348, + "logps/chosen": -362.2037353515625, + "logps/rejected": -276.865478515625, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03437815606594086, + "rewards/margins": 0.8932045698165894, + "rewards/rejected": -0.8588263988494873, + "step": 1765 + }, + { + "epoch": 0.2, + "learning_rate": 2.425845721643451e-07, + "logits/chosen": -2.980407238006592, + "logits/rejected": -2.9198200702667236, + "logps/chosen": -241.3873291015625, + "logps/rejected": -203.42881774902344, + "loss": 0.3265, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1854814887046814, + "rewards/margins": 1.3325400352478027, + "rewards/rejected": -1.1470584869384766, + "step": 1766 + }, + { + "epoch": 0.2, + "learning_rate": 2.425494556947208e-07, + "logits/chosen": -2.8503692150115967, + "logits/rejected": -2.576404333114624, + "logps/chosen": -250.96826171875, + "logps/rejected": -249.5489959716797, + "loss": 0.6724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2590703070163727, + "rewards/margins": 0.9337478280067444, + "rewards/rejected": -1.1928181648254395, + "step": 1767 + }, + { + "epoch": 0.2, + "learning_rate": 2.4251433922509655e-07, + "logits/chosen": -3.231877326965332, + "logits/rejected": -3.326113700866699, + "logps/chosen": -263.3506164550781, + "logps/rejected": -364.3380126953125, + "loss": 0.2686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1327046900987625, + "rewards/margins": 2.1899170875549316, + "rewards/rejected": -2.3226218223571777, + "step": 1768 + }, + { + "epoch": 0.2, + "learning_rate": 2.424792227554723e-07, + "logits/chosen": -3.074404001235962, + "logits/rejected": -3.400165319442749, + "logps/chosen": -196.53921508789062, + "logps/rejected": -223.18572998046875, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08249306678771973, + "rewards/margins": 1.394784927368164, + "rewards/rejected": -1.3122918605804443, + "step": 1769 + }, + { + "epoch": 0.2, + "learning_rate": 2.4244410628584806e-07, + "logits/chosen": -3.3985276222229004, + "logits/rejected": -3.268855094909668, + "logps/chosen": -205.62542724609375, + "logps/rejected": -243.33216857910156, + "loss": 0.1672, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0461481511592865, + "rewards/margins": 2.5665283203125, + "rewards/rejected": -2.5203800201416016, + "step": 1770 + }, + { + "epoch": 0.2, + "learning_rate": 2.424089898162238e-07, + "logits/chosen": -3.0864343643188477, + "logits/rejected": -3.0680220127105713, + "logps/chosen": -234.63291931152344, + "logps/rejected": -234.44036865234375, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7403955459594727, + "rewards/margins": 2.4730758666992188, + "rewards/rejected": -1.7326805591583252, + "step": 1771 + }, + { + "epoch": 0.2, + "learning_rate": 2.423738733465995e-07, + "logits/chosen": -3.602097749710083, + "logits/rejected": -3.4309592247009277, + "logps/chosen": -254.64076232910156, + "logps/rejected": -185.6962432861328, + "loss": 0.3006, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10131794214248657, + "rewards/margins": 1.7651445865631104, + "rewards/rejected": -1.8664624691009521, + "step": 1772 + }, + { + "epoch": 0.2, + "learning_rate": 2.4233875687697527e-07, + "logits/chosen": -3.531369924545288, + "logits/rejected": -3.3974971771240234, + "logps/chosen": -229.64987182617188, + "logps/rejected": -189.36959838867188, + "loss": 0.6074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5496854782104492, + "rewards/margins": 0.9148344993591309, + "rewards/rejected": -1.46451997756958, + "step": 1773 + }, + { + "epoch": 0.2, + "learning_rate": 2.423036404073511e-07, + "logits/chosen": -2.960831880569458, + "logits/rejected": -2.939600944519043, + "logps/chosen": -314.62371826171875, + "logps/rejected": -248.36761474609375, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06180811673402786, + "rewards/margins": 1.4093976020812988, + "rewards/rejected": -1.3475894927978516, + "step": 1774 + }, + { + "epoch": 0.2, + "learning_rate": 2.422685239377268e-07, + "logits/chosen": -3.187558889389038, + "logits/rejected": -2.793586015701294, + "logps/chosen": -311.04852294921875, + "logps/rejected": -264.7544250488281, + "loss": 0.7614, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.66237473487854, + "rewards/margins": 0.4096294343471527, + "rewards/rejected": -1.0720041990280151, + "step": 1775 + }, + { + "epoch": 0.2, + "learning_rate": 2.4223340746810253e-07, + "logits/chosen": -3.724465847015381, + "logits/rejected": -3.553802251815796, + "logps/chosen": -337.3489990234375, + "logps/rejected": -285.360107421875, + "loss": 0.9469, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.595371425151825, + "rewards/margins": 0.12990513443946838, + "rewards/rejected": -0.725276529788971, + "step": 1776 + }, + { + "epoch": 0.2, + "learning_rate": 2.421982909984783e-07, + "logits/chosen": -3.092574119567871, + "logits/rejected": -2.7128357887268066, + "logps/chosen": -335.2371826171875, + "logps/rejected": -387.388671875, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02709682285785675, + "rewards/margins": 2.2923853397369385, + "rewards/rejected": -2.2652883529663086, + "step": 1777 + }, + { + "epoch": 0.2, + "learning_rate": 2.4216317452885404e-07, + "logits/chosen": -3.393723964691162, + "logits/rejected": -3.2259602546691895, + "logps/chosen": -458.0068664550781, + "logps/rejected": -289.3234558105469, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.060724783688783646, + "rewards/margins": 0.9477744698524475, + "rewards/rejected": -0.887049674987793, + "step": 1778 + }, + { + "epoch": 0.21, + "learning_rate": 2.421280580592298e-07, + "logits/chosen": -2.087739944458008, + "logits/rejected": -2.4449591636657715, + "logps/chosen": -708.1103515625, + "logps/rejected": -219.73358154296875, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1941252052783966, + "rewards/margins": 1.3678836822509766, + "rewards/rejected": -1.5620088577270508, + "step": 1779 + }, + { + "epoch": 0.21, + "learning_rate": 2.420929415896055e-07, + "logits/chosen": -2.605710983276367, + "logits/rejected": -3.0942349433898926, + "logps/chosen": -277.34613037109375, + "logps/rejected": -361.6842041015625, + "loss": 0.2663, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7183920741081238, + "rewards/margins": 2.0134165287017822, + "rewards/rejected": -1.2950243949890137, + "step": 1780 + }, + { + "epoch": 0.21, + "learning_rate": 2.4205782511998125e-07, + "logits/chosen": -2.2303647994995117, + "logits/rejected": -2.428788185119629, + "logps/chosen": -570.49560546875, + "logps/rejected": -409.5047607421875, + "loss": 0.606, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18222299218177795, + "rewards/margins": 0.8466431498527527, + "rewards/rejected": -1.0288660526275635, + "step": 1781 + }, + { + "epoch": 0.21, + "learning_rate": 2.42022708650357e-07, + "logits/chosen": -3.7756333351135254, + "logits/rejected": -4.063043594360352, + "logps/chosen": -313.5496520996094, + "logps/rejected": -283.4927978515625, + "loss": 0.4209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08129837363958359, + "rewards/margins": 2.766404628753662, + "rewards/rejected": -2.847702980041504, + "step": 1782 + }, + { + "epoch": 0.21, + "learning_rate": 2.4198759218073276e-07, + "logits/chosen": -2.6750128269195557, + "logits/rejected": -2.6356160640716553, + "logps/chosen": -285.88848876953125, + "logps/rejected": -291.33819580078125, + "loss": 0.6745, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08191380649805069, + "rewards/margins": 0.2532702088356018, + "rewards/rejected": -0.1713564246892929, + "step": 1783 + }, + { + "epoch": 0.21, + "learning_rate": 2.419524757111085e-07, + "logits/chosen": -3.120246410369873, + "logits/rejected": -3.0747880935668945, + "logps/chosen": -249.4274444580078, + "logps/rejected": -186.0854034423828, + "loss": 0.6817, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2302771508693695, + "rewards/margins": 0.22528696060180664, + "rewards/rejected": -0.45556414127349854, + "step": 1784 + }, + { + "epoch": 0.21, + "learning_rate": 2.419173592414842e-07, + "logits/chosen": -3.5686984062194824, + "logits/rejected": -3.3988704681396484, + "logps/chosen": -272.94580078125, + "logps/rejected": -291.3069763183594, + "loss": 0.2544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19512201845645905, + "rewards/margins": 1.856284737586975, + "rewards/rejected": -1.6611627340316772, + "step": 1785 + }, + { + "epoch": 0.21, + "learning_rate": 2.4188224277185996e-07, + "logits/chosen": -2.2110629081726074, + "logits/rejected": -2.4659082889556885, + "logps/chosen": -506.2607421875, + "logps/rejected": -282.80792236328125, + "loss": 0.6111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5762144327163696, + "rewards/margins": 0.8014272451400757, + "rewards/rejected": -1.3776417970657349, + "step": 1786 + }, + { + "epoch": 0.21, + "learning_rate": 2.4184712630223577e-07, + "logits/chosen": -2.9010021686553955, + "logits/rejected": -2.916661024093628, + "logps/chosen": -250.02883911132812, + "logps/rejected": -255.36077880859375, + "loss": 0.3921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5074890851974487, + "rewards/margins": 1.1834965944290161, + "rewards/rejected": -1.6909856796264648, + "step": 1787 + }, + { + "epoch": 0.21, + "learning_rate": 2.4181200983261147e-07, + "logits/chosen": -2.786046028137207, + "logits/rejected": -3.1116459369659424, + "logps/chosen": -268.6700744628906, + "logps/rejected": -234.49667358398438, + "loss": 0.6217, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.220791757106781, + "rewards/margins": 0.3611282706260681, + "rewards/rejected": -0.5819200277328491, + "step": 1788 + }, + { + "epoch": 0.21, + "learning_rate": 2.417768933629872e-07, + "logits/chosen": -3.4463820457458496, + "logits/rejected": -3.401262044906616, + "logps/chosen": -364.42596435546875, + "logps/rejected": -306.0172119140625, + "loss": 0.2977, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18457360565662384, + "rewards/margins": 1.403897762298584, + "rewards/rejected": -1.2193242311477661, + "step": 1789 + }, + { + "epoch": 0.21, + "learning_rate": 2.41741776893363e-07, + "logits/chosen": -2.675659656524658, + "logits/rejected": -2.673980712890625, + "logps/chosen": -243.4378204345703, + "logps/rejected": -305.2392883300781, + "loss": 0.6264, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28893381357192993, + "rewards/margins": 1.033085823059082, + "rewards/rejected": -0.7441520690917969, + "step": 1790 + }, + { + "epoch": 0.21, + "learning_rate": 2.4170666042373873e-07, + "logits/chosen": -3.7508912086486816, + "logits/rejected": -3.563750743865967, + "logps/chosen": -174.86825561523438, + "logps/rejected": -190.00830078125, + "loss": 0.4222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0638531893491745, + "rewards/margins": 1.1816695928573608, + "rewards/rejected": -1.2455228567123413, + "step": 1791 + }, + { + "epoch": 0.21, + "learning_rate": 2.416715439541145e-07, + "logits/chosen": -3.052027702331543, + "logits/rejected": -3.2551157474517822, + "logps/chosen": -415.6919860839844, + "logps/rejected": -411.8841552734375, + "loss": 0.6046, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43954360485076904, + "rewards/margins": 1.5440583229064941, + "rewards/rejected": -1.1045148372650146, + "step": 1792 + }, + { + "epoch": 0.21, + "learning_rate": 2.416364274844902e-07, + "logits/chosen": -2.98317289352417, + "logits/rejected": -2.884835958480835, + "logps/chosen": -298.021484375, + "logps/rejected": -192.96871948242188, + "loss": 0.4656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15477770566940308, + "rewards/margins": 1.5699098110198975, + "rewards/rejected": -1.7246874570846558, + "step": 1793 + }, + { + "epoch": 0.21, + "learning_rate": 2.4160131101486594e-07, + "logits/chosen": -3.640326499938965, + "logits/rejected": -3.507678985595703, + "logps/chosen": -203.41212463378906, + "logps/rejected": -143.59127807617188, + "loss": 0.5265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2634912431240082, + "rewards/margins": 1.0048516988754272, + "rewards/rejected": -1.2683429718017578, + "step": 1794 + }, + { + "epoch": 0.21, + "learning_rate": 2.415661945452417e-07, + "logits/chosen": -2.677821636199951, + "logits/rejected": -2.4985947608947754, + "logps/chosen": -368.5274658203125, + "logps/rejected": -215.2764129638672, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5737054347991943, + "rewards/margins": 2.0907516479492188, + "rewards/rejected": -1.5170462131500244, + "step": 1795 + }, + { + "epoch": 0.21, + "learning_rate": 2.4153107807561745e-07, + "logits/chosen": -3.1011600494384766, + "logits/rejected": -2.89839506149292, + "logps/chosen": -129.42120361328125, + "logps/rejected": -184.5872039794922, + "loss": 0.4552, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023982517421245575, + "rewards/margins": 0.815593957901001, + "rewards/rejected": -0.7916114330291748, + "step": 1796 + }, + { + "epoch": 0.21, + "learning_rate": 2.414959616059932e-07, + "logits/chosen": -2.995195150375366, + "logits/rejected": -2.937110424041748, + "logps/chosen": -414.18896484375, + "logps/rejected": -450.25115966796875, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10179929435253143, + "rewards/margins": 0.9477005004882812, + "rewards/rejected": -1.0494998693466187, + "step": 1797 + }, + { + "epoch": 0.21, + "learning_rate": 2.4146084513636896e-07, + "logits/chosen": -3.33390212059021, + "logits/rejected": -3.1867642402648926, + "logps/chosen": -207.8813018798828, + "logps/rejected": -312.7217102050781, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1668022871017456, + "rewards/margins": 2.1615781784057617, + "rewards/rejected": -2.328380584716797, + "step": 1798 + }, + { + "epoch": 0.21, + "learning_rate": 2.414257286667447e-07, + "logits/chosen": -3.5038247108459473, + "logits/rejected": -3.532654047012329, + "logps/chosen": -163.7206573486328, + "logps/rejected": -285.15093994140625, + "loss": 0.3881, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49594974517822266, + "rewards/margins": 1.4398472309112549, + "rewards/rejected": -0.9438974857330322, + "step": 1799 + }, + { + "epoch": 0.21, + "learning_rate": 2.4139061219712046e-07, + "logits/chosen": -3.6769142150878906, + "logits/rejected": -3.689378023147583, + "logps/chosen": -215.927978515625, + "logps/rejected": -210.45753479003906, + "loss": 0.3028, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24080538749694824, + "rewards/margins": 1.33527410030365, + "rewards/rejected": -1.0944687128067017, + "step": 1800 + }, + { + "epoch": 0.21, + "learning_rate": 2.4135549572749617e-07, + "logits/chosen": -2.9768176078796387, + "logits/rejected": -2.789314031600952, + "logps/chosen": -262.72369384765625, + "logps/rejected": -140.95094299316406, + "loss": 0.7002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15552113950252533, + "rewards/margins": 0.8602361679077148, + "rewards/rejected": -1.0157573223114014, + "step": 1801 + }, + { + "epoch": 0.21, + "learning_rate": 2.413203792578719e-07, + "logits/chosen": -3.4500019550323486, + "logits/rejected": -3.491673231124878, + "logps/chosen": -341.31494140625, + "logps/rejected": -317.71319580078125, + "loss": 0.2155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15010561048984528, + "rewards/margins": 2.0146870613098145, + "rewards/rejected": -1.8645814657211304, + "step": 1802 + }, + { + "epoch": 0.21, + "learning_rate": 2.4128526278824767e-07, + "logits/chosen": -2.9675090312957764, + "logits/rejected": -2.915804862976074, + "logps/chosen": -296.2667236328125, + "logps/rejected": -222.76336669921875, + "loss": 0.3732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07646234333515167, + "rewards/margins": 1.3029041290283203, + "rewards/rejected": -1.379366397857666, + "step": 1803 + }, + { + "epoch": 0.21, + "learning_rate": 2.4125014631862343e-07, + "logits/chosen": -3.206181526184082, + "logits/rejected": -3.2034802436828613, + "logps/chosen": -90.62210845947266, + "logps/rejected": -165.38406372070312, + "loss": 0.5434, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2525031566619873, + "rewards/margins": 0.9475864768028259, + "rewards/rejected": -1.200089693069458, + "step": 1804 + }, + { + "epoch": 0.21, + "learning_rate": 2.412150298489992e-07, + "logits/chosen": -3.146010160446167, + "logits/rejected": -3.005582809448242, + "logps/chosen": -281.2833557128906, + "logps/rejected": -298.551025390625, + "loss": 0.3157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08163248002529144, + "rewards/margins": 1.2042107582092285, + "rewards/rejected": -1.2858431339263916, + "step": 1805 + }, + { + "epoch": 0.21, + "learning_rate": 2.411799133793749e-07, + "logits/chosen": -2.9047060012817383, + "logits/rejected": -2.8392202854156494, + "logps/chosen": -197.58004760742188, + "logps/rejected": -340.43170166015625, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16672877967357635, + "rewards/margins": 1.4568045139312744, + "rewards/rejected": -1.6235331296920776, + "step": 1806 + }, + { + "epoch": 0.21, + "learning_rate": 2.4114479690975064e-07, + "logits/chosen": -2.1913044452667236, + "logits/rejected": -2.2105109691619873, + "logps/chosen": -245.36485290527344, + "logps/rejected": -233.07164001464844, + "loss": 0.5469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.504901111125946, + "rewards/margins": 1.3426004648208618, + "rewards/rejected": -1.8475016355514526, + "step": 1807 + }, + { + "epoch": 0.21, + "learning_rate": 2.4110968044012644e-07, + "logits/chosen": -2.7578930854797363, + "logits/rejected": -2.7092695236206055, + "logps/chosen": -233.57481384277344, + "logps/rejected": -266.0731201171875, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5472968816757202, + "rewards/margins": 1.5048975944519043, + "rewards/rejected": -2.052194595336914, + "step": 1808 + }, + { + "epoch": 0.21, + "learning_rate": 2.4107456397050214e-07, + "logits/chosen": -2.4891245365142822, + "logits/rejected": -2.4622931480407715, + "logps/chosen": -384.259765625, + "logps/rejected": -318.8699951171875, + "loss": 0.2145, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35938915610313416, + "rewards/margins": 2.066375494003296, + "rewards/rejected": -1.7069863080978394, + "step": 1809 + }, + { + "epoch": 0.21, + "learning_rate": 2.410394475008779e-07, + "logits/chosen": -3.166712999343872, + "logits/rejected": -3.004619836807251, + "logps/chosen": -267.0263977050781, + "logps/rejected": -197.66006469726562, + "loss": 0.7152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46882912516593933, + "rewards/margins": 0.5559472441673279, + "rewards/rejected": -1.0247764587402344, + "step": 1810 + }, + { + "epoch": 0.21, + "learning_rate": 2.4100433103125365e-07, + "logits/chosen": -3.61323618888855, + "logits/rejected": -3.492032766342163, + "logps/chosen": -282.75213623046875, + "logps/rejected": -256.89862060546875, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12818875908851624, + "rewards/margins": 1.2939426898956299, + "rewards/rejected": -1.1657540798187256, + "step": 1811 + }, + { + "epoch": 0.21, + "learning_rate": 2.409692145616294e-07, + "logits/chosen": -2.9817934036254883, + "logits/rejected": -3.2869231700897217, + "logps/chosen": -239.7997589111328, + "logps/rejected": -186.92274475097656, + "loss": 0.3028, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2774461805820465, + "rewards/margins": 1.5824965238571167, + "rewards/rejected": -1.305050253868103, + "step": 1812 + }, + { + "epoch": 0.21, + "learning_rate": 2.4093409809200516e-07, + "logits/chosen": -2.571326732635498, + "logits/rejected": -2.466388702392578, + "logps/chosen": -340.9006652832031, + "logps/rejected": -415.9167175292969, + "loss": 0.2401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17888230085372925, + "rewards/margins": 2.3979411125183105, + "rewards/rejected": -2.5768234729766846, + "step": 1813 + }, + { + "epoch": 0.21, + "learning_rate": 2.4089898162238086e-07, + "logits/chosen": -2.92287015914917, + "logits/rejected": -2.8410611152648926, + "logps/chosen": -268.87286376953125, + "logps/rejected": -192.04782104492188, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29529693722724915, + "rewards/margins": 1.6396294832229614, + "rewards/rejected": -1.3443325757980347, + "step": 1814 + }, + { + "epoch": 0.21, + "learning_rate": 2.408638651527566e-07, + "logits/chosen": -2.401008129119873, + "logits/rejected": -2.5115413665771484, + "logps/chosen": -259.52630615234375, + "logps/rejected": -272.1788330078125, + "loss": 0.6156, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027551118284463882, + "rewards/margins": 0.5791054964065552, + "rewards/rejected": -0.5515543222427368, + "step": 1815 + }, + { + "epoch": 0.21, + "learning_rate": 2.4082874868313237e-07, + "logits/chosen": -3.01741361618042, + "logits/rejected": -2.624587059020996, + "logps/chosen": -318.7940979003906, + "logps/rejected": -301.8634033203125, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.017174065113067627, + "rewards/margins": 0.9977447986602783, + "rewards/rejected": -1.0149188041687012, + "step": 1816 + }, + { + "epoch": 0.21, + "learning_rate": 2.407936322135081e-07, + "logits/chosen": -2.6454830169677734, + "logits/rejected": -2.621281623840332, + "logps/chosen": -373.0331726074219, + "logps/rejected": -221.32321166992188, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1358024924993515, + "rewards/margins": 1.1383870840072632, + "rewards/rejected": -1.0025845766067505, + "step": 1817 + }, + { + "epoch": 0.21, + "learning_rate": 2.407585157438839e-07, + "logits/chosen": -2.74299955368042, + "logits/rejected": -2.533597946166992, + "logps/chosen": -437.7091064453125, + "logps/rejected": -197.56243896484375, + "loss": 0.5653, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1875694990158081, + "rewards/margins": 0.5917748212814331, + "rewards/rejected": -0.4042053520679474, + "step": 1818 + }, + { + "epoch": 0.21, + "learning_rate": 2.4072339927425963e-07, + "logits/chosen": -3.31374454498291, + "logits/rejected": -3.2132441997528076, + "logps/chosen": -355.820556640625, + "logps/rejected": -280.9417419433594, + "loss": 0.4856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18488293886184692, + "rewards/margins": 1.6677491664886475, + "rewards/rejected": -1.8526321649551392, + "step": 1819 + }, + { + "epoch": 0.21, + "learning_rate": 2.4068828280463533e-07, + "logits/chosen": -2.425079107284546, + "logits/rejected": -2.5127720832824707, + "logps/chosen": -351.137939453125, + "logps/rejected": -416.93988037109375, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34431570768356323, + "rewards/margins": 1.907684326171875, + "rewards/rejected": -1.563368558883667, + "step": 1820 + }, + { + "epoch": 0.21, + "learning_rate": 2.4065316633501114e-07, + "logits/chosen": -2.5490379333496094, + "logits/rejected": -2.9291326999664307, + "logps/chosen": -403.2880554199219, + "logps/rejected": -343.7261962890625, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10454273223876953, + "rewards/margins": 1.8806912899017334, + "rewards/rejected": -1.9852339029312134, + "step": 1821 + }, + { + "epoch": 0.21, + "learning_rate": 2.4061804986538684e-07, + "logits/chosen": -2.7972593307495117, + "logits/rejected": -3.167428970336914, + "logps/chosen": -323.6942138671875, + "logps/rejected": -258.0151062011719, + "loss": 0.4117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07030066847801208, + "rewards/margins": 0.844513475894928, + "rewards/rejected": -0.7742128372192383, + "step": 1822 + }, + { + "epoch": 0.21, + "learning_rate": 2.405829333957626e-07, + "logits/chosen": -3.4452574253082275, + "logits/rejected": -3.116353750228882, + "logps/chosen": -304.1138000488281, + "logps/rejected": -317.852294921875, + "loss": 0.251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0007074326276779175, + "rewards/margins": 1.9118893146514893, + "rewards/rejected": -1.911181926727295, + "step": 1823 + }, + { + "epoch": 0.21, + "learning_rate": 2.4054781692613835e-07, + "logits/chosen": -3.1927690505981445, + "logits/rejected": -3.075185775756836, + "logps/chosen": -103.44029998779297, + "logps/rejected": -163.81784057617188, + "loss": 0.4502, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30610334873199463, + "rewards/margins": 0.6559350490570068, + "rewards/rejected": -0.349831759929657, + "step": 1824 + }, + { + "epoch": 0.21, + "learning_rate": 2.405127004565141e-07, + "logits/chosen": -3.270399808883667, + "logits/rejected": -3.2742762565612793, + "logps/chosen": -150.1024932861328, + "logps/rejected": -242.41928100585938, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5639007091522217, + "rewards/margins": 2.171006679534912, + "rewards/rejected": -1.6071059703826904, + "step": 1825 + }, + { + "epoch": 0.21, + "learning_rate": 2.4047758398688985e-07, + "logits/chosen": -2.6011955738067627, + "logits/rejected": -2.495676040649414, + "logps/chosen": -331.3778076171875, + "logps/rejected": -354.8858642578125, + "loss": 0.4295, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2116556167602539, + "rewards/margins": 1.261932373046875, + "rewards/rejected": -1.473587989807129, + "step": 1826 + }, + { + "epoch": 0.21, + "learning_rate": 2.404424675172656e-07, + "logits/chosen": -3.067300319671631, + "logits/rejected": -3.2956361770629883, + "logps/chosen": -246.4269561767578, + "logps/rejected": -292.41375732421875, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5071303844451904, + "rewards/margins": 3.2434682846069336, + "rewards/rejected": -2.736337900161743, + "step": 1827 + }, + { + "epoch": 0.21, + "learning_rate": 2.404073510476413e-07, + "logits/chosen": -3.154561996459961, + "logits/rejected": -3.0340800285339355, + "logps/chosen": -103.849365234375, + "logps/rejected": -179.877197265625, + "loss": 0.3974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20260120928287506, + "rewards/margins": 1.3019003868103027, + "rewards/rejected": -1.504501461982727, + "step": 1828 + }, + { + "epoch": 0.21, + "learning_rate": 2.4037223457801706e-07, + "logits/chosen": -3.667299747467041, + "logits/rejected": -3.7130157947540283, + "logps/chosen": -198.6332244873047, + "logps/rejected": -236.09280395507812, + "loss": 0.6574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42137086391448975, + "rewards/margins": 0.6130428314208984, + "rewards/rejected": -1.0344136953353882, + "step": 1829 + }, + { + "epoch": 0.21, + "learning_rate": 2.403371181083928e-07, + "logits/chosen": -3.575718879699707, + "logits/rejected": -3.613237142562866, + "logps/chosen": -203.17555236816406, + "logps/rejected": -190.31185913085938, + "loss": 0.3735, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14820124208927155, + "rewards/margins": 1.7944077253341675, + "rewards/rejected": -1.6462064981460571, + "step": 1830 + }, + { + "epoch": 0.21, + "learning_rate": 2.4030200163876857e-07, + "logits/chosen": -3.8135619163513184, + "logits/rejected": -3.779196262359619, + "logps/chosen": -187.3614959716797, + "logps/rejected": -241.08596801757812, + "loss": 0.7252, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2534208297729492, + "rewards/margins": 1.5497138500213623, + "rewards/rejected": -1.803134799003601, + "step": 1831 + }, + { + "epoch": 0.21, + "learning_rate": 2.402668851691443e-07, + "logits/chosen": -3.3541243076324463, + "logits/rejected": -3.7305502891540527, + "logps/chosen": -163.34739685058594, + "logps/rejected": -271.3277587890625, + "loss": 0.2919, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2380848377943039, + "rewards/margins": 1.9160436391830444, + "rewards/rejected": -1.6779589653015137, + "step": 1832 + }, + { + "epoch": 0.21, + "learning_rate": 2.402317686995201e-07, + "logits/chosen": -3.0275893211364746, + "logits/rejected": -3.245628833770752, + "logps/chosen": -399.3750915527344, + "logps/rejected": -312.2113342285156, + "loss": 0.4461, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.201968252658844, + "rewards/margins": 2.2023918628692627, + "rewards/rejected": -2.0004236698150635, + "step": 1833 + }, + { + "epoch": 0.21, + "learning_rate": 2.4019665222989583e-07, + "logits/chosen": -2.775270700454712, + "logits/rejected": -2.5878751277923584, + "logps/chosen": -206.10665893554688, + "logps/rejected": -223.36859130859375, + "loss": 0.6056, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5078791379928589, + "rewards/margins": 0.5788403749465942, + "rewards/rejected": -0.07096125185489655, + "step": 1834 + }, + { + "epoch": 0.21, + "learning_rate": 2.401615357602716e-07, + "logits/chosen": -3.1379141807556152, + "logits/rejected": -2.928715944290161, + "logps/chosen": -225.46835327148438, + "logps/rejected": -395.69549560546875, + "loss": 0.4616, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0617222785949707, + "rewards/margins": 1.119102954864502, + "rewards/rejected": -1.1808249950408936, + "step": 1835 + }, + { + "epoch": 0.21, + "learning_rate": 2.401264192906473e-07, + "logits/chosen": -2.3342316150665283, + "logits/rejected": -2.241162061691284, + "logps/chosen": -264.0896301269531, + "logps/rejected": -215.17123413085938, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026220127940177917, + "rewards/margins": 0.5930976867675781, + "rewards/rejected": -0.5668774843215942, + "step": 1836 + }, + { + "epoch": 0.21, + "learning_rate": 2.4009130282102304e-07, + "logits/chosen": -3.4868226051330566, + "logits/rejected": -3.579270839691162, + "logps/chosen": -132.08517456054688, + "logps/rejected": -195.231201171875, + "loss": 0.901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43552449345588684, + "rewards/margins": 0.39090240001678467, + "rewards/rejected": -0.8264269232749939, + "step": 1837 + }, + { + "epoch": 0.21, + "learning_rate": 2.400561863513988e-07, + "logits/chosen": -3.0242526531219482, + "logits/rejected": -3.15548038482666, + "logps/chosen": -339.38232421875, + "logps/rejected": -273.70025634765625, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18326154351234436, + "rewards/margins": 2.3434157371520996, + "rewards/rejected": -2.160154342651367, + "step": 1838 + }, + { + "epoch": 0.21, + "learning_rate": 2.4002106988177455e-07, + "logits/chosen": -2.6112544536590576, + "logits/rejected": -2.499181032180786, + "logps/chosen": -347.30743408203125, + "logps/rejected": -227.7771453857422, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.267295777797699, + "rewards/margins": 0.9837774634361267, + "rewards/rejected": -0.7164817452430725, + "step": 1839 + }, + { + "epoch": 0.21, + "learning_rate": 2.399859534121503e-07, + "logits/chosen": -2.925374984741211, + "logits/rejected": -3.1526970863342285, + "logps/chosen": -311.2057189941406, + "logps/rejected": -311.64654541015625, + "loss": 0.8433, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9040330648422241, + "rewards/margins": 0.21446117758750916, + "rewards/rejected": -1.1184941530227661, + "step": 1840 + }, + { + "epoch": 0.21, + "learning_rate": 2.39950836942526e-07, + "logits/chosen": -3.2247226238250732, + "logits/rejected": -3.1036572456359863, + "logps/chosen": -169.89695739746094, + "logps/rejected": -182.72891235351562, + "loss": 0.4789, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3294336795806885, + "rewards/margins": 1.4052774906158447, + "rewards/rejected": -1.0758436918258667, + "step": 1841 + }, + { + "epoch": 0.21, + "learning_rate": 2.399157204729018e-07, + "logits/chosen": -3.5640523433685303, + "logits/rejected": -3.4269490242004395, + "logps/chosen": -176.86865234375, + "logps/rejected": -167.06423950195312, + "loss": 0.3912, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09696812927722931, + "rewards/margins": 1.6871545314788818, + "rewards/rejected": -1.590186357498169, + "step": 1842 + }, + { + "epoch": 0.21, + "learning_rate": 2.3988060400327756e-07, + "logits/chosen": -3.465142250061035, + "logits/rejected": -3.2102460861206055, + "logps/chosen": -360.8543395996094, + "logps/rejected": -277.5147705078125, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2871372699737549, + "rewards/margins": 1.8692712783813477, + "rewards/rejected": -2.1564083099365234, + "step": 1843 + }, + { + "epoch": 0.21, + "learning_rate": 2.3984548753365326e-07, + "logits/chosen": -3.891622304916382, + "logits/rejected": -3.6739211082458496, + "logps/chosen": -260.8900146484375, + "logps/rejected": -242.9789581298828, + "loss": 0.3143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4440181851387024, + "rewards/margins": 1.4399336576461792, + "rewards/rejected": -1.8839519023895264, + "step": 1844 + }, + { + "epoch": 0.21, + "learning_rate": 2.39810371064029e-07, + "logits/chosen": -2.6025052070617676, + "logits/rejected": -2.844527244567871, + "logps/chosen": -85.60796356201172, + "logps/rejected": -203.6422119140625, + "loss": 0.4577, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3265523910522461, + "rewards/margins": 1.0658100843429565, + "rewards/rejected": -0.7392577528953552, + "step": 1845 + }, + { + "epoch": 0.21, + "learning_rate": 2.3977525459440477e-07, + "logits/chosen": -3.266981601715088, + "logits/rejected": -3.445608615875244, + "logps/chosen": -105.17025756835938, + "logps/rejected": -199.04888916015625, + "loss": 0.2707, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19062800705432892, + "rewards/margins": 1.6527048349380493, + "rewards/rejected": -1.4620769023895264, + "step": 1846 + }, + { + "epoch": 0.21, + "learning_rate": 2.397401381247805e-07, + "logits/chosen": -3.5979175567626953, + "logits/rejected": -3.4828290939331055, + "logps/chosen": -354.52301025390625, + "logps/rejected": -381.6019287109375, + "loss": 0.5469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.558978796005249, + "rewards/margins": 1.3851336240768433, + "rewards/rejected": -1.9441125392913818, + "step": 1847 + }, + { + "epoch": 0.21, + "learning_rate": 2.397050216551563e-07, + "logits/chosen": -2.402862310409546, + "logits/rejected": -2.5514373779296875, + "logps/chosen": -198.13714599609375, + "logps/rejected": -177.84286499023438, + "loss": 0.3056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0012534111738204956, + "rewards/margins": 1.4720139503479004, + "rewards/rejected": -1.4732673168182373, + "step": 1848 + }, + { + "epoch": 0.21, + "learning_rate": 2.39669905185532e-07, + "logits/chosen": -3.577563524246216, + "logits/rejected": -3.393393039703369, + "logps/chosen": -118.42765808105469, + "logps/rejected": -131.41741943359375, + "loss": 0.4426, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5391978025436401, + "rewards/margins": 0.9912474155426025, + "rewards/rejected": -0.45204958319664, + "step": 1849 + }, + { + "epoch": 0.21, + "learning_rate": 2.3963478871590773e-07, + "logits/chosen": -2.7539124488830566, + "logits/rejected": -3.013728618621826, + "logps/chosen": -168.27194213867188, + "logps/rejected": -282.2645263671875, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14810240268707275, + "rewards/margins": 1.590822458267212, + "rewards/rejected": -1.4427201747894287, + "step": 1850 + }, + { + "epoch": 0.21, + "learning_rate": 2.395996722462835e-07, + "logits/chosen": -2.8367085456848145, + "logits/rejected": -3.052182912826538, + "logps/chosen": -207.71893310546875, + "logps/rejected": -289.6271057128906, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37533995509147644, + "rewards/margins": 1.6185909509658813, + "rewards/rejected": -1.9939309358596802, + "step": 1851 + }, + { + "epoch": 0.21, + "learning_rate": 2.3956455577665924e-07, + "logits/chosen": -2.9096243381500244, + "logits/rejected": -2.9823153018951416, + "logps/chosen": -365.6651916503906, + "logps/rejected": -149.75221252441406, + "loss": 0.6267, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25319188833236694, + "rewards/margins": 1.0111883878707886, + "rewards/rejected": -0.7579965591430664, + "step": 1852 + }, + { + "epoch": 0.21, + "learning_rate": 2.39529439307035e-07, + "logits/chosen": -3.22668719291687, + "logits/rejected": -3.1928296089172363, + "logps/chosen": -265.7268371582031, + "logps/rejected": -198.2104034423828, + "loss": 0.582, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08171147853136063, + "rewards/margins": 0.861680269241333, + "rewards/rejected": -0.9433917999267578, + "step": 1853 + }, + { + "epoch": 0.21, + "learning_rate": 2.394943228374107e-07, + "logits/chosen": -3.2386631965637207, + "logits/rejected": -3.243152379989624, + "logps/chosen": -433.9160461425781, + "logps/rejected": -225.78988647460938, + "loss": 0.319, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29755592346191406, + "rewards/margins": 1.500485897064209, + "rewards/rejected": -1.202929973602295, + "step": 1854 + }, + { + "epoch": 0.21, + "learning_rate": 2.394592063677865e-07, + "logits/chosen": -3.030597448348999, + "logits/rejected": -2.704895496368408, + "logps/chosen": -535.2483520507812, + "logps/rejected": -387.57000732421875, + "loss": 0.4871, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07267952710390091, + "rewards/margins": 1.3345167636871338, + "rewards/rejected": -1.407196283340454, + "step": 1855 + }, + { + "epoch": 0.21, + "learning_rate": 2.3942408989816226e-07, + "logits/chosen": -3.2048325538635254, + "logits/rejected": -3.3020262718200684, + "logps/chosen": -335.4111022949219, + "logps/rejected": -333.7204895019531, + "loss": 0.3778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.406870037317276, + "rewards/margins": 1.393203616142273, + "rewards/rejected": -0.9863336086273193, + "step": 1856 + }, + { + "epoch": 0.21, + "learning_rate": 2.3938897342853796e-07, + "logits/chosen": -3.2815136909484863, + "logits/rejected": -3.267902374267578, + "logps/chosen": -336.240966796875, + "logps/rejected": -300.3442077636719, + "loss": 0.275, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1589762568473816, + "rewards/margins": 1.7175028324127197, + "rewards/rejected": -1.5585265159606934, + "step": 1857 + }, + { + "epoch": 0.21, + "learning_rate": 2.393538569589137e-07, + "logits/chosen": -2.693025588989258, + "logits/rejected": -2.3875937461853027, + "logps/chosen": -210.38687133789062, + "logps/rejected": -232.27613830566406, + "loss": 0.3787, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10163797438144684, + "rewards/margins": 1.0402240753173828, + "rewards/rejected": -0.9385862350463867, + "step": 1858 + }, + { + "epoch": 0.21, + "learning_rate": 2.3931874048928947e-07, + "logits/chosen": -2.6612093448638916, + "logits/rejected": -2.688091516494751, + "logps/chosen": -183.9331512451172, + "logps/rejected": -155.62789916992188, + "loss": 0.5004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.052919209003448486, + "rewards/margins": 0.9086102247238159, + "rewards/rejected": -0.9615294933319092, + "step": 1859 + }, + { + "epoch": 0.21, + "learning_rate": 2.392836240196652e-07, + "logits/chosen": -2.8398513793945312, + "logits/rejected": -2.814011573791504, + "logps/chosen": -285.54852294921875, + "logps/rejected": -322.54205322265625, + "loss": 0.3787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35564279556274414, + "rewards/margins": 1.1657389402389526, + "rewards/rejected": -1.5213816165924072, + "step": 1860 + }, + { + "epoch": 0.21, + "learning_rate": 2.3924850755004097e-07, + "logits/chosen": -3.555253505706787, + "logits/rejected": -3.5419552326202393, + "logps/chosen": -316.2832946777344, + "logps/rejected": -189.86831665039062, + "loss": 0.2559, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24905207753181458, + "rewards/margins": 2.1671640872955322, + "rewards/rejected": -1.91811203956604, + "step": 1861 + }, + { + "epoch": 0.21, + "learning_rate": 2.392133910804167e-07, + "logits/chosen": -3.052665948867798, + "logits/rejected": -2.729041814804077, + "logps/chosen": -315.60882568359375, + "logps/rejected": -241.69515991210938, + "loss": 0.6393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.304241418838501, + "rewards/margins": 0.6388493776321411, + "rewards/rejected": -0.9430907964706421, + "step": 1862 + }, + { + "epoch": 0.21, + "learning_rate": 2.3917827461079243e-07, + "logits/chosen": -2.755173921585083, + "logits/rejected": -2.790940523147583, + "logps/chosen": -305.3826904296875, + "logps/rejected": -304.3145751953125, + "loss": 0.4328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2234201729297638, + "rewards/margins": 2.2677574157714844, + "rewards/rejected": -2.491177558898926, + "step": 1863 + }, + { + "epoch": 0.21, + "learning_rate": 2.3914315814116823e-07, + "logits/chosen": -2.8260200023651123, + "logits/rejected": -3.0565593242645264, + "logps/chosen": -183.84423828125, + "logps/rejected": -411.9515380859375, + "loss": 0.368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04081929102540016, + "rewards/margins": 2.799703598022461, + "rewards/rejected": -2.8405227661132812, + "step": 1864 + }, + { + "epoch": 0.21, + "learning_rate": 2.3910804167154394e-07, + "logits/chosen": -2.693484306335449, + "logits/rejected": -2.95511531829834, + "logps/chosen": -192.39453125, + "logps/rejected": -341.84967041015625, + "loss": 1.0245, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5181946754455566, + "rewards/margins": -0.006462007761001587, + "rewards/rejected": -0.5117326974868774, + "step": 1865 + }, + { + "epoch": 0.22, + "learning_rate": 2.390729252019197e-07, + "logits/chosen": -3.240333318710327, + "logits/rejected": -3.312986135482788, + "logps/chosen": -261.5600280761719, + "logps/rejected": -252.7403564453125, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43903714418411255, + "rewards/margins": 1.3385133743286133, + "rewards/rejected": -1.777550458908081, + "step": 1866 + }, + { + "epoch": 0.22, + "learning_rate": 2.3903780873229544e-07, + "logits/chosen": -2.849130153656006, + "logits/rejected": -2.850721836090088, + "logps/chosen": -211.24868774414062, + "logps/rejected": -215.11126708984375, + "loss": 0.655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14812934398651123, + "rewards/margins": 0.6029497385025024, + "rewards/rejected": -0.7510790228843689, + "step": 1867 + }, + { + "epoch": 0.22, + "learning_rate": 2.390026922626712e-07, + "logits/chosen": -3.8219552040100098, + "logits/rejected": -3.4607245922088623, + "logps/chosen": -182.2537384033203, + "logps/rejected": -171.80551147460938, + "loss": 0.2674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05184371769428253, + "rewards/margins": 1.4408807754516602, + "rewards/rejected": -1.3890371322631836, + "step": 1868 + }, + { + "epoch": 0.22, + "learning_rate": 2.3896757579304695e-07, + "logits/chosen": -3.5720267295837402, + "logits/rejected": -3.5305967330932617, + "logps/chosen": -174.22036743164062, + "logps/rejected": -220.58416748046875, + "loss": 0.4028, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012285813689231873, + "rewards/margins": 2.375335454940796, + "rewards/rejected": -2.3630495071411133, + "step": 1869 + }, + { + "epoch": 0.22, + "learning_rate": 2.3893245932342265e-07, + "logits/chosen": -2.7672641277313232, + "logits/rejected": -2.6977572441101074, + "logps/chosen": -247.30990600585938, + "logps/rejected": -171.23654174804688, + "loss": 0.7358, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2699260711669922, + "rewards/margins": 0.6120525598526001, + "rewards/rejected": -0.8819786310195923, + "step": 1870 + }, + { + "epoch": 0.22, + "learning_rate": 2.388973428537984e-07, + "logits/chosen": -2.9081223011016846, + "logits/rejected": -3.121767044067383, + "logps/chosen": -215.24801635742188, + "logps/rejected": -216.57276916503906, + "loss": 0.4661, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6541638374328613, + "rewards/margins": 1.0534257888793945, + "rewards/rejected": -1.7075896263122559, + "step": 1871 + }, + { + "epoch": 0.22, + "learning_rate": 2.3886222638417416e-07, + "logits/chosen": -2.7080483436584473, + "logits/rejected": -2.6535959243774414, + "logps/chosen": -315.1611328125, + "logps/rejected": -381.1353454589844, + "loss": 0.6026, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0642850399017334, + "rewards/margins": 0.6664404273033142, + "rewards/rejected": -0.6021553874015808, + "step": 1872 + }, + { + "epoch": 0.22, + "learning_rate": 2.388271099145499e-07, + "logits/chosen": -3.3447628021240234, + "logits/rejected": -3.316511392593384, + "logps/chosen": -284.9142761230469, + "logps/rejected": -236.10165405273438, + "loss": 0.3703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014459162950515747, + "rewards/margins": 1.2012072801589966, + "rewards/rejected": -1.21566641330719, + "step": 1873 + }, + { + "epoch": 0.22, + "learning_rate": 2.3879199344492567e-07, + "logits/chosen": -2.641317844390869, + "logits/rejected": -2.820796489715576, + "logps/chosen": -301.94207763671875, + "logps/rejected": -274.16156005859375, + "loss": 0.566, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12546077370643616, + "rewards/margins": 0.6516165733337402, + "rewards/rejected": -0.5261558294296265, + "step": 1874 + }, + { + "epoch": 0.22, + "learning_rate": 2.3875687697530137e-07, + "logits/chosen": -3.3212637901306152, + "logits/rejected": -3.4066638946533203, + "logps/chosen": -394.03936767578125, + "logps/rejected": -448.79095458984375, + "loss": 0.2709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6851503849029541, + "rewards/margins": 2.550360679626465, + "rewards/rejected": -1.8652102947235107, + "step": 1875 + }, + { + "epoch": 0.22, + "learning_rate": 2.387217605056772e-07, + "logits/chosen": -2.2592625617980957, + "logits/rejected": -2.318779706954956, + "logps/chosen": -335.7944641113281, + "logps/rejected": -385.7993469238281, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.033325958997011185, + "rewards/margins": 2.3046064376831055, + "rewards/rejected": -2.271280527114868, + "step": 1876 + }, + { + "epoch": 0.22, + "learning_rate": 2.3868664403605293e-07, + "logits/chosen": -2.441523551940918, + "logits/rejected": -2.626868724822998, + "logps/chosen": -235.89488220214844, + "logps/rejected": -311.6707458496094, + "loss": 0.5161, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1649269014596939, + "rewards/margins": 1.7457470893859863, + "rewards/rejected": -1.5808199644088745, + "step": 1877 + }, + { + "epoch": 0.22, + "learning_rate": 2.3865152756642863e-07, + "logits/chosen": -3.144813060760498, + "logits/rejected": -2.667576789855957, + "logps/chosen": -385.2821044921875, + "logps/rejected": -264.3667297363281, + "loss": 0.4031, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23157432675361633, + "rewards/margins": 0.8427044749259949, + "rewards/rejected": -0.6111301183700562, + "step": 1878 + }, + { + "epoch": 0.22, + "learning_rate": 2.386164110968044e-07, + "logits/chosen": -2.72945499420166, + "logits/rejected": -2.905499219894409, + "logps/chosen": -180.20306396484375, + "logps/rejected": -229.76187133789062, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23968681693077087, + "rewards/margins": 2.631272792816162, + "rewards/rejected": -2.3915860652923584, + "step": 1879 + }, + { + "epoch": 0.22, + "learning_rate": 2.3858129462718014e-07, + "logits/chosen": -3.453127384185791, + "logits/rejected": -3.2610836029052734, + "logps/chosen": -190.06707763671875, + "logps/rejected": -156.92901611328125, + "loss": 0.3814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15209154784679413, + "rewards/margins": 1.3634053468704224, + "rewards/rejected": -1.515496850013733, + "step": 1880 + }, + { + "epoch": 0.22, + "learning_rate": 2.385461781575559e-07, + "logits/chosen": -3.198052406311035, + "logits/rejected": -3.4438600540161133, + "logps/chosen": -452.38909912109375, + "logps/rejected": -444.28302001953125, + "loss": 0.27, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0633179098367691, + "rewards/margins": 1.9909108877182007, + "rewards/rejected": -2.0542285442352295, + "step": 1881 + }, + { + "epoch": 0.22, + "learning_rate": 2.3851106168793165e-07, + "logits/chosen": -3.2948079109191895, + "logits/rejected": -3.117093324661255, + "logps/chosen": -148.17599487304688, + "logps/rejected": -181.55645751953125, + "loss": 0.5527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8143954277038574, + "rewards/margins": 0.5694471597671509, + "rewards/rejected": -1.3838427066802979, + "step": 1882 + }, + { + "epoch": 0.22, + "learning_rate": 2.3847594521830735e-07, + "logits/chosen": -3.2947707176208496, + "logits/rejected": -3.4383034706115723, + "logps/chosen": -169.2095947265625, + "logps/rejected": -207.70339965820312, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05324330925941467, + "rewards/margins": 2.5381827354431152, + "rewards/rejected": -2.591426134109497, + "step": 1883 + }, + { + "epoch": 0.22, + "learning_rate": 2.384408287486831e-07, + "logits/chosen": -3.062617301940918, + "logits/rejected": -3.0640885829925537, + "logps/chosen": -269.67547607421875, + "logps/rejected": -336.88446044921875, + "loss": 0.6612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20418275892734528, + "rewards/margins": 0.8373125791549683, + "rewards/rejected": -1.0414953231811523, + "step": 1884 + }, + { + "epoch": 0.22, + "learning_rate": 2.3840571227905888e-07, + "logits/chosen": -3.3146984577178955, + "logits/rejected": -2.659238815307617, + "logps/chosen": -380.74871826171875, + "logps/rejected": -359.3706970214844, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5748739838600159, + "rewards/margins": 1.3990894556045532, + "rewards/rejected": -1.9739634990692139, + "step": 1885 + }, + { + "epoch": 0.22, + "learning_rate": 2.383705958094346e-07, + "logits/chosen": -2.795694351196289, + "logits/rejected": -2.9148809909820557, + "logps/chosen": -252.338134765625, + "logps/rejected": -270.98797607421875, + "loss": 0.5251, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33758652210235596, + "rewards/margins": 1.7036025524139404, + "rewards/rejected": -1.366016149520874, + "step": 1886 + }, + { + "epoch": 0.22, + "learning_rate": 2.3833547933981036e-07, + "logits/chosen": -3.4827117919921875, + "logits/rejected": -3.2771108150482178, + "logps/chosen": -343.9080810546875, + "logps/rejected": -180.65945434570312, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2024131417274475, + "rewards/margins": 1.1869056224822998, + "rewards/rejected": -0.9844925403594971, + "step": 1887 + }, + { + "epoch": 0.22, + "learning_rate": 2.3830036287018612e-07, + "logits/chosen": -3.2256879806518555, + "logits/rejected": -3.0551669597625732, + "logps/chosen": -146.13330078125, + "logps/rejected": -245.65476989746094, + "loss": 0.288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23634114861488342, + "rewards/margins": 1.736095905303955, + "rewards/rejected": -1.9724370241165161, + "step": 1888 + }, + { + "epoch": 0.22, + "learning_rate": 2.3826524640056184e-07, + "logits/chosen": -3.020913600921631, + "logits/rejected": -3.0713706016540527, + "logps/chosen": -292.6805114746094, + "logps/rejected": -441.4288635253906, + "loss": 0.4698, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19158652424812317, + "rewards/margins": 1.174302339553833, + "rewards/rejected": -0.9827158451080322, + "step": 1889 + }, + { + "epoch": 0.22, + "learning_rate": 2.382301299309376e-07, + "logits/chosen": -2.4694907665252686, + "logits/rejected": -2.43359637260437, + "logps/chosen": -76.58487701416016, + "logps/rejected": -108.47946166992188, + "loss": 0.8584, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.349437415599823, + "rewards/margins": -0.06080085039138794, + "rewards/rejected": -0.28863656520843506, + "step": 1890 + }, + { + "epoch": 0.22, + "learning_rate": 2.3819501346131332e-07, + "logits/chosen": -2.677908182144165, + "logits/rejected": -2.371974468231201, + "logps/chosen": -165.31201171875, + "logps/rejected": -229.9835205078125, + "loss": 0.6309, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37851524353027344, + "rewards/margins": 0.9979226589202881, + "rewards/rejected": -1.3764379024505615, + "step": 1891 + }, + { + "epoch": 0.22, + "learning_rate": 2.3815989699168908e-07, + "logits/chosen": -2.7455406188964844, + "logits/rejected": -2.7070841789245605, + "logps/chosen": -345.7112121582031, + "logps/rejected": -286.5813903808594, + "loss": 0.6149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.625001072883606, + "rewards/margins": 0.3239609897136688, + "rewards/rejected": -0.9489621520042419, + "step": 1892 + }, + { + "epoch": 0.22, + "learning_rate": 2.3812478052206486e-07, + "logits/chosen": -3.1710972785949707, + "logits/rejected": -3.1996312141418457, + "logps/chosen": -128.76968383789062, + "logps/rejected": -126.00115966796875, + "loss": 0.3669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1471884548664093, + "rewards/margins": 1.7755229473114014, + "rewards/rejected": -1.6283345222473145, + "step": 1893 + }, + { + "epoch": 0.22, + "learning_rate": 2.3808966405244059e-07, + "logits/chosen": -4.009587287902832, + "logits/rejected": -3.9257664680480957, + "logps/chosen": -401.2257385253906, + "logps/rejected": -315.4661560058594, + "loss": 0.166, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16800488531589508, + "rewards/margins": 2.6292805671691895, + "rewards/rejected": -2.461275815963745, + "step": 1894 + }, + { + "epoch": 0.22, + "learning_rate": 2.3805454758281634e-07, + "logits/chosen": -3.2193124294281006, + "logits/rejected": -3.392913341522217, + "logps/chosen": -174.849853515625, + "logps/rejected": -255.85043334960938, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3333899676799774, + "rewards/margins": 1.9591898918151855, + "rewards/rejected": -1.6257997751235962, + "step": 1895 + }, + { + "epoch": 0.22, + "learning_rate": 2.380194311131921e-07, + "logits/chosen": -2.5379769802093506, + "logits/rejected": -2.7388737201690674, + "logps/chosen": -231.49888610839844, + "logps/rejected": -238.67532348632812, + "loss": 0.4557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2929534912109375, + "rewards/margins": 1.5845260620117188, + "rewards/rejected": -1.8774795532226562, + "step": 1896 + }, + { + "epoch": 0.22, + "learning_rate": 2.3798431464356782e-07, + "logits/chosen": -2.861204147338867, + "logits/rejected": -2.7918097972869873, + "logps/chosen": -419.77752685546875, + "logps/rejected": -140.5082244873047, + "loss": 0.662, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13766422867774963, + "rewards/margins": 0.7582541704177856, + "rewards/rejected": -0.8959183692932129, + "step": 1897 + }, + { + "epoch": 0.22, + "learning_rate": 2.3794919817394357e-07, + "logits/chosen": -2.3190271854400635, + "logits/rejected": -2.5565736293792725, + "logps/chosen": -295.4584045410156, + "logps/rejected": -282.631103515625, + "loss": 0.3132, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23769791424274445, + "rewards/margins": 2.139730215072632, + "rewards/rejected": -1.9020321369171143, + "step": 1898 + }, + { + "epoch": 0.22, + "learning_rate": 2.379140817043193e-07, + "logits/chosen": -3.7230165004730225, + "logits/rejected": -3.505910873413086, + "logps/chosen": -247.37513732910156, + "logps/rejected": -196.65969848632812, + "loss": 0.2829, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17114269733428955, + "rewards/margins": 1.8355562686920166, + "rewards/rejected": -1.664413571357727, + "step": 1899 + }, + { + "epoch": 0.22, + "learning_rate": 2.3787896523469506e-07, + "logits/chosen": -2.6964643001556396, + "logits/rejected": -2.463961124420166, + "logps/chosen": -245.89938354492188, + "logps/rejected": -452.70953369140625, + "loss": 0.2413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06895250082015991, + "rewards/margins": 1.7071712017059326, + "rewards/rejected": -1.638218641281128, + "step": 1900 + }, + { + "epoch": 0.22, + "learning_rate": 2.378438487650708e-07, + "logits/chosen": -3.1125733852386475, + "logits/rejected": -3.0511443614959717, + "logps/chosen": -207.16773986816406, + "logps/rejected": -227.18804931640625, + "loss": 0.583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11963582038879395, + "rewards/margins": 0.7430042624473572, + "rewards/rejected": -0.8626401424407959, + "step": 1901 + }, + { + "epoch": 0.22, + "learning_rate": 2.3780873229544654e-07, + "logits/chosen": -3.546088933944702, + "logits/rejected": -4.047410011291504, + "logps/chosen": -177.18405151367188, + "logps/rejected": -211.7225341796875, + "loss": 0.3657, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09369050711393356, + "rewards/margins": 1.209911823272705, + "rewards/rejected": -1.3036023378372192, + "step": 1902 + }, + { + "epoch": 0.22, + "learning_rate": 2.3777361582582232e-07, + "logits/chosen": -2.426943778991699, + "logits/rejected": -2.498091220855713, + "logps/chosen": -248.98025512695312, + "logps/rejected": -222.96487426757812, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5036156177520752, + "rewards/margins": 1.497896432876587, + "rewards/rejected": -2.001512050628662, + "step": 1903 + }, + { + "epoch": 0.22, + "learning_rate": 2.3773849935619802e-07, + "logits/chosen": -2.845189094543457, + "logits/rejected": -2.9526124000549316, + "logps/chosen": -303.8992614746094, + "logps/rejected": -225.97897338867188, + "loss": 0.6146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2985466718673706, + "rewards/margins": 0.5058640241622925, + "rewards/rejected": -0.8044107556343079, + "step": 1904 + }, + { + "epoch": 0.22, + "learning_rate": 2.377033828865738e-07, + "logits/chosen": -3.3253822326660156, + "logits/rejected": -3.160639762878418, + "logps/chosen": -332.3227844238281, + "logps/rejected": -278.9083251953125, + "loss": 0.5514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1049327477812767, + "rewards/margins": 1.0209028720855713, + "rewards/rejected": -1.125835657119751, + "step": 1905 + }, + { + "epoch": 0.22, + "learning_rate": 2.3766826641694955e-07, + "logits/chosen": -2.6788642406463623, + "logits/rejected": -2.6205129623413086, + "logps/chosen": -305.1359558105469, + "logps/rejected": -397.4089660644531, + "loss": 0.7512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0500718355178833, + "rewards/margins": 0.1743241548538208, + "rewards/rejected": -0.2243960052728653, + "step": 1906 + }, + { + "epoch": 0.22, + "learning_rate": 2.3763314994732528e-07, + "logits/chosen": -3.5826706886291504, + "logits/rejected": -3.9390478134155273, + "logps/chosen": -132.31370544433594, + "logps/rejected": -204.89389038085938, + "loss": 0.4297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23746031522750854, + "rewards/margins": 1.8942127227783203, + "rewards/rejected": -2.1316730976104736, + "step": 1907 + }, + { + "epoch": 0.22, + "learning_rate": 2.3759803347770103e-07, + "logits/chosen": -3.439944267272949, + "logits/rejected": -4.008194923400879, + "logps/chosen": -208.41029357910156, + "logps/rejected": -207.65365600585938, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008326005190610886, + "rewards/margins": 1.9840483665466309, + "rewards/rejected": -1.975722312927246, + "step": 1908 + }, + { + "epoch": 0.22, + "learning_rate": 2.375629170080768e-07, + "logits/chosen": -3.09859037399292, + "logits/rejected": -3.2638890743255615, + "logps/chosen": -169.08837890625, + "logps/rejected": -285.9677734375, + "loss": 0.4884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1801491230726242, + "rewards/margins": 0.6366292238235474, + "rewards/rejected": -0.816778302192688, + "step": 1909 + }, + { + "epoch": 0.22, + "learning_rate": 2.3752780053845252e-07, + "logits/chosen": -2.9172348976135254, + "logits/rejected": -2.6605277061462402, + "logps/chosen": -485.97796630859375, + "logps/rejected": -276.85235595703125, + "loss": 0.3688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.034717708826065063, + "rewards/margins": 1.3961790800094604, + "rewards/rejected": -1.4308967590332031, + "step": 1910 + }, + { + "epoch": 0.22, + "learning_rate": 2.3749268406882827e-07, + "logits/chosen": -2.18872332572937, + "logits/rejected": -2.3571431636810303, + "logps/chosen": -353.88616943359375, + "logps/rejected": -330.8863830566406, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15640822052955627, + "rewards/margins": 1.881563425064087, + "rewards/rejected": -1.7251551151275635, + "step": 1911 + }, + { + "epoch": 0.22, + "learning_rate": 2.37457567599204e-07, + "logits/chosen": -3.1446750164031982, + "logits/rejected": -3.3271193504333496, + "logps/chosen": -301.9336853027344, + "logps/rejected": -335.8569030761719, + "loss": 0.4629, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0881853848695755, + "rewards/margins": 1.9179389476776123, + "rewards/rejected": -1.8297535181045532, + "step": 1912 + }, + { + "epoch": 0.22, + "learning_rate": 2.3742245112957975e-07, + "logits/chosen": -2.59894061088562, + "logits/rejected": -2.5261459350585938, + "logps/chosen": -423.3135070800781, + "logps/rejected": -260.0544128417969, + "loss": 0.3484, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5532470941543579, + "rewards/margins": 1.5248947143554688, + "rewards/rejected": -0.9716475605964661, + "step": 1913 + }, + { + "epoch": 0.22, + "learning_rate": 2.3738733465995553e-07, + "logits/chosen": -2.0945417881011963, + "logits/rejected": -2.2415428161621094, + "logps/chosen": -309.9722900390625, + "logps/rejected": -189.98486328125, + "loss": 0.5224, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38147151470184326, + "rewards/margins": 1.1462559700012207, + "rewards/rejected": -0.7647844552993774, + "step": 1914 + }, + { + "epoch": 0.22, + "learning_rate": 2.3735221819033123e-07, + "logits/chosen": -2.961062431335449, + "logits/rejected": -3.2115139961242676, + "logps/chosen": -356.75830078125, + "logps/rejected": -252.25650024414062, + "loss": 0.4849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06073129177093506, + "rewards/margins": 1.453205943107605, + "rewards/rejected": -1.51393723487854, + "step": 1915 + }, + { + "epoch": 0.22, + "learning_rate": 2.37317101720707e-07, + "logits/chosen": -2.6856603622436523, + "logits/rejected": -2.725938320159912, + "logps/chosen": -467.9189758300781, + "logps/rejected": -457.5125732421875, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3994789719581604, + "rewards/margins": 3.115227699279785, + "rewards/rejected": -2.7157485485076904, + "step": 1916 + }, + { + "epoch": 0.22, + "learning_rate": 2.3728198525108277e-07, + "logits/chosen": -3.0873265266418457, + "logits/rejected": -2.8918614387512207, + "logps/chosen": -236.23187255859375, + "logps/rejected": -99.74676513671875, + "loss": 0.3865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04841993749141693, + "rewards/margins": 1.121311068534851, + "rewards/rejected": -1.1697309017181396, + "step": 1917 + }, + { + "epoch": 0.22, + "learning_rate": 2.372468687814585e-07, + "logits/chosen": -3.5829591751098633, + "logits/rejected": -3.4007558822631836, + "logps/chosen": -338.7941589355469, + "logps/rejected": -321.5729675292969, + "loss": 0.4131, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11648986488580704, + "rewards/margins": 1.7642592191696167, + "rewards/rejected": -1.6477693319320679, + "step": 1918 + }, + { + "epoch": 0.22, + "learning_rate": 2.3721175231183425e-07, + "logits/chosen": -3.922886371612549, + "logits/rejected": -4.103003025054932, + "logps/chosen": -276.6469421386719, + "logps/rejected": -270.7803955078125, + "loss": 0.3732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21608521044254303, + "rewards/margins": 1.3308022022247314, + "rewards/rejected": -1.11471688747406, + "step": 1919 + }, + { + "epoch": 0.22, + "learning_rate": 2.3717663584220997e-07, + "logits/chosen": -3.449526071548462, + "logits/rejected": -3.163794755935669, + "logps/chosen": -243.94845581054688, + "logps/rejected": -262.3933410644531, + "loss": 0.5065, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25724267959594727, + "rewards/margins": 0.9045336246490479, + "rewards/rejected": -1.1617763042449951, + "step": 1920 + }, + { + "epoch": 0.22, + "learning_rate": 2.3714151937258573e-07, + "logits/chosen": -2.6736531257629395, + "logits/rejected": -2.7727906703948975, + "logps/chosen": -289.97076416015625, + "logps/rejected": -183.34121704101562, + "loss": 0.4423, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09541648626327515, + "rewards/margins": 0.896037757396698, + "rewards/rejected": -0.8006212711334229, + "step": 1921 + }, + { + "epoch": 0.22, + "learning_rate": 2.3710640290296148e-07, + "logits/chosen": -3.780348300933838, + "logits/rejected": -3.5795912742614746, + "logps/chosen": -195.87997436523438, + "logps/rejected": -211.07737731933594, + "loss": 0.5017, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17038977146148682, + "rewards/margins": 1.1311023235321045, + "rewards/rejected": -1.3014919757843018, + "step": 1922 + }, + { + "epoch": 0.22, + "learning_rate": 2.370712864333372e-07, + "logits/chosen": -2.852374315261841, + "logits/rejected": -2.8214497566223145, + "logps/chosen": -117.99516296386719, + "logps/rejected": -104.95326232910156, + "loss": 0.3945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3886273205280304, + "rewards/margins": 0.9860265851020813, + "rewards/rejected": -0.5973993539810181, + "step": 1923 + }, + { + "epoch": 0.22, + "learning_rate": 2.3703616996371296e-07, + "logits/chosen": -3.5518434047698975, + "logits/rejected": -3.354944944381714, + "logps/chosen": -204.48284912109375, + "logps/rejected": -189.72694396972656, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07105147838592529, + "rewards/margins": 0.5340943932533264, + "rewards/rejected": -0.4630429148674011, + "step": 1924 + }, + { + "epoch": 0.22, + "learning_rate": 2.3700105349408874e-07, + "logits/chosen": -3.3995602130889893, + "logits/rejected": -3.1734938621520996, + "logps/chosen": -157.18045043945312, + "logps/rejected": -223.75341796875, + "loss": 0.4146, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19136002659797668, + "rewards/margins": 0.9750504493713379, + "rewards/rejected": -1.1664104461669922, + "step": 1925 + }, + { + "epoch": 0.22, + "learning_rate": 2.3696593702446444e-07, + "logits/chosen": -2.7661330699920654, + "logits/rejected": -2.771693706512451, + "logps/chosen": -200.76788330078125, + "logps/rejected": -224.69912719726562, + "loss": 0.1915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6854684948921204, + "rewards/margins": 2.594217538833618, + "rewards/rejected": -1.9087492227554321, + "step": 1926 + }, + { + "epoch": 0.22, + "learning_rate": 2.3693082055484022e-07, + "logits/chosen": -3.077664852142334, + "logits/rejected": -3.011803150177002, + "logps/chosen": -394.0582275390625, + "logps/rejected": -210.83856201171875, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14685095846652985, + "rewards/margins": 1.029951572418213, + "rewards/rejected": -0.8831006288528442, + "step": 1927 + }, + { + "epoch": 0.22, + "learning_rate": 2.3689570408521595e-07, + "logits/chosen": -3.0255439281463623, + "logits/rejected": -2.714571475982666, + "logps/chosen": -162.053955078125, + "logps/rejected": -245.2805938720703, + "loss": 0.2434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05062887817621231, + "rewards/margins": 1.7896405458450317, + "rewards/rejected": -1.7390116453170776, + "step": 1928 + }, + { + "epoch": 0.22, + "learning_rate": 2.368605876155917e-07, + "logits/chosen": -2.939450263977051, + "logits/rejected": -2.5983824729919434, + "logps/chosen": -235.44837951660156, + "logps/rejected": -317.5164794921875, + "loss": 0.2184, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5185390114784241, + "rewards/margins": 1.8647598028182983, + "rewards/rejected": -1.3462207317352295, + "step": 1929 + }, + { + "epoch": 0.22, + "learning_rate": 2.3682547114596746e-07, + "logits/chosen": -3.1611690521240234, + "logits/rejected": -3.0884628295898438, + "logps/chosen": -303.3222961425781, + "logps/rejected": -238.7315216064453, + "loss": 0.222, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43077534437179565, + "rewards/margins": 1.7002692222595215, + "rewards/rejected": -1.269493818283081, + "step": 1930 + }, + { + "epoch": 0.22, + "learning_rate": 2.367903546763432e-07, + "logits/chosen": -3.4758362770080566, + "logits/rejected": -3.3406496047973633, + "logps/chosen": -202.26531982421875, + "logps/rejected": -218.9394989013672, + "loss": 0.4252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17453855276107788, + "rewards/margins": 1.8564305305480957, + "rewards/rejected": -1.681891918182373, + "step": 1931 + }, + { + "epoch": 0.22, + "learning_rate": 2.3675523820671894e-07, + "logits/chosen": -3.1129913330078125, + "logits/rejected": -3.1399240493774414, + "logps/chosen": -309.18353271484375, + "logps/rejected": -272.7609558105469, + "loss": 0.4118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027665115892887115, + "rewards/margins": 1.2992477416992188, + "rewards/rejected": -1.3269128799438477, + "step": 1932 + }, + { + "epoch": 0.22, + "learning_rate": 2.367201217370947e-07, + "logits/chosen": -2.9982151985168457, + "logits/rejected": -2.9184248447418213, + "logps/chosen": -246.80738830566406, + "logps/rejected": -261.47821044921875, + "loss": 0.3356, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37736326456069946, + "rewards/margins": 1.5595622062683105, + "rewards/rejected": -1.1821987628936768, + "step": 1933 + }, + { + "epoch": 0.22, + "learning_rate": 2.3668500526747042e-07, + "logits/chosen": -3.3420968055725098, + "logits/rejected": -3.270202875137329, + "logps/chosen": -297.59075927734375, + "logps/rejected": -332.28948974609375, + "loss": 0.2877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03211592137813568, + "rewards/margins": 2.0577778816223145, + "rewards/rejected": -2.0898938179016113, + "step": 1934 + }, + { + "epoch": 0.22, + "learning_rate": 2.3664988879784618e-07, + "logits/chosen": -3.3796701431274414, + "logits/rejected": -3.6025915145874023, + "logps/chosen": -240.79527282714844, + "logps/rejected": -204.16064453125, + "loss": 0.4075, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0855565071105957, + "rewards/margins": 1.2519049644470215, + "rewards/rejected": -1.1663485765457153, + "step": 1935 + }, + { + "epoch": 0.22, + "learning_rate": 2.366147723282219e-07, + "logits/chosen": -1.9516831636428833, + "logits/rejected": -2.189332962036133, + "logps/chosen": -359.30133056640625, + "logps/rejected": -249.66867065429688, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4304964244365692, + "rewards/margins": 1.93541419506073, + "rewards/rejected": -1.504917860031128, + "step": 1936 + }, + { + "epoch": 0.22, + "learning_rate": 2.3657965585859768e-07, + "logits/chosen": -2.6603569984436035, + "logits/rejected": -2.864053249359131, + "logps/chosen": -226.45791625976562, + "logps/rejected": -138.53054809570312, + "loss": 0.739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.536213219165802, + "rewards/margins": 0.4281558394432068, + "rewards/rejected": -0.9643691182136536, + "step": 1937 + }, + { + "epoch": 0.22, + "learning_rate": 2.3654453938897344e-07, + "logits/chosen": -3.158496141433716, + "logits/rejected": -3.446389675140381, + "logps/chosen": -281.42120361328125, + "logps/rejected": -313.11962890625, + "loss": 0.1598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04978714510798454, + "rewards/margins": 3.8393454551696777, + "rewards/rejected": -3.8891327381134033, + "step": 1938 + }, + { + "epoch": 0.22, + "learning_rate": 2.3650942291934917e-07, + "logits/chosen": -3.6872968673706055, + "logits/rejected": -3.6790711879730225, + "logps/chosen": -120.06697082519531, + "logps/rejected": -165.63514709472656, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3381662666797638, + "rewards/margins": 2.2810964584350586, + "rewards/rejected": -1.942929983139038, + "step": 1939 + }, + { + "epoch": 0.22, + "learning_rate": 2.3647430644972492e-07, + "logits/chosen": -3.797347068786621, + "logits/rejected": -3.5110065937042236, + "logps/chosen": -228.1983642578125, + "logps/rejected": -155.682373046875, + "loss": 0.6128, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19041664898395538, + "rewards/margins": 0.8307810425758362, + "rewards/rejected": -1.02119779586792, + "step": 1940 + }, + { + "epoch": 0.22, + "learning_rate": 2.3643918998010067e-07, + "logits/chosen": -3.910299777984619, + "logits/rejected": -4.0295305252075195, + "logps/chosen": -161.05770874023438, + "logps/rejected": -221.011474609375, + "loss": 0.4901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009858936071395874, + "rewards/margins": 1.878171682357788, + "rewards/rejected": -1.8880305290222168, + "step": 1941 + }, + { + "epoch": 0.22, + "learning_rate": 2.364040735104764e-07, + "logits/chosen": -2.938612937927246, + "logits/rejected": -3.0558581352233887, + "logps/chosen": -347.83355712890625, + "logps/rejected": -206.13336181640625, + "loss": 0.7616, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5964158773422241, + "rewards/margins": 0.07491900026798248, + "rewards/rejected": -0.6713349223136902, + "step": 1942 + }, + { + "epoch": 0.22, + "learning_rate": 2.3636895704085215e-07, + "logits/chosen": -3.327674150466919, + "logits/rejected": -3.1382312774658203, + "logps/chosen": -262.4370422363281, + "logps/rejected": -285.7188720703125, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32218700647354126, + "rewards/margins": 1.9246999025344849, + "rewards/rejected": -1.6025128364562988, + "step": 1943 + }, + { + "epoch": 0.22, + "learning_rate": 2.3633384057122788e-07, + "logits/chosen": -3.5266926288604736, + "logits/rejected": -3.4114933013916016, + "logps/chosen": -461.6474609375, + "logps/rejected": -384.744384765625, + "loss": 0.4509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6036498546600342, + "rewards/margins": 1.4447903633117676, + "rewards/rejected": -2.048440456390381, + "step": 1944 + }, + { + "epoch": 0.22, + "learning_rate": 2.3629872410160364e-07, + "logits/chosen": -4.050859451293945, + "logits/rejected": -4.083256244659424, + "logps/chosen": -223.20828247070312, + "logps/rejected": -262.51312255859375, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18191909790039062, + "rewards/margins": 1.1304326057434082, + "rewards/rejected": -1.3123517036437988, + "step": 1945 + }, + { + "epoch": 0.22, + "learning_rate": 2.362636076319794e-07, + "logits/chosen": -3.7139179706573486, + "logits/rejected": -4.133528709411621, + "logps/chosen": -205.20828247070312, + "logps/rejected": -310.1703186035156, + "loss": 0.1853, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5596264600753784, + "rewards/margins": 3.109354019165039, + "rewards/rejected": -2.549727439880371, + "step": 1946 + }, + { + "epoch": 0.22, + "learning_rate": 2.3622849116235512e-07, + "logits/chosen": -3.1618762016296387, + "logits/rejected": -3.259639024734497, + "logps/chosen": -203.43765258789062, + "logps/rejected": -220.77877807617188, + "loss": 0.9563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5748702883720398, + "rewards/margins": 0.8919491767883301, + "rewards/rejected": -1.466819405555725, + "step": 1947 + }, + { + "epoch": 0.22, + "learning_rate": 2.361933746927309e-07, + "logits/chosen": -3.091965675354004, + "logits/rejected": -2.996055841445923, + "logps/chosen": -333.53045654296875, + "logps/rejected": -257.8176574707031, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012640029191970825, + "rewards/margins": 0.912082850933075, + "rewards/rejected": -0.9247229099273682, + "step": 1948 + }, + { + "epoch": 0.22, + "learning_rate": 2.361582582231066e-07, + "logits/chosen": -3.3650078773498535, + "logits/rejected": -3.174588680267334, + "logps/chosen": -226.8917694091797, + "logps/rejected": -103.08650207519531, + "loss": 0.7467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2040969431400299, + "rewards/margins": 0.2637457847595215, + "rewards/rejected": -0.467842698097229, + "step": 1949 + }, + { + "epoch": 0.22, + "learning_rate": 2.3612314175348238e-07, + "logits/chosen": -2.694668769836426, + "logits/rejected": -2.980736255645752, + "logps/chosen": -249.96926879882812, + "logps/rejected": -200.3289337158203, + "loss": 0.6669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9649742245674133, + "rewards/margins": 0.18468379974365234, + "rewards/rejected": -1.1496580839157104, + "step": 1950 + }, + { + "epoch": 0.22, + "learning_rate": 2.3608802528385813e-07, + "logits/chosen": -3.2263569831848145, + "logits/rejected": -3.305718421936035, + "logps/chosen": -168.17361450195312, + "logps/rejected": -147.99964904785156, + "loss": 0.6989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.897217333316803, + "rewards/margins": 0.7248235940933228, + "rewards/rejected": -1.6220409870147705, + "step": 1951 + }, + { + "epoch": 0.23, + "learning_rate": 2.3605290881423386e-07, + "logits/chosen": -3.6969847679138184, + "logits/rejected": -3.545210123062134, + "logps/chosen": -252.175048828125, + "logps/rejected": -289.9844055175781, + "loss": 0.4321, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15360704064369202, + "rewards/margins": 1.4460314512252808, + "rewards/rejected": -1.5996384620666504, + "step": 1952 + }, + { + "epoch": 0.23, + "learning_rate": 2.3601779234460961e-07, + "logits/chosen": -3.2407126426696777, + "logits/rejected": -2.8367509841918945, + "logps/chosen": -274.98681640625, + "logps/rejected": -207.9128875732422, + "loss": 0.4495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31745845079421997, + "rewards/margins": 0.7776055932044983, + "rewards/rejected": -1.0950640439987183, + "step": 1953 + }, + { + "epoch": 0.23, + "learning_rate": 2.3598267587498537e-07, + "logits/chosen": -2.8842508792877197, + "logits/rejected": -2.9146432876586914, + "logps/chosen": -306.57452392578125, + "logps/rejected": -245.10397338867188, + "loss": 0.7073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.70942223072052, + "rewards/margins": 0.39295345544815063, + "rewards/rejected": -1.1023757457733154, + "step": 1954 + }, + { + "epoch": 0.23, + "learning_rate": 2.359475594053611e-07, + "logits/chosen": -2.928433418273926, + "logits/rejected": -2.936575412750244, + "logps/chosen": -320.8022155761719, + "logps/rejected": -266.6853332519531, + "loss": 0.327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06687840819358826, + "rewards/margins": 1.47237229347229, + "rewards/rejected": -1.5392507314682007, + "step": 1955 + }, + { + "epoch": 0.23, + "learning_rate": 2.3591244293573685e-07, + "logits/chosen": -3.7143688201904297, + "logits/rejected": -3.4185824394226074, + "logps/chosen": -234.03256225585938, + "logps/rejected": -261.76666259765625, + "loss": 0.7314, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.666238009929657, + "rewards/margins": 0.875117301940918, + "rewards/rejected": -1.5413552522659302, + "step": 1956 + }, + { + "epoch": 0.23, + "learning_rate": 2.3587732646611258e-07, + "logits/chosen": -2.9856066703796387, + "logits/rejected": -3.025017499923706, + "logps/chosen": -486.88458251953125, + "logps/rejected": -332.754150390625, + "loss": 0.1526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5096101760864258, + "rewards/margins": 2.5398807525634766, + "rewards/rejected": -2.030270576477051, + "step": 1957 + }, + { + "epoch": 0.23, + "learning_rate": 2.3584220999648833e-07, + "logits/chosen": -3.129814624786377, + "logits/rejected": -3.169912815093994, + "logps/chosen": -460.1618347167969, + "logps/rejected": -255.0945281982422, + "loss": 0.346, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3262975513935089, + "rewards/margins": 1.0936552286148071, + "rewards/rejected": -0.767357587814331, + "step": 1958 + }, + { + "epoch": 0.23, + "learning_rate": 2.358070935268641e-07, + "logits/chosen": -3.6455092430114746, + "logits/rejected": -3.701657295227051, + "logps/chosen": -234.68646240234375, + "logps/rejected": -222.25436401367188, + "loss": 0.7702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06427282094955444, + "rewards/margins": 0.5861705541610718, + "rewards/rejected": -0.5218977928161621, + "step": 1959 + }, + { + "epoch": 0.23, + "learning_rate": 2.357719770572398e-07, + "logits/chosen": -2.662120819091797, + "logits/rejected": -3.1801023483276367, + "logps/chosen": -308.42352294921875, + "logps/rejected": -174.07528686523438, + "loss": 0.4272, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12972846627235413, + "rewards/margins": 1.031456708908081, + "rewards/rejected": -0.9017282724380493, + "step": 1960 + }, + { + "epoch": 0.23, + "learning_rate": 2.357368605876156e-07, + "logits/chosen": -3.6806488037109375, + "logits/rejected": -3.452808380126953, + "logps/chosen": -179.83517456054688, + "logps/rejected": -155.84814453125, + "loss": 0.3821, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04810775816440582, + "rewards/margins": 0.8990609645843506, + "rewards/rejected": -0.9471687078475952, + "step": 1961 + }, + { + "epoch": 0.23, + "learning_rate": 2.3570174411799135e-07, + "logits/chosen": -3.4087164402008057, + "logits/rejected": -3.025362730026245, + "logps/chosen": -273.7395324707031, + "logps/rejected": -211.30308532714844, + "loss": 0.712, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5862331986427307, + "rewards/margins": 0.4231279492378235, + "rewards/rejected": -1.0093611478805542, + "step": 1962 + }, + { + "epoch": 0.23, + "learning_rate": 2.3566662764836707e-07, + "logits/chosen": -3.027560234069824, + "logits/rejected": -3.2989187240600586, + "logps/chosen": -189.6392822265625, + "logps/rejected": -178.92807006835938, + "loss": 0.4945, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1672583371400833, + "rewards/margins": 1.1380326747894287, + "rewards/rejected": -0.9707743525505066, + "step": 1963 + }, + { + "epoch": 0.23, + "learning_rate": 2.3563151117874283e-07, + "logits/chosen": -2.862318992614746, + "logits/rejected": -2.620781898498535, + "logps/chosen": -412.0824890136719, + "logps/rejected": -407.7791748046875, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15770098567008972, + "rewards/margins": 1.1947884559631348, + "rewards/rejected": -1.3524894714355469, + "step": 1964 + }, + { + "epoch": 0.23, + "learning_rate": 2.3559639470911855e-07, + "logits/chosen": -3.1342275142669678, + "logits/rejected": -3.2022297382354736, + "logps/chosen": -250.08865356445312, + "logps/rejected": -252.90341186523438, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32614508271217346, + "rewards/margins": 0.6992385983467102, + "rewards/rejected": -1.025383710861206, + "step": 1965 + }, + { + "epoch": 0.23, + "learning_rate": 2.355612782394943e-07, + "logits/chosen": -3.786177158355713, + "logits/rejected": -3.3957486152648926, + "logps/chosen": -404.34014892578125, + "logps/rejected": -175.4032440185547, + "loss": 0.5511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08581207692623138, + "rewards/margins": 1.0726323127746582, + "rewards/rejected": -1.1584444046020508, + "step": 1966 + }, + { + "epoch": 0.23, + "learning_rate": 2.3552616176987006e-07, + "logits/chosen": -3.10373854637146, + "logits/rejected": -3.129697799682617, + "logps/chosen": -182.16091918945312, + "logps/rejected": -181.70484924316406, + "loss": 0.6101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28600138425827026, + "rewards/margins": 0.9202946424484253, + "rewards/rejected": -1.2062960863113403, + "step": 1967 + }, + { + "epoch": 0.23, + "learning_rate": 2.354910453002458e-07, + "logits/chosen": -2.781548500061035, + "logits/rejected": -2.763784408569336, + "logps/chosen": -120.17839050292969, + "logps/rejected": -209.00851440429688, + "loss": 0.304, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1158633828163147, + "rewards/margins": 1.9739372730255127, + "rewards/rejected": -1.8580738306045532, + "step": 1968 + }, + { + "epoch": 0.23, + "learning_rate": 2.3545592883062154e-07, + "logits/chosen": -3.1833155155181885, + "logits/rejected": -2.836817741394043, + "logps/chosen": -197.14303588867188, + "logps/rejected": -160.02499389648438, + "loss": 0.75, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33621856570243835, + "rewards/margins": 0.18338410556316376, + "rewards/rejected": -0.5196026563644409, + "step": 1969 + }, + { + "epoch": 0.23, + "learning_rate": 2.3542081236099732e-07, + "logits/chosen": -2.8824148178100586, + "logits/rejected": -2.9640679359436035, + "logps/chosen": -247.38136291503906, + "logps/rejected": -211.27462768554688, + "loss": 0.4154, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17264491319656372, + "rewards/margins": 1.2930083274841309, + "rewards/rejected": -1.1203633546829224, + "step": 1970 + }, + { + "epoch": 0.23, + "learning_rate": 2.3538569589137305e-07, + "logits/chosen": -2.7850444316864014, + "logits/rejected": -3.2077322006225586, + "logps/chosen": -280.5623779296875, + "logps/rejected": -247.1150665283203, + "loss": 0.4595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013271048665046692, + "rewards/margins": 1.8012468814849854, + "rewards/rejected": -1.814517855644226, + "step": 1971 + }, + { + "epoch": 0.23, + "learning_rate": 2.353505794217488e-07, + "logits/chosen": -2.7597410678863525, + "logits/rejected": -2.4582085609436035, + "logps/chosen": -498.0703125, + "logps/rejected": -239.3566131591797, + "loss": 0.4264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12834206223487854, + "rewards/margins": 1.2244586944580078, + "rewards/rejected": -1.3528008460998535, + "step": 1972 + }, + { + "epoch": 0.23, + "learning_rate": 2.3531546295212453e-07, + "logits/chosen": -3.6037542819976807, + "logits/rejected": -3.8507797718048096, + "logps/chosen": -230.17218017578125, + "logps/rejected": -518.9234619140625, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1032576709985733, + "rewards/margins": 2.7321343421936035, + "rewards/rejected": -2.6288766860961914, + "step": 1973 + }, + { + "epoch": 0.23, + "learning_rate": 2.3528034648250029e-07, + "logits/chosen": -3.1235060691833496, + "logits/rejected": -3.231555938720703, + "logps/chosen": -261.3963317871094, + "logps/rejected": -269.98565673828125, + "loss": 0.5174, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008741632103919983, + "rewards/margins": 1.5905370712280273, + "rewards/rejected": -1.5817954540252686, + "step": 1974 + }, + { + "epoch": 0.23, + "learning_rate": 2.3524523001287604e-07, + "logits/chosen": -2.926250696182251, + "logits/rejected": -2.9241366386413574, + "logps/chosen": -326.24908447265625, + "logps/rejected": -220.0181427001953, + "loss": 0.5579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09044589102268219, + "rewards/margins": 0.535925030708313, + "rewards/rejected": -0.6263708472251892, + "step": 1975 + }, + { + "epoch": 0.23, + "learning_rate": 2.3521011354325177e-07, + "logits/chosen": -3.4068362712860107, + "logits/rejected": -3.146085262298584, + "logps/chosen": -177.4674530029297, + "logps/rejected": -161.8980712890625, + "loss": 0.3209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3288155496120453, + "rewards/margins": 1.8098738193511963, + "rewards/rejected": -1.4810583591461182, + "step": 1976 + }, + { + "epoch": 0.23, + "learning_rate": 2.3517499707362752e-07, + "logits/chosen": -3.3953781127929688, + "logits/rejected": -3.2584385871887207, + "logps/chosen": -196.67868041992188, + "logps/rejected": -189.63523864746094, + "loss": 0.3843, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46858957409858704, + "rewards/margins": 1.648219108581543, + "rewards/rejected": -2.1168088912963867, + "step": 1977 + }, + { + "epoch": 0.23, + "learning_rate": 2.3513988060400327e-07, + "logits/chosen": -2.5541305541992188, + "logits/rejected": -2.795699119567871, + "logps/chosen": -381.5532531738281, + "logps/rejected": -178.51487731933594, + "loss": 0.7529, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10719103366136551, + "rewards/margins": 0.40942907333374023, + "rewards/rejected": -0.5166201591491699, + "step": 1978 + }, + { + "epoch": 0.23, + "learning_rate": 2.35104764134379e-07, + "logits/chosen": -3.1287455558776855, + "logits/rejected": -3.098891258239746, + "logps/chosen": -291.9648742675781, + "logps/rejected": -223.3717041015625, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4046216309070587, + "rewards/margins": 1.9311535358428955, + "rewards/rejected": -1.5265318155288696, + "step": 1979 + }, + { + "epoch": 0.23, + "learning_rate": 2.3506964766475476e-07, + "logits/chosen": -3.5724217891693115, + "logits/rejected": -3.437025308609009, + "logps/chosen": -266.0279541015625, + "logps/rejected": -211.51556396484375, + "loss": 0.4137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21183404326438904, + "rewards/margins": 1.0518218278884888, + "rewards/rejected": -1.2636559009552002, + "step": 1980 + }, + { + "epoch": 0.23, + "learning_rate": 2.3503453119513048e-07, + "logits/chosen": -2.911797046661377, + "logits/rejected": -2.946897268295288, + "logps/chosen": -208.14990234375, + "logps/rejected": -205.21763610839844, + "loss": 0.8525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8678486347198486, + "rewards/margins": 0.11905436217784882, + "rewards/rejected": -0.9869030117988586, + "step": 1981 + }, + { + "epoch": 0.23, + "learning_rate": 2.3499941472550626e-07, + "logits/chosen": -2.9123899936676025, + "logits/rejected": -2.8558435440063477, + "logps/chosen": -288.6016845703125, + "logps/rejected": -322.5012512207031, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2832695543766022, + "rewards/margins": 1.048529863357544, + "rewards/rejected": -1.3317995071411133, + "step": 1982 + }, + { + "epoch": 0.23, + "learning_rate": 2.3496429825588202e-07, + "logits/chosen": -3.104579448699951, + "logits/rejected": -3.136007785797119, + "logps/chosen": -197.6335906982422, + "logps/rejected": -290.9420166015625, + "loss": 0.5118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16035132110118866, + "rewards/margins": 0.9162065982818604, + "rewards/rejected": -1.076557993888855, + "step": 1983 + }, + { + "epoch": 0.23, + "learning_rate": 2.3492918178625774e-07, + "logits/chosen": -2.4632530212402344, + "logits/rejected": -2.5979862213134766, + "logps/chosen": -137.07080078125, + "logps/rejected": -156.28280639648438, + "loss": 0.3892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2150254100561142, + "rewards/margins": 1.5731987953186035, + "rewards/rejected": -1.7882241010665894, + "step": 1984 + }, + { + "epoch": 0.23, + "learning_rate": 2.348940653166335e-07, + "logits/chosen": -3.679448127746582, + "logits/rejected": -3.8115761280059814, + "logps/chosen": -146.7847900390625, + "logps/rejected": -139.30975341796875, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10033592581748962, + "rewards/margins": 1.1884117126464844, + "rewards/rejected": -1.0880756378173828, + "step": 1985 + }, + { + "epoch": 0.23, + "learning_rate": 2.3485894884700925e-07, + "logits/chosen": -3.388212203979492, + "logits/rejected": -3.599100351333618, + "logps/chosen": -153.46485900878906, + "logps/rejected": -210.74172973632812, + "loss": 0.3317, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00030625052750110626, + "rewards/margins": 1.9318147897720337, + "rewards/rejected": -1.9315085411071777, + "step": 1986 + }, + { + "epoch": 0.23, + "learning_rate": 2.3482383237738498e-07, + "logits/chosen": -2.721313953399658, + "logits/rejected": -2.9026601314544678, + "logps/chosen": -347.9246826171875, + "logps/rejected": -385.1693115234375, + "loss": 0.4992, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07772264629602432, + "rewards/margins": 1.1386456489562988, + "rewards/rejected": -1.0609229803085327, + "step": 1987 + }, + { + "epoch": 0.23, + "learning_rate": 2.3478871590776073e-07, + "logits/chosen": -3.024780035018921, + "logits/rejected": -3.2264184951782227, + "logps/chosen": -245.78680419921875, + "logps/rejected": -172.29356384277344, + "loss": 0.4822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01904572919011116, + "rewards/margins": 0.9324353337287903, + "rewards/rejected": -0.9514811635017395, + "step": 1988 + }, + { + "epoch": 0.23, + "learning_rate": 2.3475359943813646e-07, + "logits/chosen": -3.036740779876709, + "logits/rejected": -2.59208345413208, + "logps/chosen": -267.9580993652344, + "logps/rejected": -239.78988647460938, + "loss": 0.5305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5468452572822571, + "rewards/margins": 0.8903722763061523, + "rewards/rejected": -1.4372174739837646, + "step": 1989 + }, + { + "epoch": 0.23, + "learning_rate": 2.3471848296851221e-07, + "logits/chosen": -2.996565818786621, + "logits/rejected": -2.908280849456787, + "logps/chosen": -145.6241455078125, + "logps/rejected": -158.10855102539062, + "loss": 0.5997, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5281105041503906, + "rewards/margins": 0.5710313320159912, + "rewards/rejected": -1.0991419553756714, + "step": 1990 + }, + { + "epoch": 0.23, + "learning_rate": 2.3468336649888797e-07, + "logits/chosen": -3.249218702316284, + "logits/rejected": -3.673222541809082, + "logps/chosen": -210.15866088867188, + "logps/rejected": -336.4881286621094, + "loss": 0.3395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11744435131549835, + "rewards/margins": 3.1911227703094482, + "rewards/rejected": -3.3085670471191406, + "step": 1991 + }, + { + "epoch": 0.23, + "learning_rate": 2.346482500292637e-07, + "logits/chosen": -2.7963690757751465, + "logits/rejected": -3.1021127700805664, + "logps/chosen": -271.590087890625, + "logps/rejected": -234.71060180664062, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1693440079689026, + "rewards/margins": 2.1083009243011475, + "rewards/rejected": -1.9389569759368896, + "step": 1992 + }, + { + "epoch": 0.23, + "learning_rate": 2.3461313355963948e-07, + "logits/chosen": -2.612342357635498, + "logits/rejected": -2.4271016120910645, + "logps/chosen": -414.65728759765625, + "logps/rejected": -272.4967956542969, + "loss": 0.6568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8659661412239075, + "rewards/margins": 0.27033287286758423, + "rewards/rejected": -1.1362990140914917, + "step": 1993 + }, + { + "epoch": 0.23, + "learning_rate": 2.3457801709001518e-07, + "logits/chosen": -3.2016282081604004, + "logits/rejected": -3.3491063117980957, + "logps/chosen": -133.52964782714844, + "logps/rejected": -174.03402709960938, + "loss": 0.4785, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12429818511009216, + "rewards/margins": 1.8087689876556396, + "rewards/rejected": -1.6844708919525146, + "step": 1994 + }, + { + "epoch": 0.23, + "learning_rate": 2.3454290062039096e-07, + "logits/chosen": -3.4598541259765625, + "logits/rejected": -3.361279010772705, + "logps/chosen": -271.35675048828125, + "logps/rejected": -277.79791259765625, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6304394006729126, + "rewards/margins": 1.3628253936767578, + "rewards/rejected": -1.9932647943496704, + "step": 1995 + }, + { + "epoch": 0.23, + "learning_rate": 2.345077841507667e-07, + "logits/chosen": -2.536715030670166, + "logits/rejected": -2.762017250061035, + "logps/chosen": -190.5369873046875, + "logps/rejected": -277.1086730957031, + "loss": 0.3889, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3141373097896576, + "rewards/margins": 1.5980970859527588, + "rewards/rejected": -1.2839598655700684, + "step": 1996 + }, + { + "epoch": 0.23, + "learning_rate": 2.3447266768114244e-07, + "logits/chosen": -3.0305919647216797, + "logits/rejected": -2.9997315406799316, + "logps/chosen": -472.67926025390625, + "logps/rejected": -311.43670654296875, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08996890485286713, + "rewards/margins": 1.4198189973831177, + "rewards/rejected": -1.3298500776290894, + "step": 1997 + }, + { + "epoch": 0.23, + "learning_rate": 2.344375512115182e-07, + "logits/chosen": -3.108970880508423, + "logits/rejected": -3.1756434440612793, + "logps/chosen": -256.4834899902344, + "logps/rejected": -341.9066162109375, + "loss": 0.513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25651776790618896, + "rewards/margins": 1.23410964012146, + "rewards/rejected": -1.4906272888183594, + "step": 1998 + }, + { + "epoch": 0.23, + "learning_rate": 2.3440243474189395e-07, + "logits/chosen": -2.774517297744751, + "logits/rejected": -2.6736412048339844, + "logps/chosen": -266.9547424316406, + "logps/rejected": -213.7591552734375, + "loss": 0.5036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02348414435982704, + "rewards/margins": 1.35770845413208, + "rewards/rejected": -1.3811925649642944, + "step": 1999 + }, + { + "epoch": 0.23, + "learning_rate": 2.3436731827226967e-07, + "logits/chosen": -3.4116063117980957, + "logits/rejected": -2.9712471961975098, + "logps/chosen": -211.96937561035156, + "logps/rejected": -190.48724365234375, + "loss": 1.03, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.45111793279647827, + "rewards/margins": 0.2641064524650574, + "rewards/rejected": -0.7152243256568909, + "step": 2000 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -2.84497332572937, + "eval_logits/rejected": -2.8033573627471924, + "eval_logps/chosen": -293.25732421875, + "eval_logps/rejected": -233.89837646484375, + "eval_loss": 0.45588183403015137, + "eval_rewards/accuracies": 0.7571428418159485, + "eval_rewards/chosen": 0.07976274937391281, + "eval_rewards/margins": 1.035339117050171, + "eval_rewards/rejected": -0.9555763006210327, + "eval_runtime": 32.7912, + "eval_samples_per_second": 2.135, + "eval_steps_per_second": 1.067, + "step": 2000 + }, + { + "epoch": 0.23, + "learning_rate": 2.3433220180264543e-07, + "logits/chosen": -3.631974697113037, + "logits/rejected": -3.316451072692871, + "logps/chosen": -205.1908721923828, + "logps/rejected": -337.1497497558594, + "loss": 0.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005931135267019272, + "rewards/margins": 1.8907032012939453, + "rewards/rejected": -1.8847721815109253, + "step": 2001 + }, + { + "epoch": 0.23, + "learning_rate": 2.3429708533302116e-07, + "logits/chosen": -2.9462764263153076, + "logits/rejected": -2.821960926055908, + "logps/chosen": -189.9939727783203, + "logps/rejected": -253.6620330810547, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3489038944244385, + "rewards/margins": 1.5633512735366821, + "rewards/rejected": -1.912255048751831, + "step": 2002 + }, + { + "epoch": 0.23, + "learning_rate": 2.342619688633969e-07, + "logits/chosen": -3.9396228790283203, + "logits/rejected": -3.848588466644287, + "logps/chosen": -148.9018096923828, + "logps/rejected": -193.3266143798828, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.319566011428833, + "rewards/margins": 2.0531301498413086, + "rewards/rejected": -2.3726961612701416, + "step": 2003 + }, + { + "epoch": 0.23, + "learning_rate": 2.342268523937727e-07, + "logits/chosen": -3.3765323162078857, + "logits/rejected": -3.3767380714416504, + "logps/chosen": -241.10079956054688, + "logps/rejected": -232.6448211669922, + "loss": 0.7504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6027411222457886, + "rewards/margins": 1.359520673751831, + "rewards/rejected": -1.9622617959976196, + "step": 2004 + }, + { + "epoch": 0.23, + "learning_rate": 2.3419173592414842e-07, + "logits/chosen": -3.2411155700683594, + "logits/rejected": -3.378081798553467, + "logps/chosen": -459.50311279296875, + "logps/rejected": -670.8847045898438, + "loss": 0.4194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3092588484287262, + "rewards/margins": 1.3548946380615234, + "rewards/rejected": -1.6641534566879272, + "step": 2005 + }, + { + "epoch": 0.23, + "learning_rate": 2.3415661945452417e-07, + "logits/chosen": -3.3376681804656982, + "logits/rejected": -3.168686866760254, + "logps/chosen": -406.0707092285156, + "logps/rejected": -239.297607421875, + "loss": 0.3926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4092297852039337, + "rewards/margins": 1.8080594539642334, + "rewards/rejected": -2.217289447784424, + "step": 2006 + }, + { + "epoch": 0.23, + "learning_rate": 2.3412150298489992e-07, + "logits/chosen": -3.073347330093384, + "logits/rejected": -3.071197986602783, + "logps/chosen": -182.3142852783203, + "logps/rejected": -223.7637939453125, + "loss": 0.2561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025876428931951523, + "rewards/margins": 1.7001011371612549, + "rewards/rejected": -1.674224853515625, + "step": 2007 + }, + { + "epoch": 0.23, + "learning_rate": 2.3408638651527565e-07, + "logits/chosen": -2.645876884460449, + "logits/rejected": -2.553802013397217, + "logps/chosen": -143.4980010986328, + "logps/rejected": -225.02066040039062, + "loss": 0.551, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30724093317985535, + "rewards/margins": 0.5976823568344116, + "rewards/rejected": -0.9049233198165894, + "step": 2008 + }, + { + "epoch": 0.23, + "learning_rate": 2.340512700456514e-07, + "logits/chosen": -2.7979679107666016, + "logits/rejected": -3.0682616233825684, + "logps/chosen": -214.00424194335938, + "logps/rejected": -242.20034790039062, + "loss": 0.3809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0684867799282074, + "rewards/margins": 1.3817732334136963, + "rewards/rejected": -1.313286542892456, + "step": 2009 + }, + { + "epoch": 0.23, + "learning_rate": 2.3401615357602713e-07, + "logits/chosen": -3.442659616470337, + "logits/rejected": -3.7547972202301025, + "logps/chosen": -143.4029083251953, + "logps/rejected": -194.121337890625, + "loss": 0.568, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04603178799152374, + "rewards/margins": 1.5408352613449097, + "rewards/rejected": -1.5868672132492065, + "step": 2010 + }, + { + "epoch": 0.23, + "learning_rate": 2.339810371064029e-07, + "logits/chosen": -3.143305778503418, + "logits/rejected": -2.797666311264038, + "logps/chosen": -206.4252166748047, + "logps/rejected": -227.5640869140625, + "loss": 0.4027, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.028666503727436066, + "rewards/margins": 0.896481454372406, + "rewards/rejected": -0.8678149580955505, + "step": 2011 + }, + { + "epoch": 0.23, + "learning_rate": 2.3394592063677864e-07, + "logits/chosen": -2.2647078037261963, + "logits/rejected": -2.648935317993164, + "logps/chosen": -198.26852416992188, + "logps/rejected": -214.53927612304688, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39364778995513916, + "rewards/margins": 2.059122085571289, + "rewards/rejected": -1.665474534034729, + "step": 2012 + }, + { + "epoch": 0.23, + "learning_rate": 2.3391080416715437e-07, + "logits/chosen": -3.081793785095215, + "logits/rejected": -3.18727445602417, + "logps/chosen": -383.7090148925781, + "logps/rejected": -410.2918701171875, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17957527935504913, + "rewards/margins": 1.3364853858947754, + "rewards/rejected": -1.1569101810455322, + "step": 2013 + }, + { + "epoch": 0.23, + "learning_rate": 2.3387568769753012e-07, + "logits/chosen": -3.4525787830352783, + "logits/rejected": -3.156954050064087, + "logps/chosen": -919.3490600585938, + "logps/rejected": -304.83258056640625, + "loss": 0.6537, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19651490449905396, + "rewards/margins": 0.9749191999435425, + "rewards/rejected": -1.1714341640472412, + "step": 2014 + }, + { + "epoch": 0.23, + "learning_rate": 2.338405712279059e-07, + "logits/chosen": -3.2447586059570312, + "logits/rejected": -2.891024112701416, + "logps/chosen": -277.0435791015625, + "logps/rejected": -337.3553771972656, + "loss": 0.8479, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21248409152030945, + "rewards/margins": 0.5197092890739441, + "rewards/rejected": -0.7321934700012207, + "step": 2015 + }, + { + "epoch": 0.23, + "learning_rate": 2.3380545475828163e-07, + "logits/chosen": -2.3986520767211914, + "logits/rejected": -2.9115982055664062, + "logps/chosen": -225.97000122070312, + "logps/rejected": -271.666259765625, + "loss": 0.5806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3381935954093933, + "rewards/margins": 0.7461787462234497, + "rewards/rejected": -1.0843722820281982, + "step": 2016 + }, + { + "epoch": 0.23, + "learning_rate": 2.3377033828865738e-07, + "logits/chosen": -2.841732978820801, + "logits/rejected": -3.3018264770507812, + "logps/chosen": -151.11618041992188, + "logps/rejected": -166.3741455078125, + "loss": 0.4516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16383850574493408, + "rewards/margins": 0.8887876272201538, + "rewards/rejected": -1.052626132965088, + "step": 2017 + }, + { + "epoch": 0.23, + "learning_rate": 2.337352218190331e-07, + "logits/chosen": -3.245800495147705, + "logits/rejected": -3.195681095123291, + "logps/chosen": -184.68496704101562, + "logps/rejected": -324.7975769042969, + "loss": 0.2785, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24927213788032532, + "rewards/margins": 2.601172924041748, + "rewards/rejected": -2.351900577545166, + "step": 2018 + }, + { + "epoch": 0.23, + "learning_rate": 2.3370010534940886e-07, + "logits/chosen": -3.6032605171203613, + "logits/rejected": -3.8894782066345215, + "logps/chosen": -90.32426452636719, + "logps/rejected": -189.17861938476562, + "loss": 0.268, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05410033091902733, + "rewards/margins": 2.307948589324951, + "rewards/rejected": -2.253848075866699, + "step": 2019 + }, + { + "epoch": 0.23, + "learning_rate": 2.3366498887978462e-07, + "logits/chosen": -2.290137767791748, + "logits/rejected": -2.274712085723877, + "logps/chosen": -313.7291564941406, + "logps/rejected": -258.10540771484375, + "loss": 0.513, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5373225212097168, + "rewards/margins": 0.8017616271972656, + "rewards/rejected": -1.3390840291976929, + "step": 2020 + }, + { + "epoch": 0.23, + "learning_rate": 2.3362987241016035e-07, + "logits/chosen": -2.9940185546875, + "logits/rejected": -3.140697479248047, + "logps/chosen": -275.61041259765625, + "logps/rejected": -196.7899169921875, + "loss": 0.2742, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6242948174476624, + "rewards/margins": 1.6437617540359497, + "rewards/rejected": -1.0194669961929321, + "step": 2021 + }, + { + "epoch": 0.23, + "learning_rate": 2.335947559405361e-07, + "logits/chosen": -3.541834831237793, + "logits/rejected": -3.439565658569336, + "logps/chosen": -172.88687133789062, + "logps/rejected": -255.30221557617188, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.458914577960968, + "rewards/margins": 2.003291130065918, + "rewards/rejected": -2.462205648422241, + "step": 2022 + }, + { + "epoch": 0.23, + "learning_rate": 2.3355963947091185e-07, + "logits/chosen": -3.9574637413024902, + "logits/rejected": -3.7782845497131348, + "logps/chosen": -264.4451599121094, + "logps/rejected": -276.3128967285156, + "loss": 0.3568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5034382343292236, + "rewards/margins": 2.0167386531829834, + "rewards/rejected": -2.520176887512207, + "step": 2023 + }, + { + "epoch": 0.23, + "learning_rate": 2.3352452300128758e-07, + "logits/chosen": -3.063586950302124, + "logits/rejected": -3.0610733032226562, + "logps/chosen": -268.8010559082031, + "logps/rejected": -231.368408203125, + "loss": 0.3525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1789417415857315, + "rewards/margins": 1.3715598583221436, + "rewards/rejected": -1.550501823425293, + "step": 2024 + }, + { + "epoch": 0.23, + "learning_rate": 2.3348940653166334e-07, + "logits/chosen": -2.6767821311950684, + "logits/rejected": -2.8032383918762207, + "logps/chosen": -142.48199462890625, + "logps/rejected": -156.19973754882812, + "loss": 0.3928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01817232370376587, + "rewards/margins": 1.2058762311935425, + "rewards/rejected": -1.2240486145019531, + "step": 2025 + }, + { + "epoch": 0.23, + "learning_rate": 2.3345429006203906e-07, + "logits/chosen": -3.4470295906066895, + "logits/rejected": -3.3284735679626465, + "logps/chosen": -245.08966064453125, + "logps/rejected": -232.45571899414062, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3664732575416565, + "rewards/margins": 0.8882078528404236, + "rewards/rejected": -1.25468111038208, + "step": 2026 + }, + { + "epoch": 0.23, + "learning_rate": 2.3341917359241484e-07, + "logits/chosen": -2.7545552253723145, + "logits/rejected": -2.7538089752197266, + "logps/chosen": -150.04014587402344, + "logps/rejected": -214.3499755859375, + "loss": 0.2637, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41995683312416077, + "rewards/margins": 1.5865967273712158, + "rewards/rejected": -1.1666399240493774, + "step": 2027 + }, + { + "epoch": 0.23, + "learning_rate": 2.333840571227906e-07, + "logits/chosen": -3.548743486404419, + "logits/rejected": -3.684885263442993, + "logps/chosen": -252.48455810546875, + "logps/rejected": -194.57154846191406, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.001837063580751419, + "rewards/margins": 1.6874967813491821, + "rewards/rejected": -1.6893337965011597, + "step": 2028 + }, + { + "epoch": 0.23, + "learning_rate": 2.3334894065316632e-07, + "logits/chosen": -3.33207631111145, + "logits/rejected": -3.281633138656616, + "logps/chosen": -482.3106384277344, + "logps/rejected": -255.0568084716797, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5264302492141724, + "rewards/margins": 2.3673295974731445, + "rewards/rejected": -1.8408994674682617, + "step": 2029 + }, + { + "epoch": 0.23, + "learning_rate": 2.3331382418354208e-07, + "logits/chosen": -2.669374942779541, + "logits/rejected": -2.921800374984741, + "logps/chosen": -364.39935302734375, + "logps/rejected": -233.28269958496094, + "loss": 0.6693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5869595408439636, + "rewards/margins": 0.3963136076927185, + "rewards/rejected": -0.9832731485366821, + "step": 2030 + }, + { + "epoch": 0.23, + "learning_rate": 2.3327870771391783e-07, + "logits/chosen": -3.4732565879821777, + "logits/rejected": -3.605494260787964, + "logps/chosen": -198.308349609375, + "logps/rejected": -190.59176635742188, + "loss": 0.274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.564396858215332, + "rewards/margins": 1.867499589920044, + "rewards/rejected": -1.3031026124954224, + "step": 2031 + }, + { + "epoch": 0.23, + "learning_rate": 2.3324359124429356e-07, + "logits/chosen": -3.0682919025421143, + "logits/rejected": -3.3911755084991455, + "logps/chosen": -194.8433837890625, + "logps/rejected": -226.01980590820312, + "loss": 0.4594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09444911032915115, + "rewards/margins": 1.8091578483581543, + "rewards/rejected": -1.714708685874939, + "step": 2032 + }, + { + "epoch": 0.23, + "learning_rate": 2.332084747746693e-07, + "logits/chosen": -3.734302520751953, + "logits/rejected": -3.6981258392333984, + "logps/chosen": -263.3233337402344, + "logps/rejected": -304.0080261230469, + "loss": 0.2715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4291212558746338, + "rewards/margins": 2.5984606742858887, + "rewards/rejected": -3.0275819301605225, + "step": 2033 + }, + { + "epoch": 0.23, + "learning_rate": 2.3317335830504504e-07, + "logits/chosen": -2.865103244781494, + "logits/rejected": -2.8553848266601562, + "logps/chosen": -304.69525146484375, + "logps/rejected": -270.1980895996094, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3193957507610321, + "rewards/margins": 1.4623113870620728, + "rewards/rejected": -1.1429156064987183, + "step": 2034 + }, + { + "epoch": 0.23, + "learning_rate": 2.331382418354208e-07, + "logits/chosen": -3.4932947158813477, + "logits/rejected": -3.5716090202331543, + "logps/chosen": -97.14220428466797, + "logps/rejected": -182.97845458984375, + "loss": 0.2682, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4542883634567261, + "rewards/margins": 1.6249351501464844, + "rewards/rejected": -2.0792236328125, + "step": 2035 + }, + { + "epoch": 0.23, + "learning_rate": 2.3310312536579655e-07, + "logits/chosen": -2.827333450317383, + "logits/rejected": -3.074057102203369, + "logps/chosen": -187.10299682617188, + "logps/rejected": -214.9417724609375, + "loss": 0.3275, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43410050868988037, + "rewards/margins": 2.3590221405029297, + "rewards/rejected": -1.9249216318130493, + "step": 2036 + }, + { + "epoch": 0.23, + "learning_rate": 2.3306800889617228e-07, + "logits/chosen": -3.311232566833496, + "logits/rejected": -2.9895005226135254, + "logps/chosen": -203.78494262695312, + "logps/rejected": -176.9379119873047, + "loss": 0.3698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2735413610935211, + "rewards/margins": 1.570022702217102, + "rewards/rejected": -1.2964814901351929, + "step": 2037 + }, + { + "epoch": 0.23, + "learning_rate": 2.3303289242654806e-07, + "logits/chosen": -3.495087146759033, + "logits/rejected": -3.3478736877441406, + "logps/chosen": -236.81210327148438, + "logps/rejected": -227.91860961914062, + "loss": 0.448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6140186190605164, + "rewards/margins": 1.039901614189148, + "rewards/rejected": -1.653920292854309, + "step": 2038 + }, + { + "epoch": 0.24, + "learning_rate": 2.329977759569238e-07, + "logits/chosen": -3.367856025695801, + "logits/rejected": -3.5354931354522705, + "logps/chosen": -98.99674224853516, + "logps/rejected": -166.5740966796875, + "loss": 0.456, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31941890716552734, + "rewards/margins": 1.4417237043380737, + "rewards/rejected": -1.761142611503601, + "step": 2039 + }, + { + "epoch": 0.24, + "learning_rate": 2.3296265948729954e-07, + "logits/chosen": -2.9856796264648438, + "logits/rejected": -3.249156951904297, + "logps/chosen": -236.30120849609375, + "logps/rejected": -281.39324951171875, + "loss": 0.301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11615180224180222, + "rewards/margins": 1.4050902128219604, + "rewards/rejected": -1.5212421417236328, + "step": 2040 + }, + { + "epoch": 0.24, + "learning_rate": 2.329275430176753e-07, + "logits/chosen": -3.1886255741119385, + "logits/rejected": -3.8579916954040527, + "logps/chosen": -218.33627319335938, + "logps/rejected": -339.0235595703125, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2469361275434494, + "rewards/margins": 2.5802955627441406, + "rewards/rejected": -2.333359718322754, + "step": 2041 + }, + { + "epoch": 0.24, + "learning_rate": 2.3289242654805102e-07, + "logits/chosen": -3.01922869682312, + "logits/rejected": -2.592362403869629, + "logps/chosen": -371.1042785644531, + "logps/rejected": -258.16058349609375, + "loss": 0.6831, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14682637155056, + "rewards/margins": 0.264029324054718, + "rewards/rejected": -0.4108557105064392, + "step": 2042 + }, + { + "epoch": 0.24, + "learning_rate": 2.3285731007842677e-07, + "logits/chosen": -3.91856050491333, + "logits/rejected": -3.606844663619995, + "logps/chosen": -339.1500549316406, + "logps/rejected": -207.0172119140625, + "loss": 0.6975, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4894591271877289, + "rewards/margins": 0.12524759769439697, + "rewards/rejected": -0.6147066950798035, + "step": 2043 + }, + { + "epoch": 0.24, + "learning_rate": 2.3282219360880253e-07, + "logits/chosen": -3.613152027130127, + "logits/rejected": -3.046384334564209, + "logps/chosen": -275.8511047363281, + "logps/rejected": -142.7661590576172, + "loss": 0.3015, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10897894948720932, + "rewards/margins": 1.6519333124160767, + "rewards/rejected": -1.5429542064666748, + "step": 2044 + }, + { + "epoch": 0.24, + "learning_rate": 2.3278707713917825e-07, + "logits/chosen": -3.560540199279785, + "logits/rejected": -3.3008222579956055, + "logps/chosen": -170.01898193359375, + "logps/rejected": -198.36279296875, + "loss": 0.2842, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1061142310500145, + "rewards/margins": 1.7505090236663818, + "rewards/rejected": -1.6443949937820435, + "step": 2045 + }, + { + "epoch": 0.24, + "learning_rate": 2.32751960669554e-07, + "logits/chosen": -2.879520893096924, + "logits/rejected": -2.573899269104004, + "logps/chosen": -406.53265380859375, + "logps/rejected": -242.12899780273438, + "loss": 0.8494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4708259701728821, + "rewards/margins": 0.6300718188285828, + "rewards/rejected": -1.1008977890014648, + "step": 2046 + }, + { + "epoch": 0.24, + "learning_rate": 2.3271684419992973e-07, + "logits/chosen": -3.217991352081299, + "logits/rejected": -2.8754897117614746, + "logps/chosen": -263.68121337890625, + "logps/rejected": -271.5013427734375, + "loss": 0.4251, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06216999888420105, + "rewards/margins": 0.8423784971237183, + "rewards/rejected": -0.9045485258102417, + "step": 2047 + }, + { + "epoch": 0.24, + "learning_rate": 2.326817277303055e-07, + "logits/chosen": -2.6422576904296875, + "logits/rejected": -2.5133559703826904, + "logps/chosen": -368.7999267578125, + "logps/rejected": -332.3048400878906, + "loss": 0.7202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14642934501171112, + "rewards/margins": 0.6152974367141724, + "rewards/rejected": -0.7617268562316895, + "step": 2048 + }, + { + "epoch": 0.24, + "learning_rate": 2.3264661126068127e-07, + "logits/chosen": -3.1302313804626465, + "logits/rejected": -2.920776844024658, + "logps/chosen": -312.0892333984375, + "logps/rejected": -202.4169464111328, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3656645715236664, + "rewards/margins": 1.546316385269165, + "rewards/rejected": -1.1806517839431763, + "step": 2049 + }, + { + "epoch": 0.24, + "learning_rate": 2.32611494791057e-07, + "logits/chosen": -3.1597986221313477, + "logits/rejected": -3.3330230712890625, + "logps/chosen": -223.2503662109375, + "logps/rejected": -358.99755859375, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21997302770614624, + "rewards/margins": 2.1808881759643555, + "rewards/rejected": -1.960915207862854, + "step": 2050 + }, + { + "epoch": 0.24, + "learning_rate": 2.3257637832143275e-07, + "logits/chosen": -3.1756839752197266, + "logits/rejected": -2.7677314281463623, + "logps/chosen": -254.58428955078125, + "logps/rejected": -170.6003875732422, + "loss": 0.335, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13010364770889282, + "rewards/margins": 1.7006912231445312, + "rewards/rejected": -1.570587396621704, + "step": 2051 + }, + { + "epoch": 0.24, + "learning_rate": 2.325412618518085e-07, + "logits/chosen": -3.3310437202453613, + "logits/rejected": -3.3536481857299805, + "logps/chosen": -256.8912353515625, + "logps/rejected": -275.40472412109375, + "loss": 0.3766, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12958364188671112, + "rewards/margins": 1.7343131303787231, + "rewards/rejected": -1.604729413986206, + "step": 2052 + }, + { + "epoch": 0.24, + "learning_rate": 2.3250614538218423e-07, + "logits/chosen": -3.2945964336395264, + "logits/rejected": -3.4332611560821533, + "logps/chosen": -194.5066375732422, + "logps/rejected": -329.435791015625, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7774198055267334, + "rewards/margins": 4.159289360046387, + "rewards/rejected": -3.3818695545196533, + "step": 2053 + }, + { + "epoch": 0.24, + "learning_rate": 2.3247102891255999e-07, + "logits/chosen": -2.982991933822632, + "logits/rejected": -2.9232401847839355, + "logps/chosen": -213.48809814453125, + "logps/rejected": -351.4576416015625, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13747844099998474, + "rewards/margins": 3.1277759075164795, + "rewards/rejected": -2.990297794342041, + "step": 2054 + }, + { + "epoch": 0.24, + "learning_rate": 2.324359124429357e-07, + "logits/chosen": -2.8734798431396484, + "logits/rejected": -2.8921728134155273, + "logps/chosen": -322.9697570800781, + "logps/rejected": -226.138671875, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2873033285140991, + "rewards/margins": 1.1255102157592773, + "rewards/rejected": -0.8382070064544678, + "step": 2055 + }, + { + "epoch": 0.24, + "learning_rate": 2.3240079597331147e-07, + "logits/chosen": -2.7112746238708496, + "logits/rejected": -2.5812742710113525, + "logps/chosen": -196.1005859375, + "logps/rejected": -271.1686706542969, + "loss": 0.3086, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15070503950119019, + "rewards/margins": 2.9618914127349854, + "rewards/rejected": -2.8111863136291504, + "step": 2056 + }, + { + "epoch": 0.24, + "learning_rate": 2.3236567950368722e-07, + "logits/chosen": -3.264662504196167, + "logits/rejected": -3.1798624992370605, + "logps/chosen": -348.24273681640625, + "logps/rejected": -242.38970947265625, + "loss": 0.4781, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.515073835849762, + "rewards/margins": 1.0141679048538208, + "rewards/rejected": -0.4990941286087036, + "step": 2057 + }, + { + "epoch": 0.24, + "learning_rate": 2.3233056303406295e-07, + "logits/chosen": -3.362121820449829, + "logits/rejected": -3.3651020526885986, + "logps/chosen": -230.2619171142578, + "logps/rejected": -254.53724670410156, + "loss": 0.5208, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09543068706989288, + "rewards/margins": 1.0240224599838257, + "rewards/rejected": -0.9285917282104492, + "step": 2058 + }, + { + "epoch": 0.24, + "learning_rate": 2.322954465644387e-07, + "logits/chosen": -2.944413185119629, + "logits/rejected": -3.228311061859131, + "logps/chosen": -193.652099609375, + "logps/rejected": -226.46304321289062, + "loss": 0.6407, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17432984709739685, + "rewards/margins": 0.6353402733802795, + "rewards/rejected": -0.4610104560852051, + "step": 2059 + }, + { + "epoch": 0.24, + "learning_rate": 2.3226033009481448e-07, + "logits/chosen": -3.9251208305358887, + "logits/rejected": -3.854796886444092, + "logps/chosen": -204.34645080566406, + "logps/rejected": -273.2313232421875, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7787759304046631, + "rewards/margins": 1.3681988716125488, + "rewards/rejected": -2.146974563598633, + "step": 2060 + }, + { + "epoch": 0.24, + "learning_rate": 2.322252136251902e-07, + "logits/chosen": -2.9845826625823975, + "logits/rejected": -3.1362674236297607, + "logps/chosen": -179.7996826171875, + "logps/rejected": -229.71188354492188, + "loss": 0.4382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14942818880081177, + "rewards/margins": 0.7847808003425598, + "rewards/rejected": -0.9342089295387268, + "step": 2061 + }, + { + "epoch": 0.24, + "learning_rate": 2.3219009715556596e-07, + "logits/chosen": -3.1472270488739014, + "logits/rejected": -3.2027831077575684, + "logps/chosen": -250.66317749023438, + "logps/rejected": -249.25238037109375, + "loss": 0.5574, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3986305892467499, + "rewards/margins": 0.97455233335495, + "rewards/rejected": -0.5759217143058777, + "step": 2062 + }, + { + "epoch": 0.24, + "learning_rate": 2.321549806859417e-07, + "logits/chosen": -3.1092135906219482, + "logits/rejected": -2.8321921825408936, + "logps/chosen": -374.6018371582031, + "logps/rejected": -338.23846435546875, + "loss": 0.2209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2629488706588745, + "rewards/margins": 1.969550371170044, + "rewards/rejected": -1.706601619720459, + "step": 2063 + }, + { + "epoch": 0.24, + "learning_rate": 2.3211986421631744e-07, + "logits/chosen": -2.9038593769073486, + "logits/rejected": -3.2201952934265137, + "logps/chosen": -224.44158935546875, + "logps/rejected": -218.9890594482422, + "loss": 0.6334, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05779045820236206, + "rewards/margins": 0.5659310817718506, + "rewards/rejected": -0.6237214803695679, + "step": 2064 + }, + { + "epoch": 0.24, + "learning_rate": 2.320847477466932e-07, + "logits/chosen": -2.447350025177002, + "logits/rejected": -2.329530715942383, + "logps/chosen": -241.9302520751953, + "logps/rejected": -236.34881591796875, + "loss": 0.6439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.045799076557159424, + "rewards/margins": 0.5019633769989014, + "rewards/rejected": -0.5477623343467712, + "step": 2065 + }, + { + "epoch": 0.24, + "learning_rate": 2.3204963127706893e-07, + "logits/chosen": -3.6228723526000977, + "logits/rejected": -3.3741965293884277, + "logps/chosen": -271.9809265136719, + "logps/rejected": -237.58929443359375, + "loss": 0.2628, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09044289588928223, + "rewards/margins": 2.3310208320617676, + "rewards/rejected": -2.2405776977539062, + "step": 2066 + }, + { + "epoch": 0.24, + "learning_rate": 2.3201451480744468e-07, + "logits/chosen": -3.1667304039001465, + "logits/rejected": -3.262972354888916, + "logps/chosen": -123.07827758789062, + "logps/rejected": -151.53941345214844, + "loss": 0.2892, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3378388583660126, + "rewards/margins": 1.7564888000488281, + "rewards/rejected": -1.4186499118804932, + "step": 2067 + }, + { + "epoch": 0.24, + "learning_rate": 2.3197939833782043e-07, + "logits/chosen": -3.7089505195617676, + "logits/rejected": -3.5454938411712646, + "logps/chosen": -137.44082641601562, + "logps/rejected": -157.4412841796875, + "loss": 0.6137, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41431868076324463, + "rewards/margins": 0.894723117351532, + "rewards/rejected": -1.3090417385101318, + "step": 2068 + }, + { + "epoch": 0.24, + "learning_rate": 2.3194428186819616e-07, + "logits/chosen": -3.1241326332092285, + "logits/rejected": -2.979635238647461, + "logps/chosen": -340.079833984375, + "logps/rejected": -253.205810546875, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24286359548568726, + "rewards/margins": 1.4917945861816406, + "rewards/rejected": -1.2489310503005981, + "step": 2069 + }, + { + "epoch": 0.24, + "learning_rate": 2.3190916539857191e-07, + "logits/chosen": -3.3570804595947266, + "logits/rejected": -3.229564905166626, + "logps/chosen": -162.0731201171875, + "logps/rejected": -130.47509765625, + "loss": 0.3454, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2674616575241089, + "rewards/margins": 1.1245052814483643, + "rewards/rejected": -0.8570435047149658, + "step": 2070 + }, + { + "epoch": 0.24, + "learning_rate": 2.3187404892894764e-07, + "logits/chosen": -2.6870970726013184, + "logits/rejected": -2.5410425662994385, + "logps/chosen": -357.74017333984375, + "logps/rejected": -200.80682373046875, + "loss": 0.7472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6361706256866455, + "rewards/margins": 0.39179790019989014, + "rewards/rejected": -1.0279685258865356, + "step": 2071 + }, + { + "epoch": 0.24, + "learning_rate": 2.3183893245932342e-07, + "logits/chosen": -3.490499258041382, + "logits/rejected": -2.9236066341400146, + "logps/chosen": -211.46511840820312, + "logps/rejected": -208.77003479003906, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09810754656791687, + "rewards/margins": 0.593243420124054, + "rewards/rejected": -0.691351056098938, + "step": 2072 + }, + { + "epoch": 0.24, + "learning_rate": 2.3180381598969918e-07, + "logits/chosen": -2.4475386142730713, + "logits/rejected": -2.4813172817230225, + "logps/chosen": -239.48947143554688, + "logps/rejected": -199.1544189453125, + "loss": 0.4764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42903584241867065, + "rewards/margins": 1.0240083932876587, + "rewards/rejected": -1.4530441761016846, + "step": 2073 + }, + { + "epoch": 0.24, + "learning_rate": 2.317686995200749e-07, + "logits/chosen": -4.11273717880249, + "logits/rejected": -3.5686588287353516, + "logps/chosen": -444.4793701171875, + "logps/rejected": -270.8744201660156, + "loss": 0.3164, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11237752437591553, + "rewards/margins": 1.9849063158035278, + "rewards/rejected": -1.8725286722183228, + "step": 2074 + }, + { + "epoch": 0.24, + "learning_rate": 2.3173358305045066e-07, + "logits/chosen": -3.6211235523223877, + "logits/rejected": -3.3090083599090576, + "logps/chosen": -255.5709228515625, + "logps/rejected": -201.60833740234375, + "loss": 0.441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05448861047625542, + "rewards/margins": 0.8456190228462219, + "rewards/rejected": -0.9001076221466064, + "step": 2075 + }, + { + "epoch": 0.24, + "learning_rate": 2.316984665808264e-07, + "logits/chosen": -2.211095094680786, + "logits/rejected": -2.259305953979492, + "logps/chosen": -380.33465576171875, + "logps/rejected": -292.9688720703125, + "loss": 0.3006, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6065548658370972, + "rewards/margins": 2.546454906463623, + "rewards/rejected": -1.9399000406265259, + "step": 2076 + }, + { + "epoch": 0.24, + "learning_rate": 2.3166335011120214e-07, + "logits/chosen": -3.1371800899505615, + "logits/rejected": -3.3831777572631836, + "logps/chosen": -372.75347900390625, + "logps/rejected": -202.69241333007812, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14730143547058105, + "rewards/margins": 1.5739052295684814, + "rewards/rejected": -1.42660391330719, + "step": 2077 + }, + { + "epoch": 0.24, + "learning_rate": 2.316282336415779e-07, + "logits/chosen": -3.3189592361450195, + "logits/rejected": -3.094216823577881, + "logps/chosen": -293.01885986328125, + "logps/rejected": -193.8881072998047, + "loss": 0.3246, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027197472751140594, + "rewards/margins": 1.5831927061080933, + "rewards/rejected": -1.555995225906372, + "step": 2078 + }, + { + "epoch": 0.24, + "learning_rate": 2.3159311717195362e-07, + "logits/chosen": -2.4798660278320312, + "logits/rejected": -2.34287166595459, + "logps/chosen": -218.00634765625, + "logps/rejected": -256.8708801269531, + "loss": 0.2574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16102617979049683, + "rewards/margins": 1.4642114639282227, + "rewards/rejected": -1.303185224533081, + "step": 2079 + }, + { + "epoch": 0.24, + "learning_rate": 2.3155800070232937e-07, + "logits/chosen": -2.9939517974853516, + "logits/rejected": -3.3702993392944336, + "logps/chosen": -301.1270751953125, + "logps/rejected": -247.39715576171875, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16677403450012207, + "rewards/margins": 3.0479025840759277, + "rewards/rejected": -2.8811285495758057, + "step": 2080 + }, + { + "epoch": 0.24, + "learning_rate": 2.3152288423270513e-07, + "logits/chosen": -3.201848030090332, + "logits/rejected": -3.272211790084839, + "logps/chosen": -164.5875244140625, + "logps/rejected": -234.74484252929688, + "loss": 0.4146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17439353466033936, + "rewards/margins": 1.2795435190200806, + "rewards/rejected": -1.45393705368042, + "step": 2081 + }, + { + "epoch": 0.24, + "learning_rate": 2.3148776776308085e-07, + "logits/chosen": -3.4412922859191895, + "logits/rejected": -3.293165922164917, + "logps/chosen": -274.3909912109375, + "logps/rejected": -241.2705841064453, + "loss": 0.3418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5082772970199585, + "rewards/margins": 2.3811819553375244, + "rewards/rejected": -1.8729045391082764, + "step": 2082 + }, + { + "epoch": 0.24, + "learning_rate": 2.3145265129345664e-07, + "logits/chosen": -2.939328670501709, + "logits/rejected": -2.76759672164917, + "logps/chosen": -234.68016052246094, + "logps/rejected": -265.6005859375, + "loss": 0.7198, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1831745207309723, + "rewards/margins": 0.11841154098510742, + "rewards/rejected": -0.3015860915184021, + "step": 2083 + }, + { + "epoch": 0.24, + "learning_rate": 2.314175348238324e-07, + "logits/chosen": -3.9192757606506348, + "logits/rejected": -3.9525022506713867, + "logps/chosen": -220.15306091308594, + "logps/rejected": -256.71136474609375, + "loss": 0.6264, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3700861930847168, + "rewards/margins": 1.2942720651626587, + "rewards/rejected": -1.6643580198287964, + "step": 2084 + }, + { + "epoch": 0.24, + "learning_rate": 2.3138241835420812e-07, + "logits/chosen": -2.8299174308776855, + "logits/rejected": -3.205439567565918, + "logps/chosen": -241.68505859375, + "logps/rejected": -176.82130432128906, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5955206155776978, + "rewards/margins": 1.596709966659546, + "rewards/rejected": -1.0011893510818481, + "step": 2085 + }, + { + "epoch": 0.24, + "learning_rate": 2.3134730188458387e-07, + "logits/chosen": -3.220149040222168, + "logits/rejected": -3.125932216644287, + "logps/chosen": -458.686767578125, + "logps/rejected": -328.49542236328125, + "loss": 0.412, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08934766054153442, + "rewards/margins": 1.6326868534088135, + "rewards/rejected": -1.5433392524719238, + "step": 2086 + }, + { + "epoch": 0.24, + "learning_rate": 2.313121854149596e-07, + "logits/chosen": -2.034285306930542, + "logits/rejected": -2.003303050994873, + "logps/chosen": -530.0022583007812, + "logps/rejected": -336.278564453125, + "loss": 0.3998, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5011769533157349, + "rewards/margins": 1.1516534090042114, + "rewards/rejected": -0.650476336479187, + "step": 2087 + }, + { + "epoch": 0.24, + "learning_rate": 2.3127706894533535e-07, + "logits/chosen": -2.691768169403076, + "logits/rejected": -2.978412628173828, + "logps/chosen": -214.77301025390625, + "logps/rejected": -369.90252685546875, + "loss": 0.2466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3249848484992981, + "rewards/margins": 2.362424850463867, + "rewards/rejected": -2.0374398231506348, + "step": 2088 + }, + { + "epoch": 0.24, + "learning_rate": 2.312419524757111e-07, + "logits/chosen": -3.6816093921661377, + "logits/rejected": -3.253176689147949, + "logps/chosen": -359.3407897949219, + "logps/rejected": -330.1440124511719, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015503372997045517, + "rewards/margins": 3.212719440460205, + "rewards/rejected": -3.197216033935547, + "step": 2089 + }, + { + "epoch": 0.24, + "learning_rate": 2.3120683600608683e-07, + "logits/chosen": -3.1668877601623535, + "logits/rejected": -3.0535621643066406, + "logps/chosen": -400.6658020019531, + "logps/rejected": -362.7208251953125, + "loss": 0.6347, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6740178465843201, + "rewards/margins": 0.5710243582725525, + "rewards/rejected": -1.245042085647583, + "step": 2090 + }, + { + "epoch": 0.24, + "learning_rate": 2.3117171953646259e-07, + "logits/chosen": -2.8681182861328125, + "logits/rejected": -2.6557397842407227, + "logps/chosen": -106.12987518310547, + "logps/rejected": -183.69265747070312, + "loss": 0.5506, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2046343982219696, + "rewards/margins": 1.1567753553390503, + "rewards/rejected": -0.9521410465240479, + "step": 2091 + }, + { + "epoch": 0.24, + "learning_rate": 2.3113660306683831e-07, + "logits/chosen": -3.3728270530700684, + "logits/rejected": -3.6698052883148193, + "logps/chosen": -379.8182678222656, + "logps/rejected": -254.52731323242188, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03467864543199539, + "rewards/margins": 1.7818523645401, + "rewards/rejected": -1.8165309429168701, + "step": 2092 + }, + { + "epoch": 0.24, + "learning_rate": 2.3110148659721407e-07, + "logits/chosen": -2.8968770503997803, + "logits/rejected": -2.829472303390503, + "logps/chosen": -247.21621704101562, + "logps/rejected": -295.7564697265625, + "loss": 0.5919, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14039793610572815, + "rewards/margins": 0.5136997699737549, + "rewards/rejected": -0.3733018934726715, + "step": 2093 + }, + { + "epoch": 0.24, + "learning_rate": 2.3106637012758985e-07, + "logits/chosen": -3.104576587677002, + "logits/rejected": -3.4068703651428223, + "logps/chosen": -176.1011199951172, + "logps/rejected": -160.91513061523438, + "loss": 0.5162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2899596393108368, + "rewards/margins": 0.9834280014038086, + "rewards/rejected": -1.2733876705169678, + "step": 2094 + }, + { + "epoch": 0.24, + "learning_rate": 2.3103125365796558e-07, + "logits/chosen": -3.8687262535095215, + "logits/rejected": -3.768993854522705, + "logps/chosen": -199.6669921875, + "logps/rejected": -169.57044982910156, + "loss": 0.4006, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3814202845096588, + "rewards/margins": 1.068402886390686, + "rewards/rejected": -0.6869826316833496, + "step": 2095 + }, + { + "epoch": 0.24, + "learning_rate": 2.3099613718834133e-07, + "logits/chosen": -2.8024728298187256, + "logits/rejected": -2.988694906234741, + "logps/chosen": -268.6394958496094, + "logps/rejected": -374.6436462402344, + "loss": 0.8894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8080625534057617, + "rewards/margins": 0.7284382581710815, + "rewards/rejected": -1.5365009307861328, + "step": 2096 + }, + { + "epoch": 0.24, + "learning_rate": 2.3096102071871708e-07, + "logits/chosen": -2.6128599643707275, + "logits/rejected": -2.492452383041382, + "logps/chosen": -300.01947021484375, + "logps/rejected": -275.435791015625, + "loss": 0.4176, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17781177163124084, + "rewards/margins": 1.2151402235031128, + "rewards/rejected": -1.0373283624649048, + "step": 2097 + }, + { + "epoch": 0.24, + "learning_rate": 2.309259042490928e-07, + "logits/chosen": -3.0148143768310547, + "logits/rejected": -3.1258163452148438, + "logps/chosen": -164.94236755371094, + "logps/rejected": -166.98150634765625, + "loss": 0.5652, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29530441761016846, + "rewards/margins": 0.7061986923217773, + "rewards/rejected": -0.4108943045139313, + "step": 2098 + }, + { + "epoch": 0.24, + "learning_rate": 2.3089078777946856e-07, + "logits/chosen": -3.3342514038085938, + "logits/rejected": -3.188351631164551, + "logps/chosen": -161.34930419921875, + "logps/rejected": -229.7317657470703, + "loss": 0.1732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24056200683116913, + "rewards/margins": 2.5383083820343018, + "rewards/rejected": -2.297746419906616, + "step": 2099 + }, + { + "epoch": 0.24, + "learning_rate": 2.308556713098443e-07, + "logits/chosen": -2.6050167083740234, + "logits/rejected": -2.6193628311157227, + "logps/chosen": -354.6103515625, + "logps/rejected": -367.6568298339844, + "loss": 0.5786, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20415526628494263, + "rewards/margins": 0.8980470895767212, + "rewards/rejected": -0.6938918828964233, + "step": 2100 + }, + { + "epoch": 0.24, + "learning_rate": 2.3082055484022005e-07, + "logits/chosen": -2.9589338302612305, + "logits/rejected": -2.891653537750244, + "logps/chosen": -156.6099853515625, + "logps/rejected": -174.7583770751953, + "loss": 0.5957, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2888055145740509, + "rewards/margins": 0.4744884669780731, + "rewards/rejected": -0.7632939219474792, + "step": 2101 + }, + { + "epoch": 0.24, + "learning_rate": 2.307854383705958e-07, + "logits/chosen": -2.8973965644836426, + "logits/rejected": -2.8866379261016846, + "logps/chosen": -188.91928100585938, + "logps/rejected": -225.80870056152344, + "loss": 0.4731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06485879421234131, + "rewards/margins": 1.3465664386749268, + "rewards/rejected": -1.411425232887268, + "step": 2102 + }, + { + "epoch": 0.24, + "learning_rate": 2.3075032190097153e-07, + "logits/chosen": -3.013587236404419, + "logits/rejected": -2.985133171081543, + "logps/chosen": -226.9334259033203, + "logps/rejected": -255.89234924316406, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1455492079257965, + "rewards/margins": 2.1877567768096924, + "rewards/rejected": -2.333305835723877, + "step": 2103 + }, + { + "epoch": 0.24, + "learning_rate": 2.3071520543134728e-07, + "logits/chosen": -3.0888326168060303, + "logits/rejected": -3.1155600547790527, + "logps/chosen": -211.76461791992188, + "logps/rejected": -248.92959594726562, + "loss": 0.3834, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30154404044151306, + "rewards/margins": 1.7661001682281494, + "rewards/rejected": -1.4645562171936035, + "step": 2104 + }, + { + "epoch": 0.24, + "learning_rate": 2.3068008896172306e-07, + "logits/chosen": -3.4710254669189453, + "logits/rejected": -3.2883238792419434, + "logps/chosen": -241.4379425048828, + "logps/rejected": -137.45843505859375, + "loss": 0.5681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36827781796455383, + "rewards/margins": 1.1108441352844238, + "rewards/rejected": -1.4791220426559448, + "step": 2105 + }, + { + "epoch": 0.24, + "learning_rate": 2.306449724920988e-07, + "logits/chosen": -3.1294000148773193, + "logits/rejected": -3.027726650238037, + "logps/chosen": -258.2811279296875, + "logps/rejected": -280.6197204589844, + "loss": 0.6973, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2621428072452545, + "rewards/margins": 0.6313274502754211, + "rewards/rejected": -0.893470287322998, + "step": 2106 + }, + { + "epoch": 0.24, + "learning_rate": 2.3060985602247454e-07, + "logits/chosen": -3.519259452819824, + "logits/rejected": -3.564070701599121, + "logps/chosen": -368.4876403808594, + "logps/rejected": -262.3271789550781, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15722541511058807, + "rewards/margins": 1.3306314945220947, + "rewards/rejected": -1.1734061241149902, + "step": 2107 + }, + { + "epoch": 0.24, + "learning_rate": 2.3057473955285027e-07, + "logits/chosen": -2.4241862297058105, + "logits/rejected": -2.557363510131836, + "logps/chosen": -350.5266418457031, + "logps/rejected": -244.41880798339844, + "loss": 0.7731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1726996898651123, + "rewards/margins": 0.4294752776622772, + "rewards/rejected": -0.6021749377250671, + "step": 2108 + }, + { + "epoch": 0.24, + "learning_rate": 2.3053962308322602e-07, + "logits/chosen": -3.217029333114624, + "logits/rejected": -2.760246992111206, + "logps/chosen": -170.1758575439453, + "logps/rejected": -248.49200439453125, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15970605611801147, + "rewards/margins": 2.787921905517578, + "rewards/rejected": -2.9476280212402344, + "step": 2109 + }, + { + "epoch": 0.24, + "learning_rate": 2.3050450661360178e-07, + "logits/chosen": -2.86850643157959, + "logits/rejected": -2.8641371726989746, + "logps/chosen": -341.783447265625, + "logps/rejected": -363.9459228515625, + "loss": 0.6159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4317886233329773, + "rewards/margins": 1.6602858304977417, + "rewards/rejected": -2.092074394226074, + "step": 2110 + }, + { + "epoch": 0.24, + "learning_rate": 2.304693901439775e-07, + "logits/chosen": -2.5138039588928223, + "logits/rejected": -2.5743088722229004, + "logps/chosen": -417.4977722167969, + "logps/rejected": -359.70416259765625, + "loss": 0.393, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03080321103334427, + "rewards/margins": 1.373356580734253, + "rewards/rejected": -1.3425533771514893, + "step": 2111 + }, + { + "epoch": 0.24, + "learning_rate": 2.3043427367435326e-07, + "logits/chosen": -2.5206520557403564, + "logits/rejected": -2.533428192138672, + "logps/chosen": -369.29827880859375, + "logps/rejected": -283.76068115234375, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7609325051307678, + "rewards/margins": 2.096494674682617, + "rewards/rejected": -1.3355623483657837, + "step": 2112 + }, + { + "epoch": 0.24, + "learning_rate": 2.30399157204729e-07, + "logits/chosen": -3.4599108695983887, + "logits/rejected": -3.4545326232910156, + "logps/chosen": -190.36834716796875, + "logps/rejected": -267.7120361328125, + "loss": 0.2081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46892499923706055, + "rewards/margins": 2.933673858642578, + "rewards/rejected": -3.4025988578796387, + "step": 2113 + }, + { + "epoch": 0.24, + "learning_rate": 2.3036404073510474e-07, + "logits/chosen": -3.257903814315796, + "logits/rejected": -3.3460612297058105, + "logps/chosen": -206.3463134765625, + "logps/rejected": -219.52035522460938, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08126439899206161, + "rewards/margins": 0.7224301099777222, + "rewards/rejected": -0.6411657333374023, + "step": 2114 + }, + { + "epoch": 0.24, + "learning_rate": 2.303289242654805e-07, + "logits/chosen": -3.4247398376464844, + "logits/rejected": -3.5034990310668945, + "logps/chosen": -135.21697998046875, + "logps/rejected": -168.78558349609375, + "loss": 0.7413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40579739212989807, + "rewards/margins": 2.020038604736328, + "rewards/rejected": -2.4258358478546143, + "step": 2115 + }, + { + "epoch": 0.24, + "learning_rate": 2.3029380779585622e-07, + "logits/chosen": -3.3096399307250977, + "logits/rejected": -3.077197551727295, + "logps/chosen": -410.85015869140625, + "logps/rejected": -367.3661804199219, + "loss": 0.2309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39888066053390503, + "rewards/margins": 1.8421542644500732, + "rewards/rejected": -1.4432735443115234, + "step": 2116 + }, + { + "epoch": 0.24, + "learning_rate": 2.30258691326232e-07, + "logits/chosen": -3.084416389465332, + "logits/rejected": -3.2645788192749023, + "logps/chosen": -209.12745666503906, + "logps/rejected": -261.8200988769531, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21082201600074768, + "rewards/margins": 1.3201878070831299, + "rewards/rejected": -1.5310097932815552, + "step": 2117 + }, + { + "epoch": 0.24, + "learning_rate": 2.3022357485660776e-07, + "logits/chosen": -3.0738563537597656, + "logits/rejected": -2.97420072555542, + "logps/chosen": -150.8150634765625, + "logps/rejected": -225.51405334472656, + "loss": 0.3607, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.034167468547821045, + "rewards/margins": 1.929117202758789, + "rewards/rejected": -1.9632846117019653, + "step": 2118 + }, + { + "epoch": 0.24, + "learning_rate": 2.3018845838698348e-07, + "logits/chosen": -2.6215896606445312, + "logits/rejected": -2.531309127807617, + "logps/chosen": -325.0052490234375, + "logps/rejected": -288.5998840332031, + "loss": 0.3867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17428207397460938, + "rewards/margins": 1.0078134536743164, + "rewards/rejected": -0.8335314393043518, + "step": 2119 + }, + { + "epoch": 0.24, + "learning_rate": 2.3015334191735924e-07, + "logits/chosen": -3.269624710083008, + "logits/rejected": -3.0596323013305664, + "logps/chosen": -446.0211486816406, + "logps/rejected": -192.98489379882812, + "loss": 0.3913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5741363167762756, + "rewards/margins": 1.5559260845184326, + "rewards/rejected": -0.9817897081375122, + "step": 2120 + }, + { + "epoch": 0.24, + "learning_rate": 2.30118225447735e-07, + "logits/chosen": -3.132638454437256, + "logits/rejected": -3.095646619796753, + "logps/chosen": -306.4154968261719, + "logps/rejected": -255.46347045898438, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4185672402381897, + "rewards/margins": 1.0858190059661865, + "rewards/rejected": -1.5043861865997314, + "step": 2121 + }, + { + "epoch": 0.24, + "learning_rate": 2.3008310897811072e-07, + "logits/chosen": -3.2063019275665283, + "logits/rejected": -3.1988487243652344, + "logps/chosen": -432.0948486328125, + "logps/rejected": -280.7615966796875, + "loss": 0.4917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06945712864398956, + "rewards/margins": 1.3280484676361084, + "rewards/rejected": -1.2585911750793457, + "step": 2122 + }, + { + "epoch": 0.24, + "learning_rate": 2.3004799250848647e-07, + "logits/chosen": -2.981205463409424, + "logits/rejected": -3.1691946983337402, + "logps/chosen": -139.61550903320312, + "logps/rejected": -225.16111755371094, + "loss": 0.3533, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32160258293151855, + "rewards/margins": 1.6840468645095825, + "rewards/rejected": -1.362444281578064, + "step": 2123 + }, + { + "epoch": 0.24, + "learning_rate": 2.300128760388622e-07, + "logits/chosen": -3.129077911376953, + "logits/rejected": -3.0811550617218018, + "logps/chosen": -416.6397705078125, + "logps/rejected": -304.04876708984375, + "loss": 0.4072, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4374493658542633, + "rewards/margins": 2.1136257648468018, + "rewards/rejected": -1.6761763095855713, + "step": 2124 + }, + { + "epoch": 0.24, + "learning_rate": 2.2997775956923795e-07, + "logits/chosen": -3.1177971363067627, + "logits/rejected": -3.068120002746582, + "logps/chosen": -261.89117431640625, + "logps/rejected": -259.0257568359375, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17802907526493073, + "rewards/margins": 1.4171087741851807, + "rewards/rejected": -1.5951377153396606, + "step": 2125 + }, + { + "epoch": 0.25, + "learning_rate": 2.2994264309961373e-07, + "logits/chosen": -3.05033802986145, + "logits/rejected": -2.8604319095611572, + "logps/chosen": -484.1619567871094, + "logps/rejected": -488.71258544921875, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07930582761764526, + "rewards/margins": 2.209566831588745, + "rewards/rejected": -2.288872718811035, + "step": 2126 + }, + { + "epoch": 0.25, + "learning_rate": 2.2990752662998943e-07, + "logits/chosen": -3.3130016326904297, + "logits/rejected": -3.182955741882324, + "logps/chosen": -242.98472595214844, + "logps/rejected": -311.72210693359375, + "loss": 0.5221, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08628827333450317, + "rewards/margins": 1.0533286333084106, + "rewards/rejected": -0.9670404195785522, + "step": 2127 + }, + { + "epoch": 0.25, + "learning_rate": 2.2987241016036521e-07, + "logits/chosen": -2.757323741912842, + "logits/rejected": -2.734171152114868, + "logps/chosen": -228.89212036132812, + "logps/rejected": -325.0506896972656, + "loss": 0.45, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12936830520629883, + "rewards/margins": 0.9810525178909302, + "rewards/rejected": -0.8516842126846313, + "step": 2128 + }, + { + "epoch": 0.25, + "learning_rate": 2.2983729369074097e-07, + "logits/chosen": -2.6155567169189453, + "logits/rejected": -2.8236327171325684, + "logps/chosen": -230.96731567382812, + "logps/rejected": -174.28579711914062, + "loss": 0.3177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15025682747364044, + "rewards/margins": 1.6429872512817383, + "rewards/rejected": -1.4927302598953247, + "step": 2129 + }, + { + "epoch": 0.25, + "learning_rate": 2.298021772211167e-07, + "logits/chosen": -2.4323248863220215, + "logits/rejected": -2.529294013977051, + "logps/chosen": -558.7015991210938, + "logps/rejected": -296.4462890625, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6510332822799683, + "rewards/margins": 1.8151662349700928, + "rewards/rejected": -1.164132833480835, + "step": 2130 + }, + { + "epoch": 0.25, + "learning_rate": 2.2976706075149245e-07, + "logits/chosen": -3.9073331356048584, + "logits/rejected": -4.021336555480957, + "logps/chosen": -196.57164001464844, + "logps/rejected": -162.32533264160156, + "loss": 0.5155, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33393171429634094, + "rewards/margins": 1.0446873903274536, + "rewards/rejected": -0.7107555866241455, + "step": 2131 + }, + { + "epoch": 0.25, + "learning_rate": 2.2973194428186818e-07, + "logits/chosen": -3.0298361778259277, + "logits/rejected": -3.081550121307373, + "logps/chosen": -319.3509826660156, + "logps/rejected": -316.0582275390625, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3999989628791809, + "rewards/margins": 0.7974104881286621, + "rewards/rejected": -1.1974093914031982, + "step": 2132 + }, + { + "epoch": 0.25, + "learning_rate": 2.2969682781224393e-07, + "logits/chosen": -2.8984832763671875, + "logits/rejected": -2.7823643684387207, + "logps/chosen": -254.94943237304688, + "logps/rejected": -154.06027221679688, + "loss": 0.8982, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.48639917373657227, + "rewards/margins": -0.02118399739265442, + "rewards/rejected": -0.46521520614624023, + "step": 2133 + }, + { + "epoch": 0.25, + "learning_rate": 2.2966171134261968e-07, + "logits/chosen": -2.907137393951416, + "logits/rejected": -2.778745651245117, + "logps/chosen": -366.7825012207031, + "logps/rejected": -330.40545654296875, + "loss": 0.6041, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25338396430015564, + "rewards/margins": 1.3865423202514648, + "rewards/rejected": -1.1331582069396973, + "step": 2134 + }, + { + "epoch": 0.25, + "learning_rate": 2.296265948729954e-07, + "logits/chosen": -2.639051675796509, + "logits/rejected": -2.8714752197265625, + "logps/chosen": -328.53765869140625, + "logps/rejected": -214.4018096923828, + "loss": 0.7175, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2575297951698303, + "rewards/margins": 0.008671015501022339, + "rewards/rejected": -0.26620084047317505, + "step": 2135 + }, + { + "epoch": 0.25, + "learning_rate": 2.2959147840337117e-07, + "logits/chosen": -3.967879056930542, + "logits/rejected": -3.6781632900238037, + "logps/chosen": -294.2057189941406, + "logps/rejected": -185.2998046875, + "loss": 0.3645, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17556343972682953, + "rewards/margins": 1.1041561365127563, + "rewards/rejected": -0.9285928010940552, + "step": 2136 + }, + { + "epoch": 0.25, + "learning_rate": 2.295563619337469e-07, + "logits/chosen": -3.4967775344848633, + "logits/rejected": -3.530625343322754, + "logps/chosen": -292.8888244628906, + "logps/rejected": -278.8633728027344, + "loss": 0.3244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02556251734495163, + "rewards/margins": 1.678627610206604, + "rewards/rejected": -1.7041900157928467, + "step": 2137 + }, + { + "epoch": 0.25, + "learning_rate": 2.2952124546412265e-07, + "logits/chosen": -3.174499988555908, + "logits/rejected": -3.5422496795654297, + "logps/chosen": -97.04664611816406, + "logps/rejected": -201.9088592529297, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26214560866355896, + "rewards/margins": 1.7778913974761963, + "rewards/rejected": -2.040037155151367, + "step": 2138 + }, + { + "epoch": 0.25, + "learning_rate": 2.2948612899449843e-07, + "logits/chosen": -3.057988405227661, + "logits/rejected": -2.7497944831848145, + "logps/chosen": -143.536865234375, + "logps/rejected": -136.38714599609375, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36133721470832825, + "rewards/margins": 1.2990679740905762, + "rewards/rejected": -1.6604052782058716, + "step": 2139 + }, + { + "epoch": 0.25, + "learning_rate": 2.2945101252487415e-07, + "logits/chosen": -3.433320999145508, + "logits/rejected": -3.2139768600463867, + "logps/chosen": -195.34268188476562, + "logps/rejected": -181.19691467285156, + "loss": 0.6035, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10047314316034317, + "rewards/margins": 0.6875261664390564, + "rewards/rejected": -0.7879993319511414, + "step": 2140 + }, + { + "epoch": 0.25, + "learning_rate": 2.294158960552499e-07, + "logits/chosen": -3.0975584983825684, + "logits/rejected": -2.981736660003662, + "logps/chosen": -435.87939453125, + "logps/rejected": -239.91470336914062, + "loss": 0.3697, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030903533101081848, + "rewards/margins": 1.337636947631836, + "rewards/rejected": -1.3685402870178223, + "step": 2141 + }, + { + "epoch": 0.25, + "learning_rate": 2.2938077958562566e-07, + "logits/chosen": -2.9738893508911133, + "logits/rejected": -2.558286190032959, + "logps/chosen": -214.34483337402344, + "logps/rejected": -250.37782287597656, + "loss": 0.5872, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6212382912635803, + "rewards/margins": 0.658962607383728, + "rewards/rejected": -1.2802008390426636, + "step": 2142 + }, + { + "epoch": 0.25, + "learning_rate": 2.293456631160014e-07, + "logits/chosen": -3.1387698650360107, + "logits/rejected": -3.114185333251953, + "logps/chosen": -390.3049621582031, + "logps/rejected": -325.4262390136719, + "loss": 0.492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07930421084165573, + "rewards/margins": 1.013663649559021, + "rewards/rejected": -1.0929678678512573, + "step": 2143 + }, + { + "epoch": 0.25, + "learning_rate": 2.2931054664637714e-07, + "logits/chosen": -3.1458261013031006, + "logits/rejected": -3.1077966690063477, + "logps/chosen": -311.1947326660156, + "logps/rejected": -292.24249267578125, + "loss": 0.2497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018325001001358032, + "rewards/margins": 2.187077760696411, + "rewards/rejected": -2.2054026126861572, + "step": 2144 + }, + { + "epoch": 0.25, + "learning_rate": 2.2927543017675287e-07, + "logits/chosen": -2.2506399154663086, + "logits/rejected": -2.366401433944702, + "logps/chosen": -443.18243408203125, + "logps/rejected": -272.187744140625, + "loss": 0.3243, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.285902202129364, + "rewards/margins": 1.2258096933364868, + "rewards/rejected": -0.9399075508117676, + "step": 2145 + }, + { + "epoch": 0.25, + "learning_rate": 2.2924031370712863e-07, + "logits/chosen": -3.9018802642822266, + "logits/rejected": -3.819483995437622, + "logps/chosen": -310.84307861328125, + "logps/rejected": -324.6479187011719, + "loss": 0.1897, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6406594514846802, + "rewards/margins": 3.827509880065918, + "rewards/rejected": -3.1868505477905273, + "step": 2146 + }, + { + "epoch": 0.25, + "learning_rate": 2.2920519723750438e-07, + "logits/chosen": -3.35387921333313, + "logits/rejected": -3.498723268508911, + "logps/chosen": -264.9539794921875, + "logps/rejected": -240.6407470703125, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1976519525051117, + "rewards/margins": 1.7366485595703125, + "rewards/rejected": -1.934300422668457, + "step": 2147 + }, + { + "epoch": 0.25, + "learning_rate": 2.291700807678801e-07, + "logits/chosen": -2.9523720741271973, + "logits/rejected": -3.0167789459228516, + "logps/chosen": -221.262939453125, + "logps/rejected": -213.5052490234375, + "loss": 0.3951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32022199034690857, + "rewards/margins": 1.2957648038864136, + "rewards/rejected": -1.6159868240356445, + "step": 2148 + }, + { + "epoch": 0.25, + "learning_rate": 2.2913496429825586e-07, + "logits/chosen": -3.675983428955078, + "logits/rejected": -3.4586997032165527, + "logps/chosen": -255.3311767578125, + "logps/rejected": -206.3752899169922, + "loss": 0.8866, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8855583667755127, + "rewards/margins": -0.09528318047523499, + "rewards/rejected": -0.7902751564979553, + "step": 2149 + }, + { + "epoch": 0.25, + "learning_rate": 2.2909984782863164e-07, + "logits/chosen": -3.1246275901794434, + "logits/rejected": -2.9770803451538086, + "logps/chosen": -306.7972106933594, + "logps/rejected": -369.6507873535156, + "loss": 0.2454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05419187992811203, + "rewards/margins": 1.6424165964126587, + "rewards/rejected": -1.6966084241867065, + "step": 2150 + }, + { + "epoch": 0.25, + "learning_rate": 2.2906473135900737e-07, + "logits/chosen": -2.799499988555908, + "logits/rejected": -2.881180763244629, + "logps/chosen": -280.1490478515625, + "logps/rejected": -216.60858154296875, + "loss": 0.5795, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.058351997286081314, + "rewards/margins": 0.6957638263702393, + "rewards/rejected": -0.754115879535675, + "step": 2151 + }, + { + "epoch": 0.25, + "learning_rate": 2.2902961488938312e-07, + "logits/chosen": -3.0426626205444336, + "logits/rejected": -2.8782923221588135, + "logps/chosen": -199.07327270507812, + "logps/rejected": -176.74562072753906, + "loss": 0.5265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3715325593948364, + "rewards/margins": 0.44901910424232483, + "rewards/rejected": -0.8205517530441284, + "step": 2152 + }, + { + "epoch": 0.25, + "learning_rate": 2.2899449841975885e-07, + "logits/chosen": -3.3847169876098633, + "logits/rejected": -3.440019130706787, + "logps/chosen": -199.2718048095703, + "logps/rejected": -242.8067626953125, + "loss": 0.1795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6725295186042786, + "rewards/margins": 2.372034788131714, + "rewards/rejected": -1.69950532913208, + "step": 2153 + }, + { + "epoch": 0.25, + "learning_rate": 2.289593819501346e-07, + "logits/chosen": -3.2121388912200928, + "logits/rejected": -3.1259593963623047, + "logps/chosen": -77.61213684082031, + "logps/rejected": -260.663818359375, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5566335916519165, + "rewards/margins": 2.4029579162597656, + "rewards/rejected": -1.8463245630264282, + "step": 2154 + }, + { + "epoch": 0.25, + "learning_rate": 2.2892426548051036e-07, + "logits/chosen": -3.8060905933380127, + "logits/rejected": -4.063305854797363, + "logps/chosen": -96.44733428955078, + "logps/rejected": -146.20587158203125, + "loss": 0.2959, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15789060294628143, + "rewards/margins": 1.3584789037704468, + "rewards/rejected": -1.200588345527649, + "step": 2155 + }, + { + "epoch": 0.25, + "learning_rate": 2.2888914901088608e-07, + "logits/chosen": -3.04556941986084, + "logits/rejected": -2.8953006267547607, + "logps/chosen": -322.14044189453125, + "logps/rejected": -291.58837890625, + "loss": 0.3588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05088639259338379, + "rewards/margins": 1.2984827756881714, + "rewards/rejected": -1.2475963830947876, + "step": 2156 + }, + { + "epoch": 0.25, + "learning_rate": 2.2885403254126184e-07, + "logits/chosen": -2.5830042362213135, + "logits/rejected": -2.5951828956604004, + "logps/chosen": -190.1724853515625, + "logps/rejected": -248.76187133789062, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5667991042137146, + "rewards/margins": 0.7928729057312012, + "rewards/rejected": -1.359671950340271, + "step": 2157 + }, + { + "epoch": 0.25, + "learning_rate": 2.288189160716376e-07, + "logits/chosen": -3.824737071990967, + "logits/rejected": -3.4306282997131348, + "logps/chosen": -316.813232421875, + "logps/rejected": -212.4075469970703, + "loss": 0.5233, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1164102703332901, + "rewards/margins": 1.068872094154358, + "rewards/rejected": -0.952461838722229, + "step": 2158 + }, + { + "epoch": 0.25, + "learning_rate": 2.2878379960201332e-07, + "logits/chosen": -2.5841102600097656, + "logits/rejected": -2.3273773193359375, + "logps/chosen": -341.92291259765625, + "logps/rejected": -198.899658203125, + "loss": 0.5798, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15881577134132385, + "rewards/margins": 0.6533850431442261, + "rewards/rejected": -0.8122007846832275, + "step": 2159 + }, + { + "epoch": 0.25, + "learning_rate": 2.287486831323891e-07, + "logits/chosen": -2.1588921546936035, + "logits/rejected": -2.281071662902832, + "logps/chosen": -338.48046875, + "logps/rejected": -367.73541259765625, + "loss": 0.3756, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6069843769073486, + "rewards/margins": 1.4542444944381714, + "rewards/rejected": -0.8472601175308228, + "step": 2160 + }, + { + "epoch": 0.25, + "learning_rate": 2.287135666627648e-07, + "logits/chosen": -3.171647310256958, + "logits/rejected": -2.9164974689483643, + "logps/chosen": -246.63812255859375, + "logps/rejected": -244.59567260742188, + "loss": 0.4204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4366302490234375, + "rewards/margins": 1.1840856075286865, + "rewards/rejected": -1.6207157373428345, + "step": 2161 + }, + { + "epoch": 0.25, + "learning_rate": 2.2867845019314058e-07, + "logits/chosen": -3.344777822494507, + "logits/rejected": -3.183298349380493, + "logps/chosen": -380.3085632324219, + "logps/rejected": -311.85821533203125, + "loss": 0.5028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21874305605888367, + "rewards/margins": 1.9667285680770874, + "rewards/rejected": -2.185471534729004, + "step": 2162 + }, + { + "epoch": 0.25, + "learning_rate": 2.2864333372351633e-07, + "logits/chosen": -3.1040329933166504, + "logits/rejected": -2.866542100906372, + "logps/chosen": -274.49066162109375, + "logps/rejected": -221.03579711914062, + "loss": 0.4141, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004150062799453735, + "rewards/margins": 1.3637328147888184, + "rewards/rejected": -1.359582781791687, + "step": 2163 + }, + { + "epoch": 0.25, + "learning_rate": 2.2860821725389206e-07, + "logits/chosen": -3.423794984817505, + "logits/rejected": -3.181846857070923, + "logps/chosen": -198.95223999023438, + "logps/rejected": -207.7142791748047, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19943265616893768, + "rewards/margins": 1.7241932153701782, + "rewards/rejected": -1.9236259460449219, + "step": 2164 + }, + { + "epoch": 0.25, + "learning_rate": 2.2857310078426782e-07, + "logits/chosen": -3.3602797985076904, + "logits/rejected": -3.5906894207000732, + "logps/chosen": -374.57989501953125, + "logps/rejected": -455.7204284667969, + "loss": 0.2646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3703863322734833, + "rewards/margins": 1.5944361686706543, + "rewards/rejected": -1.2240499258041382, + "step": 2165 + }, + { + "epoch": 0.25, + "learning_rate": 2.2853798431464357e-07, + "logits/chosen": -3.4440345764160156, + "logits/rejected": -3.7901735305786133, + "logps/chosen": -335.39263916015625, + "logps/rejected": -258.5957946777344, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04834523797035217, + "rewards/margins": 2.580047845840454, + "rewards/rejected": -2.5317025184631348, + "step": 2166 + }, + { + "epoch": 0.25, + "learning_rate": 2.285028678450193e-07, + "logits/chosen": -2.877209424972534, + "logits/rejected": -3.0024654865264893, + "logps/chosen": -196.6121063232422, + "logps/rejected": -253.6957244873047, + "loss": 0.378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32331985235214233, + "rewards/margins": 1.4587980508804321, + "rewards/rejected": -1.7821178436279297, + "step": 2167 + }, + { + "epoch": 0.25, + "learning_rate": 2.2846775137539505e-07, + "logits/chosen": -3.247687339782715, + "logits/rejected": -2.576735734939575, + "logps/chosen": -266.5221252441406, + "logps/rejected": -134.5208282470703, + "loss": 0.3538, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14202986657619476, + "rewards/margins": 1.1841765642166138, + "rewards/rejected": -1.0421466827392578, + "step": 2168 + }, + { + "epoch": 0.25, + "learning_rate": 2.2843263490577078e-07, + "logits/chosen": -3.1439571380615234, + "logits/rejected": -3.1819911003112793, + "logps/chosen": -293.97064208984375, + "logps/rejected": -315.087646484375, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5528956651687622, + "rewards/margins": 2.2351021766662598, + "rewards/rejected": -1.682206630706787, + "step": 2169 + }, + { + "epoch": 0.25, + "learning_rate": 2.2839751843614653e-07, + "logits/chosen": -3.42669677734375, + "logits/rejected": -3.2778358459472656, + "logps/chosen": -297.4933776855469, + "logps/rejected": -259.46990966796875, + "loss": 0.4767, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07736729085445404, + "rewards/margins": 1.8937163352966309, + "rewards/rejected": -1.8163491487503052, + "step": 2170 + }, + { + "epoch": 0.25, + "learning_rate": 2.283624019665223e-07, + "logits/chosen": -3.3399159908294678, + "logits/rejected": -2.733170509338379, + "logps/chosen": -364.28509521484375, + "logps/rejected": -177.2841796875, + "loss": 0.3621, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13456812500953674, + "rewards/margins": 1.6440664529800415, + "rewards/rejected": -1.5094983577728271, + "step": 2171 + }, + { + "epoch": 0.25, + "learning_rate": 2.2832728549689801e-07, + "logits/chosen": -3.493020534515381, + "logits/rejected": -3.351998805999756, + "logps/chosen": -238.95553588867188, + "logps/rejected": -194.61776733398438, + "loss": 0.7171, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10810390114784241, + "rewards/margins": 1.0185710191726685, + "rewards/rejected": -1.1266748905181885, + "step": 2172 + }, + { + "epoch": 0.25, + "learning_rate": 2.282921690272738e-07, + "logits/chosen": -3.4962539672851562, + "logits/rejected": -3.570619821548462, + "logps/chosen": -276.0008544921875, + "logps/rejected": -332.01263427734375, + "loss": 0.2421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5366507768630981, + "rewards/margins": 1.9456170797348022, + "rewards/rejected": -1.408966302871704, + "step": 2173 + }, + { + "epoch": 0.25, + "learning_rate": 2.2825705255764955e-07, + "logits/chosen": -3.6297452449798584, + "logits/rejected": -3.4391212463378906, + "logps/chosen": -169.07568359375, + "logps/rejected": -153.61801147460938, + "loss": 0.4104, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10558347404003143, + "rewards/margins": 0.9694736003875732, + "rewards/rejected": -0.8638901710510254, + "step": 2174 + }, + { + "epoch": 0.25, + "learning_rate": 2.2822193608802528e-07, + "logits/chosen": -3.5033671855926514, + "logits/rejected": -3.094743251800537, + "logps/chosen": -287.56732177734375, + "logps/rejected": -252.23291015625, + "loss": 0.5742, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0036702752113342285, + "rewards/margins": 0.6941072940826416, + "rewards/rejected": -0.6904370188713074, + "step": 2175 + }, + { + "epoch": 0.25, + "learning_rate": 2.2818681961840103e-07, + "logits/chosen": -3.6756601333618164, + "logits/rejected": -3.496523380279541, + "logps/chosen": -224.6309356689453, + "logps/rejected": -249.92306518554688, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14091336727142334, + "rewards/margins": 1.437800407409668, + "rewards/rejected": -1.2968870401382446, + "step": 2176 + }, + { + "epoch": 0.25, + "learning_rate": 2.2815170314877676e-07, + "logits/chosen": -2.996642589569092, + "logits/rejected": -2.9101545810699463, + "logps/chosen": -384.51800537109375, + "logps/rejected": -260.63800048828125, + "loss": 0.5525, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05160914361476898, + "rewards/margins": 1.1540337800979614, + "rewards/rejected": -1.1024246215820312, + "step": 2177 + }, + { + "epoch": 0.25, + "learning_rate": 2.281165866791525e-07, + "logits/chosen": -2.4927492141723633, + "logits/rejected": -2.503787040710449, + "logps/chosen": -275.6834411621094, + "logps/rejected": -230.14968872070312, + "loss": 0.5949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6375506520271301, + "rewards/margins": 0.836368203163147, + "rewards/rejected": -1.4739189147949219, + "step": 2178 + }, + { + "epoch": 0.25, + "learning_rate": 2.2808147020952826e-07, + "logits/chosen": -3.007434844970703, + "logits/rejected": -3.0320026874542236, + "logps/chosen": -236.13670349121094, + "logps/rejected": -146.66213989257812, + "loss": 0.6096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15751954913139343, + "rewards/margins": 0.669103741645813, + "rewards/rejected": -0.826623260974884, + "step": 2179 + }, + { + "epoch": 0.25, + "learning_rate": 2.28046353739904e-07, + "logits/chosen": -3.16267728805542, + "logits/rejected": -3.1818904876708984, + "logps/chosen": -456.2713928222656, + "logps/rejected": -257.8637390136719, + "loss": 0.2638, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06230699270963669, + "rewards/margins": 1.9982186555862427, + "rewards/rejected": -2.06052565574646, + "step": 2180 + }, + { + "epoch": 0.25, + "learning_rate": 2.2801123727027975e-07, + "logits/chosen": -3.335986852645874, + "logits/rejected": -3.234585762023926, + "logps/chosen": -168.18502807617188, + "logps/rejected": -117.86390686035156, + "loss": 0.6098, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25341537594795227, + "rewards/margins": 0.2837415337562561, + "rewards/rejected": -0.5371569395065308, + "step": 2181 + }, + { + "epoch": 0.25, + "learning_rate": 2.2797612080065553e-07, + "logits/chosen": -2.9828405380249023, + "logits/rejected": -2.8815841674804688, + "logps/chosen": -298.37127685546875, + "logps/rejected": -332.8902282714844, + "loss": 0.4075, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1333828866481781, + "rewards/margins": 1.594447135925293, + "rewards/rejected": -1.461064100265503, + "step": 2182 + }, + { + "epoch": 0.25, + "learning_rate": 2.2794100433103123e-07, + "logits/chosen": -3.875690460205078, + "logits/rejected": -3.8597521781921387, + "logps/chosen": -284.37811279296875, + "logps/rejected": -309.10894775390625, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021378308534622192, + "rewards/margins": 1.0725690126419067, + "rewards/rejected": -1.0511906147003174, + "step": 2183 + }, + { + "epoch": 0.25, + "learning_rate": 2.27905887861407e-07, + "logits/chosen": -3.3483641147613525, + "logits/rejected": -3.337801456451416, + "logps/chosen": -143.1961212158203, + "logps/rejected": -253.67123413085938, + "loss": 0.3619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15064403414726257, + "rewards/margins": 2.736097574234009, + "rewards/rejected": -2.886741876602173, + "step": 2184 + }, + { + "epoch": 0.25, + "learning_rate": 2.2787077139178273e-07, + "logits/chosen": -2.9544291496276855, + "logits/rejected": -2.800509214401245, + "logps/chosen": -252.32518005371094, + "logps/rejected": -268.2479248046875, + "loss": 0.3739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.573763370513916, + "rewards/margins": 1.9583191871643066, + "rewards/rejected": -2.5320825576782227, + "step": 2185 + }, + { + "epoch": 0.25, + "learning_rate": 2.278356549221585e-07, + "logits/chosen": -3.152986526489258, + "logits/rejected": -2.855604410171509, + "logps/chosen": -150.97015380859375, + "logps/rejected": -128.8043212890625, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06456160545349121, + "rewards/margins": 0.8390650153160095, + "rewards/rejected": -0.903626561164856, + "step": 2186 + }, + { + "epoch": 0.25, + "learning_rate": 2.2780053845253424e-07, + "logits/chosen": -2.6811299324035645, + "logits/rejected": -2.849144220352173, + "logps/chosen": -174.7205047607422, + "logps/rejected": -169.53009033203125, + "loss": 1.0741, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6410802602767944, + "rewards/margins": 0.19858889281749725, + "rewards/rejected": -0.8396690487861633, + "step": 2187 + }, + { + "epoch": 0.25, + "learning_rate": 2.2776542198290997e-07, + "logits/chosen": -3.2585272789001465, + "logits/rejected": -3.177778959274292, + "logps/chosen": -498.5721130371094, + "logps/rejected": -376.8558044433594, + "loss": 0.6065, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6107594966888428, + "rewards/margins": 0.8449984788894653, + "rewards/rejected": -1.4557580947875977, + "step": 2188 + }, + { + "epoch": 0.25, + "learning_rate": 2.2773030551328572e-07, + "logits/chosen": -3.739912986755371, + "logits/rejected": -3.3018579483032227, + "logps/chosen": -233.6374969482422, + "logps/rejected": -207.07614135742188, + "loss": 0.5847, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15453240275382996, + "rewards/margins": 0.39640113711357117, + "rewards/rejected": -0.5509334802627563, + "step": 2189 + }, + { + "epoch": 0.25, + "learning_rate": 2.2769518904366145e-07, + "logits/chosen": -2.671588897705078, + "logits/rejected": -2.4264400005340576, + "logps/chosen": -292.240234375, + "logps/rejected": -255.5592803955078, + "loss": 0.8081, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31521016359329224, + "rewards/margins": 0.46280139684677124, + "rewards/rejected": -0.7780115604400635, + "step": 2190 + }, + { + "epoch": 0.25, + "learning_rate": 2.276600725740372e-07, + "logits/chosen": -2.468827247619629, + "logits/rejected": -2.5742075443267822, + "logps/chosen": -340.28753662109375, + "logps/rejected": -389.3198547363281, + "loss": 0.6655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3011487126350403, + "rewards/margins": 0.8277727365493774, + "rewards/rejected": -1.128921389579773, + "step": 2191 + }, + { + "epoch": 0.25, + "learning_rate": 2.2762495610441296e-07, + "logits/chosen": -2.9728903770446777, + "logits/rejected": -3.1461896896362305, + "logps/chosen": -167.0803985595703, + "logps/rejected": -184.75753784179688, + "loss": 0.3716, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2924802899360657, + "rewards/margins": 1.4994313716888428, + "rewards/rejected": -1.2069510221481323, + "step": 2192 + }, + { + "epoch": 0.25, + "learning_rate": 2.2758983963478869e-07, + "logits/chosen": -2.8169963359832764, + "logits/rejected": -2.824395179748535, + "logps/chosen": -324.3594970703125, + "logps/rejected": -242.78765869140625, + "loss": 0.4923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24145160615444183, + "rewards/margins": 0.8665487766265869, + "rewards/rejected": -1.10800039768219, + "step": 2193 + }, + { + "epoch": 0.25, + "learning_rate": 2.2755472316516447e-07, + "logits/chosen": -2.9910857677459717, + "logits/rejected": -3.1291580200195312, + "logps/chosen": -133.91082763671875, + "logps/rejected": -339.8657531738281, + "loss": 0.4806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12414442002773285, + "rewards/margins": 3.300455093383789, + "rewards/rejected": -3.1763110160827637, + "step": 2194 + }, + { + "epoch": 0.25, + "learning_rate": 2.2751960669554022e-07, + "logits/chosen": -2.8426129817962646, + "logits/rejected": -2.8806068897247314, + "logps/chosen": -238.0952606201172, + "logps/rejected": -184.66696166992188, + "loss": 0.8204, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2021978795528412, + "rewards/margins": 0.14173412322998047, + "rewards/rejected": -0.34393200278282166, + "step": 2195 + }, + { + "epoch": 0.25, + "learning_rate": 2.2748449022591595e-07, + "logits/chosen": -3.8034210205078125, + "logits/rejected": -3.7070631980895996, + "logps/chosen": -257.9671325683594, + "logps/rejected": -327.8367614746094, + "loss": 0.2542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21462735533714294, + "rewards/margins": 2.504133462905884, + "rewards/rejected": -2.289506435394287, + "step": 2196 + }, + { + "epoch": 0.25, + "learning_rate": 2.274493737562917e-07, + "logits/chosen": -2.9795875549316406, + "logits/rejected": -2.9511022567749023, + "logps/chosen": -386.87554931640625, + "logps/rejected": -285.961669921875, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7351185083389282, + "rewards/margins": 2.8124542236328125, + "rewards/rejected": -2.0773355960845947, + "step": 2197 + }, + { + "epoch": 0.25, + "learning_rate": 2.2741425728666743e-07, + "logits/chosen": -3.3551764488220215, + "logits/rejected": -3.2607264518737793, + "logps/chosen": -228.473388671875, + "logps/rejected": -244.93682861328125, + "loss": 0.4245, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018006615340709686, + "rewards/margins": 0.9778416156768799, + "rewards/rejected": -0.9598349928855896, + "step": 2198 + }, + { + "epoch": 0.25, + "learning_rate": 2.2737914081704318e-07, + "logits/chosen": -3.5319674015045166, + "logits/rejected": -3.5533721446990967, + "logps/chosen": -96.42144775390625, + "logps/rejected": -102.57795715332031, + "loss": 0.4744, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1410767138004303, + "rewards/margins": 0.5446161031723022, + "rewards/rejected": -0.6856928467750549, + "step": 2199 + }, + { + "epoch": 0.25, + "learning_rate": 2.2734402434741894e-07, + "logits/chosen": -3.078167200088501, + "logits/rejected": -3.207195997238159, + "logps/chosen": -333.6822509765625, + "logps/rejected": -255.9193572998047, + "loss": 0.6359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3659137487411499, + "rewards/margins": 0.42312949895858765, + "rewards/rejected": -0.7890431880950928, + "step": 2200 + }, + { + "epoch": 0.25, + "learning_rate": 2.2730890787779466e-07, + "logits/chosen": -2.517956018447876, + "logits/rejected": -2.4445226192474365, + "logps/chosen": -333.677490234375, + "logps/rejected": -211.13037109375, + "loss": 0.2447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5625838041305542, + "rewards/margins": 2.46097993850708, + "rewards/rejected": -1.8983962535858154, + "step": 2201 + }, + { + "epoch": 0.25, + "learning_rate": 2.2727379140817042e-07, + "logits/chosen": -2.747490644454956, + "logits/rejected": -2.8774454593658447, + "logps/chosen": -257.6216125488281, + "logps/rejected": -269.10150146484375, + "loss": 0.4677, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3422894775867462, + "rewards/margins": 0.6943532228469849, + "rewards/rejected": -0.35206377506256104, + "step": 2202 + }, + { + "epoch": 0.25, + "learning_rate": 2.2723867493854617e-07, + "logits/chosen": -2.322866201400757, + "logits/rejected": -2.334084987640381, + "logps/chosen": -304.0146484375, + "logps/rejected": -262.3143615722656, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3600270450115204, + "rewards/margins": 0.4759736657142639, + "rewards/rejected": -0.8360008001327515, + "step": 2203 + }, + { + "epoch": 0.25, + "learning_rate": 2.272035584689219e-07, + "logits/chosen": -2.68637752532959, + "logits/rejected": -2.8301007747650146, + "logps/chosen": -237.6531982421875, + "logps/rejected": -224.08953857421875, + "loss": 0.3836, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5830736756324768, + "rewards/margins": 1.2394698858261108, + "rewards/rejected": -0.6563961505889893, + "step": 2204 + }, + { + "epoch": 0.25, + "learning_rate": 2.2716844199929768e-07, + "logits/chosen": -3.033590793609619, + "logits/rejected": -2.818540573120117, + "logps/chosen": -273.6932067871094, + "logps/rejected": -222.65040588378906, + "loss": 0.681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5942991375923157, + "rewards/margins": 0.5739060044288635, + "rewards/rejected": -1.1682050228118896, + "step": 2205 + }, + { + "epoch": 0.25, + "learning_rate": 2.2713332552967338e-07, + "logits/chosen": -3.183271646499634, + "logits/rejected": -3.354708194732666, + "logps/chosen": -194.7556610107422, + "logps/rejected": -251.96849060058594, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06630364060401917, + "rewards/margins": 2.051818370819092, + "rewards/rejected": -1.985514521598816, + "step": 2206 + }, + { + "epoch": 0.25, + "learning_rate": 2.2709820906004916e-07, + "logits/chosen": -3.4493441581726074, + "logits/rejected": -3.841705799102783, + "logps/chosen": -117.37997436523438, + "logps/rejected": -174.9344482421875, + "loss": 0.3439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19674226641654968, + "rewards/margins": 1.5272915363311768, + "rewards/rejected": -1.7240337133407593, + "step": 2207 + }, + { + "epoch": 0.25, + "learning_rate": 2.2706309259042491e-07, + "logits/chosen": -2.7928309440612793, + "logits/rejected": -2.6430015563964844, + "logps/chosen": -180.92123413085938, + "logps/rejected": -255.7613525390625, + "loss": 0.4399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06936469674110413, + "rewards/margins": 0.8699275851249695, + "rewards/rejected": -0.800562858581543, + "step": 2208 + }, + { + "epoch": 0.25, + "learning_rate": 2.2702797612080064e-07, + "logits/chosen": -2.7166121006011963, + "logits/rejected": -2.8603410720825195, + "logps/chosen": -421.296875, + "logps/rejected": -200.14544677734375, + "loss": 0.5442, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2157871425151825, + "rewards/margins": 0.5476203560829163, + "rewards/rejected": -0.3318331837654114, + "step": 2209 + }, + { + "epoch": 0.25, + "learning_rate": 2.269928596511764e-07, + "logits/chosen": -3.432258129119873, + "logits/rejected": -3.5610079765319824, + "logps/chosen": -195.3793487548828, + "logps/rejected": -168.77395629882812, + "loss": 0.4232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021966442465782166, + "rewards/margins": 1.0210915803909302, + "rewards/rejected": -0.9991251230239868, + "step": 2210 + }, + { + "epoch": 0.25, + "learning_rate": 2.2695774318155215e-07, + "logits/chosen": -2.771503448486328, + "logits/rejected": -2.740757465362549, + "logps/chosen": -325.5924377441406, + "logps/rejected": -288.82373046875, + "loss": 0.4106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34828251600265503, + "rewards/margins": 1.9800866842269897, + "rewards/rejected": -2.328369140625, + "step": 2211 + }, + { + "epoch": 0.26, + "learning_rate": 2.2692262671192788e-07, + "logits/chosen": -3.233912467956543, + "logits/rejected": -3.304896116256714, + "logps/chosen": -110.72102355957031, + "logps/rejected": -103.87687683105469, + "loss": 0.7458, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014604642987251282, + "rewards/margins": 0.4600673317909241, + "rewards/rejected": -0.445462703704834, + "step": 2212 + }, + { + "epoch": 0.26, + "learning_rate": 2.2688751024230363e-07, + "logits/chosen": -2.416687488555908, + "logits/rejected": -2.513040542602539, + "logps/chosen": -161.75949096679688, + "logps/rejected": -208.6759796142578, + "loss": 0.6325, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.26415586471557617, + "rewards/margins": 0.599382758140564, + "rewards/rejected": -0.8635387420654297, + "step": 2213 + }, + { + "epoch": 0.26, + "learning_rate": 2.2685239377267936e-07, + "logits/chosen": -2.7444801330566406, + "logits/rejected": -3.098662853240967, + "logps/chosen": -383.8021545410156, + "logps/rejected": -271.0512390136719, + "loss": 0.2592, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3276050090789795, + "rewards/margins": 1.4762645959854126, + "rewards/rejected": -1.148659586906433, + "step": 2214 + }, + { + "epoch": 0.26, + "learning_rate": 2.268172773030551e-07, + "logits/chosen": -3.567033290863037, + "logits/rejected": -3.446316957473755, + "logps/chosen": -503.92425537109375, + "logps/rejected": -296.9543762207031, + "loss": 0.3258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13412217795848846, + "rewards/margins": 1.6012651920318604, + "rewards/rejected": -1.7353874444961548, + "step": 2215 + }, + { + "epoch": 0.26, + "learning_rate": 2.267821608334309e-07, + "logits/chosen": -2.9030933380126953, + "logits/rejected": -2.9637844562530518, + "logps/chosen": -498.39849853515625, + "logps/rejected": -255.85308837890625, + "loss": 0.4309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7231199145317078, + "rewards/margins": 0.7103961706161499, + "rewards/rejected": 0.012723691761493683, + "step": 2216 + }, + { + "epoch": 0.26, + "learning_rate": 2.267470443638066e-07, + "logits/chosen": -3.359099864959717, + "logits/rejected": -3.1479880809783936, + "logps/chosen": -129.34469604492188, + "logps/rejected": -171.88211059570312, + "loss": 0.2768, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20070721209049225, + "rewards/margins": 1.8131184577941895, + "rewards/rejected": -2.0138256549835205, + "step": 2217 + }, + { + "epoch": 0.26, + "learning_rate": 2.2671192789418237e-07, + "logits/chosen": -3.5284810066223145, + "logits/rejected": -3.7530899047851562, + "logps/chosen": -111.2460708618164, + "logps/rejected": -190.41525268554688, + "loss": 0.5121, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06265842914581299, + "rewards/margins": 1.574991226196289, + "rewards/rejected": -1.6376495361328125, + "step": 2218 + }, + { + "epoch": 0.26, + "learning_rate": 2.2667681142455813e-07, + "logits/chosen": -2.9116177558898926, + "logits/rejected": -3.0700182914733887, + "logps/chosen": -259.44573974609375, + "logps/rejected": -279.88458251953125, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4714348316192627, + "rewards/margins": 2.3420186042785645, + "rewards/rejected": -1.8705838918685913, + "step": 2219 + }, + { + "epoch": 0.26, + "learning_rate": 2.2664169495493385e-07, + "logits/chosen": -3.32549786567688, + "logits/rejected": -3.1573853492736816, + "logps/chosen": -429.72613525390625, + "logps/rejected": -315.81298828125, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07049253582954407, + "rewards/margins": 2.0963709354400635, + "rewards/rejected": -2.1668636798858643, + "step": 2220 + }, + { + "epoch": 0.26, + "learning_rate": 2.266065784853096e-07, + "logits/chosen": -2.5333878993988037, + "logits/rejected": -2.593008279800415, + "logps/chosen": -248.7277069091797, + "logps/rejected": -215.50869750976562, + "loss": 0.4678, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0822727233171463, + "rewards/margins": 1.0089495182037354, + "rewards/rejected": -0.926676869392395, + "step": 2221 + }, + { + "epoch": 0.26, + "learning_rate": 2.2657146201568534e-07, + "logits/chosen": -3.3910741806030273, + "logits/rejected": -3.4796977043151855, + "logps/chosen": -275.5450134277344, + "logps/rejected": -328.5380859375, + "loss": 0.8617, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3655571937561035, + "rewards/margins": 0.3967825770378113, + "rewards/rejected": -1.7623398303985596, + "step": 2222 + }, + { + "epoch": 0.26, + "learning_rate": 2.265363455460611e-07, + "logits/chosen": -2.503227472305298, + "logits/rejected": -2.6724305152893066, + "logps/chosen": -259.76019287109375, + "logps/rejected": -210.8515167236328, + "loss": 0.2937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16455549001693726, + "rewards/margins": 1.9994087219238281, + "rewards/rejected": -1.8348530530929565, + "step": 2223 + }, + { + "epoch": 0.26, + "learning_rate": 2.2650122907643684e-07, + "logits/chosen": -3.1582465171813965, + "logits/rejected": -3.1813392639160156, + "logps/chosen": -276.953369140625, + "logps/rejected": -202.95489501953125, + "loss": 0.3505, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10009332746267319, + "rewards/margins": 1.81458580493927, + "rewards/rejected": -1.7144925594329834, + "step": 2224 + }, + { + "epoch": 0.26, + "learning_rate": 2.2646611260681257e-07, + "logits/chosen": -3.238157272338867, + "logits/rejected": -3.109389305114746, + "logps/chosen": -327.6822509765625, + "logps/rejected": -177.24972534179688, + "loss": 0.4003, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2721123695373535, + "rewards/margins": 1.6343474388122559, + "rewards/rejected": -1.3622353076934814, + "step": 2225 + }, + { + "epoch": 0.26, + "learning_rate": 2.2643099613718832e-07, + "logits/chosen": -3.5098376274108887, + "logits/rejected": -3.6221776008605957, + "logps/chosen": -275.2200927734375, + "logps/rejected": -207.06979370117188, + "loss": 0.3724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25513190031051636, + "rewards/margins": 1.1890825033187866, + "rewards/rejected": -1.4442143440246582, + "step": 2226 + }, + { + "epoch": 0.26, + "learning_rate": 2.263958796675641e-07, + "logits/chosen": -2.8032596111297607, + "logits/rejected": -2.477343797683716, + "logps/chosen": -318.05938720703125, + "logps/rejected": -340.1149597167969, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21060171723365784, + "rewards/margins": 0.6630598306655884, + "rewards/rejected": -0.8736615777015686, + "step": 2227 + }, + { + "epoch": 0.26, + "learning_rate": 2.2636076319793983e-07, + "logits/chosen": -3.5269153118133545, + "logits/rejected": -3.714320659637451, + "logps/chosen": -229.12062072753906, + "logps/rejected": -323.8396911621094, + "loss": 0.2625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08395881205797195, + "rewards/margins": 3.221024513244629, + "rewards/rejected": -3.304983139038086, + "step": 2228 + }, + { + "epoch": 0.26, + "learning_rate": 2.2632564672831559e-07, + "logits/chosen": -2.9582176208496094, + "logits/rejected": -3.087999105453491, + "logps/chosen": -243.1240234375, + "logps/rejected": -205.3972625732422, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5497586727142334, + "rewards/margins": 2.2877979278564453, + "rewards/rejected": -1.7380391359329224, + "step": 2229 + }, + { + "epoch": 0.26, + "learning_rate": 2.2629053025869131e-07, + "logits/chosen": -3.924192428588867, + "logits/rejected": -3.7524094581604004, + "logps/chosen": -207.98471069335938, + "logps/rejected": -190.6427001953125, + "loss": 0.5701, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2102999687194824, + "rewards/margins": 0.8181689381599426, + "rewards/rejected": -2.0284688472747803, + "step": 2230 + }, + { + "epoch": 0.26, + "learning_rate": 2.2625541378906707e-07, + "logits/chosen": -3.1217308044433594, + "logits/rejected": -3.0200002193450928, + "logps/chosen": -219.8317413330078, + "logps/rejected": -128.22268676757812, + "loss": 0.3917, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06199701130390167, + "rewards/margins": 0.9160330891609192, + "rewards/rejected": -0.9780300855636597, + "step": 2231 + }, + { + "epoch": 0.26, + "learning_rate": 2.2622029731944282e-07, + "logits/chosen": -3.3196372985839844, + "logits/rejected": -3.142343759536743, + "logps/chosen": -181.75997924804688, + "logps/rejected": -197.08424377441406, + "loss": 0.5066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4157918393611908, + "rewards/margins": 1.1415128707885742, + "rewards/rejected": -1.5573046207427979, + "step": 2232 + }, + { + "epoch": 0.26, + "learning_rate": 2.2618518084981855e-07, + "logits/chosen": -2.6792635917663574, + "logits/rejected": -2.5943212509155273, + "logps/chosen": -380.5477294921875, + "logps/rejected": -300.2999572753906, + "loss": 0.5525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7593969702720642, + "rewards/margins": 0.9040735960006714, + "rewards/rejected": -1.6634706258773804, + "step": 2233 + }, + { + "epoch": 0.26, + "learning_rate": 2.261500643801943e-07, + "logits/chosen": -2.836219310760498, + "logits/rejected": -2.73982834815979, + "logps/chosen": -322.259033203125, + "logps/rejected": -303.5791015625, + "loss": 0.8128, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33855730295181274, + "rewards/margins": 0.0060512349009513855, + "rewards/rejected": -0.34460848569869995, + "step": 2234 + }, + { + "epoch": 0.26, + "learning_rate": 2.2611494791057003e-07, + "logits/chosen": -2.683610439300537, + "logits/rejected": -2.830195188522339, + "logps/chosen": -375.2545166015625, + "logps/rejected": -368.57177734375, + "loss": 0.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.74430251121521, + "rewards/margins": 1.4760124683380127, + "rewards/rejected": -0.7317099571228027, + "step": 2235 + }, + { + "epoch": 0.26, + "learning_rate": 2.2607983144094578e-07, + "logits/chosen": -3.5386037826538086, + "logits/rejected": -4.169234275817871, + "logps/chosen": -247.97384643554688, + "logps/rejected": -352.4546813964844, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2025674432516098, + "rewards/margins": 1.3374712467193604, + "rewards/rejected": -1.5400387048721313, + "step": 2236 + }, + { + "epoch": 0.26, + "learning_rate": 2.2604471497132154e-07, + "logits/chosen": -3.09023380279541, + "logits/rejected": -3.0649635791778564, + "logps/chosen": -211.0482635498047, + "logps/rejected": -278.8951721191406, + "loss": 0.3222, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08305928111076355, + "rewards/margins": 1.8281651735305786, + "rewards/rejected": -1.7451059818267822, + "step": 2237 + }, + { + "epoch": 0.26, + "learning_rate": 2.2600959850169727e-07, + "logits/chosen": -2.829986095428467, + "logits/rejected": -3.001282215118408, + "logps/chosen": -335.6710205078125, + "logps/rejected": -313.0057678222656, + "loss": 0.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1497618705034256, + "rewards/margins": 1.4237704277038574, + "rewards/rejected": -1.573532223701477, + "step": 2238 + }, + { + "epoch": 0.26, + "learning_rate": 2.2597448203207305e-07, + "logits/chosen": -3.5548465251922607, + "logits/rejected": -3.7334554195404053, + "logps/chosen": -311.49920654296875, + "logps/rejected": -268.40765380859375, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05918455868959427, + "rewards/margins": 1.4807531833648682, + "rewards/rejected": -1.4215686321258545, + "step": 2239 + }, + { + "epoch": 0.26, + "learning_rate": 2.259393655624488e-07, + "logits/chosen": -3.6163840293884277, + "logits/rejected": -3.726585865020752, + "logps/chosen": -179.51388549804688, + "logps/rejected": -156.46107482910156, + "loss": 0.3629, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5495415925979614, + "rewards/margins": 1.4631555080413818, + "rewards/rejected": -0.9136137962341309, + "step": 2240 + }, + { + "epoch": 0.26, + "learning_rate": 2.2590424909282453e-07, + "logits/chosen": -4.048492431640625, + "logits/rejected": -3.6627912521362305, + "logps/chosen": -275.87420654296875, + "logps/rejected": -234.1333465576172, + "loss": 0.5873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23903483152389526, + "rewards/margins": 0.8965874910354614, + "rewards/rejected": -1.135622262954712, + "step": 2241 + }, + { + "epoch": 0.26, + "learning_rate": 2.2586913262320028e-07, + "logits/chosen": -2.667475938796997, + "logits/rejected": -2.7740554809570312, + "logps/chosen": -154.42405700683594, + "logps/rejected": -212.1448211669922, + "loss": 0.3884, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.041147418320178986, + "rewards/margins": 1.4833874702453613, + "rewards/rejected": -1.4422399997711182, + "step": 2242 + }, + { + "epoch": 0.26, + "learning_rate": 2.25834016153576e-07, + "logits/chosen": -2.6408472061157227, + "logits/rejected": -2.5702755451202393, + "logps/chosen": -334.9706115722656, + "logps/rejected": -355.46429443359375, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1447509378194809, + "rewards/margins": 1.4408013820648193, + "rewards/rejected": -1.5855523347854614, + "step": 2243 + }, + { + "epoch": 0.26, + "learning_rate": 2.2579889968395176e-07, + "logits/chosen": -3.1651813983917236, + "logits/rejected": -3.2105979919433594, + "logps/chosen": -300.7978515625, + "logps/rejected": -302.9718322753906, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3748359680175781, + "rewards/margins": 1.004964828491211, + "rewards/rejected": -0.6301287412643433, + "step": 2244 + }, + { + "epoch": 0.26, + "learning_rate": 2.2576378321432752e-07, + "logits/chosen": -3.193474531173706, + "logits/rejected": -3.421196699142456, + "logps/chosen": -130.29763793945312, + "logps/rejected": -188.87966918945312, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.890293300151825, + "rewards/margins": 1.6719597578048706, + "rewards/rejected": -0.7816663980484009, + "step": 2245 + }, + { + "epoch": 0.26, + "learning_rate": 2.2572866674470324e-07, + "logits/chosen": -4.204399108886719, + "logits/rejected": -3.5902903079986572, + "logps/chosen": -243.09902954101562, + "logps/rejected": -157.76229858398438, + "loss": 0.506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.264714777469635, + "rewards/margins": 1.1124815940856934, + "rewards/rejected": -1.3771963119506836, + "step": 2246 + }, + { + "epoch": 0.26, + "learning_rate": 2.25693550275079e-07, + "logits/chosen": -3.070166826248169, + "logits/rejected": -2.9078359603881836, + "logps/chosen": -335.2261962890625, + "logps/rejected": -293.58782958984375, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9442602396011353, + "rewards/margins": 2.633467674255371, + "rewards/rejected": -1.6892074346542358, + "step": 2247 + }, + { + "epoch": 0.26, + "learning_rate": 2.2565843380545475e-07, + "logits/chosen": -3.242949962615967, + "logits/rejected": -3.335228681564331, + "logps/chosen": -332.5345458984375, + "logps/rejected": -274.72711181640625, + "loss": 0.221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15956228971481323, + "rewards/margins": 2.2111854553222656, + "rewards/rejected": -2.0516231060028076, + "step": 2248 + }, + { + "epoch": 0.26, + "learning_rate": 2.2562331733583048e-07, + "logits/chosen": -2.97841739654541, + "logits/rejected": -2.8605456352233887, + "logps/chosen": -289.9798278808594, + "logps/rejected": -238.18922424316406, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5521097779273987, + "rewards/margins": 2.0496795177459717, + "rewards/rejected": -1.4975697994232178, + "step": 2249 + }, + { + "epoch": 0.26, + "learning_rate": 2.2558820086620626e-07, + "logits/chosen": -3.384396553039551, + "logits/rejected": -3.1368472576141357, + "logps/chosen": -283.4202575683594, + "logps/rejected": -179.10008239746094, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10693298280239105, + "rewards/margins": 0.4967597424983978, + "rewards/rejected": -0.6036927103996277, + "step": 2250 + }, + { + "epoch": 0.26, + "learning_rate": 2.2555308439658196e-07, + "logits/chosen": -3.3267688751220703, + "logits/rejected": -3.4959793090820312, + "logps/chosen": -275.69586181640625, + "logps/rejected": -225.05133056640625, + "loss": 0.4673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4311789572238922, + "rewards/margins": 1.9876725673675537, + "rewards/rejected": -2.418851613998413, + "step": 2251 + }, + { + "epoch": 0.26, + "learning_rate": 2.2551796792695774e-07, + "logits/chosen": -3.4552927017211914, + "logits/rejected": -3.367327928543091, + "logps/chosen": -499.72039794921875, + "logps/rejected": -357.138671875, + "loss": 0.5066, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12537826597690582, + "rewards/margins": 1.9460505247116089, + "rewards/rejected": -1.8206722736358643, + "step": 2252 + }, + { + "epoch": 0.26, + "learning_rate": 2.254828514573335e-07, + "logits/chosen": -2.8800249099731445, + "logits/rejected": -3.164947748184204, + "logps/chosen": -136.47613525390625, + "logps/rejected": -227.69471740722656, + "loss": 0.3149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10136361420154572, + "rewards/margins": 1.9796645641326904, + "rewards/rejected": -2.0810279846191406, + "step": 2253 + }, + { + "epoch": 0.26, + "learning_rate": 2.2544773498770922e-07, + "logits/chosen": -2.4175760746002197, + "logits/rejected": -2.373337507247925, + "logps/chosen": -337.4205322265625, + "logps/rejected": -283.81549072265625, + "loss": 0.4172, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2961743474006653, + "rewards/margins": 1.4359197616577148, + "rewards/rejected": -1.1397454738616943, + "step": 2254 + }, + { + "epoch": 0.26, + "learning_rate": 2.2541261851808497e-07, + "logits/chosen": -3.231973171234131, + "logits/rejected": -2.943213939666748, + "logps/chosen": -326.1588439941406, + "logps/rejected": -273.28497314453125, + "loss": 0.4713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14142769575119019, + "rewards/margins": 2.073690891265869, + "rewards/rejected": -1.9322631359100342, + "step": 2255 + }, + { + "epoch": 0.26, + "learning_rate": 2.2537750204846073e-07, + "logits/chosen": -2.9443790912628174, + "logits/rejected": -2.9598214626312256, + "logps/chosen": -489.3149719238281, + "logps/rejected": -478.10638427734375, + "loss": 0.6087, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.558492124080658, + "rewards/margins": 1.161177396774292, + "rewards/rejected": -1.7196694612503052, + "step": 2256 + }, + { + "epoch": 0.26, + "learning_rate": 2.2534238557883646e-07, + "logits/chosen": -3.1329293251037598, + "logits/rejected": -3.4967658519744873, + "logps/chosen": -275.5706787109375, + "logps/rejected": -271.963623046875, + "loss": 0.512, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8178631663322449, + "rewards/margins": 2.155769109725952, + "rewards/rejected": -2.9736320972442627, + "step": 2257 + }, + { + "epoch": 0.26, + "learning_rate": 2.253072691092122e-07, + "logits/chosen": -3.827704429626465, + "logits/rejected": -3.7706682682037354, + "logps/chosen": -213.4423370361328, + "logps/rejected": -269.92901611328125, + "loss": 0.3274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05744795501232147, + "rewards/margins": 1.8602038621902466, + "rewards/rejected": -1.9176517724990845, + "step": 2258 + }, + { + "epoch": 0.26, + "learning_rate": 2.2527215263958794e-07, + "logits/chosen": -2.552145004272461, + "logits/rejected": -2.794638156890869, + "logps/chosen": -309.59417724609375, + "logps/rejected": -250.362060546875, + "loss": 0.4627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3958032727241516, + "rewards/margins": 1.2265560626983643, + "rewards/rejected": -1.6223593950271606, + "step": 2259 + }, + { + "epoch": 0.26, + "learning_rate": 2.252370361699637e-07, + "logits/chosen": -3.6009974479675293, + "logits/rejected": -3.0541000366210938, + "logps/chosen": -328.2898254394531, + "logps/rejected": -250.28208923339844, + "loss": 0.269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4292284846305847, + "rewards/margins": 1.7118525505065918, + "rewards/rejected": -1.2826241254806519, + "step": 2260 + }, + { + "epoch": 0.26, + "learning_rate": 2.2520191970033947e-07, + "logits/chosen": -2.3224234580993652, + "logits/rejected": -2.2000749111175537, + "logps/chosen": -483.48291015625, + "logps/rejected": -439.1147155761719, + "loss": 0.2603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44971317052841187, + "rewards/margins": 1.9741356372833252, + "rewards/rejected": -1.5244224071502686, + "step": 2261 + }, + { + "epoch": 0.26, + "learning_rate": 2.2516680323071517e-07, + "logits/chosen": -3.180293560028076, + "logits/rejected": -2.922402858734131, + "logps/chosen": -162.604248046875, + "logps/rejected": -226.42010498046875, + "loss": 0.3257, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3010476231575012, + "rewards/margins": 2.251763343811035, + "rewards/rejected": -1.9507155418395996, + "step": 2262 + }, + { + "epoch": 0.26, + "learning_rate": 2.2513168676109095e-07, + "logits/chosen": -3.921855926513672, + "logits/rejected": -3.881495475769043, + "logps/chosen": -288.08709716796875, + "logps/rejected": -198.5615234375, + "loss": 0.5503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6627322435379028, + "rewards/margins": 1.1447124481201172, + "rewards/rejected": -1.8074445724487305, + "step": 2263 + }, + { + "epoch": 0.26, + "learning_rate": 2.250965702914667e-07, + "logits/chosen": -3.5100176334381104, + "logits/rejected": -3.2538137435913086, + "logps/chosen": -304.4443054199219, + "logps/rejected": -280.97332763671875, + "loss": 0.6767, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4202141761779785, + "rewards/margins": 0.7322701215744019, + "rewards/rejected": -1.1524842977523804, + "step": 2264 + }, + { + "epoch": 0.26, + "learning_rate": 2.2506145382184243e-07, + "logits/chosen": -3.3186898231506348, + "logits/rejected": -3.2877187728881836, + "logps/chosen": -203.36044311523438, + "logps/rejected": -159.20184326171875, + "loss": 0.4219, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18551760911941528, + "rewards/margins": 1.2251909971237183, + "rewards/rejected": -1.0396734476089478, + "step": 2265 + }, + { + "epoch": 0.26, + "learning_rate": 2.250263373522182e-07, + "logits/chosen": -3.026066780090332, + "logits/rejected": -3.175785779953003, + "logps/chosen": -175.00706481933594, + "logps/rejected": -174.04225158691406, + "loss": 0.2577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3849083483219147, + "rewards/margins": 1.82658052444458, + "rewards/rejected": -1.4416720867156982, + "step": 2266 + }, + { + "epoch": 0.26, + "learning_rate": 2.2499122088259392e-07, + "logits/chosen": -3.1592414379119873, + "logits/rejected": -3.0226528644561768, + "logps/chosen": -234.56674194335938, + "logps/rejected": -240.3091583251953, + "loss": 0.2013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42934906482696533, + "rewards/margins": 2.0805978775024414, + "rewards/rejected": -1.6512489318847656, + "step": 2267 + }, + { + "epoch": 0.26, + "learning_rate": 2.2495610441296967e-07, + "logits/chosen": -3.399369716644287, + "logits/rejected": -3.2219815254211426, + "logps/chosen": -208.96234130859375, + "logps/rejected": -252.11996459960938, + "loss": 0.7089, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4053388237953186, + "rewards/margins": 0.9835020303726196, + "rewards/rejected": -1.388840675354004, + "step": 2268 + }, + { + "epoch": 0.26, + "learning_rate": 2.2492098794334542e-07, + "logits/chosen": -3.368628978729248, + "logits/rejected": -3.531409978866577, + "logps/chosen": -316.14813232421875, + "logps/rejected": -269.9251403808594, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29628127813339233, + "rewards/margins": 1.735966444015503, + "rewards/rejected": -1.4396851062774658, + "step": 2269 + }, + { + "epoch": 0.26, + "learning_rate": 2.2488587147372115e-07, + "logits/chosen": -3.462883949279785, + "logits/rejected": -3.572935104370117, + "logps/chosen": -182.82998657226562, + "logps/rejected": -194.19821166992188, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3008688688278198, + "rewards/margins": 1.8331443071365356, + "rewards/rejected": -1.5322753190994263, + "step": 2270 + }, + { + "epoch": 0.26, + "learning_rate": 2.248507550040969e-07, + "logits/chosen": -3.124620199203491, + "logits/rejected": -3.184330940246582, + "logps/chosen": -316.53289794921875, + "logps/rejected": -246.68612670898438, + "loss": 0.3911, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1311386078596115, + "rewards/margins": 1.0782136917114258, + "rewards/rejected": -1.2093522548675537, + "step": 2271 + }, + { + "epoch": 0.26, + "learning_rate": 2.2481563853447268e-07, + "logits/chosen": -3.1124401092529297, + "logits/rejected": -3.0829367637634277, + "logps/chosen": -248.4344940185547, + "logps/rejected": -163.2216796875, + "loss": 0.4247, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027779266238212585, + "rewards/margins": 0.9924795627593994, + "rewards/rejected": -0.9647003412246704, + "step": 2272 + }, + { + "epoch": 0.26, + "learning_rate": 2.247805220648484e-07, + "logits/chosen": -3.8498446941375732, + "logits/rejected": -3.9508254528045654, + "logps/chosen": -450.53851318359375, + "logps/rejected": -388.12445068359375, + "loss": 0.8231, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08393092453479767, + "rewards/margins": 0.8466235399246216, + "rewards/rejected": -0.7626925706863403, + "step": 2273 + }, + { + "epoch": 0.26, + "learning_rate": 2.2474540559522417e-07, + "logits/chosen": -2.8272480964660645, + "logits/rejected": -2.8338663578033447, + "logps/chosen": -207.17022705078125, + "logps/rejected": -241.3815155029297, + "loss": 0.2304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3198411464691162, + "rewards/margins": 1.5751274824142456, + "rewards/rejected": -1.2552860975265503, + "step": 2274 + }, + { + "epoch": 0.26, + "learning_rate": 2.247102891255999e-07, + "logits/chosen": -3.4083895683288574, + "logits/rejected": -3.4066579341888428, + "logps/chosen": -254.93682861328125, + "logps/rejected": -250.876708984375, + "loss": 0.3784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33506911993026733, + "rewards/margins": 2.063014507293701, + "rewards/rejected": -2.3980836868286133, + "step": 2275 + }, + { + "epoch": 0.26, + "learning_rate": 2.2467517265597565e-07, + "logits/chosen": -3.3835816383361816, + "logits/rejected": -3.297212600708008, + "logps/chosen": -236.33331298828125, + "logps/rejected": -230.04714965820312, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006941929459571838, + "rewards/margins": 0.6138090491294861, + "rewards/rejected": -0.6068670749664307, + "step": 2276 + }, + { + "epoch": 0.26, + "learning_rate": 2.246400561863514e-07, + "logits/chosen": -3.1519381999969482, + "logits/rejected": -3.1097850799560547, + "logps/chosen": -264.0862731933594, + "logps/rejected": -244.04307556152344, + "loss": 0.6354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42900213599205017, + "rewards/margins": 0.920783281326294, + "rewards/rejected": -1.3497854471206665, + "step": 2277 + }, + { + "epoch": 0.26, + "learning_rate": 2.2460493971672713e-07, + "logits/chosen": -3.354160785675049, + "logits/rejected": -3.571348190307617, + "logps/chosen": -108.08409881591797, + "logps/rejected": -188.0736846923828, + "loss": 0.5215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40679460763931274, + "rewards/margins": 1.5039973258972168, + "rewards/rejected": -1.9107921123504639, + "step": 2278 + }, + { + "epoch": 0.26, + "learning_rate": 2.2456982324710288e-07, + "logits/chosen": -2.8309545516967773, + "logits/rejected": -2.372727632522583, + "logps/chosen": -108.12954711914062, + "logps/rejected": -144.30599975585938, + "loss": 0.6019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49348244071006775, + "rewards/margins": 0.5499218702316284, + "rewards/rejected": -1.043404221534729, + "step": 2279 + }, + { + "epoch": 0.26, + "learning_rate": 2.245347067774786e-07, + "logits/chosen": -3.8833813667297363, + "logits/rejected": -3.5503346920013428, + "logps/chosen": -541.40380859375, + "logps/rejected": -204.13046264648438, + "loss": 0.29, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14965233206748962, + "rewards/margins": 1.7203710079193115, + "rewards/rejected": -1.870023488998413, + "step": 2280 + }, + { + "epoch": 0.26, + "learning_rate": 2.2449959030785436e-07, + "logits/chosen": -3.4936163425445557, + "logits/rejected": -3.4844467639923096, + "logps/chosen": -168.93914794921875, + "logps/rejected": -158.01846313476562, + "loss": 0.499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01446482539176941, + "rewards/margins": 1.1886029243469238, + "rewards/rejected": -1.2030677795410156, + "step": 2281 + }, + { + "epoch": 0.26, + "learning_rate": 2.2446447383823012e-07, + "logits/chosen": -3.4351913928985596, + "logits/rejected": -3.2744905948638916, + "logps/chosen": -238.66021728515625, + "logps/rejected": -289.5440979003906, + "loss": 0.382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09510049223899841, + "rewards/margins": 1.0728797912597656, + "rewards/rejected": -1.1679801940917969, + "step": 2282 + }, + { + "epoch": 0.26, + "learning_rate": 2.2442935736860584e-07, + "logits/chosen": -3.14445424079895, + "logits/rejected": -2.8833515644073486, + "logps/chosen": -280.6742248535156, + "logps/rejected": -278.46588134765625, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6236699819564819, + "rewards/margins": 2.594407796859741, + "rewards/rejected": -1.9707375764846802, + "step": 2283 + }, + { + "epoch": 0.26, + "learning_rate": 2.2439424089898162e-07, + "logits/chosen": -3.1564388275146484, + "logits/rejected": -2.964195728302002, + "logps/chosen": -365.9484558105469, + "logps/rejected": -265.7365417480469, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11767662316560745, + "rewards/margins": 1.4449424743652344, + "rewards/rejected": -1.327265977859497, + "step": 2284 + }, + { + "epoch": 0.26, + "learning_rate": 2.2435912442935738e-07, + "logits/chosen": -2.6983301639556885, + "logits/rejected": -2.59895396232605, + "logps/chosen": -239.6616668701172, + "logps/rejected": -323.2125549316406, + "loss": 0.6148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42109742760658264, + "rewards/margins": 0.9226599931716919, + "rewards/rejected": -1.3437573909759521, + "step": 2285 + }, + { + "epoch": 0.26, + "learning_rate": 2.243240079597331e-07, + "logits/chosen": -3.0785109996795654, + "logits/rejected": -2.861117124557495, + "logps/chosen": -255.2427520751953, + "logps/rejected": -225.98379516601562, + "loss": 0.4827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2939375638961792, + "rewards/margins": 1.5898536443710327, + "rewards/rejected": -1.883791208267212, + "step": 2286 + }, + { + "epoch": 0.26, + "learning_rate": 2.2428889149010886e-07, + "logits/chosen": -3.454373359680176, + "logits/rejected": -3.180373430252075, + "logps/chosen": -486.55059814453125, + "logps/rejected": -258.9229736328125, + "loss": 0.2199, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14558573067188263, + "rewards/margins": 2.226757287979126, + "rewards/rejected": -2.081171751022339, + "step": 2287 + }, + { + "epoch": 0.26, + "learning_rate": 2.242537750204846e-07, + "logits/chosen": -2.8930914402008057, + "logits/rejected": -3.1478424072265625, + "logps/chosen": -243.96063232421875, + "logps/rejected": -336.82122802734375, + "loss": 0.2154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19600819051265717, + "rewards/margins": 2.4578299522399902, + "rewards/rejected": -2.261821746826172, + "step": 2288 + }, + { + "epoch": 0.26, + "learning_rate": 2.2421865855086034e-07, + "logits/chosen": -3.2064030170440674, + "logits/rejected": -3.1297738552093506, + "logps/chosen": -206.52078247070312, + "logps/rejected": -337.63446044921875, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3067767322063446, + "rewards/margins": 2.6514620780944824, + "rewards/rejected": -2.3446850776672363, + "step": 2289 + }, + { + "epoch": 0.26, + "learning_rate": 2.241835420812361e-07, + "logits/chosen": -3.1266894340515137, + "logits/rejected": -3.0699071884155273, + "logps/chosen": -281.8687744140625, + "logps/rejected": -320.9389343261719, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01643332466483116, + "rewards/margins": 0.9680436849594116, + "rewards/rejected": -0.9844770431518555, + "step": 2290 + }, + { + "epoch": 0.26, + "learning_rate": 2.2414842561161182e-07, + "logits/chosen": -2.4313154220581055, + "logits/rejected": -2.5549235343933105, + "logps/chosen": -265.5350036621094, + "logps/rejected": -290.1593322753906, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18767738342285156, + "rewards/margins": 0.8938902616500854, + "rewards/rejected": -1.081567645072937, + "step": 2291 + }, + { + "epoch": 0.26, + "learning_rate": 2.2411330914198758e-07, + "logits/chosen": -3.4065773487091064, + "logits/rejected": -3.153229236602783, + "logps/chosen": -350.052001953125, + "logps/rejected": -348.8642883300781, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45484742522239685, + "rewards/margins": 0.8081840872764587, + "rewards/rejected": -1.2630316019058228, + "step": 2292 + }, + { + "epoch": 0.26, + "learning_rate": 2.2407819267236333e-07, + "logits/chosen": -3.0807082653045654, + "logits/rejected": -3.3376882076263428, + "logps/chosen": -333.7201843261719, + "logps/rejected": -363.69647216796875, + "loss": 0.4653, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3106958568096161, + "rewards/margins": 1.1861242055892944, + "rewards/rejected": -0.8754282593727112, + "step": 2293 + }, + { + "epoch": 0.26, + "learning_rate": 2.2404307620273906e-07, + "logits/chosen": -3.154092311859131, + "logits/rejected": -3.6609325408935547, + "logps/chosen": -148.61679077148438, + "logps/rejected": -237.959716796875, + "loss": 0.1795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24423475563526154, + "rewards/margins": 2.550305128097534, + "rewards/rejected": -2.794539451599121, + "step": 2294 + }, + { + "epoch": 0.26, + "learning_rate": 2.2400795973311484e-07, + "logits/chosen": -3.5372209548950195, + "logits/rejected": -3.53617000579834, + "logps/chosen": -328.703857421875, + "logps/rejected": -260.7945861816406, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13486038148403168, + "rewards/margins": 2.131824254989624, + "rewards/rejected": -2.2666842937469482, + "step": 2295 + }, + { + "epoch": 0.26, + "learning_rate": 2.2397284326349054e-07, + "logits/chosen": -3.1791696548461914, + "logits/rejected": -3.312778949737549, + "logps/chosen": -320.55169677734375, + "logps/rejected": -327.502197265625, + "loss": 0.3846, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15459033846855164, + "rewards/margins": 1.1614190340042114, + "rewards/rejected": -1.0068286657333374, + "step": 2296 + }, + { + "epoch": 0.26, + "learning_rate": 2.2393772679386632e-07, + "logits/chosen": -3.3655989170074463, + "logits/rejected": -3.001248836517334, + "logps/chosen": -287.95892333984375, + "logps/rejected": -138.47171020507812, + "loss": 0.4552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.433413565158844, + "rewards/margins": 0.844413161277771, + "rewards/rejected": -1.2778267860412598, + "step": 2297 + }, + { + "epoch": 0.26, + "learning_rate": 2.2390261032424207e-07, + "logits/chosen": -3.427274703979492, + "logits/rejected": -2.9941210746765137, + "logps/chosen": -381.20654296875, + "logps/rejected": -271.62371826171875, + "loss": 0.3796, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5053874850273132, + "rewards/margins": 2.2276675701141357, + "rewards/rejected": -1.7222801446914673, + "step": 2298 + }, + { + "epoch": 0.27, + "learning_rate": 2.238674938546178e-07, + "logits/chosen": -3.1821961402893066, + "logits/rejected": -3.077338933944702, + "logps/chosen": -140.84918212890625, + "logps/rejected": -210.26116943359375, + "loss": 0.5102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10953521728515625, + "rewards/margins": 1.084930181503296, + "rewards/rejected": -1.1944653987884521, + "step": 2299 + }, + { + "epoch": 0.27, + "learning_rate": 2.2383237738499355e-07, + "logits/chosen": -2.7200963497161865, + "logits/rejected": -2.6300225257873535, + "logps/chosen": -147.16705322265625, + "logps/rejected": -205.0081329345703, + "loss": 0.3961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1571280062198639, + "rewards/margins": 0.8892449140548706, + "rewards/rejected": -1.0463730096817017, + "step": 2300 + }, + { + "epoch": 0.27, + "learning_rate": 2.237972609153693e-07, + "logits/chosen": -3.1947226524353027, + "logits/rejected": -2.8795852661132812, + "logps/chosen": -274.0819091796875, + "logps/rejected": -298.4087219238281, + "loss": 0.3128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35198846459388733, + "rewards/margins": 1.8519809246063232, + "rewards/rejected": -2.2039694786071777, + "step": 2301 + }, + { + "epoch": 0.27, + "learning_rate": 2.2376214444574504e-07, + "logits/chosen": -3.267542839050293, + "logits/rejected": -3.1906023025512695, + "logps/chosen": -134.63031005859375, + "logps/rejected": -136.79678344726562, + "loss": 0.9008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3767961859703064, + "rewards/margins": 0.3492196500301361, + "rewards/rejected": -0.7260158061981201, + "step": 2302 + }, + { + "epoch": 0.27, + "learning_rate": 2.237270279761208e-07, + "logits/chosen": -3.2612714767456055, + "logits/rejected": -3.221637725830078, + "logps/chosen": -149.6515350341797, + "logps/rejected": -191.91078186035156, + "loss": 0.4007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21605056524276733, + "rewards/margins": 1.591149091720581, + "rewards/rejected": -1.8071998357772827, + "step": 2303 + }, + { + "epoch": 0.27, + "learning_rate": 2.2369191150649652e-07, + "logits/chosen": -3.7730138301849365, + "logits/rejected": -3.7268903255462646, + "logps/chosen": -183.25645446777344, + "logps/rejected": -159.1355438232422, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09885180741548538, + "rewards/margins": 1.5030276775360107, + "rewards/rejected": -1.4041757583618164, + "step": 2304 + }, + { + "epoch": 0.27, + "learning_rate": 2.2365679503687227e-07, + "logits/chosen": -3.5077261924743652, + "logits/rejected": -3.2736268043518066, + "logps/chosen": -111.56582641601562, + "logps/rejected": -127.36540222167969, + "loss": 0.6173, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3931378126144409, + "rewards/margins": 0.8509576916694641, + "rewards/rejected": -1.2440954446792603, + "step": 2305 + }, + { + "epoch": 0.27, + "learning_rate": 2.2362167856724805e-07, + "logits/chosen": -2.835000514984131, + "logits/rejected": -2.863379955291748, + "logps/chosen": -96.53521728515625, + "logps/rejected": -163.51773071289062, + "loss": 0.3677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.040494468063116074, + "rewards/margins": 1.4579505920410156, + "rewards/rejected": -1.4984450340270996, + "step": 2306 + }, + { + "epoch": 0.27, + "learning_rate": 2.2358656209762378e-07, + "logits/chosen": -3.65985107421875, + "logits/rejected": -3.4756650924682617, + "logps/chosen": -284.971923828125, + "logps/rejected": -213.6151123046875, + "loss": 0.5499, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005902983248233795, + "rewards/margins": 0.5541761517524719, + "rewards/rejected": -0.5482732057571411, + "step": 2307 + }, + { + "epoch": 0.27, + "learning_rate": 2.2355144562799953e-07, + "logits/chosen": -3.4789464473724365, + "logits/rejected": -3.4813599586486816, + "logps/chosen": -225.6255340576172, + "logps/rejected": -251.912353515625, + "loss": 0.2841, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3957343101501465, + "rewards/margins": 2.797872543334961, + "rewards/rejected": -2.4021384716033936, + "step": 2308 + }, + { + "epoch": 0.27, + "learning_rate": 2.2351632915837529e-07, + "logits/chosen": -3.571413993835449, + "logits/rejected": -3.6593780517578125, + "logps/chosen": -227.0102996826172, + "logps/rejected": -219.4932861328125, + "loss": 0.3867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17431065440177917, + "rewards/margins": 0.9619319438934326, + "rewards/rejected": -0.7876212000846863, + "step": 2309 + }, + { + "epoch": 0.27, + "learning_rate": 2.23481212688751e-07, + "logits/chosen": -3.300116777420044, + "logits/rejected": -3.3857243061065674, + "logps/chosen": -203.71774291992188, + "logps/rejected": -134.4215087890625, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3072929382324219, + "rewards/margins": 0.9881787300109863, + "rewards/rejected": -0.6808857917785645, + "step": 2310 + }, + { + "epoch": 0.27, + "learning_rate": 2.2344609621912677e-07, + "logits/chosen": -3.2789535522460938, + "logits/rejected": -3.0446102619171143, + "logps/chosen": -560.724853515625, + "logps/rejected": -352.95025634765625, + "loss": 0.4581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07534122467041016, + "rewards/margins": 1.242357611656189, + "rewards/rejected": -1.3176988363265991, + "step": 2311 + }, + { + "epoch": 0.27, + "learning_rate": 2.234109797495025e-07, + "logits/chosen": -3.2544755935668945, + "logits/rejected": -3.3811397552490234, + "logps/chosen": -243.82464599609375, + "logps/rejected": -356.3487854003906, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06084499508142471, + "rewards/margins": 1.62021005153656, + "rewards/rejected": -1.559365153312683, + "step": 2312 + }, + { + "epoch": 0.27, + "learning_rate": 2.2337586327987825e-07, + "logits/chosen": -3.1081643104553223, + "logits/rejected": -3.223085403442383, + "logps/chosen": -141.86044311523438, + "logps/rejected": -173.1543426513672, + "loss": 0.4082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11057089269161224, + "rewards/margins": 0.8069924116134644, + "rewards/rejected": -0.9175633192062378, + "step": 2313 + }, + { + "epoch": 0.27, + "learning_rate": 2.23340746810254e-07, + "logits/chosen": -3.9556684494018555, + "logits/rejected": -3.7129368782043457, + "logps/chosen": -638.3978271484375, + "logps/rejected": -383.7227783203125, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24593685567378998, + "rewards/margins": 2.0285086631774902, + "rewards/rejected": -1.7825716733932495, + "step": 2314 + }, + { + "epoch": 0.27, + "learning_rate": 2.2330563034062973e-07, + "logits/chosen": -3.3882644176483154, + "logits/rejected": -3.101759910583496, + "logps/chosen": -245.13487243652344, + "logps/rejected": -349.3167419433594, + "loss": 0.2494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27796271443367004, + "rewards/margins": 1.9709676504135132, + "rewards/rejected": -1.693004846572876, + "step": 2315 + }, + { + "epoch": 0.27, + "learning_rate": 2.2327051387100548e-07, + "logits/chosen": -3.3345963954925537, + "logits/rejected": -3.2201483249664307, + "logps/chosen": -171.52688598632812, + "logps/rejected": -227.0782928466797, + "loss": 0.4838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30608487129211426, + "rewards/margins": 1.734795093536377, + "rewards/rejected": -2.040879964828491, + "step": 2316 + }, + { + "epoch": 0.27, + "learning_rate": 2.2323539740138126e-07, + "logits/chosen": -2.705482006072998, + "logits/rejected": -2.21264910697937, + "logps/chosen": -445.8128967285156, + "logps/rejected": -307.3106994628906, + "loss": 0.1778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6456524729728699, + "rewards/margins": 2.289884567260742, + "rewards/rejected": -1.6442320346832275, + "step": 2317 + }, + { + "epoch": 0.27, + "learning_rate": 2.23200280931757e-07, + "logits/chosen": -2.2589786052703857, + "logits/rejected": -2.402052879333496, + "logps/chosen": -430.25799560546875, + "logps/rejected": -407.96087646484375, + "loss": 0.4989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14651212096214294, + "rewards/margins": 0.8603757619857788, + "rewards/rejected": -1.0068879127502441, + "step": 2318 + }, + { + "epoch": 0.27, + "learning_rate": 2.2316516446213274e-07, + "logits/chosen": -2.9790573120117188, + "logits/rejected": -3.3048415184020996, + "logps/chosen": -121.721923828125, + "logps/rejected": -189.17747497558594, + "loss": 0.6564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3535494804382324, + "rewards/margins": 0.9624842405319214, + "rewards/rejected": -1.3160337209701538, + "step": 2319 + }, + { + "epoch": 0.27, + "learning_rate": 2.2313004799250847e-07, + "logits/chosen": -2.879859209060669, + "logits/rejected": -2.9390296936035156, + "logps/chosen": -196.1049041748047, + "logps/rejected": -228.0198211669922, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11319644749164581, + "rewards/margins": 2.323320150375366, + "rewards/rejected": -2.436516761779785, + "step": 2320 + }, + { + "epoch": 0.27, + "learning_rate": 2.2309493152288423e-07, + "logits/chosen": -2.8447914123535156, + "logits/rejected": -3.143139362335205, + "logps/chosen": -182.441162109375, + "logps/rejected": -164.0215606689453, + "loss": 0.3091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23421475291252136, + "rewards/margins": 2.2441415786743164, + "rewards/rejected": -2.47835636138916, + "step": 2321 + }, + { + "epoch": 0.27, + "learning_rate": 2.2305981505325998e-07, + "logits/chosen": -3.70208740234375, + "logits/rejected": -3.7647862434387207, + "logps/chosen": -141.14073181152344, + "logps/rejected": -197.44236755371094, + "loss": 0.4809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42657071352005005, + "rewards/margins": 1.5194764137268066, + "rewards/rejected": -1.9460471868515015, + "step": 2322 + }, + { + "epoch": 0.27, + "learning_rate": 2.230246985836357e-07, + "logits/chosen": -3.2442376613616943, + "logits/rejected": -3.343928098678589, + "logps/chosen": -117.53077697753906, + "logps/rejected": -150.13473510742188, + "loss": 0.455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23510989546775818, + "rewards/margins": 1.0165972709655762, + "rewards/rejected": -1.2517071962356567, + "step": 2323 + }, + { + "epoch": 0.27, + "learning_rate": 2.2298958211401146e-07, + "logits/chosen": -3.367222309112549, + "logits/rejected": -3.1284360885620117, + "logps/chosen": -178.44094848632812, + "logps/rejected": -163.55770874023438, + "loss": 0.4787, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3506614863872528, + "rewards/margins": 1.3835852146148682, + "rewards/rejected": -1.7342467308044434, + "step": 2324 + }, + { + "epoch": 0.27, + "learning_rate": 2.2295446564438722e-07, + "logits/chosen": -3.0737433433532715, + "logits/rejected": -3.311189889907837, + "logps/chosen": -283.84478759765625, + "logps/rejected": -379.0599365234375, + "loss": 0.2715, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3319266438484192, + "rewards/margins": 2.4585964679718018, + "rewards/rejected": -2.1266698837280273, + "step": 2325 + }, + { + "epoch": 0.27, + "learning_rate": 2.2291934917476294e-07, + "logits/chosen": -3.051520586013794, + "logits/rejected": -3.007539749145508, + "logps/chosen": -329.06658935546875, + "logps/rejected": -350.6230163574219, + "loss": 0.175, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23875963687896729, + "rewards/margins": 2.203643798828125, + "rewards/rejected": -1.9648841619491577, + "step": 2326 + }, + { + "epoch": 0.27, + "learning_rate": 2.228842327051387e-07, + "logits/chosen": -3.1601006984710693, + "logits/rejected": -3.1559200286865234, + "logps/chosen": -338.2620544433594, + "logps/rejected": -356.1646728515625, + "loss": 0.3907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0801306888461113, + "rewards/margins": 1.1949632167816162, + "rewards/rejected": -1.275093913078308, + "step": 2327 + }, + { + "epoch": 0.27, + "learning_rate": 2.2284911623551442e-07, + "logits/chosen": -2.860367774963379, + "logits/rejected": -3.049008846282959, + "logps/chosen": -300.8227233886719, + "logps/rejected": -230.3771209716797, + "loss": 0.5203, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009084880352020264, + "rewards/margins": 1.1613528728485107, + "rewards/rejected": -1.1522679328918457, + "step": 2328 + }, + { + "epoch": 0.27, + "learning_rate": 2.228139997658902e-07, + "logits/chosen": -3.1793651580810547, + "logits/rejected": -2.9272918701171875, + "logps/chosen": -147.4453887939453, + "logps/rejected": -223.3256378173828, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17410321533679962, + "rewards/margins": 1.507697582244873, + "rewards/rejected": -1.3335944414138794, + "step": 2329 + }, + { + "epoch": 0.27, + "learning_rate": 2.2277888329626596e-07, + "logits/chosen": -2.694965124130249, + "logits/rejected": -2.8185579776763916, + "logps/chosen": -228.50149536132812, + "logps/rejected": -227.23251342773438, + "loss": 0.958, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2796976864337921, + "rewards/margins": 0.5063636898994446, + "rewards/rejected": -0.7860614061355591, + "step": 2330 + }, + { + "epoch": 0.27, + "learning_rate": 2.2274376682664169e-07, + "logits/chosen": -3.498776912689209, + "logits/rejected": -3.654204845428467, + "logps/chosen": -312.8533935546875, + "logps/rejected": -277.02435302734375, + "loss": 0.284, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10883574187755585, + "rewards/margins": 2.124263048171997, + "rewards/rejected": -2.015427350997925, + "step": 2331 + }, + { + "epoch": 0.27, + "learning_rate": 2.2270865035701744e-07, + "logits/chosen": -3.4159011840820312, + "logits/rejected": -3.096810817718506, + "logps/chosen": -205.45248413085938, + "logps/rejected": -277.9186096191406, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47306978702545166, + "rewards/margins": 1.7605385780334473, + "rewards/rejected": -2.2336084842681885, + "step": 2332 + }, + { + "epoch": 0.27, + "learning_rate": 2.2267353388739317e-07, + "logits/chosen": -3.2760019302368164, + "logits/rejected": -3.4436821937561035, + "logps/chosen": -341.2275695800781, + "logps/rejected": -391.20831298828125, + "loss": 0.5589, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48584336042404175, + "rewards/margins": 0.903161346912384, + "rewards/rejected": -1.3890047073364258, + "step": 2333 + }, + { + "epoch": 0.27, + "learning_rate": 2.2263841741776892e-07, + "logits/chosen": -3.21671986579895, + "logits/rejected": -3.2600603103637695, + "logps/chosen": -253.56224060058594, + "logps/rejected": -151.76132202148438, + "loss": 0.4079, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2966836988925934, + "rewards/margins": 1.0916377305984497, + "rewards/rejected": -0.7949540615081787, + "step": 2334 + }, + { + "epoch": 0.27, + "learning_rate": 2.2260330094814467e-07, + "logits/chosen": -2.8790950775146484, + "logits/rejected": -2.8491828441619873, + "logps/chosen": -174.6387939453125, + "logps/rejected": -179.0290985107422, + "loss": 0.7913, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7612828612327576, + "rewards/margins": 1.011436104774475, + "rewards/rejected": -1.7727190256118774, + "step": 2335 + }, + { + "epoch": 0.27, + "learning_rate": 2.225681844785204e-07, + "logits/chosen": -4.086311340332031, + "logits/rejected": -3.9808340072631836, + "logps/chosen": -235.6486358642578, + "logps/rejected": -209.19998168945312, + "loss": 0.2772, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.196656733751297, + "rewards/margins": 1.6769564151763916, + "rewards/rejected": -1.4802995920181274, + "step": 2336 + }, + { + "epoch": 0.27, + "learning_rate": 2.2253306800889616e-07, + "logits/chosen": -3.225292444229126, + "logits/rejected": -3.1381125450134277, + "logps/chosen": -326.66363525390625, + "logps/rejected": -308.2825622558594, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24390682578086853, + "rewards/margins": 2.5328760147094727, + "rewards/rejected": -2.288969039916992, + "step": 2337 + }, + { + "epoch": 0.27, + "learning_rate": 2.224979515392719e-07, + "logits/chosen": -3.8074159622192383, + "logits/rejected": -3.481955051422119, + "logps/chosen": -289.1299743652344, + "logps/rejected": -218.94273376464844, + "loss": 0.4258, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.044199563562870026, + "rewards/margins": 1.2237370014190674, + "rewards/rejected": -1.1795374155044556, + "step": 2338 + }, + { + "epoch": 0.27, + "learning_rate": 2.2246283506964764e-07, + "logits/chosen": -3.373215913772583, + "logits/rejected": -3.6372809410095215, + "logps/chosen": -182.498291015625, + "logps/rejected": -232.4148712158203, + "loss": 0.4181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36814001202583313, + "rewards/margins": 0.9056377410888672, + "rewards/rejected": -1.273777723312378, + "step": 2339 + }, + { + "epoch": 0.27, + "learning_rate": 2.2242771860002342e-07, + "logits/chosen": -3.0227952003479004, + "logits/rejected": -2.999978542327881, + "logps/chosen": -282.85040283203125, + "logps/rejected": -234.45497131347656, + "loss": 0.4303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3275481164455414, + "rewards/margins": 1.1583924293518066, + "rewards/rejected": -1.4859405755996704, + "step": 2340 + }, + { + "epoch": 0.27, + "learning_rate": 2.2239260213039914e-07, + "logits/chosen": -3.8096840381622314, + "logits/rejected": -3.490859270095825, + "logps/chosen": -263.4416198730469, + "logps/rejected": -146.11439514160156, + "loss": 0.6514, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09396440535783768, + "rewards/margins": 1.118654489517212, + "rewards/rejected": -1.2126188278198242, + "step": 2341 + }, + { + "epoch": 0.27, + "learning_rate": 2.223574856607749e-07, + "logits/chosen": -3.435049533843994, + "logits/rejected": -3.6003971099853516, + "logps/chosen": -333.01055908203125, + "logps/rejected": -308.71649169921875, + "loss": 0.4915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04661216586828232, + "rewards/margins": 1.5785157680511475, + "rewards/rejected": -1.5319035053253174, + "step": 2342 + }, + { + "epoch": 0.27, + "learning_rate": 2.2232236919115065e-07, + "logits/chosen": -2.554391622543335, + "logits/rejected": -2.4934206008911133, + "logps/chosen": -445.8623962402344, + "logps/rejected": -279.84063720703125, + "loss": 0.362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3051025867462158, + "rewards/margins": 2.2693121433258057, + "rewards/rejected": -1.9642094373703003, + "step": 2343 + }, + { + "epoch": 0.27, + "learning_rate": 2.2228725272152638e-07, + "logits/chosen": -2.769512414932251, + "logits/rejected": -2.9518849849700928, + "logps/chosen": -325.8170166015625, + "logps/rejected": -283.3702392578125, + "loss": 0.3585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19465875625610352, + "rewards/margins": 1.7896649837493896, + "rewards/rejected": -1.5950063467025757, + "step": 2344 + }, + { + "epoch": 0.27, + "learning_rate": 2.2225213625190213e-07, + "logits/chosen": -3.531412124633789, + "logits/rejected": -3.475614309310913, + "logps/chosen": -179.06591796875, + "logps/rejected": -204.80856323242188, + "loss": 0.6645, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20357200503349304, + "rewards/margins": 0.48782479763031006, + "rewards/rejected": -0.6913967132568359, + "step": 2345 + }, + { + "epoch": 0.27, + "learning_rate": 2.222170197822779e-07, + "logits/chosen": -1.813718557357788, + "logits/rejected": -2.064091205596924, + "logps/chosen": -412.2948913574219, + "logps/rejected": -246.18441772460938, + "loss": 0.154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2545378804206848, + "rewards/margins": 2.5239691734313965, + "rewards/rejected": -2.2694313526153564, + "step": 2346 + }, + { + "epoch": 0.27, + "learning_rate": 2.2218190331265361e-07, + "logits/chosen": -2.628925323486328, + "logits/rejected": -2.913165330886841, + "logps/chosen": -106.1702651977539, + "logps/rejected": -156.7030487060547, + "loss": 0.3892, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23329822719097137, + "rewards/margins": 2.0119333267211914, + "rewards/rejected": -1.7786349058151245, + "step": 2347 + }, + { + "epoch": 0.27, + "learning_rate": 2.2214678684302937e-07, + "logits/chosen": -3.372081756591797, + "logits/rejected": -3.3796191215515137, + "logps/chosen": -113.38690948486328, + "logps/rejected": -193.27114868164062, + "loss": 0.1448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6297838687896729, + "rewards/margins": 2.606245517730713, + "rewards/rejected": -1.97646164894104, + "step": 2348 + }, + { + "epoch": 0.27, + "learning_rate": 2.221116703734051e-07, + "logits/chosen": -3.517207145690918, + "logits/rejected": -3.4619932174682617, + "logps/chosen": -353.9598083496094, + "logps/rejected": -261.6011047363281, + "loss": 0.2014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3443419337272644, + "rewards/margins": 1.7278470993041992, + "rewards/rejected": -1.3835052251815796, + "step": 2349 + }, + { + "epoch": 0.27, + "learning_rate": 2.2207655390378085e-07, + "logits/chosen": -3.1491665840148926, + "logits/rejected": -3.0899605751037598, + "logps/chosen": -120.68238830566406, + "logps/rejected": -180.17630004882812, + "loss": 0.2524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046226829290390015, + "rewards/margins": 2.2348098754882812, + "rewards/rejected": -2.281036853790283, + "step": 2350 + }, + { + "epoch": 0.27, + "learning_rate": 2.2204143743415663e-07, + "logits/chosen": -3.08530330657959, + "logits/rejected": -2.5792651176452637, + "logps/chosen": -297.60345458984375, + "logps/rejected": -146.40257263183594, + "loss": 0.8012, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4886108636856079, + "rewards/margins": 0.6560665965080261, + "rewards/rejected": -1.1446774005889893, + "step": 2351 + }, + { + "epoch": 0.27, + "learning_rate": 2.2200632096453236e-07, + "logits/chosen": -3.4146993160247803, + "logits/rejected": -3.084359645843506, + "logps/chosen": -152.92005920410156, + "logps/rejected": -150.97763061523438, + "loss": 0.3178, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3558596670627594, + "rewards/margins": 1.545936942100525, + "rewards/rejected": -1.1900771856307983, + "step": 2352 + }, + { + "epoch": 0.27, + "learning_rate": 2.219712044949081e-07, + "logits/chosen": -3.4668455123901367, + "logits/rejected": -3.2362961769104004, + "logps/chosen": -244.27468872070312, + "logps/rejected": -202.76588439941406, + "loss": 0.4321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48818355798721313, + "rewards/margins": 1.0799890756607056, + "rewards/rejected": -1.5681726932525635, + "step": 2353 + }, + { + "epoch": 0.27, + "learning_rate": 2.2193608802528387e-07, + "logits/chosen": -3.551710605621338, + "logits/rejected": -3.1826279163360596, + "logps/chosen": -377.43194580078125, + "logps/rejected": -189.50640869140625, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21525511145591736, + "rewards/margins": 1.5323140621185303, + "rewards/rejected": -1.7475690841674805, + "step": 2354 + }, + { + "epoch": 0.27, + "learning_rate": 2.219009715556596e-07, + "logits/chosen": -2.9195122718811035, + "logits/rejected": -2.8835558891296387, + "logps/chosen": -159.45071411132812, + "logps/rejected": -159.88906860351562, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4135933816432953, + "rewards/margins": 0.9657815098762512, + "rewards/rejected": -1.3793747425079346, + "step": 2355 + }, + { + "epoch": 0.27, + "learning_rate": 2.2186585508603535e-07, + "logits/chosen": -2.9549036026000977, + "logits/rejected": -2.754807472229004, + "logps/chosen": -275.7275085449219, + "logps/rejected": -274.50927734375, + "loss": 0.488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26981228590011597, + "rewards/margins": 1.8283768892288208, + "rewards/rejected": -2.098189115524292, + "step": 2356 + }, + { + "epoch": 0.27, + "learning_rate": 2.2183073861641107e-07, + "logits/chosen": -3.308736801147461, + "logits/rejected": -3.3536837100982666, + "logps/chosen": -318.6805725097656, + "logps/rejected": -286.7496032714844, + "loss": 0.3379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6624822616577148, + "rewards/margins": 2.1455230712890625, + "rewards/rejected": -1.4830409288406372, + "step": 2357 + }, + { + "epoch": 0.27, + "learning_rate": 2.2179562214678683e-07, + "logits/chosen": -2.864276170730591, + "logits/rejected": -3.079488754272461, + "logps/chosen": -342.0240783691406, + "logps/rejected": -248.77630615234375, + "loss": 0.3909, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21693213284015656, + "rewards/margins": 2.3979406356811523, + "rewards/rejected": -2.1810085773468018, + "step": 2358 + }, + { + "epoch": 0.27, + "learning_rate": 2.2176050567716258e-07, + "logits/chosen": -3.6980276107788086, + "logits/rejected": -3.1580188274383545, + "logps/chosen": -371.4825439453125, + "logps/rejected": -315.49456787109375, + "loss": 0.373, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5246973037719727, + "rewards/margins": 1.3481178283691406, + "rewards/rejected": -0.823420524597168, + "step": 2359 + }, + { + "epoch": 0.27, + "learning_rate": 2.217253892075383e-07, + "logits/chosen": -3.343250036239624, + "logits/rejected": -2.905071258544922, + "logps/chosen": -308.9192199707031, + "logps/rejected": -257.91400146484375, + "loss": 0.3459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1643187701702118, + "rewards/margins": 2.3319225311279297, + "rewards/rejected": -2.496241569519043, + "step": 2360 + }, + { + "epoch": 0.27, + "learning_rate": 2.2169027273791406e-07, + "logits/chosen": -3.4855165481567383, + "logits/rejected": -3.3890535831451416, + "logps/chosen": -160.6721954345703, + "logps/rejected": -175.79075622558594, + "loss": 0.5457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021240979433059692, + "rewards/margins": 0.49967584013938904, + "rewards/rejected": -0.5209168195724487, + "step": 2361 + }, + { + "epoch": 0.27, + "learning_rate": 2.2165515626828984e-07, + "logits/chosen": -3.5524086952209473, + "logits/rejected": -3.327573776245117, + "logps/chosen": -294.0095520019531, + "logps/rejected": -249.95724487304688, + "loss": 0.7239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5186745524406433, + "rewards/margins": 0.7030817270278931, + "rewards/rejected": -1.2217562198638916, + "step": 2362 + }, + { + "epoch": 0.27, + "learning_rate": 2.2162003979866557e-07, + "logits/chosen": -2.519744634628296, + "logits/rejected": -2.6741037368774414, + "logps/chosen": -447.0435485839844, + "logps/rejected": -438.4686279296875, + "loss": 0.2051, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4449297785758972, + "rewards/margins": 2.3131048679351807, + "rewards/rejected": -1.8681750297546387, + "step": 2363 + }, + { + "epoch": 0.27, + "learning_rate": 2.2158492332904132e-07, + "logits/chosen": -2.644977569580078, + "logits/rejected": -3.027986526489258, + "logps/chosen": -297.0125732421875, + "logps/rejected": -204.385009765625, + "loss": 0.3048, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03932652249932289, + "rewards/margins": 2.2451958656311035, + "rewards/rejected": -2.205869436264038, + "step": 2364 + }, + { + "epoch": 0.27, + "learning_rate": 2.2154980685941705e-07, + "logits/chosen": -3.01935076713562, + "logits/rejected": -2.8828582763671875, + "logps/chosen": -208.87442016601562, + "logps/rejected": -217.8555908203125, + "loss": 0.4533, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011846423149108887, + "rewards/margins": 0.9866576194763184, + "rewards/rejected": -0.9985040426254272, + "step": 2365 + }, + { + "epoch": 0.27, + "learning_rate": 2.215146903897928e-07, + "logits/chosen": -2.729572057723999, + "logits/rejected": -2.713742733001709, + "logps/chosen": -271.5955810546875, + "logps/rejected": -316.9332275390625, + "loss": 0.1964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09065475314855576, + "rewards/margins": 2.017472743988037, + "rewards/rejected": -2.1081275939941406, + "step": 2366 + }, + { + "epoch": 0.27, + "learning_rate": 2.2147957392016856e-07, + "logits/chosen": -3.062025785446167, + "logits/rejected": -3.1488003730773926, + "logps/chosen": -231.14218139648438, + "logps/rejected": -216.8812713623047, + "loss": 0.3381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15053454041481018, + "rewards/margins": 1.1994781494140625, + "rewards/rejected": -1.3500127792358398, + "step": 2367 + }, + { + "epoch": 0.27, + "learning_rate": 2.214444574505443e-07, + "logits/chosen": -2.9477925300598145, + "logits/rejected": -3.02471661567688, + "logps/chosen": -263.9590148925781, + "logps/rejected": -291.89361572265625, + "loss": 0.532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0771530345082283, + "rewards/margins": 1.1787853240966797, + "rewards/rejected": -1.2559382915496826, + "step": 2368 + }, + { + "epoch": 0.27, + "learning_rate": 2.2140934098092004e-07, + "logits/chosen": -2.950366973876953, + "logits/rejected": -2.7080912590026855, + "logps/chosen": -213.1247100830078, + "logps/rejected": -177.0506591796875, + "loss": 0.4165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4065900444984436, + "rewards/margins": 1.0092453956604004, + "rewards/rejected": -1.4158353805541992, + "step": 2369 + }, + { + "epoch": 0.27, + "learning_rate": 2.213742245112958e-07, + "logits/chosen": -3.305753707885742, + "logits/rejected": -3.1292917728424072, + "logps/chosen": -219.12379455566406, + "logps/rejected": -186.06771850585938, + "loss": 0.8962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8282966613769531, + "rewards/margins": 0.2661586105823517, + "rewards/rejected": -1.094455361366272, + "step": 2370 + }, + { + "epoch": 0.27, + "learning_rate": 2.2133910804167152e-07, + "logits/chosen": -2.7228991985321045, + "logits/rejected": -2.8949356079101562, + "logps/chosen": -160.4250946044922, + "logps/rejected": -223.53753662109375, + "loss": 0.2595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30080682039260864, + "rewards/margins": 2.7960963249206543, + "rewards/rejected": -3.0969035625457764, + "step": 2371 + }, + { + "epoch": 0.27, + "learning_rate": 2.2130399157204728e-07, + "logits/chosen": -3.3176345825195312, + "logits/rejected": -3.7617435455322266, + "logps/chosen": -103.75447845458984, + "logps/rejected": -183.79425048828125, + "loss": 0.3138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.062077343463897705, + "rewards/margins": 1.5383375883102417, + "rewards/rejected": -1.6004149913787842, + "step": 2372 + }, + { + "epoch": 0.27, + "learning_rate": 2.21268875102423e-07, + "logits/chosen": -2.5565836429595947, + "logits/rejected": -2.4830503463745117, + "logps/chosen": -352.04693603515625, + "logps/rejected": -152.4482421875, + "loss": 0.5654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5806581974029541, + "rewards/margins": 0.867794930934906, + "rewards/rejected": -1.4484530687332153, + "step": 2373 + }, + { + "epoch": 0.27, + "learning_rate": 2.2123375863279878e-07, + "logits/chosen": -3.0290164947509766, + "logits/rejected": -3.0526957511901855, + "logps/chosen": -149.31640625, + "logps/rejected": -224.25064086914062, + "loss": 0.3692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19528524577617645, + "rewards/margins": 2.2810399532318115, + "rewards/rejected": -2.085754632949829, + "step": 2374 + }, + { + "epoch": 0.27, + "learning_rate": 2.2119864216317454e-07, + "logits/chosen": -3.318833827972412, + "logits/rejected": -3.4359707832336426, + "logps/chosen": -276.9692077636719, + "logps/rejected": -350.209228515625, + "loss": 0.3379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12020000070333481, + "rewards/margins": 2.2103943824768066, + "rewards/rejected": -2.330594539642334, + "step": 2375 + }, + { + "epoch": 0.27, + "learning_rate": 2.2116352569355026e-07, + "logits/chosen": -3.231630802154541, + "logits/rejected": -3.184823513031006, + "logps/chosen": -203.3464813232422, + "logps/rejected": -181.96673583984375, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20183923840522766, + "rewards/margins": 1.325305461883545, + "rewards/rejected": -1.1234662532806396, + "step": 2376 + }, + { + "epoch": 0.27, + "learning_rate": 2.2112840922392602e-07, + "logits/chosen": -3.107874631881714, + "logits/rejected": -2.8020365238189697, + "logps/chosen": -211.1685791015625, + "logps/rejected": -183.49667358398438, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25903183221817017, + "rewards/margins": 1.9042845964431763, + "rewards/rejected": -1.6452527046203613, + "step": 2377 + }, + { + "epoch": 0.27, + "learning_rate": 2.2109329275430175e-07, + "logits/chosen": -3.025005578994751, + "logits/rejected": -3.2265372276306152, + "logps/chosen": -243.36477661132812, + "logps/rejected": -337.70404052734375, + "loss": 0.3672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35006654262542725, + "rewards/margins": 1.5083962678909302, + "rewards/rejected": -1.8584628105163574, + "step": 2378 + }, + { + "epoch": 0.27, + "learning_rate": 2.210581762846775e-07, + "logits/chosen": -3.4176032543182373, + "logits/rejected": -3.6933553218841553, + "logps/chosen": -182.20391845703125, + "logps/rejected": -228.9852294921875, + "loss": 0.2641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3721153438091278, + "rewards/margins": 3.1165225505828857, + "rewards/rejected": -2.7444071769714355, + "step": 2379 + }, + { + "epoch": 0.27, + "learning_rate": 2.2102305981505325e-07, + "logits/chosen": -3.6676979064941406, + "logits/rejected": -4.063460350036621, + "logps/chosen": -135.11375427246094, + "logps/rejected": -207.39349365234375, + "loss": 0.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24123400449752808, + "rewards/margins": 1.9569981098175049, + "rewards/rejected": -2.198232412338257, + "step": 2380 + }, + { + "epoch": 0.27, + "learning_rate": 2.2098794334542898e-07, + "logits/chosen": -2.870166301727295, + "logits/rejected": -2.632882833480835, + "logps/chosen": -439.5860290527344, + "logps/rejected": -281.8860778808594, + "loss": 0.3691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21394672989845276, + "rewards/margins": 1.2313594818115234, + "rewards/rejected": -1.4453063011169434, + "step": 2381 + }, + { + "epoch": 0.27, + "learning_rate": 2.2095282687580473e-07, + "logits/chosen": -3.380429983139038, + "logits/rejected": -3.255276679992676, + "logps/chosen": -236.44692993164062, + "logps/rejected": -285.94061279296875, + "loss": 0.5173, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18474343419075012, + "rewards/margins": 1.9238542318344116, + "rewards/rejected": -2.108597755432129, + "step": 2382 + }, + { + "epoch": 0.27, + "learning_rate": 2.209177104061805e-07, + "logits/chosen": -2.41460919380188, + "logits/rejected": -2.5124406814575195, + "logps/chosen": -235.37158203125, + "logps/rejected": -299.4578552246094, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5221603512763977, + "rewards/margins": 2.3304672241210938, + "rewards/rejected": -1.80830717086792, + "step": 2383 + }, + { + "epoch": 0.27, + "learning_rate": 2.2088259393655622e-07, + "logits/chosen": -3.5568745136260986, + "logits/rejected": -2.92708420753479, + "logps/chosen": -370.26666259765625, + "logps/rejected": -294.4802551269531, + "loss": 0.3657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5223014950752258, + "rewards/margins": 1.697596788406372, + "rewards/rejected": -2.2198984622955322, + "step": 2384 + }, + { + "epoch": 0.27, + "learning_rate": 2.20847477466932e-07, + "logits/chosen": -3.124657154083252, + "logits/rejected": -3.283043622970581, + "logps/chosen": -253.21127319335938, + "logps/rejected": -238.40670776367188, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33296895027160645, + "rewards/margins": 1.9213249683380127, + "rewards/rejected": -1.5883560180664062, + "step": 2385 + }, + { + "epoch": 0.28, + "learning_rate": 2.2081236099730772e-07, + "logits/chosen": -2.3110172748565674, + "logits/rejected": -2.652594566345215, + "logps/chosen": -442.261474609375, + "logps/rejected": -365.875732421875, + "loss": 0.3976, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.268903911113739, + "rewards/margins": 1.5361521244049072, + "rewards/rejected": -1.267248272895813, + "step": 2386 + }, + { + "epoch": 0.28, + "learning_rate": 2.2077724452768348e-07, + "logits/chosen": -2.6843395233154297, + "logits/rejected": -2.880399703979492, + "logps/chosen": -297.4222106933594, + "logps/rejected": -306.49822998046875, + "loss": 0.3316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22305253148078918, + "rewards/margins": 1.7236433029174805, + "rewards/rejected": -1.9466958045959473, + "step": 2387 + }, + { + "epoch": 0.28, + "learning_rate": 2.2074212805805923e-07, + "logits/chosen": -3.077138662338257, + "logits/rejected": -3.1536471843719482, + "logps/chosen": -182.40321350097656, + "logps/rejected": -185.0398712158203, + "loss": 0.4377, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007775649428367615, + "rewards/margins": 1.5823495388031006, + "rewards/rejected": -1.5745739936828613, + "step": 2388 + }, + { + "epoch": 0.28, + "learning_rate": 2.2070701158843496e-07, + "logits/chosen": -3.8942527770996094, + "logits/rejected": -3.7809829711914062, + "logps/chosen": -246.1905975341797, + "logps/rejected": -258.8467712402344, + "loss": 0.392, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3766113817691803, + "rewards/margins": 1.232938289642334, + "rewards/rejected": -0.8563268780708313, + "step": 2389 + }, + { + "epoch": 0.28, + "learning_rate": 2.206718951188107e-07, + "logits/chosen": -3.1586809158325195, + "logits/rejected": -3.657280921936035, + "logps/chosen": -280.7421875, + "logps/rejected": -308.3777160644531, + "loss": 0.3288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14101499319076538, + "rewards/margins": 3.210261821746826, + "rewards/rejected": -3.069246768951416, + "step": 2390 + }, + { + "epoch": 0.28, + "learning_rate": 2.2063677864918647e-07, + "logits/chosen": -3.016045093536377, + "logits/rejected": -2.9702980518341064, + "logps/chosen": -411.37939453125, + "logps/rejected": -207.38729858398438, + "loss": 0.4823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0860232338309288, + "rewards/margins": 1.1013425588607788, + "rewards/rejected": -1.0153193473815918, + "step": 2391 + }, + { + "epoch": 0.28, + "learning_rate": 2.206016621795622e-07, + "logits/chosen": -3.190985679626465, + "logits/rejected": -3.198812961578369, + "logps/chosen": -195.42367553710938, + "logps/rejected": -239.6539306640625, + "loss": 0.4967, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005746133625507355, + "rewards/margins": 0.8703901767730713, + "rewards/rejected": -0.8646441102027893, + "step": 2392 + }, + { + "epoch": 0.28, + "learning_rate": 2.2056654570993795e-07, + "logits/chosen": -2.549551010131836, + "logits/rejected": -2.4927704334259033, + "logps/chosen": -261.33135986328125, + "logps/rejected": -256.83306884765625, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21703052520751953, + "rewards/margins": 2.591268539428711, + "rewards/rejected": -2.8082990646362305, + "step": 2393 + }, + { + "epoch": 0.28, + "learning_rate": 2.2053142924031368e-07, + "logits/chosen": -3.2351911067962646, + "logits/rejected": -3.5481748580932617, + "logps/chosen": -113.99003601074219, + "logps/rejected": -227.33538818359375, + "loss": 0.3424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2743552625179291, + "rewards/margins": 1.4932503700256348, + "rewards/rejected": -1.7676056623458862, + "step": 2394 + }, + { + "epoch": 0.28, + "learning_rate": 2.2049631277068943e-07, + "logits/chosen": -2.7589290142059326, + "logits/rejected": -2.7966554164886475, + "logps/chosen": -373.5641174316406, + "logps/rejected": -280.7613525390625, + "loss": 0.5189, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018179267644882202, + "rewards/margins": 0.6036527156829834, + "rewards/rejected": -0.621832013130188, + "step": 2395 + }, + { + "epoch": 0.28, + "learning_rate": 2.204611963010652e-07, + "logits/chosen": -2.3533010482788086, + "logits/rejected": -2.350330352783203, + "logps/chosen": -225.6588134765625, + "logps/rejected": -245.79742431640625, + "loss": 0.453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05030819773674011, + "rewards/margins": 0.9770106077194214, + "rewards/rejected": -1.0273189544677734, + "step": 2396 + }, + { + "epoch": 0.28, + "learning_rate": 2.2042607983144094e-07, + "logits/chosen": -3.4988393783569336, + "logits/rejected": -3.3664591312408447, + "logps/chosen": -236.99954223632812, + "logps/rejected": -218.96031188964844, + "loss": 0.2357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3076488673686981, + "rewards/margins": 1.9368271827697754, + "rewards/rejected": -1.629178524017334, + "step": 2397 + }, + { + "epoch": 0.28, + "learning_rate": 2.203909633618167e-07, + "logits/chosen": -2.6876895427703857, + "logits/rejected": -2.8589649200439453, + "logps/chosen": -144.1785888671875, + "logps/rejected": -226.133056640625, + "loss": 0.2208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2778404951095581, + "rewards/margins": 1.5523988008499146, + "rewards/rejected": -1.2745583057403564, + "step": 2398 + }, + { + "epoch": 0.28, + "learning_rate": 2.2035584689219244e-07, + "logits/chosen": -2.7386555671691895, + "logits/rejected": -2.68749737739563, + "logps/chosen": -279.5812683105469, + "logps/rejected": -289.7491455078125, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3533630073070526, + "rewards/margins": 1.2324186563491821, + "rewards/rejected": -1.5857816934585571, + "step": 2399 + }, + { + "epoch": 0.28, + "learning_rate": 2.2032073042256817e-07, + "logits/chosen": -3.0557334423065186, + "logits/rejected": -2.8801448345184326, + "logps/chosen": -388.2572326660156, + "logps/rejected": -261.0730895996094, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23178040981292725, + "rewards/margins": 1.0250412225723267, + "rewards/rejected": -1.2568215131759644, + "step": 2400 + }, + { + "epoch": 0.28, + "learning_rate": 2.2028561395294393e-07, + "logits/chosen": -3.4354753494262695, + "logits/rejected": -3.4416000843048096, + "logps/chosen": -197.03607177734375, + "logps/rejected": -338.26092529296875, + "loss": 0.371, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12878771126270294, + "rewards/margins": 1.818946123123169, + "rewards/rejected": -1.9477338790893555, + "step": 2401 + }, + { + "epoch": 0.28, + "learning_rate": 2.2025049748331965e-07, + "logits/chosen": -3.4868335723876953, + "logits/rejected": -3.625142812728882, + "logps/chosen": -225.0349884033203, + "logps/rejected": -217.71945190429688, + "loss": 0.1972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4626690149307251, + "rewards/margins": 1.8271989822387695, + "rewards/rejected": -1.364530086517334, + "step": 2402 + }, + { + "epoch": 0.28, + "learning_rate": 2.202153810136954e-07, + "logits/chosen": -3.113661289215088, + "logits/rejected": -3.020430088043213, + "logps/chosen": -235.8622589111328, + "logps/rejected": -304.64794921875, + "loss": 0.4089, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05186605453491211, + "rewards/margins": 1.4089009761810303, + "rewards/rejected": -1.3570349216461182, + "step": 2403 + }, + { + "epoch": 0.28, + "learning_rate": 2.2018026454407116e-07, + "logits/chosen": -4.021939277648926, + "logits/rejected": -3.979612350463867, + "logps/chosen": -237.0137939453125, + "logps/rejected": -249.8342742919922, + "loss": 0.3147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.032747410237789154, + "rewards/margins": 2.257519245147705, + "rewards/rejected": -2.290266752243042, + "step": 2404 + }, + { + "epoch": 0.28, + "learning_rate": 2.201451480744469e-07, + "logits/chosen": -3.4714596271514893, + "logits/rejected": -3.585513114929199, + "logps/chosen": -185.29000854492188, + "logps/rejected": -233.47500610351562, + "loss": 0.5943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0920921340584755, + "rewards/margins": 1.0241678953170776, + "rewards/rejected": -1.116260051727295, + "step": 2405 + }, + { + "epoch": 0.28, + "learning_rate": 2.2011003160482264e-07, + "logits/chosen": -3.750382900238037, + "logits/rejected": -3.3204398155212402, + "logps/chosen": -333.0164794921875, + "logps/rejected": -330.2599792480469, + "loss": 0.1647, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7107330560684204, + "rewards/margins": 2.8035542964935303, + "rewards/rejected": -2.0928211212158203, + "step": 2406 + }, + { + "epoch": 0.28, + "learning_rate": 2.2007491513519842e-07, + "logits/chosen": -3.603848457336426, + "logits/rejected": -3.0772993564605713, + "logps/chosen": -189.6065673828125, + "logps/rejected": -232.31509399414062, + "loss": 0.3877, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19944944977760315, + "rewards/margins": 1.851662039756775, + "rewards/rejected": -2.0511114597320557, + "step": 2407 + }, + { + "epoch": 0.28, + "learning_rate": 2.2003979866557415e-07, + "logits/chosen": -3.3868868350982666, + "logits/rejected": -3.2851734161376953, + "logps/chosen": -281.1761779785156, + "logps/rejected": -270.9393005371094, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3579005300998688, + "rewards/margins": 1.955430269241333, + "rewards/rejected": -2.313331127166748, + "step": 2408 + }, + { + "epoch": 0.28, + "learning_rate": 2.200046821959499e-07, + "logits/chosen": -3.0388286113739014, + "logits/rejected": -3.128875494003296, + "logps/chosen": -120.84025573730469, + "logps/rejected": -185.491455078125, + "loss": 0.3228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46276143193244934, + "rewards/margins": 2.207402229309082, + "rewards/rejected": -1.7446409463882446, + "step": 2409 + }, + { + "epoch": 0.28, + "learning_rate": 2.1996956572632563e-07, + "logits/chosen": -3.182076930999756, + "logits/rejected": -3.0563042163848877, + "logps/chosen": -236.756103515625, + "logps/rejected": -262.5376892089844, + "loss": 0.4881, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16176122426986694, + "rewards/margins": 0.8078562021255493, + "rewards/rejected": -0.969617486000061, + "step": 2410 + }, + { + "epoch": 0.28, + "learning_rate": 2.1993444925670138e-07, + "logits/chosen": -2.08149790763855, + "logits/rejected": -2.1729912757873535, + "logps/chosen": -407.6117858886719, + "logps/rejected": -252.29693603515625, + "loss": 0.4656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3368873596191406, + "rewards/margins": 0.8198865652084351, + "rewards/rejected": -0.4829992651939392, + "step": 2411 + }, + { + "epoch": 0.28, + "learning_rate": 2.1989933278707714e-07, + "logits/chosen": -3.274141788482666, + "logits/rejected": -3.042417526245117, + "logps/chosen": -199.77655029296875, + "logps/rejected": -204.34878540039062, + "loss": 0.3464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03936605527997017, + "rewards/margins": 1.9879788160324097, + "rewards/rejected": -2.0273449420928955, + "step": 2412 + }, + { + "epoch": 0.28, + "learning_rate": 2.1986421631745287e-07, + "logits/chosen": -3.353074073791504, + "logits/rejected": -3.4450905323028564, + "logps/chosen": -232.88137817382812, + "logps/rejected": -373.7280578613281, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5070069432258606, + "rewards/margins": 2.8949999809265137, + "rewards/rejected": -2.387993097305298, + "step": 2413 + }, + { + "epoch": 0.28, + "learning_rate": 2.1982909984782862e-07, + "logits/chosen": -3.47517728805542, + "logits/rejected": -3.363884925842285, + "logps/chosen": -145.9418182373047, + "logps/rejected": -180.2159881591797, + "loss": 0.3109, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09704026579856873, + "rewards/margins": 2.841865062713623, + "rewards/rejected": -2.7448248863220215, + "step": 2414 + }, + { + "epoch": 0.28, + "learning_rate": 2.1979398337820437e-07, + "logits/chosen": -3.598708391189575, + "logits/rejected": -3.782074213027954, + "logps/chosen": -135.23104858398438, + "logps/rejected": -296.7024841308594, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020806431770324707, + "rewards/margins": 2.9122676849365234, + "rewards/rejected": -2.891461133956909, + "step": 2415 + }, + { + "epoch": 0.28, + "learning_rate": 2.197588669085801e-07, + "logits/chosen": -3.503203868865967, + "logits/rejected": -3.418900728225708, + "logps/chosen": -294.357421875, + "logps/rejected": -271.21673583984375, + "loss": 0.7155, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6230675578117371, + "rewards/margins": 0.2991870045661926, + "rewards/rejected": -0.9222546219825745, + "step": 2416 + }, + { + "epoch": 0.28, + "learning_rate": 2.1972375043895586e-07, + "logits/chosen": -3.8274247646331787, + "logits/rejected": -3.6736860275268555, + "logps/chosen": -128.14199829101562, + "logps/rejected": -233.97918701171875, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10199102759361267, + "rewards/margins": 0.7906545400619507, + "rewards/rejected": -0.8926455974578857, + "step": 2417 + }, + { + "epoch": 0.28, + "learning_rate": 2.1968863396933158e-07, + "logits/chosen": -3.103031635284424, + "logits/rejected": -2.6888766288757324, + "logps/chosen": -201.49899291992188, + "logps/rejected": -184.32421875, + "loss": 0.4075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12238103151321411, + "rewards/margins": 1.5899425745010376, + "rewards/rejected": -1.7123236656188965, + "step": 2418 + }, + { + "epoch": 0.28, + "learning_rate": 2.1965351749970736e-07, + "logits/chosen": -3.2060046195983887, + "logits/rejected": -3.0454280376434326, + "logps/chosen": -264.29901123046875, + "logps/rejected": -240.30360412597656, + "loss": 0.2789, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15718016028404236, + "rewards/margins": 1.744826316833496, + "rewards/rejected": -1.5876461267471313, + "step": 2419 + }, + { + "epoch": 0.28, + "learning_rate": 2.1961840103008312e-07, + "logits/chosen": -3.383967399597168, + "logits/rejected": -3.0167808532714844, + "logps/chosen": -211.47976684570312, + "logps/rejected": -127.46572875976562, + "loss": 0.3933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0747290700674057, + "rewards/margins": 1.108471155166626, + "rewards/rejected": -1.1832003593444824, + "step": 2420 + }, + { + "epoch": 0.28, + "learning_rate": 2.1958328456045884e-07, + "logits/chosen": -3.1186742782592773, + "logits/rejected": -3.0124318599700928, + "logps/chosen": -356.3381652832031, + "logps/rejected": -297.26544189453125, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1529424637556076, + "rewards/margins": 1.3905649185180664, + "rewards/rejected": -1.2376224994659424, + "step": 2421 + }, + { + "epoch": 0.28, + "learning_rate": 2.195481680908346e-07, + "logits/chosen": -2.620532989501953, + "logits/rejected": -3.1646366119384766, + "logps/chosen": -335.073974609375, + "logps/rejected": -406.078857421875, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00973445177078247, + "rewards/margins": 2.3623414039611816, + "rewards/rejected": -2.352606773376465, + "step": 2422 + }, + { + "epoch": 0.28, + "learning_rate": 2.1951305162121033e-07, + "logits/chosen": -2.7919023036956787, + "logits/rejected": -3.311110258102417, + "logps/chosen": -259.38714599609375, + "logps/rejected": -214.59597778320312, + "loss": 0.1789, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2546161711215973, + "rewards/margins": 2.521509885787964, + "rewards/rejected": -2.2668936252593994, + "step": 2423 + }, + { + "epoch": 0.28, + "learning_rate": 2.1947793515158608e-07, + "logits/chosen": -2.4544413089752197, + "logits/rejected": -2.812197208404541, + "logps/chosen": -226.77920532226562, + "logps/rejected": -232.41847229003906, + "loss": 0.2356, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029917694628238678, + "rewards/margins": 1.5192826986312866, + "rewards/rejected": -1.4893651008605957, + "step": 2424 + }, + { + "epoch": 0.28, + "learning_rate": 2.1944281868196183e-07, + "logits/chosen": -3.0819594860076904, + "logits/rejected": -3.089066743850708, + "logps/chosen": -261.36407470703125, + "logps/rejected": -311.8839111328125, + "loss": 0.2, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3528956174850464, + "rewards/margins": 2.5575754642486572, + "rewards/rejected": -2.9104714393615723, + "step": 2425 + }, + { + "epoch": 0.28, + "learning_rate": 2.1940770221233756e-07, + "logits/chosen": -4.018985748291016, + "logits/rejected": -3.828914165496826, + "logps/chosen": -456.403076171875, + "logps/rejected": -353.1015930175781, + "loss": 0.5421, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0994906425476074, + "rewards/margins": 2.3149473667144775, + "rewards/rejected": -3.414437770843506, + "step": 2426 + }, + { + "epoch": 0.28, + "learning_rate": 2.1937258574271331e-07, + "logits/chosen": -3.517800807952881, + "logits/rejected": -3.415616035461426, + "logps/chosen": -118.27099609375, + "logps/rejected": -123.96674346923828, + "loss": 0.5355, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4435257315635681, + "rewards/margins": 0.7377454042434692, + "rewards/rejected": -0.2942197024822235, + "step": 2427 + }, + { + "epoch": 0.28, + "learning_rate": 2.193374692730891e-07, + "logits/chosen": -2.7372374534606934, + "logits/rejected": -2.770352602005005, + "logps/chosen": -210.067138671875, + "logps/rejected": -224.475341796875, + "loss": 0.3228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08157536387443542, + "rewards/margins": 1.7329797744750977, + "rewards/rejected": -1.6514043807983398, + "step": 2428 + }, + { + "epoch": 0.28, + "learning_rate": 2.193023528034648e-07, + "logits/chosen": -3.1419503688812256, + "logits/rejected": -2.997148275375366, + "logps/chosen": -153.3421173095703, + "logps/rejected": -258.42919921875, + "loss": 0.1555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08342582732439041, + "rewards/margins": 2.4668238162994385, + "rewards/rejected": -2.5502495765686035, + "step": 2429 + }, + { + "epoch": 0.28, + "learning_rate": 2.1926723633384058e-07, + "logits/chosen": -3.320883274078369, + "logits/rejected": -3.1587305068969727, + "logps/chosen": -547.282470703125, + "logps/rejected": -454.83868408203125, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1741361767053604, + "rewards/margins": 1.6035101413726807, + "rewards/rejected": -1.7776463031768799, + "step": 2430 + }, + { + "epoch": 0.28, + "learning_rate": 2.192321198642163e-07, + "logits/chosen": -3.7074661254882812, + "logits/rejected": -3.3746659755706787, + "logps/chosen": -281.88360595703125, + "logps/rejected": -231.26478576660156, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16728559136390686, + "rewards/margins": 2.1524500846862793, + "rewards/rejected": -2.319735527038574, + "step": 2431 + }, + { + "epoch": 0.28, + "learning_rate": 2.1919700339459206e-07, + "logits/chosen": -2.621852397918701, + "logits/rejected": -2.738388776779175, + "logps/chosen": -230.6024627685547, + "logps/rejected": -217.95655822753906, + "loss": 0.2405, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3766915798187256, + "rewards/margins": 2.4472131729125977, + "rewards/rejected": -2.070521354675293, + "step": 2432 + }, + { + "epoch": 0.28, + "learning_rate": 2.191618869249678e-07, + "logits/chosen": -2.5729453563690186, + "logits/rejected": -2.510162353515625, + "logps/chosen": -219.2705841064453, + "logps/rejected": -253.13291931152344, + "loss": 0.5897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19528169929981232, + "rewards/margins": 1.7140196561813354, + "rewards/rejected": -1.9093014001846313, + "step": 2433 + }, + { + "epoch": 0.28, + "learning_rate": 2.1912677045534354e-07, + "logits/chosen": -3.5079660415649414, + "logits/rejected": -3.80427885055542, + "logps/chosen": -322.02587890625, + "logps/rejected": -291.5752258300781, + "loss": 0.4048, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36178719997406006, + "rewards/margins": 1.2521203756332397, + "rewards/rejected": -0.8903331756591797, + "step": 2434 + }, + { + "epoch": 0.28, + "learning_rate": 2.190916539857193e-07, + "logits/chosen": -3.756080389022827, + "logits/rejected": -3.5474343299865723, + "logps/chosen": -199.63638305664062, + "logps/rejected": -111.511474609375, + "loss": 0.4583, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.256507933139801, + "rewards/margins": 1.2519280910491943, + "rewards/rejected": -0.9954202175140381, + "step": 2435 + }, + { + "epoch": 0.28, + "learning_rate": 2.1905653751609505e-07, + "logits/chosen": -3.2701361179351807, + "logits/rejected": -2.8404221534729004, + "logps/chosen": -204.3403778076172, + "logps/rejected": -204.09979248046875, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21677035093307495, + "rewards/margins": 1.8074097633361816, + "rewards/rejected": -2.0241804122924805, + "step": 2436 + }, + { + "epoch": 0.28, + "learning_rate": 2.1902142104647077e-07, + "logits/chosen": -3.6798810958862305, + "logits/rejected": -3.082275867462158, + "logps/chosen": -324.6055908203125, + "logps/rejected": -257.0491943359375, + "loss": 0.5478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23082469403743744, + "rewards/margins": 1.3557943105697632, + "rewards/rejected": -1.1249696016311646, + "step": 2437 + }, + { + "epoch": 0.28, + "learning_rate": 2.1898630457684653e-07, + "logits/chosen": -3.4572887420654297, + "logits/rejected": -3.2142372131347656, + "logps/chosen": -216.39266967773438, + "logps/rejected": -143.99981689453125, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13798195123672485, + "rewards/margins": 1.1712979078292847, + "rewards/rejected": -1.3092797994613647, + "step": 2438 + }, + { + "epoch": 0.28, + "learning_rate": 2.1895118810722225e-07, + "logits/chosen": -3.1805362701416016, + "logits/rejected": -3.137821912765503, + "logps/chosen": -320.88018798828125, + "logps/rejected": -365.652099609375, + "loss": 0.52, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0820050984621048, + "rewards/margins": 2.076395034790039, + "rewards/rejected": -1.9943897724151611, + "step": 2439 + }, + { + "epoch": 0.28, + "learning_rate": 2.18916071637598e-07, + "logits/chosen": -2.8748421669006348, + "logits/rejected": -3.0962469577789307, + "logps/chosen": -222.6476593017578, + "logps/rejected": -240.46768188476562, + "loss": 0.3569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22386044263839722, + "rewards/margins": 1.2036077976226807, + "rewards/rejected": -0.9797472953796387, + "step": 2440 + }, + { + "epoch": 0.28, + "learning_rate": 2.188809551679738e-07, + "logits/chosen": -3.0261402130126953, + "logits/rejected": -2.900965690612793, + "logps/chosen": -222.828857421875, + "logps/rejected": -256.79217529296875, + "loss": 0.4532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08372204750776291, + "rewards/margins": 1.3232771158218384, + "rewards/rejected": -1.2395551204681396, + "step": 2441 + }, + { + "epoch": 0.28, + "learning_rate": 2.1884583869834952e-07, + "logits/chosen": -2.801417589187622, + "logits/rejected": -2.838425874710083, + "logps/chosen": -157.430419921875, + "logps/rejected": -155.1853485107422, + "loss": 0.566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3712718188762665, + "rewards/margins": 0.6495455503463745, + "rewards/rejected": -1.0208172798156738, + "step": 2442 + }, + { + "epoch": 0.28, + "learning_rate": 2.1881072222872527e-07, + "logits/chosen": -2.575927972793579, + "logits/rejected": -2.4217116832733154, + "logps/chosen": -248.05368041992188, + "logps/rejected": -260.1338195800781, + "loss": 0.5805, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12393037229776382, + "rewards/margins": 0.6171920299530029, + "rewards/rejected": -0.4932616651058197, + "step": 2443 + }, + { + "epoch": 0.28, + "learning_rate": 2.1877560575910102e-07, + "logits/chosen": -3.6244165897369385, + "logits/rejected": -3.8091020584106445, + "logps/chosen": -128.9793701171875, + "logps/rejected": -192.08836364746094, + "loss": 0.5929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5581544637680054, + "rewards/margins": 2.3534533977508545, + "rewards/rejected": -2.9116082191467285, + "step": 2444 + }, + { + "epoch": 0.28, + "learning_rate": 2.1874048928947675e-07, + "logits/chosen": -3.1359071731567383, + "logits/rejected": -2.9606525897979736, + "logps/chosen": -403.9376525878906, + "logps/rejected": -179.67617797851562, + "loss": 0.2565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5444480180740356, + "rewards/margins": 1.719770073890686, + "rewards/rejected": -1.1753219366073608, + "step": 2445 + }, + { + "epoch": 0.28, + "learning_rate": 2.187053728198525e-07, + "logits/chosen": -3.228257656097412, + "logits/rejected": -3.000086545944214, + "logps/chosen": -321.0558776855469, + "logps/rejected": -194.30027770996094, + "loss": 0.4433, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12248248606920242, + "rewards/margins": 1.4330296516418457, + "rewards/rejected": -1.3105472326278687, + "step": 2446 + }, + { + "epoch": 0.28, + "learning_rate": 2.1867025635022823e-07, + "logits/chosen": -3.75832462310791, + "logits/rejected": -3.7247042655944824, + "logps/chosen": -143.0338134765625, + "logps/rejected": -206.94908142089844, + "loss": 0.2749, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16749566793441772, + "rewards/margins": 1.5264328718185425, + "rewards/rejected": -1.3589372634887695, + "step": 2447 + }, + { + "epoch": 0.28, + "learning_rate": 2.1863513988060399e-07, + "logits/chosen": -3.0892677307128906, + "logits/rejected": -3.06630539894104, + "logps/chosen": -261.5441589355469, + "logps/rejected": -205.29661560058594, + "loss": 0.4242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2856305241584778, + "rewards/margins": 0.8872074484825134, + "rewards/rejected": -1.1728379726409912, + "step": 2448 + }, + { + "epoch": 0.28, + "learning_rate": 2.1860002341097974e-07, + "logits/chosen": -3.215249538421631, + "logits/rejected": -3.2620458602905273, + "logps/chosen": -339.4765625, + "logps/rejected": -211.56326293945312, + "loss": 0.3774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4658085107803345, + "rewards/margins": 1.4360580444335938, + "rewards/rejected": -1.9018666744232178, + "step": 2449 + }, + { + "epoch": 0.28, + "learning_rate": 2.1856490694135547e-07, + "logits/chosen": -2.947361469268799, + "logits/rejected": -2.968222141265869, + "logps/chosen": -328.7845764160156, + "logps/rejected": -269.6783447265625, + "loss": 0.3448, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3406258225440979, + "rewards/margins": 2.0198113918304443, + "rewards/rejected": -1.6791855096817017, + "step": 2450 + }, + { + "epoch": 0.28, + "learning_rate": 2.1852979047173122e-07, + "logits/chosen": -2.203819513320923, + "logits/rejected": -2.1777544021606445, + "logps/chosen": -308.4319763183594, + "logps/rejected": -300.3418884277344, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.251875638961792, + "rewards/margins": 1.6042571067810059, + "rewards/rejected": -1.3523813486099243, + "step": 2451 + }, + { + "epoch": 0.28, + "learning_rate": 2.18494674002107e-07, + "logits/chosen": -3.14052414894104, + "logits/rejected": -3.276562213897705, + "logps/chosen": -223.97531127929688, + "logps/rejected": -298.59765625, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06599549949169159, + "rewards/margins": 1.4284056425094604, + "rewards/rejected": -1.4944013357162476, + "step": 2452 + }, + { + "epoch": 0.28, + "learning_rate": 2.1845955753248273e-07, + "logits/chosen": -2.912686347961426, + "logits/rejected": -3.2183055877685547, + "logps/chosen": -120.59547424316406, + "logps/rejected": -198.07168579101562, + "loss": 0.3565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3066108226776123, + "rewards/margins": 2.31168270111084, + "rewards/rejected": -2.618293285369873, + "step": 2453 + }, + { + "epoch": 0.28, + "learning_rate": 2.1842444106285848e-07, + "logits/chosen": -2.7521955966949463, + "logits/rejected": -3.034346342086792, + "logps/chosen": -332.7185974121094, + "logps/rejected": -376.32928466796875, + "loss": 0.4256, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07439442723989487, + "rewards/margins": 1.4981579780578613, + "rewards/rejected": -1.4237632751464844, + "step": 2454 + }, + { + "epoch": 0.28, + "learning_rate": 2.183893245932342e-07, + "logits/chosen": -2.552117109298706, + "logits/rejected": -2.67108416557312, + "logps/chosen": -236.9710693359375, + "logps/rejected": -329.45623779296875, + "loss": 0.5294, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021329151466488838, + "rewards/margins": 1.016088843345642, + "rewards/rejected": -0.9947596788406372, + "step": 2455 + }, + { + "epoch": 0.28, + "learning_rate": 2.1835420812360996e-07, + "logits/chosen": -3.25130033493042, + "logits/rejected": -3.200017213821411, + "logps/chosen": -194.1268310546875, + "logps/rejected": -305.9853210449219, + "loss": 0.4243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3273342251777649, + "rewards/margins": 2.5751430988311768, + "rewards/rejected": -2.902477264404297, + "step": 2456 + }, + { + "epoch": 0.28, + "learning_rate": 2.1831909165398572e-07, + "logits/chosen": -3.3787519931793213, + "logits/rejected": -3.4744396209716797, + "logps/chosen": -180.8157196044922, + "logps/rejected": -131.29067993164062, + "loss": 0.3407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07379353046417236, + "rewards/margins": 1.1642754077911377, + "rewards/rejected": -1.2380690574645996, + "step": 2457 + }, + { + "epoch": 0.28, + "learning_rate": 2.1828397518436145e-07, + "logits/chosen": -2.6219289302825928, + "logits/rejected": -2.8350653648376465, + "logps/chosen": -247.43992614746094, + "logps/rejected": -219.67323303222656, + "loss": 0.4002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08709601312875748, + "rewards/margins": 1.2533464431762695, + "rewards/rejected": -1.340442419052124, + "step": 2458 + }, + { + "epoch": 0.28, + "learning_rate": 2.182488587147372e-07, + "logits/chosen": -2.7775309085845947, + "logits/rejected": -2.63043475151062, + "logps/chosen": -422.1219482421875, + "logps/rejected": -252.15554809570312, + "loss": 0.2352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019235610961914062, + "rewards/margins": 2.149407386779785, + "rewards/rejected": -2.1686432361602783, + "step": 2459 + }, + { + "epoch": 0.28, + "learning_rate": 2.1821374224511295e-07, + "logits/chosen": -4.210098743438721, + "logits/rejected": -4.035314083099365, + "logps/chosen": -223.18902587890625, + "logps/rejected": -210.02633666992188, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06098141521215439, + "rewards/margins": 1.4619140625, + "rewards/rejected": -1.4009325504302979, + "step": 2460 + }, + { + "epoch": 0.28, + "learning_rate": 2.1817862577548868e-07, + "logits/chosen": -3.3811802864074707, + "logits/rejected": -3.4701714515686035, + "logps/chosen": -181.46426391601562, + "logps/rejected": -219.365966796875, + "loss": 0.2773, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09229210019111633, + "rewards/margins": 2.1939291954040527, + "rewards/rejected": -2.1016368865966797, + "step": 2461 + }, + { + "epoch": 0.28, + "learning_rate": 2.1814350930586446e-07, + "logits/chosen": -2.968677520751953, + "logits/rejected": -2.926361083984375, + "logps/chosen": -263.7087707519531, + "logps/rejected": -269.3233337402344, + "loss": 0.7407, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7600381374359131, + "rewards/margins": 0.8985783457756042, + "rewards/rejected": -1.6586166620254517, + "step": 2462 + }, + { + "epoch": 0.28, + "learning_rate": 2.1810839283624016e-07, + "logits/chosen": -3.7632837295532227, + "logits/rejected": -3.830670118331909, + "logps/chosen": -388.6455383300781, + "logps/rejected": -365.0164794921875, + "loss": 0.4336, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.449359655380249, + "rewards/margins": 1.8442012071609497, + "rewards/rejected": -2.293560743331909, + "step": 2463 + }, + { + "epoch": 0.28, + "learning_rate": 2.1807327636661594e-07, + "logits/chosen": -3.249234676361084, + "logits/rejected": -3.379873514175415, + "logps/chosen": -388.23150634765625, + "logps/rejected": -273.75994873046875, + "loss": 0.5919, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.200859934091568, + "rewards/margins": 0.6882392168045044, + "rewards/rejected": -0.48737916350364685, + "step": 2464 + }, + { + "epoch": 0.28, + "learning_rate": 2.180381598969917e-07, + "logits/chosen": -3.820143699645996, + "logits/rejected": -3.7728981971740723, + "logps/chosen": -181.84933471679688, + "logps/rejected": -76.54154968261719, + "loss": 0.8428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39957424998283386, + "rewards/margins": 0.039460718631744385, + "rewards/rejected": -0.43903499841690063, + "step": 2465 + }, + { + "epoch": 0.28, + "learning_rate": 2.1800304342736742e-07, + "logits/chosen": -3.8907947540283203, + "logits/rejected": -3.780057430267334, + "logps/chosen": -182.345947265625, + "logps/rejected": -196.57447814941406, + "loss": 0.5712, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20988132059574127, + "rewards/margins": 1.065144419670105, + "rewards/rejected": -0.8552630543708801, + "step": 2466 + }, + { + "epoch": 0.28, + "learning_rate": 2.1796792695774318e-07, + "logits/chosen": -3.7114272117614746, + "logits/rejected": -3.409396171569824, + "logps/chosen": -229.558837890625, + "logps/rejected": -260.908203125, + "loss": 0.4049, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011846736073493958, + "rewards/margins": 2.6365349292755127, + "rewards/rejected": -2.6483817100524902, + "step": 2467 + }, + { + "epoch": 0.28, + "learning_rate": 2.1793281048811893e-07, + "logits/chosen": -3.204981803894043, + "logits/rejected": -3.170121431350708, + "logps/chosen": -205.509033203125, + "logps/rejected": -313.5057678222656, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0013806819915771484, + "rewards/margins": 1.938448190689087, + "rewards/rejected": -1.9370676279067993, + "step": 2468 + }, + { + "epoch": 0.28, + "learning_rate": 2.1789769401849466e-07, + "logits/chosen": -2.6218063831329346, + "logits/rejected": -2.562359094619751, + "logps/chosen": -334.5001220703125, + "logps/rejected": -213.37925720214844, + "loss": 0.7113, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.808554470539093, + "rewards/margins": 0.41915979981422424, + "rewards/rejected": -1.2277144193649292, + "step": 2469 + }, + { + "epoch": 0.28, + "learning_rate": 2.178625775488704e-07, + "logits/chosen": -3.3911120891571045, + "logits/rejected": -3.8220114707946777, + "logps/chosen": -256.6356506347656, + "logps/rejected": -254.3651580810547, + "loss": 0.2172, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04758182168006897, + "rewards/margins": 2.3955259323120117, + "rewards/rejected": -2.3479440212249756, + "step": 2470 + }, + { + "epoch": 0.28, + "learning_rate": 2.1782746107924614e-07, + "logits/chosen": -2.9821865558624268, + "logits/rejected": -2.9906139373779297, + "logps/chosen": -305.24993896484375, + "logps/rejected": -231.15438842773438, + "loss": 0.267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39678874611854553, + "rewards/margins": 1.6343164443969727, + "rewards/rejected": -2.0311052799224854, + "step": 2471 + }, + { + "epoch": 0.28, + "learning_rate": 2.177923446096219e-07, + "logits/chosen": -2.735107660293579, + "logits/rejected": -2.468890905380249, + "logps/chosen": -213.21627807617188, + "logps/rejected": -118.34982299804688, + "loss": 0.8883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5310587882995605, + "rewards/margins": -0.013679593801498413, + "rewards/rejected": -0.5173791646957397, + "step": 2472 + }, + { + "epoch": 0.29, + "learning_rate": 2.1775722813999767e-07, + "logits/chosen": -3.4001710414886475, + "logits/rejected": -3.059512138366699, + "logps/chosen": -285.00543212890625, + "logps/rejected": -217.40570068359375, + "loss": 0.3331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12728194892406464, + "rewards/margins": 1.8616950511932373, + "rewards/rejected": -1.9889768362045288, + "step": 2473 + }, + { + "epoch": 0.29, + "learning_rate": 2.1772211167037337e-07, + "logits/chosen": -3.1767196655273438, + "logits/rejected": -3.449080467224121, + "logps/chosen": -263.1763610839844, + "logps/rejected": -282.4006042480469, + "loss": 0.1196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.54848313331604, + "rewards/margins": 3.045677423477173, + "rewards/rejected": -2.497194290161133, + "step": 2474 + }, + { + "epoch": 0.29, + "learning_rate": 2.1768699520074916e-07, + "logits/chosen": -3.092620849609375, + "logits/rejected": -3.1552720069885254, + "logps/chosen": -192.42620849609375, + "logps/rejected": -260.7003173828125, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18665088713169098, + "rewards/margins": 2.06071138381958, + "rewards/rejected": -1.8740603923797607, + "step": 2475 + }, + { + "epoch": 0.29, + "learning_rate": 2.1765187873112488e-07, + "logits/chosen": -2.7265820503234863, + "logits/rejected": -2.891597270965576, + "logps/chosen": -316.4152526855469, + "logps/rejected": -281.15380859375, + "loss": 0.5573, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40571850538253784, + "rewards/margins": 1.1902090311050415, + "rewards/rejected": -1.5959275960922241, + "step": 2476 + }, + { + "epoch": 0.29, + "learning_rate": 2.1761676226150064e-07, + "logits/chosen": -2.54168701171875, + "logits/rejected": -2.733599901199341, + "logps/chosen": -228.31362915039062, + "logps/rejected": -147.59320068359375, + "loss": 0.4779, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3791736364364624, + "rewards/margins": 1.1870970726013184, + "rewards/rejected": -0.807923436164856, + "step": 2477 + }, + { + "epoch": 0.29, + "learning_rate": 2.175816457918764e-07, + "logits/chosen": -2.7614684104919434, + "logits/rejected": -2.716972827911377, + "logps/chosen": -578.1192016601562, + "logps/rejected": -459.1675720214844, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03346632421016693, + "rewards/margins": 2.1798722743988037, + "rewards/rejected": -2.1464059352874756, + "step": 2478 + }, + { + "epoch": 0.29, + "learning_rate": 2.1754652932225212e-07, + "logits/chosen": -3.615689992904663, + "logits/rejected": -3.5108532905578613, + "logps/chosen": -226.25509643554688, + "logps/rejected": -274.2037353515625, + "loss": 0.3841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3713642358779907, + "rewards/margins": 3.025009870529175, + "rewards/rejected": -3.396373987197876, + "step": 2479 + }, + { + "epoch": 0.29, + "learning_rate": 2.1751141285262787e-07, + "logits/chosen": -2.46044659614563, + "logits/rejected": -2.639129638671875, + "logps/chosen": -433.7906494140625, + "logps/rejected": -299.1061096191406, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10184717178344727, + "rewards/margins": 1.8733896017074585, + "rewards/rejected": -1.9752366542816162, + "step": 2480 + }, + { + "epoch": 0.29, + "learning_rate": 2.1747629638300363e-07, + "logits/chosen": -3.120635986328125, + "logits/rejected": -3.331043004989624, + "logps/chosen": -377.9277038574219, + "logps/rejected": -202.9185791015625, + "loss": 0.3452, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18138794600963593, + "rewards/margins": 1.8247970342636108, + "rewards/rejected": -2.0061850547790527, + "step": 2481 + }, + { + "epoch": 0.29, + "learning_rate": 2.1744117991337935e-07, + "logits/chosen": -3.5460946559906006, + "logits/rejected": -3.69732666015625, + "logps/chosen": -355.28875732421875, + "logps/rejected": -326.8369140625, + "loss": 0.6411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4959222972393036, + "rewards/margins": 0.5692657232284546, + "rewards/rejected": -1.065187931060791, + "step": 2482 + }, + { + "epoch": 0.29, + "learning_rate": 2.174060634437551e-07, + "logits/chosen": -2.488938570022583, + "logits/rejected": -2.4261369705200195, + "logps/chosen": -467.16400146484375, + "logps/rejected": -316.6492004394531, + "loss": 0.6097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1638650894165039, + "rewards/margins": 0.9151331186294556, + "rewards/rejected": -1.0789982080459595, + "step": 2483 + }, + { + "epoch": 0.29, + "learning_rate": 2.1737094697413083e-07, + "logits/chosen": -2.5669665336608887, + "logits/rejected": -2.4397716522216797, + "logps/chosen": -287.9142761230469, + "logps/rejected": -308.5404052734375, + "loss": 0.6615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30597782135009766, + "rewards/margins": 0.6199995279312134, + "rewards/rejected": -0.925977349281311, + "step": 2484 + }, + { + "epoch": 0.29, + "learning_rate": 2.173358305045066e-07, + "logits/chosen": -2.5982868671417236, + "logits/rejected": -2.5021119117736816, + "logps/chosen": -132.1014404296875, + "logps/rejected": -238.27015686035156, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17167960107326508, + "rewards/margins": 1.7378793954849243, + "rewards/rejected": -1.5662000179290771, + "step": 2485 + }, + { + "epoch": 0.29, + "learning_rate": 2.1730071403488237e-07, + "logits/chosen": -3.404895782470703, + "logits/rejected": -3.0755972862243652, + "logps/chosen": -208.4241485595703, + "logps/rejected": -139.91775512695312, + "loss": 0.4817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025990381836891174, + "rewards/margins": 0.827010989189148, + "rewards/rejected": -0.8530014157295227, + "step": 2486 + }, + { + "epoch": 0.29, + "learning_rate": 2.172655975652581e-07, + "logits/chosen": -2.539649248123169, + "logits/rejected": -2.4222071170806885, + "logps/chosen": -140.99176025390625, + "logps/rejected": -267.51507568359375, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1797463446855545, + "rewards/margins": 1.2750368118286133, + "rewards/rejected": -1.4547832012176514, + "step": 2487 + }, + { + "epoch": 0.29, + "learning_rate": 2.1723048109563385e-07, + "logits/chosen": -2.7007298469543457, + "logits/rejected": -2.281275510787964, + "logps/chosen": -345.035400390625, + "logps/rejected": -217.8950653076172, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011309966444969177, + "rewards/margins": 1.118504524230957, + "rewards/rejected": -1.1071945428848267, + "step": 2488 + }, + { + "epoch": 0.29, + "learning_rate": 2.171953646260096e-07, + "logits/chosen": -2.99935245513916, + "logits/rejected": -2.631479263305664, + "logps/chosen": -305.01983642578125, + "logps/rejected": -213.12973022460938, + "loss": 0.6579, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11633267253637314, + "rewards/margins": 1.1751036643981934, + "rewards/rejected": -1.058771014213562, + "step": 2489 + }, + { + "epoch": 0.29, + "learning_rate": 2.1716024815638533e-07, + "logits/chosen": -3.290658473968506, + "logits/rejected": -3.1096949577331543, + "logps/chosen": -199.30447387695312, + "logps/rejected": -240.88946533203125, + "loss": 0.241, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1683589518070221, + "rewards/margins": 1.7305457592010498, + "rewards/rejected": -1.5621867179870605, + "step": 2490 + }, + { + "epoch": 0.29, + "learning_rate": 2.1712513168676108e-07, + "logits/chosen": -2.925703525543213, + "logits/rejected": -2.7297894954681396, + "logps/chosen": -385.4110412597656, + "logps/rejected": -379.5626220703125, + "loss": 0.2164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10115164518356323, + "rewards/margins": 2.2671008110046387, + "rewards/rejected": -2.1659491062164307, + "step": 2491 + }, + { + "epoch": 0.29, + "learning_rate": 2.170900152171368e-07, + "logits/chosen": -2.33902645111084, + "logits/rejected": -2.3498055934906006, + "logps/chosen": -318.0898132324219, + "logps/rejected": -296.12347412109375, + "loss": 0.6364, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2034817636013031, + "rewards/margins": 0.799735426902771, + "rewards/rejected": -1.003217101097107, + "step": 2492 + }, + { + "epoch": 0.29, + "learning_rate": 2.1705489874751257e-07, + "logits/chosen": -2.928410530090332, + "logits/rejected": -3.112292766571045, + "logps/chosen": -183.60125732421875, + "logps/rejected": -200.45562744140625, + "loss": 0.411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17708800733089447, + "rewards/margins": 1.5838189125061035, + "rewards/rejected": -1.7609069347381592, + "step": 2493 + }, + { + "epoch": 0.29, + "learning_rate": 2.1701978227788832e-07, + "logits/chosen": -2.3980016708374023, + "logits/rejected": -2.6251823902130127, + "logps/chosen": -479.4037170410156, + "logps/rejected": -382.66455078125, + "loss": 0.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49874454736709595, + "rewards/margins": 1.3575128316879272, + "rewards/rejected": -1.856257438659668, + "step": 2494 + }, + { + "epoch": 0.29, + "learning_rate": 2.1698466580826405e-07, + "logits/chosen": -2.6151421070098877, + "logits/rejected": -2.9194958209991455, + "logps/chosen": -319.22930908203125, + "logps/rejected": -308.5216369628906, + "loss": 0.5001, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06488540023565292, + "rewards/margins": 0.6228733062744141, + "rewards/rejected": -0.5579878687858582, + "step": 2495 + }, + { + "epoch": 0.29, + "learning_rate": 2.1694954933863983e-07, + "logits/chosen": -3.4282264709472656, + "logits/rejected": -3.456735134124756, + "logps/chosen": -153.66282653808594, + "logps/rejected": -176.79000854492188, + "loss": 0.6492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5922830104827881, + "rewards/margins": 0.6304630041122437, + "rewards/rejected": -1.2227461338043213, + "step": 2496 + }, + { + "epoch": 0.29, + "learning_rate": 2.1691443286901558e-07, + "logits/chosen": -3.7550277709960938, + "logits/rejected": -3.6395514011383057, + "logps/chosen": -533.3838500976562, + "logps/rejected": -313.1332702636719, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5365990996360779, + "rewards/margins": 1.555617332458496, + "rewards/rejected": -2.0922164916992188, + "step": 2497 + }, + { + "epoch": 0.29, + "learning_rate": 2.168793163993913e-07, + "logits/chosen": -3.8064944744110107, + "logits/rejected": -4.0364532470703125, + "logps/chosen": -105.87013244628906, + "logps/rejected": -229.16986083984375, + "loss": 0.2893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17462556064128876, + "rewards/margins": 3.602158784866333, + "rewards/rejected": -3.4275331497192383, + "step": 2498 + }, + { + "epoch": 0.29, + "learning_rate": 2.1684419992976706e-07, + "logits/chosen": -3.3039333820343018, + "logits/rejected": -3.3841021060943604, + "logps/chosen": -190.86904907226562, + "logps/rejected": -275.48675537109375, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039414308965206146, + "rewards/margins": 3.3313801288604736, + "rewards/rejected": -3.3707942962646484, + "step": 2499 + }, + { + "epoch": 0.29, + "learning_rate": 2.168090834601428e-07, + "logits/chosen": -3.6833810806274414, + "logits/rejected": -3.439293622970581, + "logps/chosen": -252.95263671875, + "logps/rejected": -200.35391235351562, + "loss": 0.5439, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06675035506486893, + "rewards/margins": 0.9833582043647766, + "rewards/rejected": -0.9166078567504883, + "step": 2500 + }, + { + "epoch": 0.29, + "learning_rate": 2.1677396699051854e-07, + "logits/chosen": -2.9705872535705566, + "logits/rejected": -2.7476954460144043, + "logps/chosen": -172.15023803710938, + "logps/rejected": -202.17086791992188, + "loss": 0.4181, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3489150404930115, + "rewards/margins": 1.3464860916137695, + "rewards/rejected": -1.6954011917114258, + "step": 2501 + }, + { + "epoch": 0.29, + "learning_rate": 2.167388505208943e-07, + "logits/chosen": -3.088698148727417, + "logits/rejected": -3.155203342437744, + "logps/chosen": -329.0819396972656, + "logps/rejected": -288.0634765625, + "loss": 0.63, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.053108327090740204, + "rewards/margins": 0.7766106128692627, + "rewards/rejected": -0.8297189474105835, + "step": 2502 + }, + { + "epoch": 0.29, + "learning_rate": 2.1670373405127002e-07, + "logits/chosen": -2.687331199645996, + "logits/rejected": -2.371080160140991, + "logps/chosen": -304.59906005859375, + "logps/rejected": -429.2099304199219, + "loss": 0.3054, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.010310269892215729, + "rewards/margins": 2.3503966331481934, + "rewards/rejected": -2.3607070446014404, + "step": 2503 + }, + { + "epoch": 0.29, + "learning_rate": 2.1666861758164578e-07, + "logits/chosen": -3.417501449584961, + "logits/rejected": -3.6031932830810547, + "logps/chosen": -85.20997619628906, + "logps/rejected": -220.92807006835938, + "loss": 0.265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36196520924568176, + "rewards/margins": 2.267026424407959, + "rewards/rejected": -1.9050612449645996, + "step": 2504 + }, + { + "epoch": 0.29, + "learning_rate": 2.1663350111202153e-07, + "logits/chosen": -2.596254825592041, + "logits/rejected": -2.492746591567993, + "logps/chosen": -122.69634246826172, + "logps/rejected": -126.95936584472656, + "loss": 0.5136, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0050395820289850235, + "rewards/margins": 0.6098289489746094, + "rewards/rejected": -0.6047893762588501, + "step": 2505 + }, + { + "epoch": 0.29, + "learning_rate": 2.1659838464239726e-07, + "logits/chosen": -3.1768832206726074, + "logits/rejected": -3.516199827194214, + "logps/chosen": -301.1124572753906, + "logps/rejected": -347.6476745605469, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.050177350640296936, + "rewards/margins": 2.3043107986450195, + "rewards/rejected": -2.3544881343841553, + "step": 2506 + }, + { + "epoch": 0.29, + "learning_rate": 2.1656326817277304e-07, + "logits/chosen": -2.9308624267578125, + "logits/rejected": -3.1785178184509277, + "logps/chosen": -156.17010498046875, + "logps/rejected": -347.2279052734375, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22914133965969086, + "rewards/margins": 2.547045946121216, + "rewards/rejected": -2.7761874198913574, + "step": 2507 + }, + { + "epoch": 0.29, + "learning_rate": 2.1652815170314874e-07, + "logits/chosen": -2.790318727493286, + "logits/rejected": -3.004650354385376, + "logps/chosen": -352.009521484375, + "logps/rejected": -352.0276794433594, + "loss": 0.6534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46743714809417725, + "rewards/margins": 0.6324673891067505, + "rewards/rejected": -1.0999045372009277, + "step": 2508 + }, + { + "epoch": 0.29, + "learning_rate": 2.1649303523352452e-07, + "logits/chosen": -3.4628429412841797, + "logits/rejected": -3.037594795227051, + "logps/chosen": -315.0083312988281, + "logps/rejected": -223.48208618164062, + "loss": 0.6253, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1919640600681305, + "rewards/margins": 1.0366277694702148, + "rewards/rejected": -1.2285919189453125, + "step": 2509 + }, + { + "epoch": 0.29, + "learning_rate": 2.1645791876390028e-07, + "logits/chosen": -3.2440714836120605, + "logits/rejected": -3.1097850799560547, + "logps/chosen": -363.18780517578125, + "logps/rejected": -280.4508056640625, + "loss": 0.4513, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.038079455494880676, + "rewards/margins": 1.7575528621673584, + "rewards/rejected": -1.7194733619689941, + "step": 2510 + }, + { + "epoch": 0.29, + "learning_rate": 2.16422802294276e-07, + "logits/chosen": -3.711042642593384, + "logits/rejected": -3.768460750579834, + "logps/chosen": -161.98663330078125, + "logps/rejected": -228.379150390625, + "loss": 0.4019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23478330671787262, + "rewards/margins": 1.4729939699172974, + "rewards/rejected": -1.7077771425247192, + "step": 2511 + }, + { + "epoch": 0.29, + "learning_rate": 2.1638768582465176e-07, + "logits/chosen": -2.934598684310913, + "logits/rejected": -3.1725707054138184, + "logps/chosen": -226.66461181640625, + "logps/rejected": -152.4705352783203, + "loss": 0.3879, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07610709220170975, + "rewards/margins": 1.4340448379516602, + "rewards/rejected": -1.3579376935958862, + "step": 2512 + }, + { + "epoch": 0.29, + "learning_rate": 2.163525693550275e-07, + "logits/chosen": -3.305607795715332, + "logits/rejected": -3.9097390174865723, + "logps/chosen": -218.16860961914062, + "logps/rejected": -312.2205810546875, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.045376554131507874, + "rewards/margins": 1.9525346755981445, + "rewards/rejected": -1.9071580171585083, + "step": 2513 + }, + { + "epoch": 0.29, + "learning_rate": 2.1631745288540324e-07, + "logits/chosen": -3.5929651260375977, + "logits/rejected": -3.5992867946624756, + "logps/chosen": -285.7209167480469, + "logps/rejected": -367.5343933105469, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06939283758401871, + "rewards/margins": 2.251302719116211, + "rewards/rejected": -2.1819100379943848, + "step": 2514 + }, + { + "epoch": 0.29, + "learning_rate": 2.16282336415779e-07, + "logits/chosen": -3.190619468688965, + "logits/rejected": -3.1460177898406982, + "logps/chosen": -171.0941925048828, + "logps/rejected": -163.57052612304688, + "loss": 0.3984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4423140287399292, + "rewards/margins": 1.649274230003357, + "rewards/rejected": -2.0915884971618652, + "step": 2515 + }, + { + "epoch": 0.29, + "learning_rate": 2.1624721994615472e-07, + "logits/chosen": -2.877197742462158, + "logits/rejected": -2.999577045440674, + "logps/chosen": -216.1131134033203, + "logps/rejected": -295.2169189453125, + "loss": 0.4658, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03068036586046219, + "rewards/margins": 1.2160348892211914, + "rewards/rejected": -1.185354471206665, + "step": 2516 + }, + { + "epoch": 0.29, + "learning_rate": 2.1621210347653047e-07, + "logits/chosen": -3.5639023780822754, + "logits/rejected": -3.7309510707855225, + "logps/chosen": -276.4259338378906, + "logps/rejected": -268.25958251953125, + "loss": 0.1469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48831528425216675, + "rewards/margins": 2.5214827060699463, + "rewards/rejected": -2.0331673622131348, + "step": 2517 + }, + { + "epoch": 0.29, + "learning_rate": 2.1617698700690625e-07, + "logits/chosen": -2.5635616779327393, + "logits/rejected": -2.519801139831543, + "logps/chosen": -291.76727294921875, + "logps/rejected": -278.26409912109375, + "loss": 0.5788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5102211833000183, + "rewards/margins": 0.7334524393081665, + "rewards/rejected": -1.24367356300354, + "step": 2518 + }, + { + "epoch": 0.29, + "learning_rate": 2.1614187053728195e-07, + "logits/chosen": -3.2372398376464844, + "logits/rejected": -3.210988759994507, + "logps/chosen": -240.8280029296875, + "logps/rejected": -197.1525421142578, + "loss": 0.4792, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21338310837745667, + "rewards/margins": 1.8720418214797974, + "rewards/rejected": -2.0854249000549316, + "step": 2519 + }, + { + "epoch": 0.29, + "learning_rate": 2.1610675406765773e-07, + "logits/chosen": -3.1961827278137207, + "logits/rejected": -3.081766366958618, + "logps/chosen": -327.99652099609375, + "logps/rejected": -204.96917724609375, + "loss": 0.5202, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08068427443504333, + "rewards/margins": 0.7461433410644531, + "rewards/rejected": -0.8268276453018188, + "step": 2520 + }, + { + "epoch": 0.29, + "learning_rate": 2.1607163759803346e-07, + "logits/chosen": -2.6684317588806152, + "logits/rejected": -2.4371871948242188, + "logps/chosen": -339.41448974609375, + "logps/rejected": -303.41058349609375, + "loss": 0.472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00563429668545723, + "rewards/margins": 1.2148531675338745, + "rewards/rejected": -1.2204875946044922, + "step": 2521 + }, + { + "epoch": 0.29, + "learning_rate": 2.1603652112840922e-07, + "logits/chosen": -2.910371780395508, + "logits/rejected": -3.216799736022949, + "logps/chosen": -251.15113830566406, + "logps/rejected": -401.2746276855469, + "loss": 0.3364, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4468386173248291, + "rewards/margins": 2.117999792098999, + "rewards/rejected": -1.67116117477417, + "step": 2522 + }, + { + "epoch": 0.29, + "learning_rate": 2.1600140465878497e-07, + "logits/chosen": -2.570773124694824, + "logits/rejected": -2.95436692237854, + "logps/chosen": -238.2709503173828, + "logps/rejected": -237.205322265625, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8135675191879272, + "rewards/margins": 2.1649529933929443, + "rewards/rejected": -1.3513855934143066, + "step": 2523 + }, + { + "epoch": 0.29, + "learning_rate": 2.159662881891607e-07, + "logits/chosen": -3.20365834236145, + "logits/rejected": -3.417567491531372, + "logps/chosen": -128.83523559570312, + "logps/rejected": -233.34886169433594, + "loss": 0.3022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24098876118659973, + "rewards/margins": 2.528226375579834, + "rewards/rejected": -2.2872376441955566, + "step": 2524 + }, + { + "epoch": 0.29, + "learning_rate": 2.1593117171953645e-07, + "logits/chosen": -3.1513779163360596, + "logits/rejected": -3.181032180786133, + "logps/chosen": -270.15362548828125, + "logps/rejected": -331.47271728515625, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9286358952522278, + "rewards/margins": 2.737130641937256, + "rewards/rejected": -3.665766716003418, + "step": 2525 + }, + { + "epoch": 0.29, + "learning_rate": 2.158960552499122e-07, + "logits/chosen": -2.8657476902008057, + "logits/rejected": -2.9526169300079346, + "logps/chosen": -357.7826843261719, + "logps/rejected": -350.6910400390625, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5264121294021606, + "rewards/margins": 1.7213901281356812, + "rewards/rejected": -1.19497811794281, + "step": 2526 + }, + { + "epoch": 0.29, + "learning_rate": 2.1586093878028793e-07, + "logits/chosen": -2.9215714931488037, + "logits/rejected": -2.723855972290039, + "logps/chosen": -175.39907836914062, + "logps/rejected": -188.17745971679688, + "loss": 0.4783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34966182708740234, + "rewards/margins": 0.9792813658714294, + "rewards/rejected": -1.3289430141448975, + "step": 2527 + }, + { + "epoch": 0.29, + "learning_rate": 2.1582582231066369e-07, + "logits/chosen": -2.8442330360412598, + "logits/rejected": -2.813645839691162, + "logps/chosen": -155.81375122070312, + "logps/rejected": -274.1573791503906, + "loss": 0.4808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5195708870887756, + "rewards/margins": 1.7433580160140991, + "rewards/rejected": -2.2629289627075195, + "step": 2528 + }, + { + "epoch": 0.29, + "learning_rate": 2.1579070584103941e-07, + "logits/chosen": -3.446017265319824, + "logits/rejected": -3.0477077960968018, + "logps/chosen": -240.87806701660156, + "logps/rejected": -238.14952087402344, + "loss": 0.5247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6172577142715454, + "rewards/margins": 0.8756598234176636, + "rewards/rejected": -1.492917537689209, + "step": 2529 + }, + { + "epoch": 0.29, + "learning_rate": 2.157555893714152e-07, + "logits/chosen": -3.696570634841919, + "logits/rejected": -3.5547797679901123, + "logps/chosen": -220.66912841796875, + "logps/rejected": -238.52651977539062, + "loss": 0.8338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8832507133483887, + "rewards/margins": 0.30547815561294556, + "rewards/rejected": -1.1887288093566895, + "step": 2530 + }, + { + "epoch": 0.29, + "learning_rate": 2.1572047290179095e-07, + "logits/chosen": -3.44429349899292, + "logits/rejected": -3.0229990482330322, + "logps/chosen": -329.0097351074219, + "logps/rejected": -322.86669921875, + "loss": 0.5423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8805943727493286, + "rewards/margins": 2.0709004402160645, + "rewards/rejected": -2.9514946937561035, + "step": 2531 + }, + { + "epoch": 0.29, + "learning_rate": 2.1568535643216667e-07, + "logits/chosen": -2.947079658508301, + "logits/rejected": -2.8432843685150146, + "logps/chosen": -369.27947998046875, + "logps/rejected": -440.1089782714844, + "loss": 0.1344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4164121747016907, + "rewards/margins": 2.5227394104003906, + "rewards/rejected": -2.1063272953033447, + "step": 2532 + }, + { + "epoch": 0.29, + "learning_rate": 2.1565023996254243e-07, + "logits/chosen": -2.693087577819824, + "logits/rejected": -2.7966902256011963, + "logps/chosen": -246.074951171875, + "logps/rejected": -409.5174255371094, + "loss": 0.2052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4320990741252899, + "rewards/margins": 2.4942519664764404, + "rewards/rejected": -2.9263510704040527, + "step": 2533 + }, + { + "epoch": 0.29, + "learning_rate": 2.1561512349291818e-07, + "logits/chosen": -3.289893388748169, + "logits/rejected": -3.4131593704223633, + "logps/chosen": -265.2598876953125, + "logps/rejected": -296.5223388671875, + "loss": 0.2348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030001459643244743, + "rewards/margins": 1.9808032512664795, + "rewards/rejected": -1.9508018493652344, + "step": 2534 + }, + { + "epoch": 0.29, + "learning_rate": 2.155800070232939e-07, + "logits/chosen": -3.509627103805542, + "logits/rejected": -3.2508726119995117, + "logps/chosen": -332.2469482421875, + "logps/rejected": -252.56614685058594, + "loss": 1.4328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.513911783695221, + "rewards/margins": 0.11125612258911133, + "rewards/rejected": -0.625167965888977, + "step": 2535 + }, + { + "epoch": 0.29, + "learning_rate": 2.1554489055366966e-07, + "logits/chosen": -3.171164035797119, + "logits/rejected": -3.180454730987549, + "logps/chosen": -233.70899963378906, + "logps/rejected": -226.4114532470703, + "loss": 0.8238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6192416548728943, + "rewards/margins": 0.14890477061271667, + "rewards/rejected": -0.7681463956832886, + "step": 2536 + }, + { + "epoch": 0.29, + "learning_rate": 2.155097740840454e-07, + "logits/chosen": -3.039989709854126, + "logits/rejected": -2.9134738445281982, + "logps/chosen": -383.3440856933594, + "logps/rejected": -268.11175537109375, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14948640763759613, + "rewards/margins": 1.0681140422821045, + "rewards/rejected": -0.9186275005340576, + "step": 2537 + }, + { + "epoch": 0.29, + "learning_rate": 2.1547465761442115e-07, + "logits/chosen": -2.9930341243743896, + "logits/rejected": -3.3929364681243896, + "logps/chosen": -208.77340698242188, + "logps/rejected": -223.36050415039062, + "loss": 0.3317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.050011053681373596, + "rewards/margins": 1.6575568914413452, + "rewards/rejected": -1.6075458526611328, + "step": 2538 + }, + { + "epoch": 0.29, + "learning_rate": 2.154395411447969e-07, + "logits/chosen": -2.3850817680358887, + "logits/rejected": -2.4263038635253906, + "logps/chosen": -207.4864959716797, + "logps/rejected": -342.36761474609375, + "loss": 0.7779, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11761902272701263, + "rewards/margins": 0.9998621940612793, + "rewards/rejected": -0.8822430968284607, + "step": 2539 + }, + { + "epoch": 0.29, + "learning_rate": 2.1540442467517263e-07, + "logits/chosen": -3.1116700172424316, + "logits/rejected": -3.1031975746154785, + "logps/chosen": -450.1258239746094, + "logps/rejected": -204.19207763671875, + "loss": 0.5807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26815319061279297, + "rewards/margins": 0.8769459128379822, + "rewards/rejected": -0.6087927222251892, + "step": 2540 + }, + { + "epoch": 0.29, + "learning_rate": 2.153693082055484e-07, + "logits/chosen": -2.4168190956115723, + "logits/rejected": -2.45223069190979, + "logps/chosen": -480.0483093261719, + "logps/rejected": -409.9466552734375, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.705863356590271, + "rewards/margins": 2.023974895477295, + "rewards/rejected": -1.3181114196777344, + "step": 2541 + }, + { + "epoch": 0.29, + "learning_rate": 2.1533419173592416e-07, + "logits/chosen": -3.2713074684143066, + "logits/rejected": -3.0394349098205566, + "logps/chosen": -320.599853515625, + "logps/rejected": -235.3441162109375, + "loss": 0.3978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3097507953643799, + "rewards/margins": 0.9010695219039917, + "rewards/rejected": -1.2108204364776611, + "step": 2542 + }, + { + "epoch": 0.29, + "learning_rate": 2.152990752662999e-07, + "logits/chosen": -3.918215036392212, + "logits/rejected": -3.584228754043579, + "logps/chosen": -205.4032440185547, + "logps/rejected": -176.38973999023438, + "loss": 0.5921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47760850191116333, + "rewards/margins": 1.3396594524383545, + "rewards/rejected": -1.8172677755355835, + "step": 2543 + }, + { + "epoch": 0.29, + "learning_rate": 2.1526395879667564e-07, + "logits/chosen": -3.277860641479492, + "logits/rejected": -3.349273681640625, + "logps/chosen": -299.9290466308594, + "logps/rejected": -234.55770874023438, + "loss": 0.2794, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28574448823928833, + "rewards/margins": 2.639697313308716, + "rewards/rejected": -2.3539528846740723, + "step": 2544 + }, + { + "epoch": 0.29, + "learning_rate": 2.1522884232705137e-07, + "logits/chosen": -3.2285404205322266, + "logits/rejected": -3.213987350463867, + "logps/chosen": -262.91851806640625, + "logps/rejected": -211.62171936035156, + "loss": 0.5756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03728321194648743, + "rewards/margins": 0.7637776136398315, + "rewards/rejected": -0.8010609149932861, + "step": 2545 + }, + { + "epoch": 0.29, + "learning_rate": 2.1519372585742712e-07, + "logits/chosen": -3.1560704708099365, + "logits/rejected": -2.6517655849456787, + "logps/chosen": -351.4471435546875, + "logps/rejected": -331.6099853515625, + "loss": 0.7588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11821253597736359, + "rewards/margins": 1.321184754371643, + "rewards/rejected": -1.4393974542617798, + "step": 2546 + }, + { + "epoch": 0.29, + "learning_rate": 2.1515860938780288e-07, + "logits/chosen": -2.8120527267456055, + "logits/rejected": -2.7895634174346924, + "logps/chosen": -521.9832763671875, + "logps/rejected": -383.49774169921875, + "loss": 0.6791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25195562839508057, + "rewards/margins": 0.43304774165153503, + "rewards/rejected": -0.685003399848938, + "step": 2547 + }, + { + "epoch": 0.29, + "learning_rate": 2.151234929181786e-07, + "logits/chosen": -2.7360854148864746, + "logits/rejected": -2.5680344104766846, + "logps/chosen": -370.57940673828125, + "logps/rejected": -304.3961486816406, + "loss": 0.2929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1524173766374588, + "rewards/margins": 1.5483709573745728, + "rewards/rejected": -1.7007882595062256, + "step": 2548 + }, + { + "epoch": 0.29, + "learning_rate": 2.1508837644855436e-07, + "logits/chosen": -3.075923442840576, + "logits/rejected": -3.179980754852295, + "logps/chosen": -142.56776428222656, + "logps/rejected": -177.65765380859375, + "loss": 0.2434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2216678261756897, + "rewards/margins": 1.9188214540481567, + "rewards/rejected": -1.6971535682678223, + "step": 2549 + }, + { + "epoch": 0.29, + "learning_rate": 2.150532599789301e-07, + "logits/chosen": -3.268261432647705, + "logits/rejected": -3.513883113861084, + "logps/chosen": -170.75173950195312, + "logps/rejected": -190.55337524414062, + "loss": 0.6115, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31909674406051636, + "rewards/margins": 0.8725062608718872, + "rewards/rejected": -1.1916028261184692, + "step": 2550 + }, + { + "epoch": 0.29, + "learning_rate": 2.1501814350930584e-07, + "logits/chosen": -2.614130973815918, + "logits/rejected": -2.864084482192993, + "logps/chosen": -337.18707275390625, + "logps/rejected": -504.7647399902344, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.526158332824707, + "rewards/margins": 2.322004795074463, + "rewards/rejected": -1.7958464622497559, + "step": 2551 + }, + { + "epoch": 0.29, + "learning_rate": 2.1498302703968162e-07, + "logits/chosen": -2.758056163787842, + "logits/rejected": -2.6023311614990234, + "logps/chosen": -158.41049194335938, + "logps/rejected": -161.56605529785156, + "loss": 0.6111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.355548232793808, + "rewards/margins": 0.38794904947280884, + "rewards/rejected": -0.7434972524642944, + "step": 2552 + }, + { + "epoch": 0.29, + "learning_rate": 2.1494791057005732e-07, + "logits/chosen": -2.4254207611083984, + "logits/rejected": -2.475022077560425, + "logps/chosen": -252.745361328125, + "logps/rejected": -207.0758514404297, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6883941888809204, + "rewards/margins": 2.0777859687805176, + "rewards/rejected": -1.3893918991088867, + "step": 2553 + }, + { + "epoch": 0.29, + "learning_rate": 2.149127941004331e-07, + "logits/chosen": -2.533390522003174, + "logits/rejected": -2.7753233909606934, + "logps/chosen": -231.60511779785156, + "logps/rejected": -182.68295288085938, + "loss": 0.5893, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.045577578246593475, + "rewards/margins": 0.7031983733177185, + "rewards/rejected": -0.7487759590148926, + "step": 2554 + }, + { + "epoch": 0.29, + "learning_rate": 2.1487767763080885e-07, + "logits/chosen": -3.790057420730591, + "logits/rejected": -3.7323083877563477, + "logps/chosen": -226.44937133789062, + "logps/rejected": -225.21630859375, + "loss": 0.3676, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25751665234565735, + "rewards/margins": 1.1721609830856323, + "rewards/rejected": -0.9146443009376526, + "step": 2555 + }, + { + "epoch": 0.29, + "learning_rate": 2.1484256116118458e-07, + "logits/chosen": -2.9925804138183594, + "logits/rejected": -3.332345485687256, + "logps/chosen": -247.73797607421875, + "logps/rejected": -314.68743896484375, + "loss": 0.3908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30423909425735474, + "rewards/margins": 1.9378318786621094, + "rewards/rejected": -1.6335928440093994, + "step": 2556 + }, + { + "epoch": 0.29, + "learning_rate": 2.1480744469156034e-07, + "logits/chosen": -2.9305856227874756, + "logits/rejected": -3.015263557434082, + "logps/chosen": -327.0535583496094, + "logps/rejected": -324.77947998046875, + "loss": 0.1972, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8379375338554382, + "rewards/margins": 3.107719659805298, + "rewards/rejected": -2.269782066345215, + "step": 2557 + }, + { + "epoch": 0.29, + "learning_rate": 2.147723282219361e-07, + "logits/chosen": -2.9105234146118164, + "logits/rejected": -2.9280948638916016, + "logps/chosen": -111.18600463867188, + "logps/rejected": -244.12619018554688, + "loss": 0.3171, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1900024116039276, + "rewards/margins": 2.0559754371643066, + "rewards/rejected": -1.8659732341766357, + "step": 2558 + }, + { + "epoch": 0.3, + "learning_rate": 2.1473721175231182e-07, + "logits/chosen": -2.8897509574890137, + "logits/rejected": -2.5914406776428223, + "logps/chosen": -227.49989318847656, + "logps/rejected": -154.2394561767578, + "loss": 0.534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.501918375492096, + "rewards/margins": 0.9221572875976562, + "rewards/rejected": -1.4240756034851074, + "step": 2559 + }, + { + "epoch": 0.3, + "learning_rate": 2.1470209528268757e-07, + "logits/chosen": -3.153500556945801, + "logits/rejected": -3.268545150756836, + "logps/chosen": -436.768310546875, + "logps/rejected": -219.27935791015625, + "loss": 0.7682, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6188145875930786, + "rewards/margins": 0.7104479670524597, + "rewards/rejected": -1.3292624950408936, + "step": 2560 + }, + { + "epoch": 0.3, + "learning_rate": 2.146669788130633e-07, + "logits/chosen": -3.2748684883117676, + "logits/rejected": -3.43843150138855, + "logps/chosen": -220.39195251464844, + "logps/rejected": -224.68157958984375, + "loss": 0.5758, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018315039575099945, + "rewards/margins": 0.841328501701355, + "rewards/rejected": -0.8596435785293579, + "step": 2561 + }, + { + "epoch": 0.3, + "learning_rate": 2.1463186234343905e-07, + "logits/chosen": -3.733929395675659, + "logits/rejected": -3.668494701385498, + "logps/chosen": -289.71087646484375, + "logps/rejected": -191.19894409179688, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12815839052200317, + "rewards/margins": 2.065446615219116, + "rewards/rejected": -1.9372881650924683, + "step": 2562 + }, + { + "epoch": 0.3, + "learning_rate": 2.1459674587381483e-07, + "logits/chosen": -2.9200353622436523, + "logits/rejected": -3.256335735321045, + "logps/chosen": -132.3724822998047, + "logps/rejected": -162.9464569091797, + "loss": 0.3497, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36982202529907227, + "rewards/margins": 2.162919759750366, + "rewards/rejected": -1.793097734451294, + "step": 2563 + }, + { + "epoch": 0.3, + "learning_rate": 2.1456162940419056e-07, + "logits/chosen": -2.884148120880127, + "logits/rejected": -2.9516119956970215, + "logps/chosen": -181.06773376464844, + "logps/rejected": -215.36285400390625, + "loss": 0.5618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8933176398277283, + "rewards/margins": 1.1003203392028809, + "rewards/rejected": -1.993638038635254, + "step": 2564 + }, + { + "epoch": 0.3, + "learning_rate": 2.1452651293456631e-07, + "logits/chosen": -2.8888416290283203, + "logits/rejected": -2.7180867195129395, + "logps/chosen": -242.84286499023438, + "logps/rejected": -236.66026306152344, + "loss": 0.35, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13380342721939087, + "rewards/margins": 1.6444716453552246, + "rewards/rejected": -1.5106680393218994, + "step": 2565 + }, + { + "epoch": 0.3, + "learning_rate": 2.1449139646494204e-07, + "logits/chosen": -2.8170509338378906, + "logits/rejected": -2.6424622535705566, + "logps/chosen": -245.99082946777344, + "logps/rejected": -167.81605529785156, + "loss": 0.739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.394553542137146, + "rewards/margins": 0.5255584716796875, + "rewards/rejected": -0.9201120138168335, + "step": 2566 + }, + { + "epoch": 0.3, + "learning_rate": 2.144562799953178e-07, + "logits/chosen": -2.9923300743103027, + "logits/rejected": -3.051290512084961, + "logps/chosen": -250.1763916015625, + "logps/rejected": -224.6210479736328, + "loss": 0.5478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24610325694084167, + "rewards/margins": 1.632594108581543, + "rewards/rejected": -1.386490821838379, + "step": 2567 + }, + { + "epoch": 0.3, + "learning_rate": 2.1442116352569355e-07, + "logits/chosen": -3.3210184574127197, + "logits/rejected": -3.7519757747650146, + "logps/chosen": -245.64862060546875, + "logps/rejected": -358.77203369140625, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34369543194770813, + "rewards/margins": 2.6013083457946777, + "rewards/rejected": -2.257612943649292, + "step": 2568 + }, + { + "epoch": 0.3, + "learning_rate": 2.1438604705606928e-07, + "logits/chosen": -3.4209508895874023, + "logits/rejected": -3.5462265014648438, + "logps/chosen": -248.39483642578125, + "logps/rejected": -252.47625732421875, + "loss": 0.4297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6377914547920227, + "rewards/margins": 2.059689998626709, + "rewards/rejected": -2.697481632232666, + "step": 2569 + }, + { + "epoch": 0.3, + "learning_rate": 2.1435093058644503e-07, + "logits/chosen": -2.7979516983032227, + "logits/rejected": -3.2873551845550537, + "logps/chosen": -369.0631408691406, + "logps/rejected": -342.6510009765625, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07598162442445755, + "rewards/margins": 2.6163907051086426, + "rewards/rejected": -2.5404090881347656, + "step": 2570 + }, + { + "epoch": 0.3, + "learning_rate": 2.1431581411682078e-07, + "logits/chosen": -3.0711984634399414, + "logits/rejected": -3.2525575160980225, + "logps/chosen": -269.4237365722656, + "logps/rejected": -182.80599975585938, + "loss": 0.7902, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0086373090744019, + "rewards/margins": 1.2930245399475098, + "rewards/rejected": -2.301661729812622, + "step": 2571 + }, + { + "epoch": 0.3, + "learning_rate": 2.142806976471965e-07, + "logits/chosen": -3.042958974838257, + "logits/rejected": -2.8398070335388184, + "logps/chosen": -333.62127685546875, + "logps/rejected": -205.80789184570312, + "loss": 0.5471, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16432519257068634, + "rewards/margins": 0.8288993835449219, + "rewards/rejected": -0.664574146270752, + "step": 2572 + }, + { + "epoch": 0.3, + "learning_rate": 2.1424558117757227e-07, + "logits/chosen": -1.8645180463790894, + "logits/rejected": -1.9161763191223145, + "logps/chosen": -320.7381896972656, + "logps/rejected": -254.15463256835938, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23828072845935822, + "rewards/margins": 1.511570930480957, + "rewards/rejected": -1.7498517036437988, + "step": 2573 + }, + { + "epoch": 0.3, + "learning_rate": 2.14210464707948e-07, + "logits/chosen": -3.6634280681610107, + "logits/rejected": -3.960866928100586, + "logps/chosen": -191.92433166503906, + "logps/rejected": -228.18504333496094, + "loss": 0.5604, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14040455222129822, + "rewards/margins": 1.443699598312378, + "rewards/rejected": -1.3032950162887573, + "step": 2574 + }, + { + "epoch": 0.3, + "learning_rate": 2.1417534823832377e-07, + "logits/chosen": -2.7769393920898438, + "logits/rejected": -2.597374200820923, + "logps/chosen": -354.5603332519531, + "logps/rejected": -339.4692687988281, + "loss": 0.2693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4016771614551544, + "rewards/margins": 1.757029414176941, + "rewards/rejected": -1.3553521633148193, + "step": 2575 + }, + { + "epoch": 0.3, + "learning_rate": 2.1414023176869953e-07, + "logits/chosen": -2.9424214363098145, + "logits/rejected": -2.755504608154297, + "logps/chosen": -229.32029724121094, + "logps/rejected": -234.5140380859375, + "loss": 0.9373, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5314222574234009, + "rewards/margins": 0.7831764817237854, + "rewards/rejected": -1.3145986795425415, + "step": 2576 + }, + { + "epoch": 0.3, + "learning_rate": 2.1410511529907525e-07, + "logits/chosen": -3.3437247276306152, + "logits/rejected": -3.464768409729004, + "logps/chosen": -356.91973876953125, + "logps/rejected": -299.2821044921875, + "loss": 0.2691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5267232060432434, + "rewards/margins": 1.897353172302246, + "rewards/rejected": -1.3706300258636475, + "step": 2577 + }, + { + "epoch": 0.3, + "learning_rate": 2.14069998829451e-07, + "logits/chosen": -2.6547083854675293, + "logits/rejected": -2.710179567337036, + "logps/chosen": -394.1455383300781, + "logps/rejected": -322.2351379394531, + "loss": 0.2403, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2676967680454254, + "rewards/margins": 3.0415279865264893, + "rewards/rejected": -2.7738311290740967, + "step": 2578 + }, + { + "epoch": 0.3, + "learning_rate": 2.1403488235982676e-07, + "logits/chosen": -3.219705104827881, + "logits/rejected": -3.228687286376953, + "logps/chosen": -238.00192260742188, + "logps/rejected": -455.2807312011719, + "loss": 0.3909, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32765665650367737, + "rewards/margins": 1.227205514907837, + "rewards/rejected": -1.5548622608184814, + "step": 2579 + }, + { + "epoch": 0.3, + "learning_rate": 2.139997658902025e-07, + "logits/chosen": -3.0565598011016846, + "logits/rejected": -3.000286102294922, + "logps/chosen": -234.52352905273438, + "logps/rejected": -298.09271240234375, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16635188460350037, + "rewards/margins": 1.56252121925354, + "rewards/rejected": -1.3961693048477173, + "step": 2580 + }, + { + "epoch": 0.3, + "learning_rate": 2.1396464942057824e-07, + "logits/chosen": -2.6357057094573975, + "logits/rejected": -2.665994882583618, + "logps/chosen": -280.0640563964844, + "logps/rejected": -317.63079833984375, + "loss": 0.3203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2933403253555298, + "rewards/margins": 1.5820248126983643, + "rewards/rejected": -1.8753650188446045, + "step": 2581 + }, + { + "epoch": 0.3, + "learning_rate": 2.1392953295095397e-07, + "logits/chosen": -2.7188618183135986, + "logits/rejected": -2.8137030601501465, + "logps/chosen": -200.86907958984375, + "logps/rejected": -293.8908386230469, + "loss": 0.3079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12379367649555206, + "rewards/margins": 1.5092010498046875, + "rewards/rejected": -1.6329946517944336, + "step": 2582 + }, + { + "epoch": 0.3, + "learning_rate": 2.1389441648132972e-07, + "logits/chosen": -3.7103869915008545, + "logits/rejected": -3.593686819076538, + "logps/chosen": -253.62274169921875, + "logps/rejected": -235.7546844482422, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19297929108142853, + "rewards/margins": 1.938727617263794, + "rewards/rejected": -1.7457482814788818, + "step": 2583 + }, + { + "epoch": 0.3, + "learning_rate": 2.1385930001170548e-07, + "logits/chosen": -3.245603322982788, + "logits/rejected": -3.305307626724243, + "logps/chosen": -231.2584228515625, + "logps/rejected": -229.70326232910156, + "loss": 0.3879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3055317997932434, + "rewards/margins": 0.9713259935379028, + "rewards/rejected": -1.2768577337265015, + "step": 2584 + }, + { + "epoch": 0.3, + "learning_rate": 2.138241835420812e-07, + "logits/chosen": -2.4296247959136963, + "logits/rejected": -2.5137107372283936, + "logps/chosen": -269.891845703125, + "logps/rejected": -194.33457946777344, + "loss": 0.4433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09491641819477081, + "rewards/margins": 0.7702800035476685, + "rewards/rejected": -0.6753636002540588, + "step": 2585 + }, + { + "epoch": 0.3, + "learning_rate": 2.1378906707245699e-07, + "logits/chosen": -2.8271713256835938, + "logits/rejected": -3.096623420715332, + "logps/chosen": -280.2093505859375, + "logps/rejected": -209.69635009765625, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2109682410955429, + "rewards/margins": 2.855173110961914, + "rewards/rejected": -2.64420485496521, + "step": 2586 + }, + { + "epoch": 0.3, + "learning_rate": 2.1375395060283274e-07, + "logits/chosen": -3.998806953430176, + "logits/rejected": -3.7143378257751465, + "logps/chosen": -269.6476745605469, + "logps/rejected": -192.35833740234375, + "loss": 0.2462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5411118268966675, + "rewards/margins": 2.1918482780456543, + "rewards/rejected": -1.6507364511489868, + "step": 2587 + }, + { + "epoch": 0.3, + "learning_rate": 2.1371883413320847e-07, + "logits/chosen": -3.212614059448242, + "logits/rejected": -3.266993522644043, + "logps/chosen": -341.0892333984375, + "logps/rejected": -375.3945617675781, + "loss": 0.6666, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9270532131195068, + "rewards/margins": 1.1798666715621948, + "rewards/rejected": -2.106920003890991, + "step": 2588 + }, + { + "epoch": 0.3, + "learning_rate": 2.1368371766358422e-07, + "logits/chosen": -3.088294506072998, + "logits/rejected": -3.2397639751434326, + "logps/chosen": -201.5574493408203, + "logps/rejected": -195.56849670410156, + "loss": 0.3392, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22191745042800903, + "rewards/margins": 1.3379775285720825, + "rewards/rejected": -1.1160600185394287, + "step": 2589 + }, + { + "epoch": 0.3, + "learning_rate": 2.1364860119395995e-07, + "logits/chosen": -3.3521690368652344, + "logits/rejected": -3.457568645477295, + "logps/chosen": -227.6110076904297, + "logps/rejected": -270.6137390136719, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3614324629306793, + "rewards/margins": 3.0437488555908203, + "rewards/rejected": -2.682316541671753, + "step": 2590 + }, + { + "epoch": 0.3, + "learning_rate": 2.136134847243357e-07, + "logits/chosen": -3.3407042026519775, + "logits/rejected": -3.23598313331604, + "logps/chosen": -160.33721923828125, + "logps/rejected": -194.5361785888672, + "loss": 0.2872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9667548537254333, + "rewards/margins": 2.1675119400024414, + "rewards/rejected": -1.2007571458816528, + "step": 2591 + }, + { + "epoch": 0.3, + "learning_rate": 2.1357836825471146e-07, + "logits/chosen": -2.746734857559204, + "logits/rejected": -2.882901668548584, + "logps/chosen": -349.1083984375, + "logps/rejected": -295.4512634277344, + "loss": 0.3753, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31298744678497314, + "rewards/margins": 1.3062620162963867, + "rewards/rejected": -0.9932745695114136, + "step": 2592 + }, + { + "epoch": 0.3, + "learning_rate": 2.1354325178508718e-07, + "logits/chosen": -3.6634044647216797, + "logits/rejected": -3.5774505138397217, + "logps/chosen": -103.76362609863281, + "logps/rejected": -68.02978515625, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7039738297462463, + "rewards/margins": 0.3043333888053894, + "rewards/rejected": -1.0083072185516357, + "step": 2593 + }, + { + "epoch": 0.3, + "learning_rate": 2.1350813531546294e-07, + "logits/chosen": -3.461884021759033, + "logits/rejected": -3.379714012145996, + "logps/chosen": -181.22329711914062, + "logps/rejected": -202.13223266601562, + "loss": 0.4143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6452523469924927, + "rewards/margins": 1.5031282901763916, + "rewards/rejected": -2.148380756378174, + "step": 2594 + }, + { + "epoch": 0.3, + "learning_rate": 2.134730188458387e-07, + "logits/chosen": -3.6769375801086426, + "logits/rejected": -3.6051723957061768, + "logps/chosen": -231.912109375, + "logps/rejected": -213.67449951171875, + "loss": 0.3504, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16769975423812866, + "rewards/margins": 2.318512439727783, + "rewards/rejected": -2.1508126258850098, + "step": 2595 + }, + { + "epoch": 0.3, + "learning_rate": 2.1343790237621442e-07, + "logits/chosen": -3.837965488433838, + "logits/rejected": -3.6957931518554688, + "logps/chosen": -233.676513671875, + "logps/rejected": -243.48460388183594, + "loss": 0.3155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31379270553588867, + "rewards/margins": 1.7074849605560303, + "rewards/rejected": -2.021277666091919, + "step": 2596 + }, + { + "epoch": 0.3, + "learning_rate": 2.134027859065902e-07, + "logits/chosen": -3.7925162315368652, + "logits/rejected": -3.667675495147705, + "logps/chosen": -186.47421264648438, + "logps/rejected": -257.84417724609375, + "loss": 0.2493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24779394268989563, + "rewards/margins": 2.5216047763824463, + "rewards/rejected": -2.273810863494873, + "step": 2597 + }, + { + "epoch": 0.3, + "learning_rate": 2.1336766943696593e-07, + "logits/chosen": -3.8695459365844727, + "logits/rejected": -3.687941789627075, + "logps/chosen": -194.97474670410156, + "logps/rejected": -273.35821533203125, + "loss": 0.5707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31056395173072815, + "rewards/margins": 1.2100341320037842, + "rewards/rejected": -1.52059805393219, + "step": 2598 + }, + { + "epoch": 0.3, + "learning_rate": 2.1333255296734168e-07, + "logits/chosen": -3.2129340171813965, + "logits/rejected": -3.2441821098327637, + "logps/chosen": -240.37953186035156, + "logps/rejected": -171.22369384765625, + "loss": 0.5942, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24090737104415894, + "rewards/margins": 1.164582371711731, + "rewards/rejected": -0.9236750602722168, + "step": 2599 + }, + { + "epoch": 0.3, + "learning_rate": 2.1329743649771743e-07, + "logits/chosen": -2.86246919631958, + "logits/rejected": -3.069891929626465, + "logps/chosen": -169.1517333984375, + "logps/rejected": -285.3898620605469, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13462701439857483, + "rewards/margins": 3.2156643867492676, + "rewards/rejected": -3.0810372829437256, + "step": 2600 + }, + { + "epoch": 0.3, + "learning_rate": 2.1326232002809316e-07, + "logits/chosen": -3.2694320678710938, + "logits/rejected": -3.1203155517578125, + "logps/chosen": -225.84344482421875, + "logps/rejected": -182.9088134765625, + "loss": 0.6947, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12488748878240585, + "rewards/margins": 0.339462012052536, + "rewards/rejected": -0.46434950828552246, + "step": 2601 + }, + { + "epoch": 0.3, + "learning_rate": 2.1322720355846892e-07, + "logits/chosen": -3.072448492050171, + "logits/rejected": -2.9065980911254883, + "logps/chosen": -371.125732421875, + "logps/rejected": -233.52462768554688, + "loss": 0.4332, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1140744686126709, + "rewards/margins": 1.1411091089248657, + "rewards/rejected": -1.0270345211029053, + "step": 2602 + }, + { + "epoch": 0.3, + "learning_rate": 2.1319208708884467e-07, + "logits/chosen": -2.8249688148498535, + "logits/rejected": -3.252692699432373, + "logps/chosen": -143.95408630371094, + "logps/rejected": -528.7186889648438, + "loss": 0.4371, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08502054214477539, + "rewards/margins": 2.0863773822784424, + "rewards/rejected": -2.001356840133667, + "step": 2603 + }, + { + "epoch": 0.3, + "learning_rate": 2.131569706192204e-07, + "logits/chosen": -3.0703587532043457, + "logits/rejected": -2.903543472290039, + "logps/chosen": -334.08245849609375, + "logps/rejected": -219.74005126953125, + "loss": 0.1952, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4916192293167114, + "rewards/margins": 1.9880692958831787, + "rewards/rejected": -1.4964500665664673, + "step": 2604 + }, + { + "epoch": 0.3, + "learning_rate": 2.1312185414959615e-07, + "logits/chosen": -2.659576416015625, + "logits/rejected": -2.610520839691162, + "logps/chosen": -219.05325317382812, + "logps/rejected": -160.3195343017578, + "loss": 0.3556, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08446864783763885, + "rewards/margins": 1.3481814861297607, + "rewards/rejected": -1.263712763786316, + "step": 2605 + }, + { + "epoch": 0.3, + "learning_rate": 2.1308673767997188e-07, + "logits/chosen": -3.3327317237854004, + "logits/rejected": -3.3206560611724854, + "logps/chosen": -227.22195434570312, + "logps/rejected": -148.6520233154297, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027636319398880005, + "rewards/margins": 1.0642647743225098, + "rewards/rejected": -1.091901183128357, + "step": 2606 + }, + { + "epoch": 0.3, + "learning_rate": 2.1305162121034763e-07, + "logits/chosen": -3.3311967849731445, + "logits/rejected": -3.4810538291931152, + "logps/chosen": -345.34423828125, + "logps/rejected": -275.7441101074219, + "loss": 0.2028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3735184967517853, + "rewards/margins": 2.5044138431549072, + "rewards/rejected": -2.1308953762054443, + "step": 2607 + }, + { + "epoch": 0.3, + "learning_rate": 2.130165047407234e-07, + "logits/chosen": -3.479562282562256, + "logits/rejected": -3.3001370429992676, + "logps/chosen": -222.87506103515625, + "logps/rejected": -218.8445281982422, + "loss": 0.4056, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07859775424003601, + "rewards/margins": 1.285064697265625, + "rewards/rejected": -1.3636624813079834, + "step": 2608 + }, + { + "epoch": 0.3, + "learning_rate": 2.1298138827109914e-07, + "logits/chosen": -2.7875185012817383, + "logits/rejected": -2.8810875415802, + "logps/chosen": -190.42747497558594, + "logps/rejected": -375.72894287109375, + "loss": 0.3292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08984656631946564, + "rewards/margins": 1.622300624847412, + "rewards/rejected": -1.7121471166610718, + "step": 2609 + }, + { + "epoch": 0.3, + "learning_rate": 2.129462718014749e-07, + "logits/chosen": -2.667707681655884, + "logits/rejected": -2.4137344360351562, + "logps/chosen": -251.4737548828125, + "logps/rejected": -222.12075805664062, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3819400370121002, + "rewards/margins": 1.4335930347442627, + "rewards/rejected": -1.8155330419540405, + "step": 2610 + }, + { + "epoch": 0.3, + "learning_rate": 2.1291115533185065e-07, + "logits/chosen": -2.9640867710113525, + "logits/rejected": -2.9014406204223633, + "logps/chosen": -341.06048583984375, + "logps/rejected": -276.04901123046875, + "loss": 0.3695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027166053652763367, + "rewards/margins": 1.784468173980713, + "rewards/rejected": -1.8116341829299927, + "step": 2611 + }, + { + "epoch": 0.3, + "learning_rate": 2.1287603886222637e-07, + "logits/chosen": -4.178925037384033, + "logits/rejected": -3.6387276649475098, + "logps/chosen": -269.6713562011719, + "logps/rejected": -149.16397094726562, + "loss": 0.3383, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18593506515026093, + "rewards/margins": 1.5617293119430542, + "rewards/rejected": -1.3757941722869873, + "step": 2612 + }, + { + "epoch": 0.3, + "learning_rate": 2.1284092239260213e-07, + "logits/chosen": -2.8688013553619385, + "logits/rejected": -2.8532214164733887, + "logps/chosen": -415.79522705078125, + "logps/rejected": -301.96832275390625, + "loss": 1.1092, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.008213996887207, + "rewards/margins": 0.527400016784668, + "rewards/rejected": -1.535614013671875, + "step": 2613 + }, + { + "epoch": 0.3, + "learning_rate": 2.1280580592297786e-07, + "logits/chosen": -2.6170856952667236, + "logits/rejected": -2.904116153717041, + "logps/chosen": -361.64971923828125, + "logps/rejected": -264.6898498535156, + "loss": 0.4149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051516562700271606, + "rewards/margins": 1.5424813032150269, + "rewards/rejected": -1.5939979553222656, + "step": 2614 + }, + { + "epoch": 0.3, + "learning_rate": 2.127706894533536e-07, + "logits/chosen": -4.016365051269531, + "logits/rejected": -3.613889694213867, + "logps/chosen": -295.7431640625, + "logps/rejected": -167.41946411132812, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20511853694915771, + "rewards/margins": 1.9600541591644287, + "rewards/rejected": -1.7549355030059814, + "step": 2615 + }, + { + "epoch": 0.3, + "learning_rate": 2.1273557298372936e-07, + "logits/chosen": -3.0946555137634277, + "logits/rejected": -3.032836437225342, + "logps/chosen": -306.7103271484375, + "logps/rejected": -287.1866760253906, + "loss": 0.5942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5122720003128052, + "rewards/margins": 1.0914312601089478, + "rewards/rejected": -1.6037033796310425, + "step": 2616 + }, + { + "epoch": 0.3, + "learning_rate": 2.127004565141051e-07, + "logits/chosen": -3.9072630405426025, + "logits/rejected": -3.914475917816162, + "logps/chosen": -95.60518646240234, + "logps/rejected": -121.07215881347656, + "loss": 0.5527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09743925929069519, + "rewards/margins": 0.8458166122436523, + "rewards/rejected": -0.9432559013366699, + "step": 2617 + }, + { + "epoch": 0.3, + "learning_rate": 2.1266534004448084e-07, + "logits/chosen": -2.8881773948669434, + "logits/rejected": -2.747392416000366, + "logps/chosen": -207.5545196533203, + "logps/rejected": -231.87057495117188, + "loss": 0.3731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22826243937015533, + "rewards/margins": 2.211359977722168, + "rewards/rejected": -2.439622402191162, + "step": 2618 + }, + { + "epoch": 0.3, + "learning_rate": 2.1263022357485657e-07, + "logits/chosen": -2.5903801918029785, + "logits/rejected": -2.3856489658355713, + "logps/chosen": -539.1986083984375, + "logps/rejected": -277.7161560058594, + "loss": 0.3475, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10978647321462631, + "rewards/margins": 2.167459011077881, + "rewards/rejected": -2.27724552154541, + "step": 2619 + }, + { + "epoch": 0.3, + "learning_rate": 2.1259510710523235e-07, + "logits/chosen": -2.7712082862854004, + "logits/rejected": -2.854727268218994, + "logps/chosen": -286.43865966796875, + "logps/rejected": -190.49935913085938, + "loss": 1.236, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1826320886611938, + "rewards/margins": 0.10737287998199463, + "rewards/rejected": -1.2900049686431885, + "step": 2620 + }, + { + "epoch": 0.3, + "learning_rate": 2.125599906356081e-07, + "logits/chosen": -3.4522032737731934, + "logits/rejected": -3.415865659713745, + "logps/chosen": -84.9923324584961, + "logps/rejected": -154.8427734375, + "loss": 0.5065, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007892109453678131, + "rewards/margins": 0.7958219647407532, + "rewards/rejected": -0.787929892539978, + "step": 2621 + }, + { + "epoch": 0.3, + "learning_rate": 2.1252487416598383e-07, + "logits/chosen": -2.8758649826049805, + "logits/rejected": -2.875692844390869, + "logps/chosen": -201.611328125, + "logps/rejected": -195.94918823242188, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0891701877117157, + "rewards/margins": 0.9865264892578125, + "rewards/rejected": -1.0756967067718506, + "step": 2622 + }, + { + "epoch": 0.3, + "learning_rate": 2.124897576963596e-07, + "logits/chosen": -3.1700446605682373, + "logits/rejected": -2.8612446784973145, + "logps/chosen": -149.87327575683594, + "logps/rejected": -246.7994384765625, + "loss": 0.6418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04152717813849449, + "rewards/margins": 0.6933844089508057, + "rewards/rejected": -0.7349116206169128, + "step": 2623 + }, + { + "epoch": 0.3, + "learning_rate": 2.1245464122673534e-07, + "logits/chosen": -2.560467004776001, + "logits/rejected": -2.861370086669922, + "logps/chosen": -234.38888549804688, + "logps/rejected": -249.68020629882812, + "loss": 0.3292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.295142263174057, + "rewards/margins": 1.493542194366455, + "rewards/rejected": -1.1983999013900757, + "step": 2624 + }, + { + "epoch": 0.3, + "learning_rate": 2.1241952475711107e-07, + "logits/chosen": -3.5084965229034424, + "logits/rejected": -3.4222421646118164, + "logps/chosen": -242.83255004882812, + "logps/rejected": -213.56703186035156, + "loss": 0.2488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3930262327194214, + "rewards/margins": 1.8862134218215942, + "rewards/rejected": -1.4931873083114624, + "step": 2625 + }, + { + "epoch": 0.3, + "learning_rate": 2.1238440828748682e-07, + "logits/chosen": -2.7609899044036865, + "logits/rejected": -2.445192813873291, + "logps/chosen": -261.17474365234375, + "logps/rejected": -216.25392150878906, + "loss": 0.4826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5984484553337097, + "rewards/margins": 0.617791473865509, + "rewards/rejected": -1.2162399291992188, + "step": 2626 + }, + { + "epoch": 0.3, + "learning_rate": 2.1234929181786255e-07, + "logits/chosen": -3.1811728477478027, + "logits/rejected": -3.130852222442627, + "logps/chosen": -364.94818115234375, + "logps/rejected": -268.77203369140625, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2814651429653168, + "rewards/margins": 1.9705853462219238, + "rewards/rejected": -1.6891200542449951, + "step": 2627 + }, + { + "epoch": 0.3, + "learning_rate": 2.123141753482383e-07, + "logits/chosen": -2.4331860542297363, + "logits/rejected": -2.4532604217529297, + "logps/chosen": -286.90264892578125, + "logps/rejected": -229.51002502441406, + "loss": 0.2558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6661115884780884, + "rewards/margins": 1.7842493057250977, + "rewards/rejected": -1.1181378364562988, + "step": 2628 + }, + { + "epoch": 0.3, + "learning_rate": 2.1227905887861406e-07, + "logits/chosen": -3.2210826873779297, + "logits/rejected": -3.2571213245391846, + "logps/chosen": -233.7112579345703, + "logps/rejected": -191.5620574951172, + "loss": 0.4412, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0010215938091278076, + "rewards/margins": 1.6510913372039795, + "rewards/rejected": -1.6521129608154297, + "step": 2629 + }, + { + "epoch": 0.3, + "learning_rate": 2.1224394240898979e-07, + "logits/chosen": -3.6039528846740723, + "logits/rejected": -3.635333299636841, + "logps/chosen": -201.91702270507812, + "logps/rejected": -206.30783081054688, + "loss": 0.2237, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35813838243484497, + "rewards/margins": 2.510892629623413, + "rewards/rejected": -2.152754306793213, + "step": 2630 + }, + { + "epoch": 0.3, + "learning_rate": 2.1220882593936557e-07, + "logits/chosen": -2.989579677581787, + "logits/rejected": -2.7776458263397217, + "logps/chosen": -391.1034240722656, + "logps/rejected": -455.87969970703125, + "loss": 0.5709, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36003822088241577, + "rewards/margins": 0.9418476223945618, + "rewards/rejected": -1.301885724067688, + "step": 2631 + }, + { + "epoch": 0.3, + "learning_rate": 2.1217370946974132e-07, + "logits/chosen": -3.085770845413208, + "logits/rejected": -2.9609150886535645, + "logps/chosen": -343.3042297363281, + "logps/rejected": -253.92869567871094, + "loss": 0.2088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1349964290857315, + "rewards/margins": 2.3497674465179443, + "rewards/rejected": -2.4847636222839355, + "step": 2632 + }, + { + "epoch": 0.3, + "learning_rate": 2.1213859300011705e-07, + "logits/chosen": -3.0040388107299805, + "logits/rejected": -3.138188362121582, + "logps/chosen": -356.5177001953125, + "logps/rejected": -225.9423370361328, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18886470794677734, + "rewards/margins": 1.668954849243164, + "rewards/rejected": -1.4800900220870972, + "step": 2633 + }, + { + "epoch": 0.3, + "learning_rate": 2.121034765304928e-07, + "logits/chosen": -3.32328462600708, + "logits/rejected": -2.9965553283691406, + "logps/chosen": -102.29368591308594, + "logps/rejected": -192.2423095703125, + "loss": 0.3921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13508011400699615, + "rewards/margins": 1.2319879531860352, + "rewards/rejected": -1.3670681715011597, + "step": 2634 + }, + { + "epoch": 0.3, + "learning_rate": 2.1206836006086853e-07, + "logits/chosen": -3.0822315216064453, + "logits/rejected": -3.226738452911377, + "logps/chosen": -210.1138458251953, + "logps/rejected": -296.59503173828125, + "loss": 0.5019, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37011346220970154, + "rewards/margins": 1.0705934762954712, + "rewards/rejected": -0.700480043888092, + "step": 2635 + }, + { + "epoch": 0.3, + "learning_rate": 2.1203324359124428e-07, + "logits/chosen": -2.6529245376586914, + "logits/rejected": -2.863008499145508, + "logps/chosen": -273.86395263671875, + "logps/rejected": -112.42547607421875, + "loss": 0.5312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4667474627494812, + "rewards/margins": 0.6671906113624573, + "rewards/rejected": -1.1339380741119385, + "step": 2636 + }, + { + "epoch": 0.3, + "learning_rate": 2.1199812712162004e-07, + "logits/chosen": -3.348909854888916, + "logits/rejected": -3.4405832290649414, + "logps/chosen": -245.3507537841797, + "logps/rejected": -328.1563720703125, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31078895926475525, + "rewards/margins": 2.1293139457702637, + "rewards/rejected": -2.440103054046631, + "step": 2637 + }, + { + "epoch": 0.3, + "learning_rate": 2.1196301065199576e-07, + "logits/chosen": -2.7455739974975586, + "logits/rejected": -2.6644973754882812, + "logps/chosen": -148.25282287597656, + "logps/rejected": -186.3067626953125, + "loss": 0.5144, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22366288304328918, + "rewards/margins": 0.8049038052558899, + "rewards/rejected": -0.5812409520149231, + "step": 2638 + }, + { + "epoch": 0.3, + "learning_rate": 2.1192789418237152e-07, + "logits/chosen": -1.7965762615203857, + "logits/rejected": -1.701200008392334, + "logps/chosen": -346.4522705078125, + "logps/rejected": -255.2733154296875, + "loss": 0.5984, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11234212666749954, + "rewards/margins": 0.9633269906044006, + "rewards/rejected": -0.8509848117828369, + "step": 2639 + }, + { + "epoch": 0.3, + "learning_rate": 2.1189277771274727e-07, + "logits/chosen": -3.385784864425659, + "logits/rejected": -3.2045576572418213, + "logps/chosen": -312.9085998535156, + "logps/rejected": -281.45745849609375, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27522900700569153, + "rewards/margins": 2.7667925357818604, + "rewards/rejected": -2.491563558578491, + "step": 2640 + }, + { + "epoch": 0.3, + "learning_rate": 2.11857661243123e-07, + "logits/chosen": -2.315624713897705, + "logits/rejected": -2.173530340194702, + "logps/chosen": -332.1817626953125, + "logps/rejected": -288.607421875, + "loss": 0.4544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02453383058309555, + "rewards/margins": 1.3337042331695557, + "rewards/rejected": -1.3091704845428467, + "step": 2641 + }, + { + "epoch": 0.3, + "learning_rate": 2.1182254477349878e-07, + "logits/chosen": -2.7607760429382324, + "logits/rejected": -2.784654140472412, + "logps/chosen": -176.8272705078125, + "logps/rejected": -262.67938232421875, + "loss": 0.2918, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2575886845588684, + "rewards/margins": 1.6384268999099731, + "rewards/rejected": -1.380838394165039, + "step": 2642 + }, + { + "epoch": 0.3, + "learning_rate": 2.117874283038745e-07, + "logits/chosen": -3.110285758972168, + "logits/rejected": -3.1426384449005127, + "logps/chosen": -157.165283203125, + "logps/rejected": -244.91180419921875, + "loss": 0.4884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07096487283706665, + "rewards/margins": 1.6198042631149292, + "rewards/rejected": -1.5488393306732178, + "step": 2643 + }, + { + "epoch": 0.3, + "learning_rate": 2.1175231183425026e-07, + "logits/chosen": -2.538911819458008, + "logits/rejected": -2.7284297943115234, + "logps/chosen": -152.67164611816406, + "logps/rejected": -122.08198547363281, + "loss": 0.5502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9580534100532532, + "rewards/margins": 1.063791036605835, + "rewards/rejected": -2.0218443870544434, + "step": 2644 + }, + { + "epoch": 0.3, + "learning_rate": 2.1171719536462601e-07, + "logits/chosen": -2.8962833881378174, + "logits/rejected": -3.2447714805603027, + "logps/chosen": -232.14337158203125, + "logps/rejected": -260.5798034667969, + "loss": 0.3313, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2283482402563095, + "rewards/margins": 2.361008882522583, + "rewards/rejected": -2.132660388946533, + "step": 2645 + }, + { + "epoch": 0.31, + "learning_rate": 2.1168207889500174e-07, + "logits/chosen": -2.660217046737671, + "logits/rejected": -2.7193503379821777, + "logps/chosen": -367.6368408203125, + "logps/rejected": -191.890869140625, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6010500192642212, + "rewards/margins": 1.9173550605773926, + "rewards/rejected": -1.3163049221038818, + "step": 2646 + }, + { + "epoch": 0.31, + "learning_rate": 2.116469624253775e-07, + "logits/chosen": -2.9813153743743896, + "logits/rejected": -3.253883123397827, + "logps/chosen": -499.7625427246094, + "logps/rejected": -322.88751220703125, + "loss": 0.6191, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021469667553901672, + "rewards/margins": 0.857175350189209, + "rewards/rejected": -0.8357056379318237, + "step": 2647 + }, + { + "epoch": 0.31, + "learning_rate": 2.1161184595575325e-07, + "logits/chosen": -2.829207420349121, + "logits/rejected": -2.7558043003082275, + "logps/chosen": -277.6553649902344, + "logps/rejected": -334.64935302734375, + "loss": 0.4406, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06831994652748108, + "rewards/margins": 2.1513161659240723, + "rewards/rejected": -2.082996129989624, + "step": 2648 + }, + { + "epoch": 0.31, + "learning_rate": 2.1157672948612898e-07, + "logits/chosen": -3.97103214263916, + "logits/rejected": -3.8434014320373535, + "logps/chosen": -100.18197631835938, + "logps/rejected": -83.30469512939453, + "loss": 0.4779, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3769177496433258, + "rewards/margins": 0.8938929438591003, + "rewards/rejected": -0.5169751644134521, + "step": 2649 + }, + { + "epoch": 0.31, + "learning_rate": 2.1154161301650473e-07, + "logits/chosen": -3.3420674800872803, + "logits/rejected": -3.755899667739868, + "logps/chosen": -275.7237243652344, + "logps/rejected": -316.19622802734375, + "loss": 0.3554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23422250151634216, + "rewards/margins": 2.5390076637268066, + "rewards/rejected": -2.7732300758361816, + "step": 2650 + }, + { + "epoch": 0.31, + "learning_rate": 2.1150649654688046e-07, + "logits/chosen": -3.2168405055999756, + "logits/rejected": -3.186497211456299, + "logps/chosen": -249.2946014404297, + "logps/rejected": -274.6036376953125, + "loss": 0.6596, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6279729008674622, + "rewards/margins": 0.36376339197158813, + "rewards/rejected": -0.9917362928390503, + "step": 2651 + }, + { + "epoch": 0.31, + "learning_rate": 2.114713800772562e-07, + "logits/chosen": -3.032636880874634, + "logits/rejected": -3.1316819190979004, + "logps/chosen": -368.83294677734375, + "logps/rejected": -294.6126708984375, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10298866778612137, + "rewards/margins": 2.1913034915924072, + "rewards/rejected": -2.088315010070801, + "step": 2652 + }, + { + "epoch": 0.31, + "learning_rate": 2.11436263607632e-07, + "logits/chosen": -2.524991512298584, + "logits/rejected": -2.8310399055480957, + "logps/chosen": -155.92784118652344, + "logps/rejected": -240.8963623046875, + "loss": 0.5906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24226030707359314, + "rewards/margins": 1.8797134160995483, + "rewards/rejected": -2.121973752975464, + "step": 2653 + }, + { + "epoch": 0.31, + "learning_rate": 2.1140114713800772e-07, + "logits/chosen": -3.855231285095215, + "logits/rejected": -3.8131346702575684, + "logps/chosen": -201.9010467529297, + "logps/rejected": -158.7369384765625, + "loss": 0.5394, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3431272804737091, + "rewards/margins": 1.4349701404571533, + "rewards/rejected": -1.0918428897857666, + "step": 2654 + }, + { + "epoch": 0.31, + "learning_rate": 2.1136603066838347e-07, + "logits/chosen": -2.9801840782165527, + "logits/rejected": -3.0134928226470947, + "logps/chosen": -206.4908447265625, + "logps/rejected": -212.1171112060547, + "loss": 0.3557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24139977991580963, + "rewards/margins": 1.6338372230529785, + "rewards/rejected": -1.39243745803833, + "step": 2655 + }, + { + "epoch": 0.31, + "learning_rate": 2.1133091419875923e-07, + "logits/chosen": -3.3574025630950928, + "logits/rejected": -3.284484386444092, + "logps/chosen": -355.9624328613281, + "logps/rejected": -294.3044738769531, + "loss": 0.2593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.051383137702941895, + "rewards/margins": 2.213636875152588, + "rewards/rejected": -2.1622536182403564, + "step": 2656 + }, + { + "epoch": 0.31, + "learning_rate": 2.1129579772913495e-07, + "logits/chosen": -2.746053695678711, + "logits/rejected": -2.828361749649048, + "logps/chosen": -256.38055419921875, + "logps/rejected": -281.7858581542969, + "loss": 0.1683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8860880732536316, + "rewards/margins": 3.565058469772339, + "rewards/rejected": -2.6789703369140625, + "step": 2657 + }, + { + "epoch": 0.31, + "learning_rate": 2.112606812595107e-07, + "logits/chosen": -3.005098342895508, + "logits/rejected": -2.9978652000427246, + "logps/chosen": -260.446044921875, + "logps/rejected": -244.42608642578125, + "loss": 0.3022, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13928157091140747, + "rewards/margins": 1.4200856685638428, + "rewards/rejected": -1.28080415725708, + "step": 2658 + }, + { + "epoch": 0.31, + "learning_rate": 2.1122556478988644e-07, + "logits/chosen": -3.190901517868042, + "logits/rejected": -2.9818334579467773, + "logps/chosen": -306.5211486816406, + "logps/rejected": -280.48028564453125, + "loss": 0.3138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07627400755882263, + "rewards/margins": 2.1277923583984375, + "rewards/rejected": -2.204066514968872, + "step": 2659 + }, + { + "epoch": 0.31, + "learning_rate": 2.111904483202622e-07, + "logits/chosen": -3.647137403488159, + "logits/rejected": -3.631355047225952, + "logps/chosen": -259.9970703125, + "logps/rejected": -317.7362365722656, + "loss": 0.6997, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4353281557559967, + "rewards/margins": 1.0021271705627441, + "rewards/rejected": -1.4374552965164185, + "step": 2660 + }, + { + "epoch": 0.31, + "learning_rate": 2.1115533185063794e-07, + "logits/chosen": -3.617340564727783, + "logits/rejected": -4.030972957611084, + "logps/chosen": -198.78048706054688, + "logps/rejected": -331.6608581542969, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0650661289691925, + "rewards/margins": 3.150432586669922, + "rewards/rejected": -3.215498447418213, + "step": 2661 + }, + { + "epoch": 0.31, + "learning_rate": 2.1112021538101367e-07, + "logits/chosen": -2.8107078075408936, + "logits/rejected": -2.870521068572998, + "logps/chosen": -265.8528137207031, + "logps/rejected": -264.05474853515625, + "loss": 0.5018, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25905323028564453, + "rewards/margins": 0.8668520450592041, + "rewards/rejected": -0.6077988743782043, + "step": 2662 + }, + { + "epoch": 0.31, + "learning_rate": 2.1108509891138942e-07, + "logits/chosen": -3.0048599243164062, + "logits/rejected": -3.1161508560180664, + "logps/chosen": -132.58041381835938, + "logps/rejected": -160.42922973632812, + "loss": 0.376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24053612351417542, + "rewards/margins": 1.4679057598114014, + "rewards/rejected": -1.7084418535232544, + "step": 2663 + }, + { + "epoch": 0.31, + "learning_rate": 2.1104998244176515e-07, + "logits/chosen": -2.939281702041626, + "logits/rejected": -2.7568531036376953, + "logps/chosen": -217.7142791748047, + "logps/rejected": -264.04083251953125, + "loss": 0.3856, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2692546844482422, + "rewards/margins": 1.320460557937622, + "rewards/rejected": -1.0512057542800903, + "step": 2664 + }, + { + "epoch": 0.31, + "learning_rate": 2.1101486597214093e-07, + "logits/chosen": -2.4977235794067383, + "logits/rejected": -2.569164752960205, + "logps/chosen": -307.8876037597656, + "logps/rejected": -216.93710327148438, + "loss": 0.5921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00397057831287384, + "rewards/margins": 1.6792693138122559, + "rewards/rejected": -1.6832399368286133, + "step": 2665 + }, + { + "epoch": 0.31, + "learning_rate": 2.1097974950251669e-07, + "logits/chosen": -2.9322047233581543, + "logits/rejected": -2.8850135803222656, + "logps/chosen": -153.75851440429688, + "logps/rejected": -199.24948120117188, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1545974165201187, + "rewards/margins": 0.6758066415786743, + "rewards/rejected": -0.8304040431976318, + "step": 2666 + }, + { + "epoch": 0.31, + "learning_rate": 2.109446330328924e-07, + "logits/chosen": -2.508617877960205, + "logits/rejected": -2.6317267417907715, + "logps/chosen": -391.77093505859375, + "logps/rejected": -262.9716491699219, + "loss": 0.2904, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14566461741924286, + "rewards/margins": 1.6080398559570312, + "rewards/rejected": -1.462375283241272, + "step": 2667 + }, + { + "epoch": 0.31, + "learning_rate": 2.1090951656326817e-07, + "logits/chosen": -2.4445652961730957, + "logits/rejected": -2.621253252029419, + "logps/chosen": -383.74267578125, + "logps/rejected": -253.97607421875, + "loss": 0.2879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47786152362823486, + "rewards/margins": 2.01947021484375, + "rewards/rejected": -1.5416086912155151, + "step": 2668 + }, + { + "epoch": 0.31, + "learning_rate": 2.1087440009364392e-07, + "logits/chosen": -2.8375632762908936, + "logits/rejected": -2.8650431632995605, + "logps/chosen": -395.013916015625, + "logps/rejected": -225.9221649169922, + "loss": 0.6505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47935783863067627, + "rewards/margins": 1.5428438186645508, + "rewards/rejected": -2.0222015380859375, + "step": 2669 + }, + { + "epoch": 0.31, + "learning_rate": 2.1083928362401965e-07, + "logits/chosen": -3.415860176086426, + "logits/rejected": -3.3173723220825195, + "logps/chosen": -180.45108032226562, + "logps/rejected": -195.68460083007812, + "loss": 0.5787, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3478144109249115, + "rewards/margins": 1.1470049619674683, + "rewards/rejected": -1.4948192834854126, + "step": 2670 + }, + { + "epoch": 0.31, + "learning_rate": 2.108041671543954e-07, + "logits/chosen": -3.7328004837036133, + "logits/rejected": -3.8640236854553223, + "logps/chosen": -92.90747833251953, + "logps/rejected": -189.60092163085938, + "loss": 0.2147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1304924190044403, + "rewards/margins": 2.4503655433654785, + "rewards/rejected": -2.319873332977295, + "step": 2671 + }, + { + "epoch": 0.31, + "learning_rate": 2.1076905068477113e-07, + "logits/chosen": -3.435408353805542, + "logits/rejected": -3.4077329635620117, + "logps/chosen": -103.98704528808594, + "logps/rejected": -138.59429931640625, + "loss": 0.375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014223292469978333, + "rewards/margins": 1.1510636806488037, + "rewards/rejected": -1.1368404626846313, + "step": 2672 + }, + { + "epoch": 0.31, + "learning_rate": 2.1073393421514688e-07, + "logits/chosen": -3.4270856380462646, + "logits/rejected": -3.6255593299865723, + "logps/chosen": -99.92672729492188, + "logps/rejected": -123.4222412109375, + "loss": 0.48, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1662251055240631, + "rewards/margins": 1.2893134355545044, + "rewards/rejected": -1.4555386304855347, + "step": 2673 + }, + { + "epoch": 0.31, + "learning_rate": 2.1069881774552264e-07, + "logits/chosen": -3.5459532737731934, + "logits/rejected": -3.3756797313690186, + "logps/chosen": -281.7789306640625, + "logps/rejected": -275.4245300292969, + "loss": 0.7326, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.160372257232666, + "rewards/margins": 0.7282190322875977, + "rewards/rejected": -1.8885912895202637, + "step": 2674 + }, + { + "epoch": 0.31, + "learning_rate": 2.1066370127589836e-07, + "logits/chosen": -3.8656203746795654, + "logits/rejected": -3.727492332458496, + "logps/chosen": -232.46302795410156, + "logps/rejected": -183.06394958496094, + "loss": 0.4531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3305974006652832, + "rewards/margins": 0.8914026618003845, + "rewards/rejected": -1.2220001220703125, + "step": 2675 + }, + { + "epoch": 0.31, + "learning_rate": 2.1062858480627414e-07, + "logits/chosen": -3.6443703174591064, + "logits/rejected": -3.1740593910217285, + "logps/chosen": -300.6732177734375, + "logps/rejected": -213.2513885498047, + "loss": 0.1587, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48693275451660156, + "rewards/margins": 2.5872771739959717, + "rewards/rejected": -2.10034441947937, + "step": 2676 + }, + { + "epoch": 0.31, + "learning_rate": 2.105934683366499e-07, + "logits/chosen": -3.6283326148986816, + "logits/rejected": -3.6519641876220703, + "logps/chosen": -315.93853759765625, + "logps/rejected": -424.2310791015625, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3778654932975769, + "rewards/margins": 1.5352487564086914, + "rewards/rejected": -1.9131141901016235, + "step": 2677 + }, + { + "epoch": 0.31, + "learning_rate": 2.1055835186702563e-07, + "logits/chosen": -2.7422573566436768, + "logits/rejected": -2.8363497257232666, + "logps/chosen": -355.2220458984375, + "logps/rejected": -262.51263427734375, + "loss": 0.3191, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04066774249076843, + "rewards/margins": 1.8454184532165527, + "rewards/rejected": -1.804750680923462, + "step": 2678 + }, + { + "epoch": 0.31, + "learning_rate": 2.1052323539740138e-07, + "logits/chosen": -2.541379451751709, + "logits/rejected": -2.456173896789551, + "logps/chosen": -324.5509338378906, + "logps/rejected": -393.8587341308594, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02267785370349884, + "rewards/margins": 1.363777995109558, + "rewards/rejected": -1.3411000967025757, + "step": 2679 + }, + { + "epoch": 0.31, + "learning_rate": 2.104881189277771e-07, + "logits/chosen": -3.055565357208252, + "logits/rejected": -2.951294183731079, + "logps/chosen": -152.54299926757812, + "logps/rejected": -198.94424438476562, + "loss": 0.1704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2504234313964844, + "rewards/margins": 3.040806293487549, + "rewards/rejected": -2.7903826236724854, + "step": 2680 + }, + { + "epoch": 0.31, + "learning_rate": 2.1045300245815286e-07, + "logits/chosen": -2.892068386077881, + "logits/rejected": -2.700523614883423, + "logps/chosen": -202.7470703125, + "logps/rejected": -188.39675903320312, + "loss": 0.7224, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3664044141769409, + "rewards/margins": 0.3669832944869995, + "rewards/rejected": -0.7333877086639404, + "step": 2681 + }, + { + "epoch": 0.31, + "learning_rate": 2.1041788598852861e-07, + "logits/chosen": -2.5932185649871826, + "logits/rejected": -2.5721709728240967, + "logps/chosen": -244.40576171875, + "logps/rejected": -265.55419921875, + "loss": 0.4802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8531277179718018, + "rewards/margins": 0.8602733612060547, + "rewards/rejected": -1.7134010791778564, + "step": 2682 + }, + { + "epoch": 0.31, + "learning_rate": 2.1038276951890434e-07, + "logits/chosen": -2.5868353843688965, + "logits/rejected": -2.671602249145508, + "logps/chosen": -199.66122436523438, + "logps/rejected": -393.555419921875, + "loss": 0.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3236641585826874, + "rewards/margins": 3.164863109588623, + "rewards/rejected": -2.841198682785034, + "step": 2683 + }, + { + "epoch": 0.31, + "learning_rate": 2.103476530492801e-07, + "logits/chosen": -2.8019237518310547, + "logits/rejected": -2.676260232925415, + "logps/chosen": -161.70999145507812, + "logps/rejected": -196.5008544921875, + "loss": 0.494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46975642442703247, + "rewards/margins": 1.6134014129638672, + "rewards/rejected": -2.083158016204834, + "step": 2684 + }, + { + "epoch": 0.31, + "learning_rate": 2.1031253657965588e-07, + "logits/chosen": -2.8804428577423096, + "logits/rejected": -2.9575161933898926, + "logps/chosen": -196.95571899414062, + "logps/rejected": -206.85055541992188, + "loss": 0.3039, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00013312697410583496, + "rewards/margins": 1.4372384548187256, + "rewards/rejected": -1.4371052980422974, + "step": 2685 + }, + { + "epoch": 0.31, + "learning_rate": 2.1027742011003158e-07, + "logits/chosen": -2.6479103565216064, + "logits/rejected": -2.7668538093566895, + "logps/chosen": -404.8021545410156, + "logps/rejected": -376.9189453125, + "loss": 0.5218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28654834628105164, + "rewards/margins": 2.0833852291107178, + "rewards/rejected": -2.369933605194092, + "step": 2686 + }, + { + "epoch": 0.31, + "learning_rate": 2.1024230364040736e-07, + "logits/chosen": -2.721830368041992, + "logits/rejected": -2.6887221336364746, + "logps/chosen": -479.88433837890625, + "logps/rejected": -330.96258544921875, + "loss": 0.1944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.529760479927063, + "rewards/margins": 2.816356658935547, + "rewards/rejected": -2.2865958213806152, + "step": 2687 + }, + { + "epoch": 0.31, + "learning_rate": 2.1020718717078309e-07, + "logits/chosen": -3.017599582672119, + "logits/rejected": -2.7422571182250977, + "logps/chosen": -574.6094360351562, + "logps/rejected": -324.6213684082031, + "loss": 0.3177, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11848056316375732, + "rewards/margins": 1.5183645486831665, + "rewards/rejected": -1.3998838663101196, + "step": 2688 + }, + { + "epoch": 0.31, + "learning_rate": 2.1017207070115884e-07, + "logits/chosen": -2.825831174850464, + "logits/rejected": -2.6222283840179443, + "logps/chosen": -247.04901123046875, + "logps/rejected": -203.26644897460938, + "loss": 0.623, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5861104130744934, + "rewards/margins": 0.5202609896659851, + "rewards/rejected": -1.1063714027404785, + "step": 2689 + }, + { + "epoch": 0.31, + "learning_rate": 2.101369542315346e-07, + "logits/chosen": -3.076383590698242, + "logits/rejected": -3.1518568992614746, + "logps/chosen": -454.2213439941406, + "logps/rejected": -299.42901611328125, + "loss": 0.2745, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47566550970077515, + "rewards/margins": 2.2643818855285645, + "rewards/rejected": -1.7887163162231445, + "step": 2690 + }, + { + "epoch": 0.31, + "learning_rate": 2.1010183776191032e-07, + "logits/chosen": -3.569575309753418, + "logits/rejected": -3.248314142227173, + "logps/chosen": -285.6998291015625, + "logps/rejected": -250.03831481933594, + "loss": 0.3628, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2644709646701813, + "rewards/margins": 1.9379606246948242, + "rewards/rejected": -1.6734895706176758, + "step": 2691 + }, + { + "epoch": 0.31, + "learning_rate": 2.1006672129228607e-07, + "logits/chosen": -3.544778347015381, + "logits/rejected": -3.777670383453369, + "logps/chosen": -258.34637451171875, + "logps/rejected": -272.9873352050781, + "loss": 0.5142, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6007353067398071, + "rewards/margins": 1.0859975814819336, + "rewards/rejected": -1.6867328882217407, + "step": 2692 + }, + { + "epoch": 0.31, + "learning_rate": 2.1003160482266183e-07, + "logits/chosen": -3.152451992034912, + "logits/rejected": -3.0305657386779785, + "logps/chosen": -345.98406982421875, + "logps/rejected": -336.1103515625, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.412387490272522, + "rewards/margins": 2.871246814727783, + "rewards/rejected": -2.4588592052459717, + "step": 2693 + }, + { + "epoch": 0.31, + "learning_rate": 2.0999648835303756e-07, + "logits/chosen": -3.2075588703155518, + "logits/rejected": -3.689764976501465, + "logps/chosen": -313.42242431640625, + "logps/rejected": -169.81350708007812, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5771796703338623, + "rewards/margins": 0.5056952238082886, + "rewards/rejected": -1.0828747749328613, + "step": 2694 + }, + { + "epoch": 0.31, + "learning_rate": 2.099613718834133e-07, + "logits/chosen": -2.710972547531128, + "logits/rejected": -2.895327091217041, + "logps/chosen": -350.9754638671875, + "logps/rejected": -232.68905639648438, + "loss": 0.4867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036527350544929504, + "rewards/margins": 1.3880201578140259, + "rewards/rejected": -1.4245474338531494, + "step": 2695 + }, + { + "epoch": 0.31, + "learning_rate": 2.0992625541378904e-07, + "logits/chosen": -2.858287811279297, + "logits/rejected": -3.0463531017303467, + "logps/chosen": -313.5277404785156, + "logps/rejected": -379.3931579589844, + "loss": 0.4762, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31277066469192505, + "rewards/margins": 1.923231601715088, + "rewards/rejected": -2.236002206802368, + "step": 2696 + }, + { + "epoch": 0.31, + "learning_rate": 2.098911389441648e-07, + "logits/chosen": -3.0413875579833984, + "logits/rejected": -3.0731945037841797, + "logps/chosen": -278.4690856933594, + "logps/rejected": -399.407958984375, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4262908101081848, + "rewards/margins": 3.117654323577881, + "rewards/rejected": -2.69136381149292, + "step": 2697 + }, + { + "epoch": 0.31, + "learning_rate": 2.0985602247454057e-07, + "logits/chosen": -3.3376400470733643, + "logits/rejected": -2.549386501312256, + "logps/chosen": -261.89373779296875, + "logps/rejected": -132.71653747558594, + "loss": 0.6113, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14452692866325378, + "rewards/margins": 0.4512198567390442, + "rewards/rejected": -0.5957468152046204, + "step": 2698 + }, + { + "epoch": 0.31, + "learning_rate": 2.098209060049163e-07, + "logits/chosen": -2.8589749336242676, + "logits/rejected": -3.1143991947174072, + "logps/chosen": -240.55950927734375, + "logps/rejected": -253.86822509765625, + "loss": 0.5735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25031960010528564, + "rewards/margins": 0.9200965166091919, + "rewards/rejected": -1.1704161167144775, + "step": 2699 + }, + { + "epoch": 0.31, + "learning_rate": 2.0978578953529205e-07, + "logits/chosen": -3.168196678161621, + "logits/rejected": -3.3921029567718506, + "logps/chosen": -116.16915130615234, + "logps/rejected": -297.52691650390625, + "loss": 0.5029, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3023165166378021, + "rewards/margins": 1.0676770210266113, + "rewards/rejected": -1.3699935674667358, + "step": 2700 + }, + { + "epoch": 0.31, + "learning_rate": 2.097506730656678e-07, + "logits/chosen": -3.9658989906311035, + "logits/rejected": -3.6875076293945312, + "logps/chosen": -161.37594604492188, + "logps/rejected": -195.8558349609375, + "loss": 0.81, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12227240204811096, + "rewards/margins": 1.3459267616271973, + "rewards/rejected": -1.4681992530822754, + "step": 2701 + }, + { + "epoch": 0.31, + "learning_rate": 2.0971555659604353e-07, + "logits/chosen": -2.984095811843872, + "logits/rejected": -3.294487476348877, + "logps/chosen": -370.7571716308594, + "logps/rejected": -295.5445251464844, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05452115833759308, + "rewards/margins": 0.6429303884506226, + "rewards/rejected": -0.6974514722824097, + "step": 2702 + }, + { + "epoch": 0.31, + "learning_rate": 2.096804401264193e-07, + "logits/chosen": -2.801766872406006, + "logits/rejected": -2.8464720249176025, + "logps/chosen": -214.14891052246094, + "logps/rejected": -252.0380096435547, + "loss": 0.3709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11697833240032196, + "rewards/margins": 1.5305709838867188, + "rewards/rejected": -1.6475491523742676, + "step": 2703 + }, + { + "epoch": 0.31, + "learning_rate": 2.0964532365679501e-07, + "logits/chosen": -2.698483943939209, + "logits/rejected": -2.8396835327148438, + "logps/chosen": -370.0849609375, + "logps/rejected": -252.1450958251953, + "loss": 0.1364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08998577296733856, + "rewards/margins": 2.55472469329834, + "rewards/rejected": -2.4647388458251953, + "step": 2704 + }, + { + "epoch": 0.31, + "learning_rate": 2.0961020718717077e-07, + "logits/chosen": -3.1203620433807373, + "logits/rejected": -3.3811442852020264, + "logps/chosen": -209.79498291015625, + "logps/rejected": -337.44732666015625, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20113682746887207, + "rewards/margins": 2.4834189414978027, + "rewards/rejected": -2.2822821140289307, + "step": 2705 + }, + { + "epoch": 0.31, + "learning_rate": 2.0957509071754652e-07, + "logits/chosen": -2.432766914367676, + "logits/rejected": -2.4980971813201904, + "logps/chosen": -319.7143859863281, + "logps/rejected": -274.7018737792969, + "loss": 0.431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6734658479690552, + "rewards/margins": 1.1623632907867432, + "rewards/rejected": -1.8358290195465088, + "step": 2706 + }, + { + "epoch": 0.31, + "learning_rate": 2.0953997424792225e-07, + "logits/chosen": -2.96154522895813, + "logits/rejected": -2.9254560470581055, + "logps/chosen": -310.79266357421875, + "logps/rejected": -329.29315185546875, + "loss": 0.4107, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35735946893692017, + "rewards/margins": 2.5898733139038086, + "rewards/rejected": -2.232513904571533, + "step": 2707 + }, + { + "epoch": 0.31, + "learning_rate": 2.09504857778298e-07, + "logits/chosen": -2.600987434387207, + "logits/rejected": -2.3445005416870117, + "logps/chosen": -406.8525390625, + "logps/rejected": -251.35968017578125, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05882759392261505, + "rewards/margins": 1.2079129219055176, + "rewards/rejected": -1.2667404413223267, + "step": 2708 + }, + { + "epoch": 0.31, + "learning_rate": 2.0946974130867373e-07, + "logits/chosen": -3.034916877746582, + "logits/rejected": -2.986525774002075, + "logps/chosen": -152.0174560546875, + "logps/rejected": -325.87103271484375, + "loss": 0.5273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12014269828796387, + "rewards/margins": 1.3156898021697998, + "rewards/rejected": -1.4358325004577637, + "step": 2709 + }, + { + "epoch": 0.31, + "learning_rate": 2.094346248390495e-07, + "logits/chosen": -2.9180924892425537, + "logits/rejected": -3.030306100845337, + "logps/chosen": -335.950927734375, + "logps/rejected": -276.42486572265625, + "loss": 0.5605, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10493157804012299, + "rewards/margins": 1.2087838649749756, + "rewards/rejected": -1.1038521528244019, + "step": 2710 + }, + { + "epoch": 0.31, + "learning_rate": 2.0939950836942526e-07, + "logits/chosen": -3.11173415184021, + "logits/rejected": -3.039310932159424, + "logps/chosen": -160.8915557861328, + "logps/rejected": -87.95643615722656, + "loss": 0.8762, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7014458775520325, + "rewards/margins": 0.0006791055202484131, + "rewards/rejected": -0.7021249532699585, + "step": 2711 + }, + { + "epoch": 0.31, + "learning_rate": 2.09364391899801e-07, + "logits/chosen": -3.666139602661133, + "logits/rejected": -3.310901403427124, + "logps/chosen": -215.253173828125, + "logps/rejected": -81.03885650634766, + "loss": 0.988, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6212292909622192, + "rewards/margins": 0.3523818552494049, + "rewards/rejected": -0.9736111164093018, + "step": 2712 + }, + { + "epoch": 0.31, + "learning_rate": 2.0932927543017675e-07, + "logits/chosen": -2.936321496963501, + "logits/rejected": -3.2686567306518555, + "logps/chosen": -147.0630340576172, + "logps/rejected": -218.07647705078125, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5394455194473267, + "rewards/margins": 2.531698226928711, + "rewards/rejected": -1.9922525882720947, + "step": 2713 + }, + { + "epoch": 0.31, + "learning_rate": 2.092941589605525e-07, + "logits/chosen": -2.443145751953125, + "logits/rejected": -2.5913243293762207, + "logps/chosen": -179.33511352539062, + "logps/rejected": -228.26394653320312, + "loss": 0.5721, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3532053828239441, + "rewards/margins": 0.7078384757041931, + "rewards/rejected": -1.0610438585281372, + "step": 2714 + }, + { + "epoch": 0.31, + "learning_rate": 2.0925904249092823e-07, + "logits/chosen": -2.9418888092041016, + "logits/rejected": -3.1111104488372803, + "logps/chosen": -305.97149658203125, + "logps/rejected": -284.12255859375, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06284767389297485, + "rewards/margins": 1.6648646593093872, + "rewards/rejected": -1.7277122735977173, + "step": 2715 + }, + { + "epoch": 0.31, + "learning_rate": 2.0922392602130398e-07, + "logits/chosen": -2.8024463653564453, + "logits/rejected": -2.684154987335205, + "logps/chosen": -346.1974182128906, + "logps/rejected": -294.5653076171875, + "loss": 0.5388, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1463080495595932, + "rewards/margins": 1.6074026823043823, + "rewards/rejected": -1.461094856262207, + "step": 2716 + }, + { + "epoch": 0.31, + "learning_rate": 2.091888095516797e-07, + "logits/chosen": -2.924553871154785, + "logits/rejected": -3.0525951385498047, + "logps/chosen": -266.85491943359375, + "logps/rejected": -217.34909057617188, + "loss": 0.3502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08117246627807617, + "rewards/margins": 2.7042860984802246, + "rewards/rejected": -2.78545880317688, + "step": 2717 + }, + { + "epoch": 0.31, + "learning_rate": 2.0915369308205546e-07, + "logits/chosen": -2.6071321964263916, + "logits/rejected": -2.5663018226623535, + "logps/chosen": -390.0578308105469, + "logps/rejected": -307.9854736328125, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30872267484664917, + "rewards/margins": 1.0495226383209229, + "rewards/rejected": -1.3582452535629272, + "step": 2718 + }, + { + "epoch": 0.31, + "learning_rate": 2.0911857661243124e-07, + "logits/chosen": -2.9327125549316406, + "logits/rejected": -2.772413730621338, + "logps/chosen": -336.01568603515625, + "logps/rejected": -425.3703308105469, + "loss": 0.3806, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1350601315498352, + "rewards/margins": 1.0636364221572876, + "rewards/rejected": -0.9285762310028076, + "step": 2719 + }, + { + "epoch": 0.31, + "learning_rate": 2.0908346014280694e-07, + "logits/chosen": -3.5731375217437744, + "logits/rejected": -3.603288412094116, + "logps/chosen": -271.04486083984375, + "logps/rejected": -201.68218994140625, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2796739339828491, + "rewards/margins": 2.1193509101867676, + "rewards/rejected": -1.839676856994629, + "step": 2720 + }, + { + "epoch": 0.31, + "learning_rate": 2.0904834367318272e-07, + "logits/chosen": -3.2600088119506836, + "logits/rejected": -2.971642017364502, + "logps/chosen": -156.05426025390625, + "logps/rejected": -189.95968627929688, + "loss": 0.8102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6407157182693481, + "rewards/margins": 0.3939170837402344, + "rewards/rejected": -1.0346328020095825, + "step": 2721 + }, + { + "epoch": 0.31, + "learning_rate": 2.0901322720355848e-07, + "logits/chosen": -4.045426368713379, + "logits/rejected": -3.636168956756592, + "logps/chosen": -308.62548828125, + "logps/rejected": -167.6669921875, + "loss": 0.3873, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19946065545082092, + "rewards/margins": 1.5284388065338135, + "rewards/rejected": -1.3289780616760254, + "step": 2722 + }, + { + "epoch": 0.31, + "learning_rate": 2.089781107339342e-07, + "logits/chosen": -2.959185838699341, + "logits/rejected": -3.1417200565338135, + "logps/chosen": -321.5123596191406, + "logps/rejected": -275.5606384277344, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11313152313232422, + "rewards/margins": 0.844805121421814, + "rewards/rejected": -0.9579366445541382, + "step": 2723 + }, + { + "epoch": 0.31, + "learning_rate": 2.0894299426430996e-07, + "logits/chosen": -3.3365931510925293, + "logits/rejected": -3.814706325531006, + "logps/chosen": -267.4439392089844, + "logps/rejected": -366.5554504394531, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7470001578330994, + "rewards/margins": 5.158563137054443, + "rewards/rejected": -4.411562442779541, + "step": 2724 + }, + { + "epoch": 0.31, + "learning_rate": 2.0890787779468569e-07, + "logits/chosen": -2.8866372108459473, + "logits/rejected": -3.149526834487915, + "logps/chosen": -148.21160888671875, + "logps/rejected": -210.41029357910156, + "loss": 0.4718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10630086064338684, + "rewards/margins": 1.1432745456695557, + "rewards/rejected": -1.2495753765106201, + "step": 2725 + }, + { + "epoch": 0.31, + "learning_rate": 2.0887276132506144e-07, + "logits/chosen": -3.232724905014038, + "logits/rejected": -2.647637367248535, + "logps/chosen": -271.8125915527344, + "logps/rejected": -185.46218872070312, + "loss": 0.6035, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5131202340126038, + "rewards/margins": 0.8920029401779175, + "rewards/rejected": -1.4051231145858765, + "step": 2726 + }, + { + "epoch": 0.31, + "learning_rate": 2.088376448554372e-07, + "logits/chosen": -2.9529480934143066, + "logits/rejected": -3.0681285858154297, + "logps/chosen": -175.5751190185547, + "logps/rejected": -304.4449157714844, + "loss": 0.657, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2673693001270294, + "rewards/margins": 0.7302308678627014, + "rewards/rejected": -0.997600257396698, + "step": 2727 + }, + { + "epoch": 0.31, + "learning_rate": 2.0880252838581292e-07, + "logits/chosen": -2.607029438018799, + "logits/rejected": -2.7416880130767822, + "logps/chosen": -347.989013671875, + "logps/rejected": -281.9116516113281, + "loss": 0.093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15197797119617462, + "rewards/margins": 3.1610162258148193, + "rewards/rejected": -3.0090384483337402, + "step": 2728 + }, + { + "epoch": 0.31, + "learning_rate": 2.0876741191618868e-07, + "logits/chosen": -3.648715019226074, + "logits/rejected": -3.5327646732330322, + "logps/chosen": -212.24200439453125, + "logps/rejected": -179.3642578125, + "loss": 0.3657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.011868447065353394, + "rewards/margins": 2.103546142578125, + "rewards/rejected": -2.115414619445801, + "step": 2729 + }, + { + "epoch": 0.31, + "learning_rate": 2.0873229544656446e-07, + "logits/chosen": -3.250645399093628, + "logits/rejected": -3.0961387157440186, + "logps/chosen": -333.5579833984375, + "logps/rejected": -286.25653076171875, + "loss": 0.1929, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3795652389526367, + "rewards/margins": 2.307131052017212, + "rewards/rejected": -1.927565574645996, + "step": 2730 + }, + { + "epoch": 0.31, + "learning_rate": 2.0869717897694016e-07, + "logits/chosen": -3.1712851524353027, + "logits/rejected": -2.850739002227783, + "logps/chosen": -303.90948486328125, + "logps/rejected": -318.20599365234375, + "loss": 0.2678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1232505738735199, + "rewards/margins": 1.605102300643921, + "rewards/rejected": -1.7283529043197632, + "step": 2731 + }, + { + "epoch": 0.31, + "learning_rate": 2.0866206250731594e-07, + "logits/chosen": -3.1232945919036865, + "logits/rejected": -2.68102765083313, + "logps/chosen": -292.6473083496094, + "logps/rejected": -271.8005676269531, + "loss": 0.398, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03885522484779358, + "rewards/margins": 1.6380221843719482, + "rewards/rejected": -1.5991668701171875, + "step": 2732 + }, + { + "epoch": 0.32, + "learning_rate": 2.0862694603769166e-07, + "logits/chosen": -2.438480854034424, + "logits/rejected": -2.4167518615722656, + "logps/chosen": -462.29327392578125, + "logps/rejected": -441.25799560546875, + "loss": 0.4538, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03119295835494995, + "rewards/margins": 1.4113497734069824, + "rewards/rejected": -1.3801567554473877, + "step": 2733 + }, + { + "epoch": 0.32, + "learning_rate": 2.0859182956806742e-07, + "logits/chosen": -3.5268936157226562, + "logits/rejected": -3.4741175174713135, + "logps/chosen": -241.01690673828125, + "logps/rejected": -149.72216796875, + "loss": 0.3056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.334955096244812, + "rewards/margins": 2.012882709503174, + "rewards/rejected": -2.3478379249572754, + "step": 2734 + }, + { + "epoch": 0.32, + "learning_rate": 2.0855671309844317e-07, + "logits/chosen": -3.6205995082855225, + "logits/rejected": -3.635786533355713, + "logps/chosen": -207.8341827392578, + "logps/rejected": -207.9324951171875, + "loss": 0.6389, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.062294840812683105, + "rewards/margins": 0.6435567140579224, + "rewards/rejected": -0.5812618732452393, + "step": 2735 + }, + { + "epoch": 0.32, + "learning_rate": 2.085215966288189e-07, + "logits/chosen": -2.772484302520752, + "logits/rejected": -2.720337390899658, + "logps/chosen": -296.5679626464844, + "logps/rejected": -242.85589599609375, + "loss": 0.4225, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3278283178806305, + "rewards/margins": 1.6699044704437256, + "rewards/rejected": -1.342076063156128, + "step": 2736 + }, + { + "epoch": 0.32, + "learning_rate": 2.0848648015919465e-07, + "logits/chosen": -2.7820305824279785, + "logits/rejected": -2.8744516372680664, + "logps/chosen": -287.03204345703125, + "logps/rejected": -233.9835205078125, + "loss": 0.7323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6748775243759155, + "rewards/margins": 1.3397912979125977, + "rewards/rejected": -2.0146689414978027, + "step": 2737 + }, + { + "epoch": 0.32, + "learning_rate": 2.084513636895704e-07, + "logits/chosen": -3.0112247467041016, + "logits/rejected": -3.2624130249023438, + "logps/chosen": -232.97894287109375, + "logps/rejected": -292.38720703125, + "loss": 0.25, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1521078199148178, + "rewards/margins": 2.252563953399658, + "rewards/rejected": -2.1004562377929688, + "step": 2738 + }, + { + "epoch": 0.32, + "learning_rate": 2.0841624721994613e-07, + "logits/chosen": -2.42109751701355, + "logits/rejected": -2.6753807067871094, + "logps/chosen": -214.0558624267578, + "logps/rejected": -324.70733642578125, + "loss": 0.882, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0905417650938034, + "rewards/margins": 0.23565584421157837, + "rewards/rejected": -0.32619768381118774, + "step": 2739 + }, + { + "epoch": 0.32, + "learning_rate": 2.083811307503219e-07, + "logits/chosen": -2.8686671257019043, + "logits/rejected": -2.8672688007354736, + "logps/chosen": -232.95529174804688, + "logps/rejected": -292.22552490234375, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020238623023033142, + "rewards/margins": 1.858863353729248, + "rewards/rejected": -1.8791018724441528, + "step": 2740 + }, + { + "epoch": 0.32, + "learning_rate": 2.0834601428069762e-07, + "logits/chosen": -3.703939437866211, + "logits/rejected": -3.371593952178955, + "logps/chosen": -302.5903625488281, + "logps/rejected": -235.95545959472656, + "loss": 0.5862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09825241565704346, + "rewards/margins": 0.8189486265182495, + "rewards/rejected": -0.917201042175293, + "step": 2741 + }, + { + "epoch": 0.32, + "learning_rate": 2.0831089781107337e-07, + "logits/chosen": -2.69975209236145, + "logits/rejected": -3.0724833011627197, + "logps/chosen": -229.7742156982422, + "logps/rejected": -231.9947509765625, + "loss": 0.3098, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3144189715385437, + "rewards/margins": 2.1158127784729004, + "rewards/rejected": -1.801393747329712, + "step": 2742 + }, + { + "epoch": 0.32, + "learning_rate": 2.0827578134144915e-07, + "logits/chosen": -3.380939245223999, + "logits/rejected": -3.7571020126342773, + "logps/chosen": -148.39996337890625, + "logps/rejected": -160.85269165039062, + "loss": 0.3082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12602221965789795, + "rewards/margins": 1.8812940120697021, + "rewards/rejected": -1.7552719116210938, + "step": 2743 + }, + { + "epoch": 0.32, + "learning_rate": 2.0824066487182488e-07, + "logits/chosen": -2.8378567695617676, + "logits/rejected": -2.9269256591796875, + "logps/chosen": -484.6377868652344, + "logps/rejected": -449.677734375, + "loss": 0.3796, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0651644766330719, + "rewards/margins": 1.9449522495269775, + "rewards/rejected": -1.8797876834869385, + "step": 2744 + }, + { + "epoch": 0.32, + "learning_rate": 2.0820554840220063e-07, + "logits/chosen": -3.15161395072937, + "logits/rejected": -2.8768653869628906, + "logps/chosen": -261.294921875, + "logps/rejected": -246.78607177734375, + "loss": 0.6687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5186463594436646, + "rewards/margins": 0.7705070972442627, + "rewards/rejected": -1.2891535758972168, + "step": 2745 + }, + { + "epoch": 0.32, + "learning_rate": 2.0817043193257639e-07, + "logits/chosen": -3.969273805618286, + "logits/rejected": -3.685112476348877, + "logps/chosen": -307.9874572753906, + "logps/rejected": -215.06178283691406, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12640725076198578, + "rewards/margins": 1.3675227165222168, + "rewards/rejected": -1.2411154508590698, + "step": 2746 + }, + { + "epoch": 0.32, + "learning_rate": 2.081353154629521e-07, + "logits/chosen": -2.0217998027801514, + "logits/rejected": -2.2769839763641357, + "logps/chosen": -457.26739501953125, + "logps/rejected": -249.7937469482422, + "loss": 0.6075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17885425686836243, + "rewards/margins": 0.5669353008270264, + "rewards/rejected": -0.7457895278930664, + "step": 2747 + }, + { + "epoch": 0.32, + "learning_rate": 2.0810019899332787e-07, + "logits/chosen": -3.174325942993164, + "logits/rejected": -3.2788195610046387, + "logps/chosen": -317.87310791015625, + "logps/rejected": -322.45166015625, + "loss": 0.3397, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18599393963813782, + "rewards/margins": 1.6060364246368408, + "rewards/rejected": -1.4200422763824463, + "step": 2748 + }, + { + "epoch": 0.32, + "learning_rate": 2.080650825237036e-07, + "logits/chosen": -3.426868200302124, + "logits/rejected": -3.4244260787963867, + "logps/chosen": -161.81173706054688, + "logps/rejected": -268.3340148925781, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.020454153418540955, + "rewards/margins": 1.8233460187911987, + "rewards/rejected": -1.8438003063201904, + "step": 2749 + }, + { + "epoch": 0.32, + "learning_rate": 2.0802996605407935e-07, + "logits/chosen": -2.4972128868103027, + "logits/rejected": -2.537996292114258, + "logps/chosen": -182.00807189941406, + "logps/rejected": -166.27622985839844, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4161045253276825, + "rewards/margins": 0.886556088924408, + "rewards/rejected": -0.4704515337944031, + "step": 2750 + }, + { + "epoch": 0.32, + "learning_rate": 2.079948495844551e-07, + "logits/chosen": -3.267488718032837, + "logits/rejected": -3.155862808227539, + "logps/chosen": -194.3003387451172, + "logps/rejected": -231.23731994628906, + "loss": 0.3789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06987342238426208, + "rewards/margins": 1.602082371711731, + "rewards/rejected": -1.6719558238983154, + "step": 2751 + }, + { + "epoch": 0.32, + "learning_rate": 2.0795973311483083e-07, + "logits/chosen": -2.986726760864258, + "logits/rejected": -3.078307867050171, + "logps/chosen": -269.416748046875, + "logps/rejected": -257.9798889160156, + "loss": 0.6469, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6248037815093994, + "rewards/margins": 0.4553123414516449, + "rewards/rejected": -1.0801160335540771, + "step": 2752 + }, + { + "epoch": 0.32, + "learning_rate": 2.079246166452066e-07, + "logits/chosen": -2.354102611541748, + "logits/rejected": -2.6754047870635986, + "logps/chosen": -321.333251953125, + "logps/rejected": -125.45684051513672, + "loss": 0.3747, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016368567943572998, + "rewards/margins": 0.8395527601242065, + "rewards/rejected": -0.8231841325759888, + "step": 2753 + }, + { + "epoch": 0.32, + "learning_rate": 2.0788950017558236e-07, + "logits/chosen": -2.1088876724243164, + "logits/rejected": -1.9361817836761475, + "logps/chosen": -321.8093566894531, + "logps/rejected": -336.0527648925781, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003968283534049988, + "rewards/margins": 1.2352946996688843, + "rewards/rejected": -1.2313264608383179, + "step": 2754 + }, + { + "epoch": 0.32, + "learning_rate": 2.078543837059581e-07, + "logits/chosen": -3.2046656608581543, + "logits/rejected": -2.6928887367248535, + "logps/chosen": -192.21133422851562, + "logps/rejected": -243.51084899902344, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14992228150367737, + "rewards/margins": 0.8116161823272705, + "rewards/rejected": -0.9615384340286255, + "step": 2755 + }, + { + "epoch": 0.32, + "learning_rate": 2.0781926723633384e-07, + "logits/chosen": -2.474734306335449, + "logits/rejected": -2.4608330726623535, + "logps/chosen": -262.71148681640625, + "logps/rejected": -251.32855224609375, + "loss": 0.2938, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13674023747444153, + "rewards/margins": 1.8013579845428467, + "rewards/rejected": -1.664617657661438, + "step": 2756 + }, + { + "epoch": 0.32, + "learning_rate": 2.0778415076670957e-07, + "logits/chosen": -2.46675968170166, + "logits/rejected": -2.7378435134887695, + "logps/chosen": -218.49761962890625, + "logps/rejected": -258.50616455078125, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20747564733028412, + "rewards/margins": 1.7474000453948975, + "rewards/rejected": -1.9548757076263428, + "step": 2757 + }, + { + "epoch": 0.32, + "learning_rate": 2.0774903429708533e-07, + "logits/chosen": -2.959700107574463, + "logits/rejected": -3.173567771911621, + "logps/chosen": -257.94110107421875, + "logps/rejected": -202.59756469726562, + "loss": 0.5463, + "rewards/accuracies": 0.75, + "rewards/chosen": 8.59573483467102e-05, + "rewards/margins": 0.7057973742485046, + "rewards/rejected": -0.705711305141449, + "step": 2758 + }, + { + "epoch": 0.32, + "learning_rate": 2.0771391782746108e-07, + "logits/chosen": -2.595471143722534, + "logits/rejected": -2.696042060852051, + "logps/chosen": -323.83038330078125, + "logps/rejected": -326.8404541015625, + "loss": 0.5636, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28396448493003845, + "rewards/margins": 1.4943914413452148, + "rewards/rejected": -1.7783560752868652, + "step": 2759 + }, + { + "epoch": 0.32, + "learning_rate": 2.076788013578368e-07, + "logits/chosen": -3.587393283843994, + "logits/rejected": -3.5495142936706543, + "logps/chosen": -138.2574005126953, + "logps/rejected": -197.32254028320312, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5710176825523376, + "rewards/margins": 2.0169577598571777, + "rewards/rejected": -1.4459400177001953, + "step": 2760 + }, + { + "epoch": 0.32, + "learning_rate": 2.0764368488821256e-07, + "logits/chosen": -2.503964424133301, + "logits/rejected": -2.3353896141052246, + "logps/chosen": -277.02130126953125, + "logps/rejected": -181.201416015625, + "loss": 0.3995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08484504371881485, + "rewards/margins": 1.4066998958587646, + "rewards/rejected": -1.4915450811386108, + "step": 2761 + }, + { + "epoch": 0.32, + "learning_rate": 2.076085684185883e-07, + "logits/chosen": -3.5791070461273193, + "logits/rejected": -3.2346675395965576, + "logps/chosen": -330.312744140625, + "logps/rejected": -341.7666931152344, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4902004599571228, + "rewards/margins": 1.5270874500274658, + "rewards/rejected": -2.0172882080078125, + "step": 2762 + }, + { + "epoch": 0.32, + "learning_rate": 2.0757345194896404e-07, + "logits/chosen": -3.3797836303710938, + "logits/rejected": -3.712411403656006, + "logps/chosen": -243.3679656982422, + "logps/rejected": -208.43618774414062, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.051359936594963074, + "rewards/margins": 2.0788116455078125, + "rewards/rejected": -2.027451515197754, + "step": 2763 + }, + { + "epoch": 0.32, + "learning_rate": 2.0753833547933982e-07, + "logits/chosen": -2.7436885833740234, + "logits/rejected": -2.8331990242004395, + "logps/chosen": -218.19393920898438, + "logps/rejected": -211.0985107421875, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15266427397727966, + "rewards/margins": 1.4829375743865967, + "rewards/rejected": -1.3302732706069946, + "step": 2764 + }, + { + "epoch": 0.32, + "learning_rate": 2.0750321900971552e-07, + "logits/chosen": -3.272725820541382, + "logits/rejected": -3.317335605621338, + "logps/chosen": -221.25201416015625, + "logps/rejected": -351.0550231933594, + "loss": 0.1994, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40522170066833496, + "rewards/margins": 2.570573568344116, + "rewards/rejected": -2.1653518676757812, + "step": 2765 + }, + { + "epoch": 0.32, + "learning_rate": 2.074681025400913e-07, + "logits/chosen": -3.786918878555298, + "logits/rejected": -3.7598862648010254, + "logps/chosen": -221.38348388671875, + "logps/rejected": -287.49920654296875, + "loss": 0.3288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20213985443115234, + "rewards/margins": 2.4138193130493164, + "rewards/rejected": -2.6159591674804688, + "step": 2766 + }, + { + "epoch": 0.32, + "learning_rate": 2.0743298607046706e-07, + "logits/chosen": -2.935328960418701, + "logits/rejected": -3.161776065826416, + "logps/chosen": -266.54510498046875, + "logps/rejected": -200.47454833984375, + "loss": 0.6284, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5107806324958801, + "rewards/margins": 1.097649097442627, + "rewards/rejected": -1.6084296703338623, + "step": 2767 + }, + { + "epoch": 0.32, + "learning_rate": 2.0739786960084278e-07, + "logits/chosen": -3.1065666675567627, + "logits/rejected": -3.133519411087036, + "logps/chosen": -208.64923095703125, + "logps/rejected": -164.1324462890625, + "loss": 0.3468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.267901748418808, + "rewards/margins": 1.2319071292877197, + "rewards/rejected": -0.9640053510665894, + "step": 2768 + }, + { + "epoch": 0.32, + "learning_rate": 2.0736275313121854e-07, + "logits/chosen": -3.0599682331085205, + "logits/rejected": -3.438636064529419, + "logps/chosen": -126.22867584228516, + "logps/rejected": -323.46881103515625, + "loss": 0.1529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11474213004112244, + "rewards/margins": 5.443012237548828, + "rewards/rejected": -5.328269958496094, + "step": 2769 + }, + { + "epoch": 0.32, + "learning_rate": 2.0732763666159427e-07, + "logits/chosen": -2.548311948776245, + "logits/rejected": -2.6767163276672363, + "logps/chosen": -273.38616943359375, + "logps/rejected": -240.0062713623047, + "loss": 0.3454, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2300184965133667, + "rewards/margins": 1.1098237037658691, + "rewards/rejected": -0.8798051476478577, + "step": 2770 + }, + { + "epoch": 0.32, + "learning_rate": 2.0729252019197002e-07, + "logits/chosen": -2.5090506076812744, + "logits/rejected": -3.0814170837402344, + "logps/chosen": -130.109130859375, + "logps/rejected": -165.18136596679688, + "loss": 0.6606, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07737506181001663, + "rewards/margins": 1.5364673137664795, + "rewards/rejected": -1.613842248916626, + "step": 2771 + }, + { + "epoch": 0.32, + "learning_rate": 2.0725740372234577e-07, + "logits/chosen": -2.8526933193206787, + "logits/rejected": -2.7082886695861816, + "logps/chosen": -211.28614807128906, + "logps/rejected": -276.4790954589844, + "loss": 0.4884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22731192409992218, + "rewards/margins": 1.1562751531600952, + "rewards/rejected": -1.383587121963501, + "step": 2772 + }, + { + "epoch": 0.32, + "learning_rate": 2.072222872527215e-07, + "logits/chosen": -3.091115951538086, + "logits/rejected": -3.334754467010498, + "logps/chosen": -289.0819396972656, + "logps/rejected": -285.72930908203125, + "loss": 0.3764, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12595519423484802, + "rewards/margins": 1.5188894271850586, + "rewards/rejected": -1.3929342031478882, + "step": 2773 + }, + { + "epoch": 0.32, + "learning_rate": 2.0718717078309725e-07, + "logits/chosen": -3.1591386795043945, + "logits/rejected": -3.301708936691284, + "logps/chosen": -128.5341339111328, + "logps/rejected": -224.38369750976562, + "loss": 0.3912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048657774925231934, + "rewards/margins": 1.9820069074630737, + "rewards/rejected": -2.0306649208068848, + "step": 2774 + }, + { + "epoch": 0.32, + "learning_rate": 2.0715205431347304e-07, + "logits/chosen": -2.909804344177246, + "logits/rejected": -3.3829526901245117, + "logps/chosen": -144.20606994628906, + "logps/rejected": -278.80078125, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14013497531414032, + "rewards/margins": 2.7419135570526123, + "rewards/rejected": -2.8820486068725586, + "step": 2775 + }, + { + "epoch": 0.32, + "learning_rate": 2.0711693784384874e-07, + "logits/chosen": -2.79217529296875, + "logits/rejected": -2.8686933517456055, + "logps/chosen": -139.03973388671875, + "logps/rejected": -317.21795654296875, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05444703996181488, + "rewards/margins": 2.672330617904663, + "rewards/rejected": -2.7267777919769287, + "step": 2776 + }, + { + "epoch": 0.32, + "learning_rate": 2.0708182137422452e-07, + "logits/chosen": -2.554117202758789, + "logits/rejected": -2.5956289768218994, + "logps/chosen": -413.4727783203125, + "logps/rejected": -317.3256530761719, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07327502965927124, + "rewards/margins": 1.98712158203125, + "rewards/rejected": -1.9138466119766235, + "step": 2777 + }, + { + "epoch": 0.32, + "learning_rate": 2.0704670490460024e-07, + "logits/chosen": -2.46449613571167, + "logits/rejected": -2.649974822998047, + "logps/chosen": -416.12799072265625, + "logps/rejected": -309.6974182128906, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34918850660324097, + "rewards/margins": 1.780132532119751, + "rewards/rejected": -1.4309438467025757, + "step": 2778 + }, + { + "epoch": 0.32, + "learning_rate": 2.07011588434976e-07, + "logits/chosen": -3.181380271911621, + "logits/rejected": -3.1958529949188232, + "logps/chosen": -235.20460510253906, + "logps/rejected": -182.61184692382812, + "loss": 0.3187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024216219782829285, + "rewards/margins": 1.2108616828918457, + "rewards/rejected": -1.2350780963897705, + "step": 2779 + }, + { + "epoch": 0.32, + "learning_rate": 2.0697647196535175e-07, + "logits/chosen": -3.7424967288970947, + "logits/rejected": -3.408094882965088, + "logps/chosen": -135.8411407470703, + "logps/rejected": -173.82858276367188, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12902896106243134, + "rewards/margins": 2.2972166538238525, + "rewards/rejected": -2.1681878566741943, + "step": 2780 + }, + { + "epoch": 0.32, + "learning_rate": 2.0694135549572748e-07, + "logits/chosen": -2.892028331756592, + "logits/rejected": -2.8821470737457275, + "logps/chosen": -240.58624267578125, + "logps/rejected": -223.71792602539062, + "loss": 0.5902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6565995216369629, + "rewards/margins": 1.448161244392395, + "rewards/rejected": -2.1047606468200684, + "step": 2781 + }, + { + "epoch": 0.32, + "learning_rate": 2.0690623902610323e-07, + "logits/chosen": -3.3885231018066406, + "logits/rejected": -3.45500111579895, + "logps/chosen": -266.329833984375, + "logps/rejected": -124.86180877685547, + "loss": 0.6957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24081218242645264, + "rewards/margins": 1.135359764099121, + "rewards/rejected": -1.3761719465255737, + "step": 2782 + }, + { + "epoch": 0.32, + "learning_rate": 2.0687112255647899e-07, + "logits/chosen": -2.9476523399353027, + "logits/rejected": -3.3540115356445312, + "logps/chosen": -206.1102752685547, + "logps/rejected": -278.2396240234375, + "loss": 0.1709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33596497774124146, + "rewards/margins": 2.307621955871582, + "rewards/rejected": -1.9716570377349854, + "step": 2783 + }, + { + "epoch": 0.32, + "learning_rate": 2.0683600608685471e-07, + "logits/chosen": -3.0456900596618652, + "logits/rejected": -3.1665358543395996, + "logps/chosen": -352.8702087402344, + "logps/rejected": -287.72796630859375, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14109206199645996, + "rewards/margins": 3.021542549133301, + "rewards/rejected": -3.16263484954834, + "step": 2784 + }, + { + "epoch": 0.32, + "learning_rate": 2.0680088961723047e-07, + "logits/chosen": -3.062130928039551, + "logits/rejected": -3.2230515480041504, + "logps/chosen": -163.9191131591797, + "logps/rejected": -199.23892211914062, + "loss": 0.4972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09201288968324661, + "rewards/margins": 1.4529011249542236, + "rewards/rejected": -1.5449140071868896, + "step": 2785 + }, + { + "epoch": 0.32, + "learning_rate": 2.067657731476062e-07, + "logits/chosen": -2.8953661918640137, + "logits/rejected": -2.7664976119995117, + "logps/chosen": -230.72195434570312, + "logps/rejected": -211.48867797851562, + "loss": 0.4531, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14753729104995728, + "rewards/margins": 1.2386822700500488, + "rewards/rejected": -1.0911450386047363, + "step": 2786 + }, + { + "epoch": 0.32, + "learning_rate": 2.0673065667798198e-07, + "logits/chosen": -4.048678398132324, + "logits/rejected": -3.816167116165161, + "logps/chosen": -160.11972045898438, + "logps/rejected": -149.6986541748047, + "loss": 0.7961, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3054066598415375, + "rewards/margins": 0.9730106592178345, + "rewards/rejected": -1.2784172296524048, + "step": 2787 + }, + { + "epoch": 0.32, + "learning_rate": 2.0669554020835773e-07, + "logits/chosen": -2.844991683959961, + "logits/rejected": -2.9796016216278076, + "logps/chosen": -180.3058624267578, + "logps/rejected": -466.3212890625, + "loss": 0.3845, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06598556041717529, + "rewards/margins": 1.9717737436294556, + "rewards/rejected": -1.9057883024215698, + "step": 2788 + }, + { + "epoch": 0.32, + "learning_rate": 2.0666042373873346e-07, + "logits/chosen": -3.070272445678711, + "logits/rejected": -3.1582517623901367, + "logps/chosen": -260.7377014160156, + "logps/rejected": -177.84510803222656, + "loss": 0.8234, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8787546753883362, + "rewards/margins": -0.21073278784751892, + "rewards/rejected": -0.6680218577384949, + "step": 2789 + }, + { + "epoch": 0.32, + "learning_rate": 2.066253072691092e-07, + "logits/chosen": -3.138500213623047, + "logits/rejected": -2.7797598838806152, + "logps/chosen": -315.20343017578125, + "logps/rejected": -388.59283447265625, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2606803774833679, + "rewards/margins": 1.995790958404541, + "rewards/rejected": -2.2564713954925537, + "step": 2790 + }, + { + "epoch": 0.32, + "learning_rate": 2.0659019079948496e-07, + "logits/chosen": -2.8877885341644287, + "logits/rejected": -3.1086130142211914, + "logps/chosen": -294.93609619140625, + "logps/rejected": -224.794921875, + "loss": 0.3392, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5658361911773682, + "rewards/margins": 2.3008816242218018, + "rewards/rejected": -1.7350455522537231, + "step": 2791 + }, + { + "epoch": 0.32, + "learning_rate": 2.065550743298607e-07, + "logits/chosen": -3.656029224395752, + "logits/rejected": -3.45927095413208, + "logps/chosen": -233.8111572265625, + "logps/rejected": -216.8090362548828, + "loss": 0.5791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.552493691444397, + "rewards/margins": 1.6606298685073853, + "rewards/rejected": -2.213123321533203, + "step": 2792 + }, + { + "epoch": 0.32, + "learning_rate": 2.0651995786023645e-07, + "logits/chosen": -3.5274083614349365, + "logits/rejected": -3.4105100631713867, + "logps/chosen": -196.47276306152344, + "logps/rejected": -199.6464080810547, + "loss": 0.3224, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5164499878883362, + "rewards/margins": 2.451303720474243, + "rewards/rejected": -1.9348536729812622, + "step": 2793 + }, + { + "epoch": 0.32, + "learning_rate": 2.0648484139061217e-07, + "logits/chosen": -3.0147829055786133, + "logits/rejected": -2.6869168281555176, + "logps/chosen": -394.6365661621094, + "logps/rejected": -337.3128662109375, + "loss": 0.2182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10059966146945953, + "rewards/margins": 2.2805778980255127, + "rewards/rejected": -2.3811776638031006, + "step": 2794 + }, + { + "epoch": 0.32, + "learning_rate": 2.0644972492098793e-07, + "logits/chosen": -3.406358242034912, + "logits/rejected": -3.2000293731689453, + "logps/chosen": -259.47711181640625, + "logps/rejected": -237.4225616455078, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025759266689419746, + "rewards/margins": 1.5062122344970703, + "rewards/rejected": -1.5319716930389404, + "step": 2795 + }, + { + "epoch": 0.32, + "learning_rate": 2.0641460845136368e-07, + "logits/chosen": -3.4297091960906982, + "logits/rejected": -3.576140880584717, + "logps/chosen": -277.160888671875, + "logps/rejected": -592.968017578125, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10689663141965866, + "rewards/margins": 2.199827194213867, + "rewards/rejected": -2.092930555343628, + "step": 2796 + }, + { + "epoch": 0.32, + "learning_rate": 2.063794919817394e-07, + "logits/chosen": -3.271301746368408, + "logits/rejected": -3.4515151977539062, + "logps/chosen": -228.8732452392578, + "logps/rejected": -218.2284393310547, + "loss": 0.5952, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.722952127456665, + "rewards/margins": 0.5262311100959778, + "rewards/rejected": -1.249183177947998, + "step": 2797 + }, + { + "epoch": 0.32, + "learning_rate": 2.063443755121152e-07, + "logits/chosen": -3.462169647216797, + "logits/rejected": -3.5262911319732666, + "logps/chosen": -180.69976806640625, + "logps/rejected": -245.34774780273438, + "loss": 0.3497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2832827568054199, + "rewards/margins": 2.034862518310547, + "rewards/rejected": -2.318145275115967, + "step": 2798 + }, + { + "epoch": 0.32, + "learning_rate": 2.0630925904249094e-07, + "logits/chosen": -2.850862503051758, + "logits/rejected": -2.941068649291992, + "logps/chosen": -196.75091552734375, + "logps/rejected": -187.80892944335938, + "loss": 0.5323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4563661813735962, + "rewards/margins": 0.8169429302215576, + "rewards/rejected": -1.2733091115951538, + "step": 2799 + }, + { + "epoch": 0.32, + "learning_rate": 2.0627414257286667e-07, + "logits/chosen": -3.767493486404419, + "logits/rejected": -3.849728584289551, + "logps/chosen": -185.01150512695312, + "logps/rejected": -310.3260498046875, + "loss": 0.2299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06735554337501526, + "rewards/margins": 2.8156516551971436, + "rewards/rejected": -2.883007049560547, + "step": 2800 + }, + { + "epoch": 0.32, + "learning_rate": 2.0623902610324242e-07, + "logits/chosen": -3.025587558746338, + "logits/rejected": -2.7623236179351807, + "logps/chosen": -162.96011352539062, + "logps/rejected": -173.7723846435547, + "loss": 0.5137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053438737988471985, + "rewards/margins": 0.9953893423080444, + "rewards/rejected": -1.048828125, + "step": 2801 + }, + { + "epoch": 0.32, + "learning_rate": 2.0620390963361815e-07, + "logits/chosen": -2.5162322521209717, + "logits/rejected": -2.9308698177337646, + "logps/chosen": -311.500244140625, + "logps/rejected": -310.20361328125, + "loss": 0.3918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3595004677772522, + "rewards/margins": 1.6310007572174072, + "rewards/rejected": -1.9905012845993042, + "step": 2802 + }, + { + "epoch": 0.32, + "learning_rate": 2.061687931639939e-07, + "logits/chosen": -3.3017165660858154, + "logits/rejected": -3.515655040740967, + "logps/chosen": -176.8379669189453, + "logps/rejected": -208.91122436523438, + "loss": 0.3498, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21133531630039215, + "rewards/margins": 1.9963289499282837, + "rewards/rejected": -1.7849936485290527, + "step": 2803 + }, + { + "epoch": 0.32, + "learning_rate": 2.0613367669436966e-07, + "logits/chosen": -2.5642123222351074, + "logits/rejected": -2.490652561187744, + "logps/chosen": -448.76416015625, + "logps/rejected": -173.724853515625, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9064850807189941, + "rewards/margins": 1.7373032569885254, + "rewards/rejected": -0.8308181762695312, + "step": 2804 + }, + { + "epoch": 0.32, + "learning_rate": 2.0609856022474539e-07, + "logits/chosen": -3.548060894012451, + "logits/rejected": -3.19390869140625, + "logps/chosen": -243.50192260742188, + "logps/rejected": -245.09463500976562, + "loss": 0.4526, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09417843073606491, + "rewards/margins": 1.1738923788070679, + "rewards/rejected": -1.0797138214111328, + "step": 2805 + }, + { + "epoch": 0.32, + "learning_rate": 2.0606344375512114e-07, + "logits/chosen": -2.5123791694641113, + "logits/rejected": -2.4777791500091553, + "logps/chosen": -459.89471435546875, + "logps/rejected": -254.26239013671875, + "loss": 0.5917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14879187941551208, + "rewards/margins": 0.48147064447402954, + "rewards/rejected": -0.630262553691864, + "step": 2806 + }, + { + "epoch": 0.32, + "learning_rate": 2.0602832728549687e-07, + "logits/chosen": -2.391932249069214, + "logits/rejected": -2.4828383922576904, + "logps/chosen": -355.3644104003906, + "logps/rejected": -379.36676025390625, + "loss": 0.2492, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04231920838356018, + "rewards/margins": 2.014566421508789, + "rewards/rejected": -2.0568857192993164, + "step": 2807 + }, + { + "epoch": 0.32, + "learning_rate": 2.0599321081587262e-07, + "logits/chosen": -3.1494741439819336, + "logits/rejected": -3.4154911041259766, + "logps/chosen": -219.63803100585938, + "logps/rejected": -263.78009033203125, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16116540133953094, + "rewards/margins": 2.823303699493408, + "rewards/rejected": -2.984469175338745, + "step": 2808 + }, + { + "epoch": 0.32, + "learning_rate": 2.059580943462484e-07, + "logits/chosen": -3.585617780685425, + "logits/rejected": -3.779629945755005, + "logps/chosen": -391.9552001953125, + "logps/rejected": -288.7772216796875, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0698586106300354, + "rewards/margins": 1.5362114906311035, + "rewards/rejected": -1.466352939605713, + "step": 2809 + }, + { + "epoch": 0.32, + "learning_rate": 2.059229778766241e-07, + "logits/chosen": -2.9250593185424805, + "logits/rejected": -3.21315860748291, + "logps/chosen": -180.68344116210938, + "logps/rejected": -217.66542053222656, + "loss": 0.5711, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024542585015296936, + "rewards/margins": 0.9317641854286194, + "rewards/rejected": -0.9072216749191284, + "step": 2810 + }, + { + "epoch": 0.32, + "learning_rate": 2.0588786140699988e-07, + "logits/chosen": -3.785414934158325, + "logits/rejected": -3.5212647914886475, + "logps/chosen": -331.46636962890625, + "logps/rejected": -310.186767578125, + "loss": 0.3021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1537300944328308, + "rewards/margins": 1.9160597324371338, + "rewards/rejected": -2.0697898864746094, + "step": 2811 + }, + { + "epoch": 0.32, + "learning_rate": 2.0585274493737564e-07, + "logits/chosen": -3.8488528728485107, + "logits/rejected": -3.7776103019714355, + "logps/chosen": -181.44529724121094, + "logps/rejected": -235.0676727294922, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27591148018836975, + "rewards/margins": 2.671863079071045, + "rewards/rejected": -2.395951747894287, + "step": 2812 + }, + { + "epoch": 0.32, + "learning_rate": 2.0581762846775136e-07, + "logits/chosen": -2.9657726287841797, + "logits/rejected": -3.0691418647766113, + "logps/chosen": -293.22412109375, + "logps/rejected": -170.36061096191406, + "loss": 0.7891, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0776626467704773, + "rewards/margins": 0.3171939253807068, + "rewards/rejected": -0.3948565721511841, + "step": 2813 + }, + { + "epoch": 0.32, + "learning_rate": 2.0578251199812712e-07, + "logits/chosen": -3.835513114929199, + "logits/rejected": -3.7529373168945312, + "logps/chosen": -291.42938232421875, + "logps/rejected": -264.9427490234375, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1858813464641571, + "rewards/margins": 1.128273606300354, + "rewards/rejected": -1.314155101776123, + "step": 2814 + }, + { + "epoch": 0.32, + "learning_rate": 2.0574739552850285e-07, + "logits/chosen": -3.304427146911621, + "logits/rejected": -3.129387617111206, + "logps/chosen": -331.84478759765625, + "logps/rejected": -178.6880340576172, + "loss": 0.4213, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19511562585830688, + "rewards/margins": 0.9570579528808594, + "rewards/rejected": -0.7619423866271973, + "step": 2815 + }, + { + "epoch": 0.32, + "learning_rate": 2.057122790588786e-07, + "logits/chosen": -3.395862102508545, + "logits/rejected": -3.4491450786590576, + "logps/chosen": -218.5532989501953, + "logps/rejected": -176.27085876464844, + "loss": 0.3029, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30331024527549744, + "rewards/margins": 1.599184274673462, + "rewards/rejected": -1.2958738803863525, + "step": 2816 + }, + { + "epoch": 0.32, + "learning_rate": 2.0567716258925435e-07, + "logits/chosen": -3.0377089977264404, + "logits/rejected": -3.1854615211486816, + "logps/chosen": -200.83856201171875, + "logps/rejected": -249.68304443359375, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07921233773231506, + "rewards/margins": 1.6125292778015137, + "rewards/rejected": -1.5333170890808105, + "step": 2817 + }, + { + "epoch": 0.32, + "learning_rate": 2.0564204611963008e-07, + "logits/chosen": -3.005790948867798, + "logits/rejected": -3.1264028549194336, + "logps/chosen": -309.28253173828125, + "logps/rejected": -301.784912109375, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4166223704814911, + "rewards/margins": 2.5665364265441895, + "rewards/rejected": -2.149913787841797, + "step": 2818 + }, + { + "epoch": 0.32, + "learning_rate": 2.0560692965000583e-07, + "logits/chosen": -3.5147624015808105, + "logits/rejected": -3.195995569229126, + "logps/chosen": -373.454345703125, + "logps/rejected": -318.0096435546875, + "loss": 0.217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49907389283180237, + "rewards/margins": 2.2315163612365723, + "rewards/rejected": -2.7305901050567627, + "step": 2819 + }, + { + "epoch": 0.33, + "learning_rate": 2.0557181318038161e-07, + "logits/chosen": -2.7856035232543945, + "logits/rejected": -2.7912845611572266, + "logps/chosen": -140.02053833007812, + "logps/rejected": -142.15567016601562, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09772336483001709, + "rewards/margins": 0.6865450739860535, + "rewards/rejected": -0.5888217091560364, + "step": 2820 + }, + { + "epoch": 0.33, + "learning_rate": 2.0553669671075734e-07, + "logits/chosen": -3.6302576065063477, + "logits/rejected": -3.485506772994995, + "logps/chosen": -252.49411010742188, + "logps/rejected": -312.7757568359375, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2792898714542389, + "rewards/margins": 1.900126338005066, + "rewards/rejected": -1.6208364963531494, + "step": 2821 + }, + { + "epoch": 0.33, + "learning_rate": 2.055015802411331e-07, + "logits/chosen": -3.8503851890563965, + "logits/rejected": -3.8455076217651367, + "logps/chosen": -222.40640258789062, + "logps/rejected": -268.122314453125, + "loss": 0.3039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.012696027755737305, + "rewards/margins": 2.311507225036621, + "rewards/rejected": -2.3242032527923584, + "step": 2822 + }, + { + "epoch": 0.33, + "learning_rate": 2.0546646377150882e-07, + "logits/chosen": -3.299320697784424, + "logits/rejected": -3.4471657276153564, + "logps/chosen": -300.37921142578125, + "logps/rejected": -329.3104553222656, + "loss": 0.2131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1285678595304489, + "rewards/margins": 2.6096432209014893, + "rewards/rejected": -2.738211154937744, + "step": 2823 + }, + { + "epoch": 0.33, + "learning_rate": 2.0543134730188458e-07, + "logits/chosen": -3.3328328132629395, + "logits/rejected": -3.4418020248413086, + "logps/chosen": -111.92011260986328, + "logps/rejected": -243.8980712890625, + "loss": 0.5381, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01608555018901825, + "rewards/margins": 1.337178111076355, + "rewards/rejected": -1.3210923671722412, + "step": 2824 + }, + { + "epoch": 0.33, + "learning_rate": 2.0539623083226033e-07, + "logits/chosen": -3.667369842529297, + "logits/rejected": -3.9021668434143066, + "logps/chosen": -160.41021728515625, + "logps/rejected": -196.22976684570312, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08319275081157684, + "rewards/margins": 2.984401226043701, + "rewards/rejected": -3.067594051361084, + "step": 2825 + }, + { + "epoch": 0.33, + "learning_rate": 2.0536111436263606e-07, + "logits/chosen": -3.1877806186676025, + "logits/rejected": -3.57426118850708, + "logps/chosen": -132.1624298095703, + "logps/rejected": -165.39495849609375, + "loss": 0.2547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24397379159927368, + "rewards/margins": 2.3526341915130615, + "rewards/rejected": -2.1086602210998535, + "step": 2826 + }, + { + "epoch": 0.33, + "learning_rate": 2.053259978930118e-07, + "logits/chosen": -2.2200443744659424, + "logits/rejected": -2.239499807357788, + "logps/chosen": -264.8586730957031, + "logps/rejected": -207.8159942626953, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33564120531082153, + "rewards/margins": 1.5681999921798706, + "rewards/rejected": -1.903841257095337, + "step": 2827 + }, + { + "epoch": 0.33, + "learning_rate": 2.0529088142338757e-07, + "logits/chosen": -3.1319541931152344, + "logits/rejected": -3.5582733154296875, + "logps/chosen": -251.1297149658203, + "logps/rejected": -203.61962890625, + "loss": 0.4657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5164710283279419, + "rewards/margins": 1.9681981801986694, + "rewards/rejected": -2.4846692085266113, + "step": 2828 + }, + { + "epoch": 0.33, + "learning_rate": 2.052557649537633e-07, + "logits/chosen": -2.409322738647461, + "logits/rejected": -2.3061256408691406, + "logps/chosen": -262.6789855957031, + "logps/rejected": -265.337158203125, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13889148831367493, + "rewards/margins": 1.3595165014266968, + "rewards/rejected": -1.2206250429153442, + "step": 2829 + }, + { + "epoch": 0.33, + "learning_rate": 2.0522064848413905e-07, + "logits/chosen": -3.533191204071045, + "logits/rejected": -3.625004768371582, + "logps/chosen": -190.8134765625, + "logps/rejected": -202.21246337890625, + "loss": 0.8439, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18298617005348206, + "rewards/margins": -0.12491105496883392, + "rewards/rejected": -0.05807510018348694, + "step": 2830 + }, + { + "epoch": 0.33, + "learning_rate": 2.0518553201451477e-07, + "logits/chosen": -2.260848045349121, + "logits/rejected": -2.3793628215789795, + "logps/chosen": -226.67410278320312, + "logps/rejected": -203.89141845703125, + "loss": 0.7461, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41478753089904785, + "rewards/margins": 0.7413772940635681, + "rewards/rejected": -1.1561648845672607, + "step": 2831 + }, + { + "epoch": 0.33, + "learning_rate": 2.0515041554489055e-07, + "logits/chosen": -3.3377904891967773, + "logits/rejected": -3.2721962928771973, + "logps/chosen": -445.6872863769531, + "logps/rejected": -260.747314453125, + "loss": 0.3066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37181031703948975, + "rewards/margins": 1.3510496616363525, + "rewards/rejected": -1.7228599786758423, + "step": 2832 + }, + { + "epoch": 0.33, + "learning_rate": 2.051152990752663e-07, + "logits/chosen": -3.283334970474243, + "logits/rejected": -3.422372817993164, + "logps/chosen": -283.694580078125, + "logps/rejected": -213.81796264648438, + "loss": 0.4655, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10639706254005432, + "rewards/margins": 0.8843005895614624, + "rewards/rejected": -0.7779035568237305, + "step": 2833 + }, + { + "epoch": 0.33, + "learning_rate": 2.0508018260564204e-07, + "logits/chosen": -2.571668863296509, + "logits/rejected": -2.686441659927368, + "logps/chosen": -436.0262451171875, + "logps/rejected": -267.1753234863281, + "loss": 0.2666, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09400790184736252, + "rewards/margins": 1.5225231647491455, + "rewards/rejected": -1.4285151958465576, + "step": 2834 + }, + { + "epoch": 0.33, + "learning_rate": 2.050450661360178e-07, + "logits/chosen": -3.2354795932769775, + "logits/rejected": -3.2029502391815186, + "logps/chosen": -245.97073364257812, + "logps/rejected": -203.56268310546875, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29072803258895874, + "rewards/margins": 1.2958617210388184, + "rewards/rejected": -1.0051336288452148, + "step": 2835 + }, + { + "epoch": 0.33, + "learning_rate": 2.0500994966639354e-07, + "logits/chosen": -3.5662002563476562, + "logits/rejected": -3.201505422592163, + "logps/chosen": -391.55828857421875, + "logps/rejected": -431.5497131347656, + "loss": 0.3071, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27092230319976807, + "rewards/margins": 2.3845252990722656, + "rewards/rejected": -2.113602876663208, + "step": 2836 + }, + { + "epoch": 0.33, + "learning_rate": 2.0497483319676927e-07, + "logits/chosen": -3.1553406715393066, + "logits/rejected": -3.542404890060425, + "logps/chosen": -224.5789337158203, + "logps/rejected": -195.91854858398438, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13149698078632355, + "rewards/margins": 1.4715056419372559, + "rewards/rejected": -1.3400086164474487, + "step": 2837 + }, + { + "epoch": 0.33, + "learning_rate": 2.0493971672714503e-07, + "logits/chosen": -2.8095169067382812, + "logits/rejected": -3.3172037601470947, + "logps/chosen": -206.4317169189453, + "logps/rejected": -356.79345703125, + "loss": 0.1801, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18328090012073517, + "rewards/margins": 4.173395156860352, + "rewards/rejected": -3.990114212036133, + "step": 2838 + }, + { + "epoch": 0.33, + "learning_rate": 2.0490460025752075e-07, + "logits/chosen": -3.175493001937866, + "logits/rejected": -3.241122007369995, + "logps/chosen": -343.7049560546875, + "logps/rejected": -304.8462829589844, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6818311214447021, + "rewards/margins": 0.706986129283905, + "rewards/rejected": -1.3888171911239624, + "step": 2839 + }, + { + "epoch": 0.33, + "learning_rate": 2.048694837878965e-07, + "logits/chosen": -3.2136454582214355, + "logits/rejected": -3.106694459915161, + "logps/chosen": -335.4702453613281, + "logps/rejected": -334.1417541503906, + "loss": 0.5216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11840799450874329, + "rewards/margins": 0.747544527053833, + "rewards/rejected": -0.8659524917602539, + "step": 2840 + }, + { + "epoch": 0.33, + "learning_rate": 2.0483436731827226e-07, + "logits/chosen": -3.2005863189697266, + "logits/rejected": -3.286083221435547, + "logps/chosen": -253.33087158203125, + "logps/rejected": -349.37103271484375, + "loss": 0.2865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15498188138008118, + "rewards/margins": 2.362828016281128, + "rewards/rejected": -2.5178098678588867, + "step": 2841 + }, + { + "epoch": 0.33, + "learning_rate": 2.04799250848648e-07, + "logits/chosen": -3.661076545715332, + "logits/rejected": -3.7284059524536133, + "logps/chosen": -98.0713119506836, + "logps/rejected": -204.99378967285156, + "loss": 0.3115, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07008009403944016, + "rewards/margins": 3.4748270511627197, + "rewards/rejected": -3.4047467708587646, + "step": 2842 + }, + { + "epoch": 0.33, + "learning_rate": 2.0476413437902377e-07, + "logits/chosen": -2.9179842472076416, + "logits/rejected": -2.96488881111145, + "logps/chosen": -299.5823059082031, + "logps/rejected": -297.158203125, + "loss": 0.6925, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07056504487991333, + "rewards/margins": 0.4271971583366394, + "rewards/rejected": -0.35663214325904846, + "step": 2843 + }, + { + "epoch": 0.33, + "learning_rate": 2.0472901790939952e-07, + "logits/chosen": -3.262031316757202, + "logits/rejected": -3.5883002281188965, + "logps/chosen": -165.2972412109375, + "logps/rejected": -274.709716796875, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05183995142579079, + "rewards/margins": 2.7336294651031494, + "rewards/rejected": -2.6817893981933594, + "step": 2844 + }, + { + "epoch": 0.33, + "learning_rate": 2.0469390143977525e-07, + "logits/chosen": -3.4286673069000244, + "logits/rejected": -3.506324291229248, + "logps/chosen": -193.91348266601562, + "logps/rejected": -258.2356262207031, + "loss": 0.5467, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20075538754463196, + "rewards/margins": 1.1406350135803223, + "rewards/rejected": -1.3413903713226318, + "step": 2845 + }, + { + "epoch": 0.33, + "learning_rate": 2.04658784970151e-07, + "logits/chosen": -3.846790313720703, + "logits/rejected": -3.740144968032837, + "logps/chosen": -266.9960021972656, + "logps/rejected": -275.77166748046875, + "loss": 0.166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36887016892433167, + "rewards/margins": 2.7621688842773438, + "rewards/rejected": -2.393298864364624, + "step": 2846 + }, + { + "epoch": 0.33, + "learning_rate": 2.0462366850052673e-07, + "logits/chosen": -3.2154715061187744, + "logits/rejected": -3.0804696083068848, + "logps/chosen": -156.24244689941406, + "logps/rejected": -226.55938720703125, + "loss": 0.3985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16894464194774628, + "rewards/margins": 1.3082945346832275, + "rewards/rejected": -1.1393499374389648, + "step": 2847 + }, + { + "epoch": 0.33, + "learning_rate": 2.0458855203090248e-07, + "logits/chosen": -2.71974515914917, + "logits/rejected": -2.7606868743896484, + "logps/chosen": -326.88818359375, + "logps/rejected": -301.45758056640625, + "loss": 0.2039, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1638636291027069, + "rewards/margins": 2.366168975830078, + "rewards/rejected": -2.202305316925049, + "step": 2848 + }, + { + "epoch": 0.33, + "learning_rate": 2.0455343556127824e-07, + "logits/chosen": -4.187641143798828, + "logits/rejected": -3.686187267303467, + "logps/chosen": -306.5660400390625, + "logps/rejected": -220.70330810546875, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7266150712966919, + "rewards/margins": 0.987389862537384, + "rewards/rejected": -1.7140049934387207, + "step": 2849 + }, + { + "epoch": 0.33, + "learning_rate": 2.0451831909165397e-07, + "logits/chosen": -3.0918784141540527, + "logits/rejected": -3.3395795822143555, + "logps/chosen": -127.67268371582031, + "logps/rejected": -205.4291534423828, + "loss": 0.53, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3469720780849457, + "rewards/margins": 0.727628231048584, + "rewards/rejected": -1.074600338935852, + "step": 2850 + }, + { + "epoch": 0.33, + "learning_rate": 2.0448320262202972e-07, + "logits/chosen": -2.266724109649658, + "logits/rejected": -2.6965675354003906, + "logps/chosen": -367.3207092285156, + "logps/rejected": -236.07861328125, + "loss": 0.2026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44943588972091675, + "rewards/margins": 2.4586939811706543, + "rewards/rejected": -2.0092580318450928, + "step": 2851 + }, + { + "epoch": 0.33, + "learning_rate": 2.0444808615240545e-07, + "logits/chosen": -3.265845775604248, + "logits/rejected": -3.5107994079589844, + "logps/chosen": -376.89837646484375, + "logps/rejected": -406.1129150390625, + "loss": 0.4289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08156808465719223, + "rewards/margins": 2.337326765060425, + "rewards/rejected": -2.255758762359619, + "step": 2852 + }, + { + "epoch": 0.33, + "learning_rate": 2.044129696827812e-07, + "logits/chosen": -3.413444995880127, + "logits/rejected": -3.628187894821167, + "logps/chosen": -233.23023986816406, + "logps/rejected": -418.2298278808594, + "loss": 0.5283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13183501362800598, + "rewards/margins": 0.9775283336639404, + "rewards/rejected": -1.109363317489624, + "step": 2853 + }, + { + "epoch": 0.33, + "learning_rate": 2.0437785321315698e-07, + "logits/chosen": -2.5081512928009033, + "logits/rejected": -2.5851986408233643, + "logps/chosen": -155.041748046875, + "logps/rejected": -205.93833923339844, + "loss": 0.404, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49991554021835327, + "rewards/margins": 1.5381900072097778, + "rewards/rejected": -1.0382745265960693, + "step": 2854 + }, + { + "epoch": 0.33, + "learning_rate": 2.043427367435327e-07, + "logits/chosen": -2.7078542709350586, + "logits/rejected": -2.7510008811950684, + "logps/chosen": -145.48406982421875, + "logps/rejected": -130.09799194335938, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5758556127548218, + "rewards/margins": 0.5272294282913208, + "rewards/rejected": -1.1030850410461426, + "step": 2855 + }, + { + "epoch": 0.33, + "learning_rate": 2.0430762027390846e-07, + "logits/chosen": -3.8277082443237305, + "logits/rejected": -3.7483091354370117, + "logps/chosen": -107.4218521118164, + "logps/rejected": -194.98402404785156, + "loss": 0.3055, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3449106216430664, + "rewards/margins": 1.5563803911209106, + "rewards/rejected": -1.2114696502685547, + "step": 2856 + }, + { + "epoch": 0.33, + "learning_rate": 2.0427250380428422e-07, + "logits/chosen": -3.4146618843078613, + "logits/rejected": -3.019530773162842, + "logps/chosen": -248.57144165039062, + "logps/rejected": -315.67120361328125, + "loss": 0.3784, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08395737409591675, + "rewards/margins": 1.0278345346450806, + "rewards/rejected": -1.1117918491363525, + "step": 2857 + }, + { + "epoch": 0.33, + "learning_rate": 2.0423738733465994e-07, + "logits/chosen": -3.355469226837158, + "logits/rejected": -3.481398344039917, + "logps/chosen": -287.9048156738281, + "logps/rejected": -209.59620666503906, + "loss": 0.631, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22663965821266174, + "rewards/margins": 0.5752274394035339, + "rewards/rejected": -0.8018671274185181, + "step": 2858 + }, + { + "epoch": 0.33, + "learning_rate": 2.042022708650357e-07, + "logits/chosen": -3.780696153640747, + "logits/rejected": -3.8148794174194336, + "logps/chosen": -218.00885009765625, + "logps/rejected": -243.75308227539062, + "loss": 0.4824, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4431857764720917, + "rewards/margins": 2.1373233795166016, + "rewards/rejected": -1.694137692451477, + "step": 2859 + }, + { + "epoch": 0.33, + "learning_rate": 2.0416715439541142e-07, + "logits/chosen": -3.4989047050476074, + "logits/rejected": -3.5055723190307617, + "logps/chosen": -394.12890625, + "logps/rejected": -231.61492919921875, + "loss": 0.4735, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2130986750125885, + "rewards/margins": 2.1727070808410645, + "rewards/rejected": -1.9596086740493774, + "step": 2860 + }, + { + "epoch": 0.33, + "learning_rate": 2.0413203792578718e-07, + "logits/chosen": -2.994781017303467, + "logits/rejected": -2.5221192836761475, + "logps/chosen": -446.52862548828125, + "logps/rejected": -372.12017822265625, + "loss": 0.5404, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.052041590213775635, + "rewards/margins": 1.267101764678955, + "rewards/rejected": -1.3191434144973755, + "step": 2861 + }, + { + "epoch": 0.33, + "learning_rate": 2.0409692145616293e-07, + "logits/chosen": -3.3700790405273438, + "logits/rejected": -3.4661407470703125, + "logps/chosen": -143.74241638183594, + "logps/rejected": -221.1109161376953, + "loss": 0.2348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28408685326576233, + "rewards/margins": 1.802150011062622, + "rewards/rejected": -1.5180631875991821, + "step": 2862 + }, + { + "epoch": 0.33, + "learning_rate": 2.0406180498653866e-07, + "logits/chosen": -3.284825325012207, + "logits/rejected": -3.4520740509033203, + "logps/chosen": -327.230224609375, + "logps/rejected": -281.76812744140625, + "loss": 0.2492, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24369294941425323, + "rewards/margins": 3.1893839836120605, + "rewards/rejected": -2.9456911087036133, + "step": 2863 + }, + { + "epoch": 0.33, + "learning_rate": 2.0402668851691441e-07, + "logits/chosen": -2.760578155517578, + "logits/rejected": -2.690955877304077, + "logps/chosen": -371.62115478515625, + "logps/rejected": -229.04368591308594, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5389971137046814, + "rewards/margins": 1.2430037260055542, + "rewards/rejected": -0.7040066719055176, + "step": 2864 + }, + { + "epoch": 0.33, + "learning_rate": 2.039915720472902e-07, + "logits/chosen": -3.003124237060547, + "logits/rejected": -2.8219361305236816, + "logps/chosen": -185.51058959960938, + "logps/rejected": -178.19921875, + "loss": 0.5855, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23136897385120392, + "rewards/margins": 0.6254237294197083, + "rewards/rejected": -0.856792688369751, + "step": 2865 + }, + { + "epoch": 0.33, + "learning_rate": 2.0395645557766592e-07, + "logits/chosen": -3.105785369873047, + "logits/rejected": -3.4216971397399902, + "logps/chosen": -161.64126586914062, + "logps/rejected": -152.860595703125, + "loss": 0.6072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.043644893914461136, + "rewards/margins": 1.243830919265747, + "rewards/rejected": -1.287475824356079, + "step": 2866 + }, + { + "epoch": 0.33, + "learning_rate": 2.0392133910804168e-07, + "logits/chosen": -3.3430590629577637, + "logits/rejected": -3.047654390335083, + "logps/chosen": -443.2104187011719, + "logps/rejected": -242.50274658203125, + "loss": 0.3402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0757419615983963, + "rewards/margins": 1.3590400218963623, + "rewards/rejected": -1.4347820281982422, + "step": 2867 + }, + { + "epoch": 0.33, + "learning_rate": 2.038862226384174e-07, + "logits/chosen": -3.5673904418945312, + "logits/rejected": -3.885878086090088, + "logps/chosen": -104.0085678100586, + "logps/rejected": -182.9176025390625, + "loss": 0.4167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0320337638258934, + "rewards/margins": 1.4112091064453125, + "rewards/rejected": -1.4432427883148193, + "step": 2868 + }, + { + "epoch": 0.33, + "learning_rate": 2.0385110616879316e-07, + "logits/chosen": -2.8177428245544434, + "logits/rejected": -2.9365592002868652, + "logps/chosen": -236.85020446777344, + "logps/rejected": -162.49288940429688, + "loss": 0.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5083247423171997, + "rewards/margins": 1.562101125717163, + "rewards/rejected": -1.0537763833999634, + "step": 2869 + }, + { + "epoch": 0.33, + "learning_rate": 2.038159896991689e-07, + "logits/chosen": -3.3119957447052, + "logits/rejected": -3.5376977920532227, + "logps/chosen": -80.22996520996094, + "logps/rejected": -229.53500366210938, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06617847084999084, + "rewards/margins": 2.495213508605957, + "rewards/rejected": -2.429035186767578, + "step": 2870 + }, + { + "epoch": 0.33, + "learning_rate": 2.0378087322954464e-07, + "logits/chosen": -2.822551727294922, + "logits/rejected": -3.1431126594543457, + "logps/chosen": -201.25804138183594, + "logps/rejected": -216.75155639648438, + "loss": 0.4443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24011985957622528, + "rewards/margins": 1.0317258834838867, + "rewards/rejected": -0.7916060090065002, + "step": 2871 + }, + { + "epoch": 0.33, + "learning_rate": 2.037457567599204e-07, + "logits/chosen": -3.9176366329193115, + "logits/rejected": -3.9342238903045654, + "logps/chosen": -198.0251922607422, + "logps/rejected": -265.783447265625, + "loss": 0.1576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24891816079616547, + "rewards/margins": 2.949993133544922, + "rewards/rejected": -3.198911190032959, + "step": 2872 + }, + { + "epoch": 0.33, + "learning_rate": 2.0371064029029615e-07, + "logits/chosen": -3.441892623901367, + "logits/rejected": -3.429807662963867, + "logps/chosen": -155.37017822265625, + "logps/rejected": -226.2512969970703, + "loss": 0.3728, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43890780210494995, + "rewards/margins": 1.76227605342865, + "rewards/rejected": -1.3233681917190552, + "step": 2873 + }, + { + "epoch": 0.33, + "learning_rate": 2.0367552382067187e-07, + "logits/chosen": -2.4810843467712402, + "logits/rejected": -2.4452452659606934, + "logps/chosen": -305.5585632324219, + "logps/rejected": -262.0847473144531, + "loss": 0.3351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4443497657775879, + "rewards/margins": 2.7303013801574707, + "rewards/rejected": -3.1746511459350586, + "step": 2874 + }, + { + "epoch": 0.33, + "learning_rate": 2.0364040735104763e-07, + "logits/chosen": -2.2279763221740723, + "logits/rejected": -2.673219919204712, + "logps/chosen": -185.39920043945312, + "logps/rejected": -227.1730499267578, + "loss": 0.4857, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03498871624469757, + "rewards/margins": 1.4246978759765625, + "rewards/rejected": -1.3897091150283813, + "step": 2875 + }, + { + "epoch": 0.33, + "learning_rate": 2.0360529088142335e-07, + "logits/chosen": -3.4194071292877197, + "logits/rejected": -3.7679789066314697, + "logps/chosen": -136.21278381347656, + "logps/rejected": -273.4063720703125, + "loss": 0.6594, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5958983302116394, + "rewards/margins": 1.835218071937561, + "rewards/rejected": -2.4311163425445557, + "step": 2876 + }, + { + "epoch": 0.33, + "learning_rate": 2.0357017441179913e-07, + "logits/chosen": -3.377349376678467, + "logits/rejected": -3.5270557403564453, + "logps/chosen": -147.3437957763672, + "logps/rejected": -179.2998504638672, + "loss": 0.5146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2750656008720398, + "rewards/margins": 0.9859857559204102, + "rewards/rejected": -1.2610514163970947, + "step": 2877 + }, + { + "epoch": 0.33, + "learning_rate": 2.035350579421749e-07, + "logits/chosen": -3.3747410774230957, + "logits/rejected": -3.6004037857055664, + "logps/chosen": -132.59349060058594, + "logps/rejected": -145.92938232421875, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16789588332176208, + "rewards/margins": 0.9863411784172058, + "rewards/rejected": -1.1542370319366455, + "step": 2878 + }, + { + "epoch": 0.33, + "learning_rate": 2.0349994147255062e-07, + "logits/chosen": -3.310739040374756, + "logits/rejected": -3.25069260597229, + "logps/chosen": -309.7239990234375, + "logps/rejected": -194.3499298095703, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23342959582805634, + "rewards/margins": 1.051843285560608, + "rewards/rejected": -0.8184137344360352, + "step": 2879 + }, + { + "epoch": 0.33, + "learning_rate": 2.0346482500292637e-07, + "logits/chosen": -2.6922240257263184, + "logits/rejected": -2.826249361038208, + "logps/chosen": -399.671142578125, + "logps/rejected": -294.33953857421875, + "loss": 0.3997, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1272048056125641, + "rewards/margins": 1.224534273147583, + "rewards/rejected": -1.0973294973373413, + "step": 2880 + }, + { + "epoch": 0.33, + "learning_rate": 2.0342970853330212e-07, + "logits/chosen": -2.624650478363037, + "logits/rejected": -2.749953031539917, + "logps/chosen": -311.1007995605469, + "logps/rejected": -484.07110595703125, + "loss": 0.6448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6814218163490295, + "rewards/margins": 0.4598720669746399, + "rewards/rejected": -1.1412938833236694, + "step": 2881 + }, + { + "epoch": 0.33, + "learning_rate": 2.0339459206367785e-07, + "logits/chosen": -2.7654786109924316, + "logits/rejected": -2.634474992752075, + "logps/chosen": -358.39947509765625, + "logps/rejected": -334.8489685058594, + "loss": 0.6411, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005508989095687866, + "rewards/margins": 0.6813869476318359, + "rewards/rejected": -0.6758779883384705, + "step": 2882 + }, + { + "epoch": 0.33, + "learning_rate": 2.033594755940536e-07, + "logits/chosen": -3.3589162826538086, + "logits/rejected": -3.4439425468444824, + "logps/chosen": -259.4148254394531, + "logps/rejected": -226.098388671875, + "loss": 0.4831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10776549577713013, + "rewards/margins": 1.1535016298294067, + "rewards/rejected": -1.2612671852111816, + "step": 2883 + }, + { + "epoch": 0.33, + "learning_rate": 2.0332435912442933e-07, + "logits/chosen": -3.2560248374938965, + "logits/rejected": -3.540004253387451, + "logps/chosen": -106.66690063476562, + "logps/rejected": -151.1693878173828, + "loss": 0.3813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25389766693115234, + "rewards/margins": 1.2234773635864258, + "rewards/rejected": -1.4773750305175781, + "step": 2884 + }, + { + "epoch": 0.33, + "learning_rate": 2.0328924265480509e-07, + "logits/chosen": -2.682955265045166, + "logits/rejected": -2.7434847354888916, + "logps/chosen": -237.13888549804688, + "logps/rejected": -188.9593963623047, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18113292753696442, + "rewards/margins": 1.7273635864257812, + "rewards/rejected": -1.9084967374801636, + "step": 2885 + }, + { + "epoch": 0.33, + "learning_rate": 2.0325412618518084e-07, + "logits/chosen": -3.130871057510376, + "logits/rejected": -3.096796751022339, + "logps/chosen": -448.5235900878906, + "logps/rejected": -366.7471923828125, + "loss": 0.5405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2540725767612457, + "rewards/margins": 1.0548664331436157, + "rewards/rejected": -1.308938980102539, + "step": 2886 + }, + { + "epoch": 0.33, + "learning_rate": 2.0321900971555657e-07, + "logits/chosen": -2.966341972351074, + "logits/rejected": -2.6792943477630615, + "logps/chosen": -404.1424560546875, + "logps/rejected": -265.12969970703125, + "loss": 0.628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33887067437171936, + "rewards/margins": 0.7187198400497437, + "rewards/rejected": -1.0575904846191406, + "step": 2887 + }, + { + "epoch": 0.33, + "learning_rate": 2.0318389324593235e-07, + "logits/chosen": -3.5187668800354004, + "logits/rejected": -3.288881301879883, + "logps/chosen": -180.57705688476562, + "logps/rejected": -159.48138427734375, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4802477955818176, + "rewards/margins": 0.7036935091018677, + "rewards/rejected": -1.1839412450790405, + "step": 2888 + }, + { + "epoch": 0.33, + "learning_rate": 2.031487767763081e-07, + "logits/chosen": -2.6170167922973633, + "logits/rejected": -3.084103584289551, + "logps/chosen": -216.09457397460938, + "logps/rejected": -291.9983825683594, + "loss": 0.2523, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1299726963043213, + "rewards/margins": 2.5426528453826904, + "rewards/rejected": -2.41267991065979, + "step": 2889 + }, + { + "epoch": 0.33, + "learning_rate": 2.0311366030668383e-07, + "logits/chosen": -3.602548599243164, + "logits/rejected": -3.6066975593566895, + "logps/chosen": -285.4060363769531, + "logps/rejected": -203.991455078125, + "loss": 0.6956, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5418540239334106, + "rewards/margins": 1.4668782949447632, + "rewards/rejected": -2.0087320804595947, + "step": 2890 + }, + { + "epoch": 0.33, + "learning_rate": 2.0307854383705958e-07, + "logits/chosen": -3.328801393508911, + "logits/rejected": -3.5992088317871094, + "logps/chosen": -222.65676879882812, + "logps/rejected": -256.3575439453125, + "loss": 0.2553, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2712659239768982, + "rewards/margins": 2.6190712451934814, + "rewards/rejected": -2.3478055000305176, + "step": 2891 + }, + { + "epoch": 0.33, + "learning_rate": 2.030434273674353e-07, + "logits/chosen": -3.5535213947296143, + "logits/rejected": -3.6672935485839844, + "logps/chosen": -263.6219177246094, + "logps/rejected": -281.49920654296875, + "loss": 0.2473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16253812611103058, + "rewards/margins": 1.9850643873214722, + "rewards/rejected": -1.822526216506958, + "step": 2892 + }, + { + "epoch": 0.33, + "learning_rate": 2.0300831089781106e-07, + "logits/chosen": -2.7230935096740723, + "logits/rejected": -2.833466053009033, + "logps/chosen": -237.63197326660156, + "logps/rejected": -339.1151123046875, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0367647185921669, + "rewards/margins": 2.592855930328369, + "rewards/rejected": -2.6296205520629883, + "step": 2893 + }, + { + "epoch": 0.33, + "learning_rate": 2.0297319442818682e-07, + "logits/chosen": -3.154160976409912, + "logits/rejected": -3.2895665168762207, + "logps/chosen": -169.5806884765625, + "logps/rejected": -214.6156463623047, + "loss": 0.5531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6379812359809875, + "rewards/margins": 1.3894729614257812, + "rewards/rejected": -2.027454137802124, + "step": 2894 + }, + { + "epoch": 0.33, + "learning_rate": 2.0293807795856254e-07, + "logits/chosen": -2.7812728881835938, + "logits/rejected": -2.794137477874756, + "logps/chosen": -172.37005615234375, + "logps/rejected": -240.3179168701172, + "loss": 0.3697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11813174188137054, + "rewards/margins": 1.116188883781433, + "rewards/rejected": -1.2343206405639648, + "step": 2895 + }, + { + "epoch": 0.33, + "learning_rate": 2.029029614889383e-07, + "logits/chosen": -2.7991766929626465, + "logits/rejected": -2.82218599319458, + "logps/chosen": -268.7731628417969, + "logps/rejected": -259.3908996582031, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027704190462827682, + "rewards/margins": 1.890121340751648, + "rewards/rejected": -1.8624169826507568, + "step": 2896 + }, + { + "epoch": 0.33, + "learning_rate": 2.0286784501931405e-07, + "logits/chosen": -3.3356785774230957, + "logits/rejected": -3.206099033355713, + "logps/chosen": -182.0909881591797, + "logps/rejected": -149.77499389648438, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7972962856292725, + "rewards/margins": 1.7562203407287598, + "rewards/rejected": -0.9589241147041321, + "step": 2897 + }, + { + "epoch": 0.33, + "learning_rate": 2.0283272854968978e-07, + "logits/chosen": -3.5309667587280273, + "logits/rejected": -3.4538121223449707, + "logps/chosen": -189.53086853027344, + "logps/rejected": -159.86663818359375, + "loss": 0.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5535858869552612, + "rewards/margins": 2.20595121383667, + "rewards/rejected": -1.6523652076721191, + "step": 2898 + }, + { + "epoch": 0.33, + "learning_rate": 2.0279761208006556e-07, + "logits/chosen": -3.309152841567993, + "logits/rejected": -3.5300586223602295, + "logps/chosen": -120.97969055175781, + "logps/rejected": -274.0085754394531, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07035361975431442, + "rewards/margins": 2.709697961807251, + "rewards/rejected": -2.7800517082214355, + "step": 2899 + }, + { + "epoch": 0.33, + "learning_rate": 2.027624956104413e-07, + "logits/chosen": -3.1258974075317383, + "logits/rejected": -2.951890230178833, + "logps/chosen": -290.8453674316406, + "logps/rejected": -259.3034362792969, + "loss": 0.5096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16470569372177124, + "rewards/margins": 0.9113531112670898, + "rewards/rejected": -1.0760588645935059, + "step": 2900 + }, + { + "epoch": 0.33, + "learning_rate": 2.0272737914081704e-07, + "logits/chosen": -3.3193087577819824, + "logits/rejected": -3.1440491676330566, + "logps/chosen": -223.26614379882812, + "logps/rejected": -278.0645751953125, + "loss": 0.2752, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00713968463242054, + "rewards/margins": 1.2909389734268188, + "rewards/rejected": -1.2837992906570435, + "step": 2901 + }, + { + "epoch": 0.33, + "learning_rate": 2.026922626711928e-07, + "logits/chosen": -3.4273622035980225, + "logits/rejected": -3.6122689247131348, + "logps/chosen": -272.2105407714844, + "logps/rejected": -250.69073486328125, + "loss": 0.5938, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3840213418006897, + "rewards/margins": 1.9926505088806152, + "rewards/rejected": -2.3766720294952393, + "step": 2902 + }, + { + "epoch": 0.33, + "learning_rate": 2.0265714620156852e-07, + "logits/chosen": -3.8571419715881348, + "logits/rejected": -3.744837760925293, + "logps/chosen": -90.60382080078125, + "logps/rejected": -115.12057495117188, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07765614986419678, + "rewards/margins": 0.9016141295433044, + "rewards/rejected": -0.8239579200744629, + "step": 2903 + }, + { + "epoch": 0.33, + "learning_rate": 2.0262202973194428e-07, + "logits/chosen": -3.0325510501861572, + "logits/rejected": -3.0662951469421387, + "logps/chosen": -174.23849487304688, + "logps/rejected": -239.7551727294922, + "loss": 0.5201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08118686825037003, + "rewards/margins": 1.1486828327178955, + "rewards/rejected": -1.2298697233200073, + "step": 2904 + }, + { + "epoch": 0.33, + "learning_rate": 2.0258691326232e-07, + "logits/chosen": -3.1456851959228516, + "logits/rejected": -2.981926918029785, + "logps/chosen": -417.8323974609375, + "logps/rejected": -232.88133239746094, + "loss": 0.2142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12147698551416397, + "rewards/margins": 2.208392858505249, + "rewards/rejected": -2.3298697471618652, + "step": 2905 + }, + { + "epoch": 0.34, + "learning_rate": 2.0255179679269576e-07, + "logits/chosen": -3.0123066902160645, + "logits/rejected": -3.046659469604492, + "logps/chosen": -111.80816650390625, + "logps/rejected": -237.41744995117188, + "loss": 0.4294, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2278384268283844, + "rewards/margins": 1.8048381805419922, + "rewards/rejected": -1.5769997835159302, + "step": 2906 + }, + { + "epoch": 0.34, + "learning_rate": 2.025166803230715e-07, + "logits/chosen": -2.841749429702759, + "logits/rejected": -2.9171857833862305, + "logps/chosen": -163.79049682617188, + "logps/rejected": -271.8253173828125, + "loss": 0.4686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5389413237571716, + "rewards/margins": 1.8660411834716797, + "rewards/rejected": -2.404982566833496, + "step": 2907 + }, + { + "epoch": 0.34, + "learning_rate": 2.0248156385344724e-07, + "logits/chosen": -3.1888015270233154, + "logits/rejected": -3.3762166500091553, + "logps/chosen": -308.487548828125, + "logps/rejected": -282.2408142089844, + "loss": 0.2995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2578979730606079, + "rewards/margins": 1.6379125118255615, + "rewards/rejected": -1.8958104848861694, + "step": 2908 + }, + { + "epoch": 0.34, + "learning_rate": 2.02446447383823e-07, + "logits/chosen": -3.409684181213379, + "logits/rejected": -3.088074207305908, + "logps/chosen": -231.34664916992188, + "logps/rejected": -263.800048828125, + "loss": 0.2925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09666433930397034, + "rewards/margins": 1.5264886617660522, + "rewards/rejected": -1.4298243522644043, + "step": 2909 + }, + { + "epoch": 0.34, + "learning_rate": 2.0241133091419877e-07, + "logits/chosen": -3.5555026531219482, + "logits/rejected": -3.3858587741851807, + "logps/chosen": -220.3841552734375, + "logps/rejected": -182.65028381347656, + "loss": 0.2386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10781025141477585, + "rewards/margins": 1.9294021129608154, + "rewards/rejected": -2.037212371826172, + "step": 2910 + }, + { + "epoch": 0.34, + "learning_rate": 2.023762144445745e-07, + "logits/chosen": -3.437788248062134, + "logits/rejected": -3.441528558731079, + "logps/chosen": -246.73094177246094, + "logps/rejected": -155.72535705566406, + "loss": 0.7647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05477520823478699, + "rewards/margins": 0.8151040077209473, + "rewards/rejected": -0.8698792457580566, + "step": 2911 + }, + { + "epoch": 0.34, + "learning_rate": 2.0234109797495025e-07, + "logits/chosen": -2.5301971435546875, + "logits/rejected": -2.4919474124908447, + "logps/chosen": -139.75106811523438, + "logps/rejected": -274.443115234375, + "loss": 0.5712, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05151505768299103, + "rewards/margins": 0.9389746189117432, + "rewards/rejected": -0.8874596357345581, + "step": 2912 + }, + { + "epoch": 0.34, + "learning_rate": 2.0230598150532598e-07, + "logits/chosen": -2.386244297027588, + "logits/rejected": -2.549412727355957, + "logps/chosen": -285.9289245605469, + "logps/rejected": -249.79061889648438, + "loss": 0.6758, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.001665368676185608, + "rewards/margins": 0.6432605385780334, + "rewards/rejected": -0.6449258923530579, + "step": 2913 + }, + { + "epoch": 0.34, + "learning_rate": 2.0227086503570174e-07, + "logits/chosen": -3.2651190757751465, + "logits/rejected": -3.1326780319213867, + "logps/chosen": -137.72396850585938, + "logps/rejected": -200.672119140625, + "loss": 0.3184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36109378933906555, + "rewards/margins": 1.7345033884048462, + "rewards/rejected": -2.0955970287323, + "step": 2914 + }, + { + "epoch": 0.34, + "learning_rate": 2.022357485660775e-07, + "logits/chosen": -3.6490721702575684, + "logits/rejected": -3.4258580207824707, + "logps/chosen": -328.4866943359375, + "logps/rejected": -218.25198364257812, + "loss": 0.3498, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14412955939769745, + "rewards/margins": 2.362576723098755, + "rewards/rejected": -2.218447208404541, + "step": 2915 + }, + { + "epoch": 0.34, + "learning_rate": 2.0220063209645322e-07, + "logits/chosen": -3.072421073913574, + "logits/rejected": -3.2418439388275146, + "logps/chosen": -306.8154602050781, + "logps/rejected": -353.8634948730469, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8371804356575012, + "rewards/margins": 3.1967976093292236, + "rewards/rejected": -2.359617233276367, + "step": 2916 + }, + { + "epoch": 0.34, + "learning_rate": 2.0216551562682897e-07, + "logits/chosen": -3.10593318939209, + "logits/rejected": -2.9724957942962646, + "logps/chosen": -271.1985778808594, + "logps/rejected": -103.64981079101562, + "loss": 0.5302, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18109387159347534, + "rewards/margins": 0.8321651816368103, + "rewards/rejected": -0.651071310043335, + "step": 2917 + }, + { + "epoch": 0.34, + "learning_rate": 2.0213039915720472e-07, + "logits/chosen": -3.32285737991333, + "logits/rejected": -3.1761317253112793, + "logps/chosen": -497.54815673828125, + "logps/rejected": -296.56829833984375, + "loss": 0.6467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.563912570476532, + "rewards/margins": 0.5004069805145264, + "rewards/rejected": -1.0643194913864136, + "step": 2918 + }, + { + "epoch": 0.34, + "learning_rate": 2.0209528268758045e-07, + "logits/chosen": -3.2541615962982178, + "logits/rejected": -3.3115899562835693, + "logps/chosen": -221.63436889648438, + "logps/rejected": -352.35357666015625, + "loss": 0.4507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08433705568313599, + "rewards/margins": 0.8270418047904968, + "rewards/rejected": -0.9113788604736328, + "step": 2919 + }, + { + "epoch": 0.34, + "learning_rate": 2.020601662179562e-07, + "logits/chosen": -3.9774343967437744, + "logits/rejected": -3.86190128326416, + "logps/chosen": -194.93002319335938, + "logps/rejected": -212.81338500976562, + "loss": 0.6828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14803776144981384, + "rewards/margins": 1.3772728443145752, + "rewards/rejected": -1.2292351722717285, + "step": 2920 + }, + { + "epoch": 0.34, + "learning_rate": 2.0202504974833193e-07, + "logits/chosen": -3.4205026626586914, + "logits/rejected": -3.3348300457000732, + "logps/chosen": -295.4635925292969, + "logps/rejected": -240.77664184570312, + "loss": 0.356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27936410903930664, + "rewards/margins": 1.2246732711791992, + "rewards/rejected": -1.5040373802185059, + "step": 2921 + }, + { + "epoch": 0.34, + "learning_rate": 2.0198993327870771e-07, + "logits/chosen": -3.842554807662964, + "logits/rejected": -3.4636380672454834, + "logps/chosen": -298.9164123535156, + "logps/rejected": -276.25439453125, + "loss": 0.4523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43982210755348206, + "rewards/margins": 1.3990267515182495, + "rewards/rejected": -1.8388487100601196, + "step": 2922 + }, + { + "epoch": 0.34, + "learning_rate": 2.0195481680908347e-07, + "logits/chosen": -3.4936327934265137, + "logits/rejected": -3.4551618099212646, + "logps/chosen": -144.1200714111328, + "logps/rejected": -198.9249267578125, + "loss": 0.5079, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4545038342475891, + "rewards/margins": 1.5161478519439697, + "rewards/rejected": -1.9706518650054932, + "step": 2923 + }, + { + "epoch": 0.34, + "learning_rate": 2.019197003394592e-07, + "logits/chosen": -3.250945568084717, + "logits/rejected": -3.1881096363067627, + "logps/chosen": -372.4384460449219, + "logps/rejected": -301.46795654296875, + "loss": 0.2915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15509256720542908, + "rewards/margins": 1.5211576223373413, + "rewards/rejected": -1.6762502193450928, + "step": 2924 + }, + { + "epoch": 0.34, + "learning_rate": 2.0188458386983495e-07, + "logits/chosen": -2.9653918743133545, + "logits/rejected": -3.219026565551758, + "logps/chosen": -314.8522644042969, + "logps/rejected": -172.25094604492188, + "loss": 0.7247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28413155674934387, + "rewards/margins": 0.38722193241119385, + "rewards/rejected": -0.6713534593582153, + "step": 2925 + }, + { + "epoch": 0.34, + "learning_rate": 2.018494674002107e-07, + "logits/chosen": -3.0218026638031006, + "logits/rejected": -3.0762972831726074, + "logps/chosen": -278.5806579589844, + "logps/rejected": -171.37179565429688, + "loss": 0.6272, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6304113864898682, + "rewards/margins": 0.28319215774536133, + "rewards/rejected": -0.9136036038398743, + "step": 2926 + }, + { + "epoch": 0.34, + "learning_rate": 2.0181435093058643e-07, + "logits/chosen": -3.6375489234924316, + "logits/rejected": -3.9481396675109863, + "logps/chosen": -114.65579223632812, + "logps/rejected": -227.4328155517578, + "loss": 0.3098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36168745160102844, + "rewards/margins": 2.165419340133667, + "rewards/rejected": -2.527106761932373, + "step": 2927 + }, + { + "epoch": 0.34, + "learning_rate": 2.0177923446096218e-07, + "logits/chosen": -2.845045804977417, + "logits/rejected": -3.15944504737854, + "logps/chosen": -194.98345947265625, + "logps/rejected": -196.55674743652344, + "loss": 0.3986, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16046234965324402, + "rewards/margins": 2.023402690887451, + "rewards/rejected": -1.8629405498504639, + "step": 2928 + }, + { + "epoch": 0.34, + "learning_rate": 2.017441179913379e-07, + "logits/chosen": -2.2045390605926514, + "logits/rejected": -2.417449951171875, + "logps/chosen": -433.2235412597656, + "logps/rejected": -398.8790283203125, + "loss": 0.6651, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19060079753398895, + "rewards/margins": 0.44331440329551697, + "rewards/rejected": -0.2527135908603668, + "step": 2929 + }, + { + "epoch": 0.34, + "learning_rate": 2.0170900152171367e-07, + "logits/chosen": -2.8293135166168213, + "logits/rejected": -2.6707863807678223, + "logps/chosen": -434.1829833984375, + "logps/rejected": -325.0392150878906, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11470673978328705, + "rewards/margins": 2.8220791816711426, + "rewards/rejected": -2.9367856979370117, + "step": 2930 + }, + { + "epoch": 0.34, + "learning_rate": 2.0167388505208942e-07, + "logits/chosen": -3.193936347961426, + "logits/rejected": -3.1344683170318604, + "logps/chosen": -361.7147521972656, + "logps/rejected": -324.0749816894531, + "loss": 0.2182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020264655351638794, + "rewards/margins": 1.947543740272522, + "rewards/rejected": -1.927278995513916, + "step": 2931 + }, + { + "epoch": 0.34, + "learning_rate": 2.0163876858246515e-07, + "logits/chosen": -3.257889747619629, + "logits/rejected": -3.6183388233184814, + "logps/chosen": -107.74793243408203, + "logps/rejected": -217.03079223632812, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46542614698410034, + "rewards/margins": 1.4531677961349487, + "rewards/rejected": -1.9185938835144043, + "step": 2932 + }, + { + "epoch": 0.34, + "learning_rate": 2.0160365211284093e-07, + "logits/chosen": -3.3156652450561523, + "logits/rejected": -3.4656808376312256, + "logps/chosen": -233.94534301757812, + "logps/rejected": -146.61886596679688, + "loss": 0.5259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38281339406967163, + "rewards/margins": 0.6853949427604675, + "rewards/rejected": -1.0682082176208496, + "step": 2933 + }, + { + "epoch": 0.34, + "learning_rate": 2.0156853564321668e-07, + "logits/chosen": -3.140315532684326, + "logits/rejected": -3.1714439392089844, + "logps/chosen": -209.2906036376953, + "logps/rejected": -198.14663696289062, + "loss": 0.2815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6778032183647156, + "rewards/margins": 1.5302822589874268, + "rewards/rejected": -0.8524790406227112, + "step": 2934 + }, + { + "epoch": 0.34, + "learning_rate": 2.015334191735924e-07, + "logits/chosen": -2.9553558826446533, + "logits/rejected": -2.9042861461639404, + "logps/chosen": -132.9278564453125, + "logps/rejected": -103.35874938964844, + "loss": 0.7497, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03138389438390732, + "rewards/margins": 0.488581120967865, + "rewards/rejected": -0.5199650526046753, + "step": 2935 + }, + { + "epoch": 0.34, + "learning_rate": 2.0149830270396816e-07, + "logits/chosen": -3.115084648132324, + "logits/rejected": -2.6558589935302734, + "logps/chosen": -267.0437316894531, + "logps/rejected": -221.674560546875, + "loss": 0.4587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1873985081911087, + "rewards/margins": 0.8836804032325745, + "rewards/rejected": -0.696281909942627, + "step": 2936 + }, + { + "epoch": 0.34, + "learning_rate": 2.014631862343439e-07, + "logits/chosen": -3.6395370960235596, + "logits/rejected": -3.6731014251708984, + "logps/chosen": -194.2936553955078, + "logps/rejected": -232.82534790039062, + "loss": 1.0017, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04591737687587738, + "rewards/margins": 0.19130980968475342, + "rewards/rejected": -0.237227201461792, + "step": 2937 + }, + { + "epoch": 0.34, + "learning_rate": 2.0142806976471964e-07, + "logits/chosen": -3.3354387283325195, + "logits/rejected": -3.511521339416504, + "logps/chosen": -222.4091339111328, + "logps/rejected": -269.0169982910156, + "loss": 0.3779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044893428683280945, + "rewards/margins": 1.6682689189910889, + "rewards/rejected": -1.7131624221801758, + "step": 2938 + }, + { + "epoch": 0.34, + "learning_rate": 2.013929532950954e-07, + "logits/chosen": -2.6988158226013184, + "logits/rejected": -2.481468677520752, + "logps/chosen": -165.83949279785156, + "logps/rejected": -208.2683868408203, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09846772253513336, + "rewards/margins": 1.9758474826812744, + "rewards/rejected": -2.074315071105957, + "step": 2939 + }, + { + "epoch": 0.34, + "learning_rate": 2.0135783682547112e-07, + "logits/chosen": -3.2513375282287598, + "logits/rejected": -3.3446426391601562, + "logps/chosen": -185.20703125, + "logps/rejected": -226.14825439453125, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33747759461402893, + "rewards/margins": 2.4196553230285645, + "rewards/rejected": -2.0821776390075684, + "step": 2940 + }, + { + "epoch": 0.34, + "learning_rate": 2.0132272035584688e-07, + "logits/chosen": -3.3286094665527344, + "logits/rejected": -3.4731173515319824, + "logps/chosen": -111.169921875, + "logps/rejected": -226.41802978515625, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3943968415260315, + "rewards/margins": 3.1738436222076416, + "rewards/rejected": -3.5682406425476074, + "step": 2941 + }, + { + "epoch": 0.34, + "learning_rate": 2.0128760388622266e-07, + "logits/chosen": -3.021522283554077, + "logits/rejected": -3.2482199668884277, + "logps/chosen": -90.61788177490234, + "logps/rejected": -223.24044799804688, + "loss": 0.1685, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2776203751564026, + "rewards/margins": 3.1941661834716797, + "rewards/rejected": -2.916545867919922, + "step": 2942 + }, + { + "epoch": 0.34, + "learning_rate": 2.0125248741659836e-07, + "logits/chosen": -3.879267930984497, + "logits/rejected": -3.4690473079681396, + "logps/chosen": -345.91107177734375, + "logps/rejected": -193.40048217773438, + "loss": 0.3728, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0711236298084259, + "rewards/margins": 1.0579986572265625, + "rewards/rejected": -0.986875057220459, + "step": 2943 + }, + { + "epoch": 0.34, + "learning_rate": 2.0121737094697414e-07, + "logits/chosen": -3.5829286575317383, + "logits/rejected": -3.644287586212158, + "logps/chosen": -79.34429931640625, + "logps/rejected": -163.7261199951172, + "loss": 0.2724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3883410096168518, + "rewards/margins": 2.1744675636291504, + "rewards/rejected": -1.7861266136169434, + "step": 2944 + }, + { + "epoch": 0.34, + "learning_rate": 2.0118225447734987e-07, + "logits/chosen": -3.2257795333862305, + "logits/rejected": -3.5706822872161865, + "logps/chosen": -86.93380737304688, + "logps/rejected": -153.52951049804688, + "loss": 0.6394, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020923465490341187, + "rewards/margins": 0.8440307378768921, + "rewards/rejected": -0.8231073021888733, + "step": 2945 + }, + { + "epoch": 0.34, + "learning_rate": 2.0114713800772562e-07, + "logits/chosen": -3.0329060554504395, + "logits/rejected": -2.8970446586608887, + "logps/chosen": -260.33770751953125, + "logps/rejected": -193.12661743164062, + "loss": 0.4413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0754384770989418, + "rewards/margins": 0.7200766801834106, + "rewards/rejected": -0.644638180732727, + "step": 2946 + }, + { + "epoch": 0.34, + "learning_rate": 2.0111202153810137e-07, + "logits/chosen": -2.883592367172241, + "logits/rejected": -2.868070602416992, + "logps/chosen": -205.02223205566406, + "logps/rejected": -204.54791259765625, + "loss": 0.5112, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2792278826236725, + "rewards/margins": 0.8101165294647217, + "rewards/rejected": -1.0893443822860718, + "step": 2947 + }, + { + "epoch": 0.34, + "learning_rate": 2.010769050684771e-07, + "logits/chosen": -2.8097195625305176, + "logits/rejected": -3.152843952178955, + "logps/chosen": -139.98876953125, + "logps/rejected": -241.2833251953125, + "loss": 0.3774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.027085930109024048, + "rewards/margins": 2.2503609657287598, + "rewards/rejected": -2.277446746826172, + "step": 2948 + }, + { + "epoch": 0.34, + "learning_rate": 2.0104178859885286e-07, + "logits/chosen": -3.2890512943267822, + "logits/rejected": -3.800539970397949, + "logps/chosen": -122.72566986083984, + "logps/rejected": -220.52108764648438, + "loss": 0.3549, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.293498158454895, + "rewards/margins": 2.4187824726104736, + "rewards/rejected": -2.125284194946289, + "step": 2949 + }, + { + "epoch": 0.34, + "learning_rate": 2.0100667212922858e-07, + "logits/chosen": -2.557565689086914, + "logits/rejected": -2.4764654636383057, + "logps/chosen": -384.3209228515625, + "logps/rejected": -367.75927734375, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3155340850353241, + "rewards/margins": 2.6300878524780273, + "rewards/rejected": -2.945621967315674, + "step": 2950 + }, + { + "epoch": 0.34, + "learning_rate": 2.0097155565960434e-07, + "logits/chosen": -2.464866876602173, + "logits/rejected": -2.6863346099853516, + "logps/chosen": -482.4538879394531, + "logps/rejected": -279.73681640625, + "loss": 0.174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1314268261194229, + "rewards/margins": 2.312380313873291, + "rewards/rejected": -2.1809535026550293, + "step": 2951 + }, + { + "epoch": 0.34, + "learning_rate": 2.009364391899801e-07, + "logits/chosen": -2.910322666168213, + "logits/rejected": -3.2480764389038086, + "logps/chosen": -247.26763916015625, + "logps/rejected": -376.4541015625, + "loss": 0.2103, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5165860652923584, + "rewards/margins": 3.5483169555664062, + "rewards/rejected": -3.031731128692627, + "step": 2952 + }, + { + "epoch": 0.34, + "learning_rate": 2.0090132272035582e-07, + "logits/chosen": -3.5618886947631836, + "logits/rejected": -3.307736396789551, + "logps/chosen": -283.1102600097656, + "logps/rejected": -222.1205596923828, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3213953971862793, + "rewards/margins": 1.54739511013031, + "rewards/rejected": -1.868790626525879, + "step": 2953 + }, + { + "epoch": 0.34, + "learning_rate": 2.0086620625073157e-07, + "logits/chosen": -3.276106834411621, + "logits/rejected": -3.2900655269622803, + "logps/chosen": -274.07421875, + "logps/rejected": -205.03900146484375, + "loss": 0.3969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10438403487205505, + "rewards/margins": 1.4987530708312988, + "rewards/rejected": -1.6031370162963867, + "step": 2954 + }, + { + "epoch": 0.34, + "learning_rate": 2.0083108978110735e-07, + "logits/chosen": -3.2050347328186035, + "logits/rejected": -3.2259111404418945, + "logps/chosen": -212.81309509277344, + "logps/rejected": -217.99159240722656, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21151524782180786, + "rewards/margins": 0.7038230895996094, + "rewards/rejected": -0.915338397026062, + "step": 2955 + }, + { + "epoch": 0.34, + "learning_rate": 2.0079597331148308e-07, + "logits/chosen": -3.0979113578796387, + "logits/rejected": -3.2714247703552246, + "logps/chosen": -166.32839965820312, + "logps/rejected": -222.6524200439453, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1351582109928131, + "rewards/margins": 2.383890151977539, + "rewards/rejected": -2.5190484523773193, + "step": 2956 + }, + { + "epoch": 0.34, + "learning_rate": 2.0076085684185883e-07, + "logits/chosen": -3.4065027236938477, + "logits/rejected": -3.238586187362671, + "logps/chosen": -320.4075012207031, + "logps/rejected": -179.87060546875, + "loss": 0.8869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9620587825775146, + "rewards/margins": 0.7493278384208679, + "rewards/rejected": -1.711386799812317, + "step": 2957 + }, + { + "epoch": 0.34, + "learning_rate": 2.0072574037223456e-07, + "logits/chosen": -3.345984935760498, + "logits/rejected": -3.1638994216918945, + "logps/chosen": -254.9220733642578, + "logps/rejected": -217.00027465820312, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01831601932644844, + "rewards/margins": 1.9321069717407227, + "rewards/rejected": -1.9504231214523315, + "step": 2958 + }, + { + "epoch": 0.34, + "learning_rate": 2.0069062390261032e-07, + "logits/chosen": -3.2762582302093506, + "logits/rejected": -3.6065890789031982, + "logps/chosen": -201.4008026123047, + "logps/rejected": -246.94248962402344, + "loss": 0.3326, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13006329536437988, + "rewards/margins": 2.4494426250457764, + "rewards/rejected": -2.3193793296813965, + "step": 2959 + }, + { + "epoch": 0.34, + "learning_rate": 2.0065550743298607e-07, + "logits/chosen": -3.7167484760284424, + "logits/rejected": -3.4912686347961426, + "logps/chosen": -225.8983917236328, + "logps/rejected": -260.1999816894531, + "loss": 0.5, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5673426985740662, + "rewards/margins": 2.505617618560791, + "rewards/rejected": -3.0729598999023438, + "step": 2960 + }, + { + "epoch": 0.34, + "learning_rate": 2.006203909633618e-07, + "logits/chosen": -2.3851637840270996, + "logits/rejected": -2.2815890312194824, + "logps/chosen": -324.37042236328125, + "logps/rejected": -377.2200012207031, + "loss": 0.4755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5461332201957703, + "rewards/margins": 0.6332523822784424, + "rewards/rejected": -1.1793855428695679, + "step": 2961 + }, + { + "epoch": 0.34, + "learning_rate": 2.0058527449373755e-07, + "logits/chosen": -3.7728652954101562, + "logits/rejected": -3.8157835006713867, + "logps/chosen": -225.0650634765625, + "logps/rejected": -191.16131591796875, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32488149404525757, + "rewards/margins": 1.783473253250122, + "rewards/rejected": -2.1083548069000244, + "step": 2962 + }, + { + "epoch": 0.34, + "learning_rate": 2.005501580241133e-07, + "logits/chosen": -3.6742606163024902, + "logits/rejected": -3.884150266647339, + "logps/chosen": -190.41400146484375, + "logps/rejected": -343.18048095703125, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7148770093917847, + "rewards/margins": 2.357071876525879, + "rewards/rejected": -1.6421949863433838, + "step": 2963 + }, + { + "epoch": 0.34, + "learning_rate": 2.0051504155448903e-07, + "logits/chosen": -2.3819708824157715, + "logits/rejected": -2.454144239425659, + "logps/chosen": -463.3072204589844, + "logps/rejected": -444.0306701660156, + "loss": 0.414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7822462320327759, + "rewards/margins": 1.340709924697876, + "rewards/rejected": -0.5584636926651001, + "step": 2964 + }, + { + "epoch": 0.34, + "learning_rate": 2.0047992508486479e-07, + "logits/chosen": -3.45698618888855, + "logits/rejected": -3.4458649158477783, + "logps/chosen": -177.67025756835938, + "logps/rejected": -293.7791442871094, + "loss": 0.31, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3144713342189789, + "rewards/margins": 1.9426510334014893, + "rewards/rejected": -2.25712251663208, + "step": 2965 + }, + { + "epoch": 0.34, + "learning_rate": 2.004448086152405e-07, + "logits/chosen": -3.2309508323669434, + "logits/rejected": -3.1886708736419678, + "logps/chosen": -285.8022155761719, + "logps/rejected": -278.4993896484375, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021058499813079834, + "rewards/margins": 0.8736082315444946, + "rewards/rejected": -0.8946667909622192, + "step": 2966 + }, + { + "epoch": 0.34, + "learning_rate": 2.004096921456163e-07, + "logits/chosen": -2.5895845890045166, + "logits/rejected": -2.450655937194824, + "logps/chosen": -323.84771728515625, + "logps/rejected": -358.38043212890625, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4293551445007324, + "rewards/margins": 1.597538709640503, + "rewards/rejected": -1.168183445930481, + "step": 2967 + }, + { + "epoch": 0.34, + "learning_rate": 2.0037457567599205e-07, + "logits/chosen": -3.168048858642578, + "logits/rejected": -2.779696464538574, + "logps/chosen": -429.7923278808594, + "logps/rejected": -248.91915893554688, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03198336809873581, + "rewards/margins": 0.9688706398010254, + "rewards/rejected": -1.0008540153503418, + "step": 2968 + }, + { + "epoch": 0.34, + "learning_rate": 2.0033945920636777e-07, + "logits/chosen": -3.5022358894348145, + "logits/rejected": -3.593702793121338, + "logps/chosen": -232.06930541992188, + "logps/rejected": -204.57720947265625, + "loss": 0.5591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03298419713973999, + "rewards/margins": 1.623997449874878, + "rewards/rejected": -1.6569815874099731, + "step": 2969 + }, + { + "epoch": 0.34, + "learning_rate": 2.0030434273674353e-07, + "logits/chosen": -2.515444755554199, + "logits/rejected": -2.611987829208374, + "logps/chosen": -437.18853759765625, + "logps/rejected": -484.5002136230469, + "loss": 0.6607, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20678915083408356, + "rewards/margins": 0.6823543310165405, + "rewards/rejected": -0.4755651652812958, + "step": 2970 + }, + { + "epoch": 0.34, + "learning_rate": 2.0026922626711928e-07, + "logits/chosen": -3.229118824005127, + "logits/rejected": -2.9941413402557373, + "logps/chosen": -257.7070007324219, + "logps/rejected": -258.85931396484375, + "loss": 0.4121, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.009428739547729492, + "rewards/margins": 1.835546612739563, + "rewards/rejected": -1.8449753522872925, + "step": 2971 + }, + { + "epoch": 0.34, + "learning_rate": 2.00234109797495e-07, + "logits/chosen": -2.910989999771118, + "logits/rejected": -2.967456340789795, + "logps/chosen": -362.3610534667969, + "logps/rejected": -354.69659423828125, + "loss": 0.2608, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19265328347682953, + "rewards/margins": 2.3764946460723877, + "rewards/rejected": -2.1838412284851074, + "step": 2972 + }, + { + "epoch": 0.34, + "learning_rate": 2.0019899332787076e-07, + "logits/chosen": -3.767831325531006, + "logits/rejected": -3.752877712249756, + "logps/chosen": -297.3468017578125, + "logps/rejected": -234.800537109375, + "loss": 0.3336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04232408106327057, + "rewards/margins": 2.679588794708252, + "rewards/rejected": -2.7219130992889404, + "step": 2973 + }, + { + "epoch": 0.34, + "learning_rate": 2.001638768582465e-07, + "logits/chosen": -2.808736801147461, + "logits/rejected": -2.622533082962036, + "logps/chosen": -447.11077880859375, + "logps/rejected": -370.8930358886719, + "loss": 0.6322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3884313404560089, + "rewards/margins": 0.7366093397140503, + "rewards/rejected": -1.1250405311584473, + "step": 2974 + }, + { + "epoch": 0.34, + "learning_rate": 2.0012876038862224e-07, + "logits/chosen": -2.6631946563720703, + "logits/rejected": -2.606398344039917, + "logps/chosen": -444.72528076171875, + "logps/rejected": -342.0912170410156, + "loss": 0.2375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08098623156547546, + "rewards/margins": 1.9411709308624268, + "rewards/rejected": -1.860184669494629, + "step": 2975 + }, + { + "epoch": 0.34, + "learning_rate": 2.0009364391899802e-07, + "logits/chosen": -2.4911019802093506, + "logits/rejected": -2.389383554458618, + "logps/chosen": -409.98248291015625, + "logps/rejected": -230.0225067138672, + "loss": 0.3638, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04305548965930939, + "rewards/margins": 1.7912445068359375, + "rewards/rejected": -1.8343000411987305, + "step": 2976 + }, + { + "epoch": 0.34, + "learning_rate": 2.0005852744937373e-07, + "logits/chosen": -3.051130533218384, + "logits/rejected": -3.1382083892822266, + "logps/chosen": -285.3281555175781, + "logps/rejected": -179.24974060058594, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7016482949256897, + "rewards/margins": 1.540708303451538, + "rewards/rejected": -2.242356538772583, + "step": 2977 + }, + { + "epoch": 0.34, + "learning_rate": 2.000234109797495e-07, + "logits/chosen": -3.412571668624878, + "logits/rejected": -2.9769859313964844, + "logps/chosen": -272.42523193359375, + "logps/rejected": -270.3815612792969, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07033814489841461, + "rewards/margins": 2.4317941665649414, + "rewards/rejected": -2.3614561557769775, + "step": 2978 + }, + { + "epoch": 0.34, + "learning_rate": 1.9998829451012526e-07, + "logits/chosen": -3.3354339599609375, + "logits/rejected": -3.3684403896331787, + "logps/chosen": -213.25857543945312, + "logps/rejected": -192.07913208007812, + "loss": 0.4038, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35450178384780884, + "rewards/margins": 1.3645257949829102, + "rewards/rejected": -1.010023832321167, + "step": 2979 + }, + { + "epoch": 0.34, + "learning_rate": 1.99953178040501e-07, + "logits/chosen": -2.2966232299804688, + "logits/rejected": -2.5412094593048096, + "logps/chosen": -148.21197509765625, + "logps/rejected": -104.93280029296875, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03637028485536575, + "rewards/margins": 0.8655422329902649, + "rewards/rejected": -0.9019125699996948, + "step": 2980 + }, + { + "epoch": 0.34, + "learning_rate": 1.9991806157087674e-07, + "logits/chosen": -2.4457528591156006, + "logits/rejected": -2.567152500152588, + "logps/chosen": -334.0098571777344, + "logps/rejected": -285.9653625488281, + "loss": 0.4737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04161853343248367, + "rewards/margins": 1.2551064491271973, + "rewards/rejected": -1.2134878635406494, + "step": 2981 + }, + { + "epoch": 0.34, + "learning_rate": 1.9988294510125247e-07, + "logits/chosen": -2.269549608230591, + "logits/rejected": -2.194660186767578, + "logps/chosen": -351.0200500488281, + "logps/rejected": -302.31170654296875, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7515136003494263, + "rewards/margins": 1.7577385902404785, + "rewards/rejected": -1.0062249898910522, + "step": 2982 + }, + { + "epoch": 0.34, + "learning_rate": 1.9984782863162822e-07, + "logits/chosen": -2.849526882171631, + "logits/rejected": -2.9044618606567383, + "logps/chosen": -142.3116912841797, + "logps/rejected": -178.07489013671875, + "loss": 0.504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3033895194530487, + "rewards/margins": 1.4080030918121338, + "rewards/rejected": -1.7113924026489258, + "step": 2983 + }, + { + "epoch": 0.34, + "learning_rate": 1.9981271216200398e-07, + "logits/chosen": -2.6064300537109375, + "logits/rejected": -2.7185001373291016, + "logps/chosen": -223.63357543945312, + "logps/rejected": -320.416748046875, + "loss": 0.4903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35416316986083984, + "rewards/margins": 1.364677906036377, + "rewards/rejected": -1.7188410758972168, + "step": 2984 + }, + { + "epoch": 0.34, + "learning_rate": 1.997775956923797e-07, + "logits/chosen": -3.716830253601074, + "logits/rejected": -3.910639524459839, + "logps/chosen": -256.9998779296875, + "logps/rejected": -315.8363952636719, + "loss": 0.3436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5190137028694153, + "rewards/margins": 2.1939942836761475, + "rewards/rejected": -1.6749805212020874, + "step": 2985 + }, + { + "epoch": 0.34, + "learning_rate": 1.9974247922275546e-07, + "logits/chosen": -2.975961685180664, + "logits/rejected": -3.188007354736328, + "logps/chosen": -289.7637023925781, + "logps/rejected": -246.95179748535156, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29284995794296265, + "rewards/margins": 1.8309082984924316, + "rewards/rejected": -1.5380582809448242, + "step": 2986 + }, + { + "epoch": 0.34, + "learning_rate": 1.9970736275313124e-07, + "logits/chosen": -3.8261470794677734, + "logits/rejected": -3.8902313709259033, + "logps/chosen": -365.32080078125, + "logps/rejected": -369.98876953125, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1501438319683075, + "rewards/margins": 2.6673123836517334, + "rewards/rejected": -2.5171687602996826, + "step": 2987 + }, + { + "epoch": 0.34, + "learning_rate": 1.9967224628350694e-07, + "logits/chosen": -2.8611676692962646, + "logits/rejected": -2.8606300354003906, + "logps/chosen": -201.0721435546875, + "logps/rejected": -329.27618408203125, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3275659680366516, + "rewards/margins": 0.7490256428718567, + "rewards/rejected": -1.0765914916992188, + "step": 2988 + }, + { + "epoch": 0.34, + "learning_rate": 1.9963712981388272e-07, + "logits/chosen": -3.416454792022705, + "logits/rejected": -3.345167398452759, + "logps/chosen": -69.67001342773438, + "logps/rejected": -111.84070587158203, + "loss": 0.4732, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07749253511428833, + "rewards/margins": 1.2552928924560547, + "rewards/rejected": -1.3327854871749878, + "step": 2989 + }, + { + "epoch": 0.34, + "learning_rate": 1.9960201334425845e-07, + "logits/chosen": -3.3993122577667236, + "logits/rejected": -3.369974136352539, + "logps/chosen": -197.49288940429688, + "logps/rejected": -242.44815063476562, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40783071517944336, + "rewards/margins": 3.1466867923736572, + "rewards/rejected": -2.738856077194214, + "step": 2990 + }, + { + "epoch": 0.34, + "learning_rate": 1.995668968746342e-07, + "logits/chosen": -3.722980499267578, + "logits/rejected": -3.53766131401062, + "logps/chosen": -242.56549072265625, + "logps/rejected": -155.88417053222656, + "loss": 0.4759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18551580607891083, + "rewards/margins": 1.6566330194473267, + "rewards/rejected": -1.842148780822754, + "step": 2991 + }, + { + "epoch": 0.34, + "learning_rate": 1.9953178040500995e-07, + "logits/chosen": -3.0331616401672363, + "logits/rejected": -2.9508485794067383, + "logps/chosen": -301.06365966796875, + "logps/rejected": -183.7648468017578, + "loss": 0.5323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21685844659805298, + "rewards/margins": 2.0872159004211426, + "rewards/rejected": -2.304074287414551, + "step": 2992 + }, + { + "epoch": 0.35, + "learning_rate": 1.9949666393538568e-07, + "logits/chosen": -2.5392539501190186, + "logits/rejected": -2.625774621963501, + "logps/chosen": -325.6302185058594, + "logps/rejected": -347.7114562988281, + "loss": 0.2214, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05564439296722412, + "rewards/margins": 1.881001353263855, + "rewards/rejected": -1.8253567218780518, + "step": 2993 + }, + { + "epoch": 0.35, + "learning_rate": 1.9946154746576144e-07, + "logits/chosen": -3.414804458618164, + "logits/rejected": -3.583747386932373, + "logps/chosen": -298.76922607421875, + "logps/rejected": -261.234619140625, + "loss": 0.4232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.590377926826477, + "rewards/margins": 1.2065370082855225, + "rewards/rejected": -1.7969149351119995, + "step": 2994 + }, + { + "epoch": 0.35, + "learning_rate": 1.9942643099613716e-07, + "logits/chosen": -3.7196402549743652, + "logits/rejected": -3.6113600730895996, + "logps/chosen": -237.99790954589844, + "logps/rejected": -267.6648254394531, + "loss": 0.3164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2398047149181366, + "rewards/margins": 1.9074733257293701, + "rewards/rejected": -2.14727783203125, + "step": 2995 + }, + { + "epoch": 0.35, + "learning_rate": 1.9939131452651292e-07, + "logits/chosen": -3.3375420570373535, + "logits/rejected": -3.3888843059539795, + "logps/chosen": -276.5927429199219, + "logps/rejected": -267.129150390625, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2998148202896118, + "rewards/margins": 2.171598434448242, + "rewards/rejected": -1.8717834949493408, + "step": 2996 + }, + { + "epoch": 0.35, + "learning_rate": 1.9935619805688867e-07, + "logits/chosen": -2.497130870819092, + "logits/rejected": -2.3048453330993652, + "logps/chosen": -380.0819091796875, + "logps/rejected": -299.1247863769531, + "loss": 0.2903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29808443784713745, + "rewards/margins": 1.9018582105636597, + "rewards/rejected": -2.1999425888061523, + "step": 2997 + }, + { + "epoch": 0.35, + "learning_rate": 1.993210815872644e-07, + "logits/chosen": -3.046635150909424, + "logits/rejected": -2.855534076690674, + "logps/chosen": -310.72760009765625, + "logps/rejected": -285.21832275390625, + "loss": 0.5567, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13025318086147308, + "rewards/margins": 0.7707851529121399, + "rewards/rejected": -0.6405320763587952, + "step": 2998 + }, + { + "epoch": 0.35, + "learning_rate": 1.9928596511764015e-07, + "logits/chosen": -3.0353543758392334, + "logits/rejected": -3.0603725910186768, + "logps/chosen": -209.21337890625, + "logps/rejected": -329.19598388671875, + "loss": 0.3971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1573580801486969, + "rewards/margins": 1.5026507377624512, + "rewards/rejected": -1.6600087881088257, + "step": 2999 + }, + { + "epoch": 0.35, + "learning_rate": 1.9925084864801593e-07, + "logits/chosen": -3.063105583190918, + "logits/rejected": -2.6062874794006348, + "logps/chosen": -168.02154541015625, + "logps/rejected": -242.42494201660156, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.255886048078537, + "rewards/margins": 1.5238447189331055, + "rewards/rejected": -1.7797307968139648, + "step": 3000 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.8445253372192383, + "eval_logits/rejected": -2.806975841522217, + "eval_logps/chosen": -293.6763610839844, + "eval_logps/rejected": -235.7855682373047, + "eval_loss": 0.44277361035346985, + "eval_rewards/accuracies": 0.7714285850524902, + "eval_rewards/chosen": 0.037860333919525146, + "eval_rewards/margins": 1.1821558475494385, + "eval_rewards/rejected": -1.144295334815979, + "eval_runtime": 32.7716, + "eval_samples_per_second": 2.136, + "eval_steps_per_second": 1.068, + "step": 3000 + }, + { + "epoch": 0.35, + "learning_rate": 1.9921573217839166e-07, + "logits/chosen": -2.8027710914611816, + "logits/rejected": -2.887425422668457, + "logps/chosen": -513.3043823242188, + "logps/rejected": -385.4537353515625, + "loss": 0.4562, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41076868772506714, + "rewards/margins": 1.4678168296813965, + "rewards/rejected": -1.0570480823516846, + "step": 3001 + }, + { + "epoch": 0.35, + "learning_rate": 1.9918061570876741e-07, + "logits/chosen": -3.318390130996704, + "logits/rejected": -3.3696188926696777, + "logps/chosen": -111.37016296386719, + "logps/rejected": -224.27706909179688, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2539695203304291, + "rewards/margins": 1.6461830139160156, + "rewards/rejected": -1.3922135829925537, + "step": 3002 + }, + { + "epoch": 0.35, + "learning_rate": 1.9914549923914314e-07, + "logits/chosen": -3.136902332305908, + "logits/rejected": -3.3772804737091064, + "logps/chosen": -337.8256530761719, + "logps/rejected": -275.40423583984375, + "loss": 0.6767, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6138310432434082, + "rewards/margins": 1.276790976524353, + "rewards/rejected": -1.8906221389770508, + "step": 3003 + }, + { + "epoch": 0.35, + "learning_rate": 1.991103827695189e-07, + "logits/chosen": -3.433337926864624, + "logits/rejected": -3.3356785774230957, + "logps/chosen": -181.3197021484375, + "logps/rejected": -209.333740234375, + "loss": 0.2586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13835421204566956, + "rewards/margins": 2.54742431640625, + "rewards/rejected": -2.6857783794403076, + "step": 3004 + }, + { + "epoch": 0.35, + "learning_rate": 1.9907526629989465e-07, + "logits/chosen": -2.821763515472412, + "logits/rejected": -2.961116313934326, + "logps/chosen": -174.74969482421875, + "logps/rejected": -251.42413330078125, + "loss": 0.4287, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02719070017337799, + "rewards/margins": 2.9332656860351562, + "rewards/rejected": -2.960456371307373, + "step": 3005 + }, + { + "epoch": 0.35, + "learning_rate": 1.9904014983027038e-07, + "logits/chosen": -3.7379512786865234, + "logits/rejected": -3.584634780883789, + "logps/chosen": -405.835205078125, + "logps/rejected": -380.30316162109375, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20924147963523865, + "rewards/margins": 1.1872947216033936, + "rewards/rejected": -1.3965362310409546, + "step": 3006 + }, + { + "epoch": 0.35, + "learning_rate": 1.9900503336064613e-07, + "logits/chosen": -3.562121868133545, + "logits/rejected": -3.8115479946136475, + "logps/chosen": -166.93214416503906, + "logps/rejected": -213.4400634765625, + "loss": 0.4442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08170342445373535, + "rewards/margins": 1.6551244258880615, + "rewards/rejected": -1.5734210014343262, + "step": 3007 + }, + { + "epoch": 0.35, + "learning_rate": 1.9896991689102188e-07, + "logits/chosen": -2.8363289833068848, + "logits/rejected": -2.9709248542785645, + "logps/chosen": -372.1715087890625, + "logps/rejected": -255.58438110351562, + "loss": 0.6478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6143717765808105, + "rewards/margins": 1.224979043006897, + "rewards/rejected": -1.839350938796997, + "step": 3008 + }, + { + "epoch": 0.35, + "learning_rate": 1.989348004213976e-07, + "logits/chosen": -3.510218858718872, + "logits/rejected": -3.4929113388061523, + "logps/chosen": -224.03868103027344, + "logps/rejected": -275.6429748535156, + "loss": 0.3077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.294406533241272, + "rewards/margins": 1.5911123752593994, + "rewards/rejected": -1.8855189085006714, + "step": 3009 + }, + { + "epoch": 0.35, + "learning_rate": 1.988996839517734e-07, + "logits/chosen": -2.471229076385498, + "logits/rejected": -2.4347286224365234, + "logps/chosen": -398.85101318359375, + "logps/rejected": -286.23248291015625, + "loss": 0.359, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36383056640625, + "rewards/margins": 2.1161303520202637, + "rewards/rejected": -1.7522996664047241, + "step": 3010 + }, + { + "epoch": 0.35, + "learning_rate": 1.988645674821491e-07, + "logits/chosen": -2.3877522945404053, + "logits/rejected": -2.4750397205352783, + "logps/chosen": -327.6005859375, + "logps/rejected": -271.2843017578125, + "loss": 0.7386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6632050275802612, + "rewards/margins": 0.38292232155799866, + "rewards/rejected": -1.046127438545227, + "step": 3011 + }, + { + "epoch": 0.35, + "learning_rate": 1.9882945101252487e-07, + "logits/chosen": -2.903801918029785, + "logits/rejected": -2.8651628494262695, + "logps/chosen": -516.5899658203125, + "logps/rejected": -389.511962890625, + "loss": 0.274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6114081144332886, + "rewards/margins": 2.1922669410705566, + "rewards/rejected": -2.8036751747131348, + "step": 3012 + }, + { + "epoch": 0.35, + "learning_rate": 1.9879433454290063e-07, + "logits/chosen": -2.432929515838623, + "logits/rejected": -2.4705395698547363, + "logps/chosen": -474.18035888671875, + "logps/rejected": -308.600341796875, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21352744102478027, + "rewards/margins": 1.6352531909942627, + "rewards/rejected": -1.421725869178772, + "step": 3013 + }, + { + "epoch": 0.35, + "learning_rate": 1.9875921807327635e-07, + "logits/chosen": -3.472501754760742, + "logits/rejected": -3.217576503753662, + "logps/chosen": -340.12335205078125, + "logps/rejected": -302.1429748535156, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1616649627685547, + "rewards/margins": 2.697509765625, + "rewards/rejected": -2.5358448028564453, + "step": 3014 + }, + { + "epoch": 0.35, + "learning_rate": 1.987241016036521e-07, + "logits/chosen": -2.5437166690826416, + "logits/rejected": -2.7228076457977295, + "logps/chosen": -285.1317443847656, + "logps/rejected": -120.3369140625, + "loss": 0.4389, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005285590887069702, + "rewards/margins": 0.9945621490478516, + "rewards/rejected": -0.9892765879631042, + "step": 3015 + }, + { + "epoch": 0.35, + "learning_rate": 1.9868898513402786e-07, + "logits/chosen": -3.5898420810699463, + "logits/rejected": -3.2219769954681396, + "logps/chosen": -334.88385009765625, + "logps/rejected": -252.46917724609375, + "loss": 0.2284, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.484081506729126, + "rewards/margins": 2.456275463104248, + "rewards/rejected": -1.972193956375122, + "step": 3016 + }, + { + "epoch": 0.35, + "learning_rate": 1.986538686644036e-07, + "logits/chosen": -3.281723976135254, + "logits/rejected": -2.991319417953491, + "logps/chosen": -205.6256103515625, + "logps/rejected": -149.90499877929688, + "loss": 0.501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18950071930885315, + "rewards/margins": 1.5170499086380005, + "rewards/rejected": -1.7065507173538208, + "step": 3017 + }, + { + "epoch": 0.35, + "learning_rate": 1.9861875219477934e-07, + "logits/chosen": -2.635406970977783, + "logits/rejected": -3.1753180027008057, + "logps/chosen": -284.2916564941406, + "logps/rejected": -303.6649169921875, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13679316639900208, + "rewards/margins": 2.396989107131958, + "rewards/rejected": -2.260195732116699, + "step": 3018 + }, + { + "epoch": 0.35, + "learning_rate": 1.9858363572515507e-07, + "logits/chosen": -2.655086040496826, + "logits/rejected": -2.872951030731201, + "logps/chosen": -215.7069854736328, + "logps/rejected": -294.8112487792969, + "loss": 0.7569, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6652940511703491, + "rewards/margins": 0.6008734703063965, + "rewards/rejected": -1.2661676406860352, + "step": 3019 + }, + { + "epoch": 0.35, + "learning_rate": 1.9854851925553082e-07, + "logits/chosen": -3.1013875007629395, + "logits/rejected": -3.3349311351776123, + "logps/chosen": -117.22346496582031, + "logps/rejected": -258.43963623046875, + "loss": 0.3789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4288647770881653, + "rewards/margins": 2.8140647411346436, + "rewards/rejected": -3.242929697036743, + "step": 3020 + }, + { + "epoch": 0.35, + "learning_rate": 1.985134027859066e-07, + "logits/chosen": -2.500993013381958, + "logits/rejected": -3.0212295055389404, + "logps/chosen": -202.703857421875, + "logps/rejected": -208.21218872070312, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2056315839290619, + "rewards/margins": 2.046143054962158, + "rewards/rejected": -1.8405113220214844, + "step": 3021 + }, + { + "epoch": 0.35, + "learning_rate": 1.984782863162823e-07, + "logits/chosen": -3.098226547241211, + "logits/rejected": -3.1351306438446045, + "logps/chosen": -339.45343017578125, + "logps/rejected": -303.92236328125, + "loss": 1.8897, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9420448541641235, + "rewards/margins": -0.21989381313323975, + "rewards/rejected": -1.7221510410308838, + "step": 3022 + }, + { + "epoch": 0.35, + "learning_rate": 1.9844316984665809e-07, + "logits/chosen": -2.996962547302246, + "logits/rejected": -2.885087490081787, + "logps/chosen": -418.7176818847656, + "logps/rejected": -286.46240234375, + "loss": 0.1713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5985704660415649, + "rewards/margins": 2.354692220687866, + "rewards/rejected": -1.7561216354370117, + "step": 3023 + }, + { + "epoch": 0.35, + "learning_rate": 1.9840805337703384e-07, + "logits/chosen": -2.605712413787842, + "logits/rejected": -2.691016674041748, + "logps/chosen": -283.2939453125, + "logps/rejected": -332.7919921875, + "loss": 0.8818, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4866686463356018, + "rewards/margins": -0.034748926758766174, + "rewards/rejected": -0.451919823884964, + "step": 3024 + }, + { + "epoch": 0.35, + "learning_rate": 1.9837293690740957e-07, + "logits/chosen": -3.6237730979919434, + "logits/rejected": -3.3434765338897705, + "logps/chosen": -260.3717956542969, + "logps/rejected": -224.75294494628906, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1063823401927948, + "rewards/margins": 2.3508050441741943, + "rewards/rejected": -2.244422435760498, + "step": 3025 + }, + { + "epoch": 0.35, + "learning_rate": 1.9833782043778532e-07, + "logits/chosen": -3.303894281387329, + "logits/rejected": -3.2161800861358643, + "logps/chosen": -146.5291748046875, + "logps/rejected": -194.72665405273438, + "loss": 0.417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16204451024532318, + "rewards/margins": 1.404057264328003, + "rewards/rejected": -1.5661019086837769, + "step": 3026 + }, + { + "epoch": 0.35, + "learning_rate": 1.9830270396816105e-07, + "logits/chosen": -2.1165835857391357, + "logits/rejected": -2.270214080810547, + "logps/chosen": -280.33050537109375, + "logps/rejected": -210.07708740234375, + "loss": 0.6495, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4028017520904541, + "rewards/margins": 0.8968319892883301, + "rewards/rejected": -1.2996337413787842, + "step": 3027 + }, + { + "epoch": 0.35, + "learning_rate": 1.982675874985368e-07, + "logits/chosen": -3.563145160675049, + "logits/rejected": -3.32793927192688, + "logps/chosen": -201.4566192626953, + "logps/rejected": -114.2402114868164, + "loss": 0.4588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20357412099838257, + "rewards/margins": 1.0731282234191895, + "rewards/rejected": -1.2767024040222168, + "step": 3028 + }, + { + "epoch": 0.35, + "learning_rate": 1.9823247102891256e-07, + "logits/chosen": -3.150799512863159, + "logits/rejected": -2.8476860523223877, + "logps/chosen": -292.8707580566406, + "logps/rejected": -285.7276306152344, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8865633010864258, + "rewards/margins": 2.21144962310791, + "rewards/rejected": -1.3248863220214844, + "step": 3029 + }, + { + "epoch": 0.35, + "learning_rate": 1.9819735455928828e-07, + "logits/chosen": -2.52286434173584, + "logits/rejected": -2.74586820602417, + "logps/chosen": -362.01422119140625, + "logps/rejected": -241.31552124023438, + "loss": 0.337, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07482895255088806, + "rewards/margins": 1.8971986770629883, + "rewards/rejected": -1.8223698139190674, + "step": 3030 + }, + { + "epoch": 0.35, + "learning_rate": 1.9816223808966404e-07, + "logits/chosen": -3.05952787399292, + "logits/rejected": -3.217639684677124, + "logps/chosen": -291.5096740722656, + "logps/rejected": -207.1548309326172, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04779055714607239, + "rewards/margins": 0.8655898571014404, + "rewards/rejected": -0.9133803248405457, + "step": 3031 + }, + { + "epoch": 0.35, + "learning_rate": 1.9812712162003982e-07, + "logits/chosen": -3.334317684173584, + "logits/rejected": -3.2033066749572754, + "logps/chosen": -354.0831298828125, + "logps/rejected": -286.35302734375, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36995968222618103, + "rewards/margins": 1.5233728885650635, + "rewards/rejected": -1.15341317653656, + "step": 3032 + }, + { + "epoch": 0.35, + "learning_rate": 1.9809200515041552e-07, + "logits/chosen": -3.110982894897461, + "logits/rejected": -2.9442169666290283, + "logps/chosen": -382.1273193359375, + "logps/rejected": -326.48187255859375, + "loss": 0.4691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1010933518409729, + "rewards/margins": 0.8019925951957703, + "rewards/rejected": -0.9030859470367432, + "step": 3033 + }, + { + "epoch": 0.35, + "learning_rate": 1.980568886807913e-07, + "logits/chosen": -3.4457716941833496, + "logits/rejected": -3.2462565898895264, + "logps/chosen": -322.18768310546875, + "logps/rejected": -311.78790283203125, + "loss": 0.4558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6141657829284668, + "rewards/margins": 0.9861765503883362, + "rewards/rejected": -1.6003423929214478, + "step": 3034 + }, + { + "epoch": 0.35, + "learning_rate": 1.9802177221116703e-07, + "logits/chosen": -3.6390295028686523, + "logits/rejected": -3.5001792907714844, + "logps/chosen": -237.57666015625, + "logps/rejected": -231.55169677734375, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18183813989162445, + "rewards/margins": 2.0608084201812744, + "rewards/rejected": -2.2426466941833496, + "step": 3035 + }, + { + "epoch": 0.35, + "learning_rate": 1.9798665574154278e-07, + "logits/chosen": -2.9726481437683105, + "logits/rejected": -2.9976768493652344, + "logps/chosen": -310.0140380859375, + "logps/rejected": -191.63180541992188, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19015823304653168, + "rewards/margins": 1.5200804471969604, + "rewards/rejected": -1.3299221992492676, + "step": 3036 + }, + { + "epoch": 0.35, + "learning_rate": 1.9795153927191853e-07, + "logits/chosen": -2.87605357170105, + "logits/rejected": -2.79805326461792, + "logps/chosen": -199.61354064941406, + "logps/rejected": -315.37994384765625, + "loss": 0.2231, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0459071546792984, + "rewards/margins": 2.6719188690185547, + "rewards/rejected": -2.626011848449707, + "step": 3037 + }, + { + "epoch": 0.35, + "learning_rate": 1.9791642280229426e-07, + "logits/chosen": -3.4407434463500977, + "logits/rejected": -3.536858320236206, + "logps/chosen": -188.2257843017578, + "logps/rejected": -272.03668212890625, + "loss": 0.269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19547808170318604, + "rewards/margins": 1.6404101848602295, + "rewards/rejected": -1.835888147354126, + "step": 3038 + }, + { + "epoch": 0.35, + "learning_rate": 1.9788130633267001e-07, + "logits/chosen": -3.929490089416504, + "logits/rejected": -4.022392749786377, + "logps/chosen": -227.07415771484375, + "logps/rejected": -263.71087646484375, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20810705423355103, + "rewards/margins": 0.9839985966682434, + "rewards/rejected": -1.1921056509017944, + "step": 3039 + }, + { + "epoch": 0.35, + "learning_rate": 1.9784618986304577e-07, + "logits/chosen": -2.611607313156128, + "logits/rejected": -2.630568504333496, + "logps/chosen": -310.46295166015625, + "logps/rejected": -265.02227783203125, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14055156707763672, + "rewards/margins": 2.5918312072753906, + "rewards/rejected": -2.7323827743530273, + "step": 3040 + }, + { + "epoch": 0.35, + "learning_rate": 1.978110733934215e-07, + "logits/chosen": -3.4036474227905273, + "logits/rejected": -3.2815630435943604, + "logps/chosen": -249.08258056640625, + "logps/rejected": -235.0412139892578, + "loss": 0.2034, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0114627406001091, + "rewards/margins": 2.777311325073242, + "rewards/rejected": -2.7658488750457764, + "step": 3041 + }, + { + "epoch": 0.35, + "learning_rate": 1.9777595692379725e-07, + "logits/chosen": -2.9080467224121094, + "logits/rejected": -2.978524684906006, + "logps/chosen": -167.72547912597656, + "logps/rejected": -242.87387084960938, + "loss": 0.6104, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17639777064323425, + "rewards/margins": 1.325622797012329, + "rewards/rejected": -1.149225115776062, + "step": 3042 + }, + { + "epoch": 0.35, + "learning_rate": 1.9774084045417298e-07, + "logits/chosen": -3.295597553253174, + "logits/rejected": -3.4926910400390625, + "logps/chosen": -207.98135375976562, + "logps/rejected": -234.22433471679688, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11429326236248016, + "rewards/margins": 1.1010631322860718, + "rewards/rejected": -0.9867699146270752, + "step": 3043 + }, + { + "epoch": 0.35, + "learning_rate": 1.9770572398454876e-07, + "logits/chosen": -3.2846922874450684, + "logits/rejected": -3.254024028778076, + "logps/chosen": -388.8541564941406, + "logps/rejected": -210.4857940673828, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7120705246925354, + "rewards/margins": 1.476340413093567, + "rewards/rejected": -0.7642698287963867, + "step": 3044 + }, + { + "epoch": 0.35, + "learning_rate": 1.976706075149245e-07, + "logits/chosen": -2.8282506465911865, + "logits/rejected": -2.755333423614502, + "logps/chosen": -354.48809814453125, + "logps/rejected": -272.2617492675781, + "loss": 0.5649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09444017708301544, + "rewards/margins": 0.8484975099563599, + "rewards/rejected": -0.9429377317428589, + "step": 3045 + }, + { + "epoch": 0.35, + "learning_rate": 1.9763549104530024e-07, + "logits/chosen": -2.671649932861328, + "logits/rejected": -2.8197784423828125, + "logps/chosen": -223.82540893554688, + "logps/rejected": -479.8037414550781, + "loss": 0.1874, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03378252685070038, + "rewards/margins": 2.399121046066284, + "rewards/rejected": -2.3653385639190674, + "step": 3046 + }, + { + "epoch": 0.35, + "learning_rate": 1.97600374575676e-07, + "logits/chosen": -3.122842788696289, + "logits/rejected": -3.284548759460449, + "logps/chosen": -191.75709533691406, + "logps/rejected": -241.4578399658203, + "loss": 0.2279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2843462824821472, + "rewards/margins": 1.9698700904846191, + "rewards/rejected": -1.6855237483978271, + "step": 3047 + }, + { + "epoch": 0.35, + "learning_rate": 1.9756525810605172e-07, + "logits/chosen": -3.865755319595337, + "logits/rejected": -3.6041152477264404, + "logps/chosen": -244.45639038085938, + "logps/rejected": -217.13330078125, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0404282808303833, + "rewards/margins": 1.0206739902496338, + "rewards/rejected": -0.9802457094192505, + "step": 3048 + }, + { + "epoch": 0.35, + "learning_rate": 1.9753014163642747e-07, + "logits/chosen": -3.6419577598571777, + "logits/rejected": -3.9636166095733643, + "logps/chosen": -151.43231201171875, + "logps/rejected": -233.0259552001953, + "loss": 0.2829, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17550142109394073, + "rewards/margins": 3.123033285140991, + "rewards/rejected": -2.9475317001342773, + "step": 3049 + }, + { + "epoch": 0.35, + "learning_rate": 1.9749502516680323e-07, + "logits/chosen": -3.2303709983825684, + "logits/rejected": -3.3413503170013428, + "logps/chosen": -422.3179016113281, + "logps/rejected": -252.9170379638672, + "loss": 0.2598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20782317221164703, + "rewards/margins": 3.006639003753662, + "rewards/rejected": -2.798815965652466, + "step": 3050 + }, + { + "epoch": 0.35, + "learning_rate": 1.9745990869717896e-07, + "logits/chosen": -3.103703260421753, + "logits/rejected": -3.0119826793670654, + "logps/chosen": -215.5072021484375, + "logps/rejected": -300.2403259277344, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6085277795791626, + "rewards/margins": 1.361037254333496, + "rewards/rejected": -1.9695651531219482, + "step": 3051 + }, + { + "epoch": 0.35, + "learning_rate": 1.974247922275547e-07, + "logits/chosen": -2.995516538619995, + "logits/rejected": -3.27770733833313, + "logps/chosen": -190.18251037597656, + "logps/rejected": -276.3632507324219, + "loss": 0.5397, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15863284468650818, + "rewards/margins": 0.639234185218811, + "rewards/rejected": -0.797866940498352, + "step": 3052 + }, + { + "epoch": 0.35, + "learning_rate": 1.9738967575793046e-07, + "logits/chosen": -3.341181755065918, + "logits/rejected": -3.3497276306152344, + "logps/chosen": -424.89697265625, + "logps/rejected": -303.04144287109375, + "loss": 0.3799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1870366781949997, + "rewards/margins": 1.6448566913604736, + "rewards/rejected": -1.8318934440612793, + "step": 3053 + }, + { + "epoch": 0.35, + "learning_rate": 1.973545592883062e-07, + "logits/chosen": -2.8119359016418457, + "logits/rejected": -2.788180112838745, + "logps/chosen": -208.33493041992188, + "logps/rejected": -329.6491394042969, + "loss": 0.3586, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3023321032524109, + "rewards/margins": 2.8718509674072266, + "rewards/rejected": -2.569518804550171, + "step": 3054 + }, + { + "epoch": 0.35, + "learning_rate": 1.9731944281868197e-07, + "logits/chosen": -2.9192004203796387, + "logits/rejected": -2.935391902923584, + "logps/chosen": -222.40679931640625, + "logps/rejected": -261.94488525390625, + "loss": 0.4417, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0614902637898922, + "rewards/margins": 2.0149471759796143, + "rewards/rejected": -1.953456997871399, + "step": 3055 + }, + { + "epoch": 0.35, + "learning_rate": 1.9728432634905767e-07, + "logits/chosen": -3.262298822402954, + "logits/rejected": -3.2442245483398438, + "logps/chosen": -304.89141845703125, + "logps/rejected": -313.369873046875, + "loss": 0.4883, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3718996047973633, + "rewards/margins": 2.149317502975464, + "rewards/rejected": -2.5212173461914062, + "step": 3056 + }, + { + "epoch": 0.35, + "learning_rate": 1.9724920987943345e-07, + "logits/chosen": -2.8789186477661133, + "logits/rejected": -3.0166006088256836, + "logps/chosen": -294.1700134277344, + "logps/rejected": -367.06231689453125, + "loss": 0.3106, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3389640152454376, + "rewards/margins": 2.8315038681030273, + "rewards/rejected": -2.492539644241333, + "step": 3057 + }, + { + "epoch": 0.35, + "learning_rate": 1.972140934098092e-07, + "logits/chosen": -2.9572980403900146, + "logits/rejected": -2.84912109375, + "logps/chosen": -319.00738525390625, + "logps/rejected": -287.9696960449219, + "loss": 0.1747, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14011840522289276, + "rewards/margins": 2.1210193634033203, + "rewards/rejected": -1.9809010028839111, + "step": 3058 + }, + { + "epoch": 0.35, + "learning_rate": 1.9717897694018493e-07, + "logits/chosen": -2.8451247215270996, + "logits/rejected": -2.8435986042022705, + "logps/chosen": -296.18280029296875, + "logps/rejected": -241.6285400390625, + "loss": 0.5452, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42781952023506165, + "rewards/margins": 1.4644172191619873, + "rewards/rejected": -1.8922367095947266, + "step": 3059 + }, + { + "epoch": 0.35, + "learning_rate": 1.971438604705607e-07, + "logits/chosen": -1.9019265174865723, + "logits/rejected": -2.157865047454834, + "logps/chosen": -415.7928771972656, + "logps/rejected": -224.9405059814453, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08197470754384995, + "rewards/margins": 0.9937095046043396, + "rewards/rejected": -0.911734938621521, + "step": 3060 + }, + { + "epoch": 0.35, + "learning_rate": 1.9710874400093644e-07, + "logits/chosen": -3.0277099609375, + "logits/rejected": -3.2907814979553223, + "logps/chosen": -310.419677734375, + "logps/rejected": -261.60980224609375, + "loss": 0.1682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3358491063117981, + "rewards/margins": 2.2276926040649414, + "rewards/rejected": -1.8918434381484985, + "step": 3061 + }, + { + "epoch": 0.35, + "learning_rate": 1.9707362753131217e-07, + "logits/chosen": -2.665870189666748, + "logits/rejected": -2.9369471073150635, + "logps/chosen": -289.2737731933594, + "logps/rejected": -165.29689025878906, + "loss": 0.543, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21771953999996185, + "rewards/margins": 1.5365526676177979, + "rewards/rejected": -1.3188331127166748, + "step": 3062 + }, + { + "epoch": 0.35, + "learning_rate": 1.9703851106168792e-07, + "logits/chosen": -2.779461145401001, + "logits/rejected": -2.6709160804748535, + "logps/chosen": -360.2775573730469, + "logps/rejected": -201.90740966796875, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11120958626270294, + "rewards/margins": 1.4007362127304077, + "rewards/rejected": -1.2895267009735107, + "step": 3063 + }, + { + "epoch": 0.35, + "learning_rate": 1.9700339459206365e-07, + "logits/chosen": -3.459818124771118, + "logits/rejected": -3.352565288543701, + "logps/chosen": -228.70184326171875, + "logps/rejected": -218.30648803710938, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04656504467129707, + "rewards/margins": 1.7341384887695312, + "rewards/rejected": -1.7807034254074097, + "step": 3064 + }, + { + "epoch": 0.35, + "learning_rate": 1.969682781224394e-07, + "logits/chosen": -3.332549571990967, + "logits/rejected": -3.393594741821289, + "logps/chosen": -393.3575439453125, + "logps/rejected": -221.424072265625, + "loss": 0.5811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24022534489631653, + "rewards/margins": 0.8009299039840698, + "rewards/rejected": -1.0411553382873535, + "step": 3065 + }, + { + "epoch": 0.35, + "learning_rate": 1.9693316165281518e-07, + "logits/chosen": -3.2575490474700928, + "logits/rejected": -3.3155391216278076, + "logps/chosen": -298.7373046875, + "logps/rejected": -280.0418395996094, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5184282064437866, + "rewards/margins": 2.8983774185180664, + "rewards/rejected": -2.3799490928649902, + "step": 3066 + }, + { + "epoch": 0.35, + "learning_rate": 1.9689804518319088e-07, + "logits/chosen": -3.0950751304626465, + "logits/rejected": -3.161099433898926, + "logps/chosen": -260.45660400390625, + "logps/rejected": -213.88665771484375, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04338574409484863, + "rewards/margins": 2.7250099182128906, + "rewards/rejected": -2.7683956623077393, + "step": 3067 + }, + { + "epoch": 0.35, + "learning_rate": 1.9686292871356666e-07, + "logits/chosen": -3.430783271789551, + "logits/rejected": -3.4712109565734863, + "logps/chosen": -225.34194946289062, + "logps/rejected": -227.66732788085938, + "loss": 0.7103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.658412754535675, + "rewards/margins": 1.080237627029419, + "rewards/rejected": -1.7386503219604492, + "step": 3068 + }, + { + "epoch": 0.35, + "learning_rate": 1.9682781224394242e-07, + "logits/chosen": -2.6669554710388184, + "logits/rejected": -2.6647467613220215, + "logps/chosen": -208.25653076171875, + "logps/rejected": -458.4106140136719, + "loss": 0.4452, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04271111264824867, + "rewards/margins": 1.5185856819152832, + "rewards/rejected": -1.475874662399292, + "step": 3069 + }, + { + "epoch": 0.35, + "learning_rate": 1.9679269577431815e-07, + "logits/chosen": -2.8078761100769043, + "logits/rejected": -2.862076997756958, + "logps/chosen": -198.1163787841797, + "logps/rejected": -218.91244506835938, + "loss": 0.3516, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37441080808639526, + "rewards/margins": 2.130362033843994, + "rewards/rejected": -1.7559514045715332, + "step": 3070 + }, + { + "epoch": 0.35, + "learning_rate": 1.967575793046939e-07, + "logits/chosen": -3.2165818214416504, + "logits/rejected": -3.3916678428649902, + "logps/chosen": -340.52783203125, + "logps/rejected": -224.23562622070312, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4872143268585205, + "rewards/margins": 1.549790620803833, + "rewards/rejected": -2.0370049476623535, + "step": 3071 + }, + { + "epoch": 0.35, + "learning_rate": 1.9672246283506963e-07, + "logits/chosen": -2.727433443069458, + "logits/rejected": -2.8505280017852783, + "logps/chosen": -320.86407470703125, + "logps/rejected": -320.6701354980469, + "loss": 0.3794, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3218567371368408, + "rewards/margins": 1.528525471687317, + "rewards/rejected": -1.2066688537597656, + "step": 3072 + }, + { + "epoch": 0.35, + "learning_rate": 1.9668734636544538e-07, + "logits/chosen": -2.8702425956726074, + "logits/rejected": -2.890501022338867, + "logps/chosen": -315.35821533203125, + "logps/rejected": -260.9658203125, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6542035341262817, + "rewards/margins": 1.5954463481903076, + "rewards/rejected": -2.2496497631073, + "step": 3073 + }, + { + "epoch": 0.35, + "learning_rate": 1.9665222989582113e-07, + "logits/chosen": -3.4884254932403564, + "logits/rejected": -3.5635218620300293, + "logps/chosen": -320.4202575683594, + "logps/rejected": -212.19674682617188, + "loss": 0.2118, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5530683994293213, + "rewards/margins": 2.5857884883880615, + "rewards/rejected": -2.0327200889587402, + "step": 3074 + }, + { + "epoch": 0.35, + "learning_rate": 1.9661711342619686e-07, + "logits/chosen": -3.0132694244384766, + "logits/rejected": -2.898167133331299, + "logps/chosen": -304.4390869140625, + "logps/rejected": -215.00189208984375, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10484334081411362, + "rewards/margins": 1.3833073377609253, + "rewards/rejected": -1.488150715827942, + "step": 3075 + }, + { + "epoch": 0.35, + "learning_rate": 1.9658199695657262e-07, + "logits/chosen": -3.6952781677246094, + "logits/rejected": -4.015458106994629, + "logps/chosen": -182.2537078857422, + "logps/rejected": -251.23565673828125, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5762869119644165, + "rewards/margins": 0.47806811332702637, + "rewards/rejected": -1.0543550252914429, + "step": 3076 + }, + { + "epoch": 0.35, + "learning_rate": 1.965468804869484e-07, + "logits/chosen": -3.045768976211548, + "logits/rejected": -3.1276774406433105, + "logps/chosen": -230.03053283691406, + "logps/rejected": -202.70664978027344, + "loss": 0.5215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4107796847820282, + "rewards/margins": 0.9814106225967407, + "rewards/rejected": -1.3921902179718018, + "step": 3077 + }, + { + "epoch": 0.35, + "learning_rate": 1.9651176401732412e-07, + "logits/chosen": -2.740459442138672, + "logits/rejected": -2.6614506244659424, + "logps/chosen": -375.5497741699219, + "logps/rejected": -314.50128173828125, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6683952808380127, + "rewards/margins": 2.7857348918914795, + "rewards/rejected": -2.1173393726348877, + "step": 3078 + }, + { + "epoch": 0.35, + "learning_rate": 1.9647664754769988e-07, + "logits/chosen": -2.8291549682617188, + "logits/rejected": -3.107804298400879, + "logps/chosen": -174.18634033203125, + "logps/rejected": -193.68040466308594, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32329416275024414, + "rewards/margins": 1.0610005855560303, + "rewards/rejected": -0.7377064824104309, + "step": 3079 + }, + { + "epoch": 0.36, + "learning_rate": 1.964415310780756e-07, + "logits/chosen": -3.1709063053131104, + "logits/rejected": -3.124007225036621, + "logps/chosen": -185.44290161132812, + "logps/rejected": -224.39195251464844, + "loss": 0.2682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08008065819740295, + "rewards/margins": 1.6290040016174316, + "rewards/rejected": -1.548923373222351, + "step": 3080 + }, + { + "epoch": 0.36, + "learning_rate": 1.9640641460845136e-07, + "logits/chosen": -2.4712629318237305, + "logits/rejected": -2.3469796180725098, + "logps/chosen": -94.00013732910156, + "logps/rejected": -164.3089141845703, + "loss": 0.4605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3907009959220886, + "rewards/margins": 0.9797440767288208, + "rewards/rejected": -1.3704450130462646, + "step": 3081 + }, + { + "epoch": 0.36, + "learning_rate": 1.963712981388271e-07, + "logits/chosen": -2.5381901264190674, + "logits/rejected": -2.5858469009399414, + "logps/chosen": -279.7504577636719, + "logps/rejected": -310.41522216796875, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.012198060750961304, + "rewards/margins": 2.116105556488037, + "rewards/rejected": -2.103907585144043, + "step": 3082 + }, + { + "epoch": 0.36, + "learning_rate": 1.9633618166920284e-07, + "logits/chosen": -3.471062660217285, + "logits/rejected": -3.4846184253692627, + "logps/chosen": -203.15170288085938, + "logps/rejected": -177.05755615234375, + "loss": 0.5395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5098321437835693, + "rewards/margins": 1.0100224018096924, + "rewards/rejected": -1.5198546648025513, + "step": 3083 + }, + { + "epoch": 0.36, + "learning_rate": 1.963010651995786e-07, + "logits/chosen": -2.7988507747650146, + "logits/rejected": -2.8256518840789795, + "logps/chosen": -287.4083251953125, + "logps/rejected": -241.5149383544922, + "loss": 0.5701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5783542394638062, + "rewards/margins": 1.3916542530059814, + "rewards/rejected": -0.8133001327514648, + "step": 3084 + }, + { + "epoch": 0.36, + "learning_rate": 1.9626594872995435e-07, + "logits/chosen": -2.4115376472473145, + "logits/rejected": -2.4969100952148438, + "logps/chosen": -327.6260986328125, + "logps/rejected": -332.6960754394531, + "loss": 0.4141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08624743670225143, + "rewards/margins": 1.687882900238037, + "rewards/rejected": -1.7741303443908691, + "step": 3085 + }, + { + "epoch": 0.36, + "learning_rate": 1.9623083226033008e-07, + "logits/chosen": -3.42287015914917, + "logits/rejected": -3.5188708305358887, + "logps/chosen": -163.1253204345703, + "logps/rejected": -332.4408264160156, + "loss": 0.63, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5863954424858093, + "rewards/margins": 1.00204336643219, + "rewards/rejected": -1.5884387493133545, + "step": 3086 + }, + { + "epoch": 0.36, + "learning_rate": 1.9619571579070583e-07, + "logits/chosen": -3.7319681644439697, + "logits/rejected": -3.7785820960998535, + "logps/chosen": -265.24932861328125, + "logps/rejected": -245.22450256347656, + "loss": 0.4261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5445310473442078, + "rewards/margins": 1.6153206825256348, + "rewards/rejected": -2.1598517894744873, + "step": 3087 + }, + { + "epoch": 0.36, + "learning_rate": 1.9616059932108156e-07, + "logits/chosen": -2.8471860885620117, + "logits/rejected": -3.107409954071045, + "logps/chosen": -281.7643737792969, + "logps/rejected": -209.75204467773438, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5447835326194763, + "rewards/margins": 2.7791929244995117, + "rewards/rejected": -2.2344093322753906, + "step": 3088 + }, + { + "epoch": 0.36, + "learning_rate": 1.9612548285145734e-07, + "logits/chosen": -3.0530147552490234, + "logits/rejected": -2.825230598449707, + "logps/chosen": -186.46145629882812, + "logps/rejected": -155.78109741210938, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31543001532554626, + "rewards/margins": 2.172807216644287, + "rewards/rejected": -2.4882373809814453, + "step": 3089 + }, + { + "epoch": 0.36, + "learning_rate": 1.960903663818331e-07, + "logits/chosen": -2.751537561416626, + "logits/rejected": -2.6698997020721436, + "logps/chosen": -254.18115234375, + "logps/rejected": -220.85328674316406, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24336227774620056, + "rewards/margins": 1.1776258945465088, + "rewards/rejected": -0.9342636466026306, + "step": 3090 + }, + { + "epoch": 0.36, + "learning_rate": 1.9605524991220882e-07, + "logits/chosen": -3.1359548568725586, + "logits/rejected": -2.998751163482666, + "logps/chosen": -290.08148193359375, + "logps/rejected": -242.88125610351562, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3427727222442627, + "rewards/margins": 1.8069825172424316, + "rewards/rejected": -1.4642099142074585, + "step": 3091 + }, + { + "epoch": 0.36, + "learning_rate": 1.9602013344258457e-07, + "logits/chosen": -2.9210681915283203, + "logits/rejected": -2.9084177017211914, + "logps/chosen": -346.00054931640625, + "logps/rejected": -326.629638671875, + "loss": 0.4096, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24618424475193024, + "rewards/margins": 2.0851447582244873, + "rewards/rejected": -1.8389604091644287, + "step": 3092 + }, + { + "epoch": 0.36, + "learning_rate": 1.959850169729603e-07, + "logits/chosen": -2.930842161178589, + "logits/rejected": -2.80279541015625, + "logps/chosen": -449.075439453125, + "logps/rejected": -228.6827850341797, + "loss": 0.9606, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7300905585289001, + "rewards/margins": 0.2501145601272583, + "rewards/rejected": -0.9802049994468689, + "step": 3093 + }, + { + "epoch": 0.36, + "learning_rate": 1.9594990050333605e-07, + "logits/chosen": -2.839263439178467, + "logits/rejected": -2.80460786819458, + "logps/chosen": -324.3180847167969, + "logps/rejected": -178.81686401367188, + "loss": 0.445, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19470174610614777, + "rewards/margins": 1.5245707035064697, + "rewards/rejected": -1.329869031906128, + "step": 3094 + }, + { + "epoch": 0.36, + "learning_rate": 1.959147840337118e-07, + "logits/chosen": -3.0211663246154785, + "logits/rejected": -3.2875208854675293, + "logps/chosen": -258.13104248046875, + "logps/rejected": -175.353515625, + "loss": 0.5279, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3289003372192383, + "rewards/margins": 0.7249034643173218, + "rewards/rejected": -0.39600318670272827, + "step": 3095 + }, + { + "epoch": 0.36, + "learning_rate": 1.9587966756408753e-07, + "logits/chosen": -3.7746894359588623, + "logits/rejected": -3.701540470123291, + "logps/chosen": -87.59601593017578, + "logps/rejected": -114.17941284179688, + "loss": 0.4657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27062538266181946, + "rewards/margins": 1.0857970714569092, + "rewards/rejected": -1.3564224243164062, + "step": 3096 + }, + { + "epoch": 0.36, + "learning_rate": 1.958445510944633e-07, + "logits/chosen": -2.119966983795166, + "logits/rejected": -2.189887762069702, + "logps/chosen": -334.360595703125, + "logps/rejected": -364.8055725097656, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.369712233543396, + "rewards/margins": 1.367830514907837, + "rewards/rejected": -0.9981181621551514, + "step": 3097 + }, + { + "epoch": 0.36, + "learning_rate": 1.9580943462483904e-07, + "logits/chosen": -2.4852919578552246, + "logits/rejected": -2.3092923164367676, + "logps/chosen": -354.326171875, + "logps/rejected": -394.84991455078125, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19075548648834229, + "rewards/margins": 3.3216190338134766, + "rewards/rejected": -3.130863666534424, + "step": 3098 + }, + { + "epoch": 0.36, + "learning_rate": 1.9577431815521477e-07, + "logits/chosen": -2.86698579788208, + "logits/rejected": -2.7887377738952637, + "logps/chosen": -266.2457580566406, + "logps/rejected": -174.58917236328125, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1278223991394043, + "rewards/margins": 2.1043291091918945, + "rewards/rejected": -2.232151508331299, + "step": 3099 + }, + { + "epoch": 0.36, + "learning_rate": 1.9573920168559055e-07, + "logits/chosen": -3.0276951789855957, + "logits/rejected": -3.058997631072998, + "logps/chosen": -157.2912139892578, + "logps/rejected": -208.0201873779297, + "loss": 0.4033, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1304360032081604, + "rewards/margins": 1.4164899587631226, + "rewards/rejected": -1.2860538959503174, + "step": 3100 + }, + { + "epoch": 0.36, + "learning_rate": 1.9570408521596625e-07, + "logits/chosen": -3.5753822326660156, + "logits/rejected": -3.2642714977264404, + "logps/chosen": -123.91609191894531, + "logps/rejected": -142.77993774414062, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6242440342903137, + "rewards/margins": 1.4839280843734741, + "rewards/rejected": -2.1081719398498535, + "step": 3101 + }, + { + "epoch": 0.36, + "learning_rate": 1.9566896874634203e-07, + "logits/chosen": -3.2596631050109863, + "logits/rejected": -3.2124130725860596, + "logps/chosen": -278.4791259765625, + "logps/rejected": -184.82432556152344, + "loss": 0.34, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3263780474662781, + "rewards/margins": 1.3903692960739136, + "rewards/rejected": -1.7167472839355469, + "step": 3102 + }, + { + "epoch": 0.36, + "learning_rate": 1.9563385227671778e-07, + "logits/chosen": -3.1468048095703125, + "logits/rejected": -2.9663121700286865, + "logps/chosen": -227.12924194335938, + "logps/rejected": -371.0093994140625, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17177477478981018, + "rewards/margins": 1.9644711017608643, + "rewards/rejected": -1.7926963567733765, + "step": 3103 + }, + { + "epoch": 0.36, + "learning_rate": 1.955987358070935e-07, + "logits/chosen": -2.4530160427093506, + "logits/rejected": -2.5940890312194824, + "logps/chosen": -243.99659729003906, + "logps/rejected": -257.55047607421875, + "loss": 0.4068, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08054850995540619, + "rewards/margins": 1.1997195482254028, + "rewards/rejected": -1.2802680730819702, + "step": 3104 + }, + { + "epoch": 0.36, + "learning_rate": 1.9556361933746927e-07, + "logits/chosen": -3.117814064025879, + "logits/rejected": -3.1777310371398926, + "logps/chosen": -170.281005859375, + "logps/rejected": -217.00865173339844, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2771066427230835, + "rewards/margins": 1.8005781173706055, + "rewards/rejected": -1.523471474647522, + "step": 3105 + }, + { + "epoch": 0.36, + "learning_rate": 1.9552850286784502e-07, + "logits/chosen": -2.468282461166382, + "logits/rejected": -2.6449267864227295, + "logps/chosen": -231.47715759277344, + "logps/rejected": -193.7936248779297, + "loss": 0.4764, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026336491107940674, + "rewards/margins": 1.1085357666015625, + "rewards/rejected": -1.1348721981048584, + "step": 3106 + }, + { + "epoch": 0.36, + "learning_rate": 1.9549338639822075e-07, + "logits/chosen": -3.772731304168701, + "logits/rejected": -3.5305869579315186, + "logps/chosen": -189.17623901367188, + "logps/rejected": -265.4544677734375, + "loss": 0.5189, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6102039813995361, + "rewards/margins": 0.9858118295669556, + "rewards/rejected": -1.5960159301757812, + "step": 3107 + }, + { + "epoch": 0.36, + "learning_rate": 1.954582699285965e-07, + "logits/chosen": -2.8213677406311035, + "logits/rejected": -2.987582206726074, + "logps/chosen": -193.61802673339844, + "logps/rejected": -239.55477905273438, + "loss": 0.3603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10257388651371002, + "rewards/margins": 1.5944665670394897, + "rewards/rejected": -1.6970404386520386, + "step": 3108 + }, + { + "epoch": 0.36, + "learning_rate": 1.9542315345897223e-07, + "logits/chosen": -3.5685129165649414, + "logits/rejected": -3.624213933944702, + "logps/chosen": -114.85082244873047, + "logps/rejected": -187.01666259765625, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01748856157064438, + "rewards/margins": 2.4503862857818604, + "rewards/rejected": -2.4328978061676025, + "step": 3109 + }, + { + "epoch": 0.36, + "learning_rate": 1.9538803698934798e-07, + "logits/chosen": -2.980478286743164, + "logits/rejected": -3.362539291381836, + "logps/chosen": -158.89834594726562, + "logps/rejected": -207.892578125, + "loss": 0.2631, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3749452233314514, + "rewards/margins": 3.1507649421691895, + "rewards/rejected": -2.775819778442383, + "step": 3110 + }, + { + "epoch": 0.36, + "learning_rate": 1.9535292051972376e-07, + "logits/chosen": -2.8349218368530273, + "logits/rejected": -2.8050320148468018, + "logps/chosen": -397.3338623046875, + "logps/rejected": -170.21737670898438, + "loss": 0.72, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6490936279296875, + "rewards/margins": 1.0060296058654785, + "rewards/rejected": -1.655123233795166, + "step": 3111 + }, + { + "epoch": 0.36, + "learning_rate": 1.953178040500995e-07, + "logits/chosen": -2.6955342292785645, + "logits/rejected": -2.5296480655670166, + "logps/chosen": -200.2220001220703, + "logps/rejected": -276.84991455078125, + "loss": 0.6075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2893408238887787, + "rewards/margins": 0.6289680004119873, + "rewards/rejected": -0.9183087944984436, + "step": 3112 + }, + { + "epoch": 0.36, + "learning_rate": 1.9528268758047524e-07, + "logits/chosen": -2.84305739402771, + "logits/rejected": -2.9804935455322266, + "logps/chosen": -269.8381652832031, + "logps/rejected": -260.12884521484375, + "loss": 0.5668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3733377158641815, + "rewards/margins": 1.8349182605743408, + "rewards/rejected": -2.2082557678222656, + "step": 3113 + }, + { + "epoch": 0.36, + "learning_rate": 1.95247571110851e-07, + "logits/chosen": -3.008105754852295, + "logits/rejected": -3.419436454772949, + "logps/chosen": -168.17922973632812, + "logps/rejected": -127.64070129394531, + "loss": 0.4402, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20922866463661194, + "rewards/margins": 1.341770052909851, + "rewards/rejected": -1.1325414180755615, + "step": 3114 + }, + { + "epoch": 0.36, + "learning_rate": 1.9521245464122673e-07, + "logits/chosen": -2.782423973083496, + "logits/rejected": -2.849886655807495, + "logps/chosen": -264.4804382324219, + "logps/rejected": -356.5912780761719, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4575228691101074, + "rewards/margins": 1.7113043069839478, + "rewards/rejected": -1.2537814378738403, + "step": 3115 + }, + { + "epoch": 0.36, + "learning_rate": 1.9517733817160248e-07, + "logits/chosen": -2.6934237480163574, + "logits/rejected": -2.544127941131592, + "logps/chosen": -239.30445861816406, + "logps/rejected": -247.96731567382812, + "loss": 0.3127, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08455897122621536, + "rewards/margins": 1.6608185768127441, + "rewards/rejected": -1.5762596130371094, + "step": 3116 + }, + { + "epoch": 0.36, + "learning_rate": 1.951422217019782e-07, + "logits/chosen": -3.177844524383545, + "logits/rejected": -3.2488391399383545, + "logps/chosen": -284.8919677734375, + "logps/rejected": -206.64675903320312, + "loss": 0.1555, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32208263874053955, + "rewards/margins": 2.090003728866577, + "rewards/rejected": -1.7679210901260376, + "step": 3117 + }, + { + "epoch": 0.36, + "learning_rate": 1.9510710523235396e-07, + "logits/chosen": -3.1049277782440186, + "logits/rejected": -2.7169547080993652, + "logps/chosen": -346.8677062988281, + "logps/rejected": -336.5568542480469, + "loss": 0.3661, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10120445489883423, + "rewards/margins": 1.9424926042556763, + "rewards/rejected": -1.8412880897521973, + "step": 3118 + }, + { + "epoch": 0.36, + "learning_rate": 1.9507198876272971e-07, + "logits/chosen": -3.155133008956909, + "logits/rejected": -3.333463668823242, + "logps/chosen": -246.06948852539062, + "logps/rejected": -306.27447509765625, + "loss": 0.2175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.427146852016449, + "rewards/margins": 3.072503089904785, + "rewards/rejected": -2.6453564167022705, + "step": 3119 + }, + { + "epoch": 0.36, + "learning_rate": 1.9503687229310544e-07, + "logits/chosen": -2.935206890106201, + "logits/rejected": -2.903000593185425, + "logps/chosen": -282.84906005859375, + "logps/rejected": -199.02740478515625, + "loss": 0.6734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3533051013946533, + "rewards/margins": 0.7861587405204773, + "rewards/rejected": -1.1394637823104858, + "step": 3120 + }, + { + "epoch": 0.36, + "learning_rate": 1.950017558234812e-07, + "logits/chosen": -3.081047773361206, + "logits/rejected": -3.1772549152374268, + "logps/chosen": -341.4259948730469, + "logps/rejected": -377.9747009277344, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6126015186309814, + "rewards/margins": 2.7089593410491943, + "rewards/rejected": -2.096357822418213, + "step": 3121 + }, + { + "epoch": 0.36, + "learning_rate": 1.9496663935385698e-07, + "logits/chosen": -3.3274025917053223, + "logits/rejected": -3.181914806365967, + "logps/chosen": -320.32696533203125, + "logps/rejected": -213.698974609375, + "loss": 0.3359, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6988868713378906, + "rewards/margins": 2.239280939102173, + "rewards/rejected": -1.5403938293457031, + "step": 3122 + }, + { + "epoch": 0.36, + "learning_rate": 1.949315228842327e-07, + "logits/chosen": -2.8600902557373047, + "logits/rejected": -3.1306629180908203, + "logps/chosen": -203.5533447265625, + "logps/rejected": -228.5997772216797, + "loss": 0.2809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12238869816064835, + "rewards/margins": 1.7218592166900635, + "rewards/rejected": -1.8442476987838745, + "step": 3123 + }, + { + "epoch": 0.36, + "learning_rate": 1.9489640641460846e-07, + "logits/chosen": -1.9985876083374023, + "logits/rejected": -2.338883876800537, + "logps/chosen": -321.4471740722656, + "logps/rejected": -234.20289611816406, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09489896893501282, + "rewards/margins": 1.5055862665176392, + "rewards/rejected": -1.4106873273849487, + "step": 3124 + }, + { + "epoch": 0.36, + "learning_rate": 1.9486128994498418e-07, + "logits/chosen": -3.3073911666870117, + "logits/rejected": -3.0754551887512207, + "logps/chosen": -303.9153747558594, + "logps/rejected": -255.05633544921875, + "loss": 0.2527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17883943021297455, + "rewards/margins": 2.0495049953460693, + "rewards/rejected": -2.228344440460205, + "step": 3125 + }, + { + "epoch": 0.36, + "learning_rate": 1.9482617347535994e-07, + "logits/chosen": -2.7084317207336426, + "logits/rejected": -2.904076337814331, + "logps/chosen": -286.6516418457031, + "logps/rejected": -288.10247802734375, + "loss": 0.541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7445558309555054, + "rewards/margins": 1.1072702407836914, + "rewards/rejected": -1.8518260717391968, + "step": 3126 + }, + { + "epoch": 0.36, + "learning_rate": 1.947910570057357e-07, + "logits/chosen": -3.025954246520996, + "logits/rejected": -3.003652572631836, + "logps/chosen": -124.65135955810547, + "logps/rejected": -272.06390380859375, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12165818363428116, + "rewards/margins": 1.2299613952636719, + "rewards/rejected": -1.3516194820404053, + "step": 3127 + }, + { + "epoch": 0.36, + "learning_rate": 1.9475594053611142e-07, + "logits/chosen": -3.5752532482147217, + "logits/rejected": -3.469583034515381, + "logps/chosen": -284.8741149902344, + "logps/rejected": -279.1533203125, + "loss": 0.7118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0186142735183239, + "rewards/margins": 0.5026312470436096, + "rewards/rejected": -0.521245539188385, + "step": 3128 + }, + { + "epoch": 0.36, + "learning_rate": 1.9472082406648717e-07, + "logits/chosen": -1.9693684577941895, + "logits/rejected": -1.7743173837661743, + "logps/chosen": -362.24920654296875, + "logps/rejected": -359.17877197265625, + "loss": 0.5973, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6720864772796631, + "rewards/margins": 0.8427870273590088, + "rewards/rejected": -1.5148735046386719, + "step": 3129 + }, + { + "epoch": 0.36, + "learning_rate": 1.9468570759686293e-07, + "logits/chosen": -2.61252498626709, + "logits/rejected": -2.764272451400757, + "logps/chosen": -289.86468505859375, + "logps/rejected": -241.3604278564453, + "loss": 0.5395, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5666022300720215, + "rewards/margins": 0.7167096734046936, + "rewards/rejected": -1.2833119630813599, + "step": 3130 + }, + { + "epoch": 0.36, + "learning_rate": 1.9465059112723865e-07, + "logits/chosen": -3.1690239906311035, + "logits/rejected": -3.3270647525787354, + "logps/chosen": -177.7681121826172, + "logps/rejected": -356.51983642578125, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19588251411914825, + "rewards/margins": 2.389726400375366, + "rewards/rejected": -2.1938438415527344, + "step": 3131 + }, + { + "epoch": 0.36, + "learning_rate": 1.946154746576144e-07, + "logits/chosen": -3.2140862941741943, + "logits/rejected": -3.5098836421966553, + "logps/chosen": -195.50234985351562, + "logps/rejected": -327.5115661621094, + "loss": 0.3091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4320847690105438, + "rewards/margins": 2.2779173851013184, + "rewards/rejected": -2.7100021839141846, + "step": 3132 + }, + { + "epoch": 0.36, + "learning_rate": 1.9458035818799014e-07, + "logits/chosen": -3.3292362689971924, + "logits/rejected": -3.2177438735961914, + "logps/chosen": -160.10154724121094, + "logps/rejected": -196.58212280273438, + "loss": 1.3432, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.791459560394287, + "rewards/margins": -0.3153592348098755, + "rewards/rejected": -1.4761004447937012, + "step": 3133 + }, + { + "epoch": 0.36, + "learning_rate": 1.9454524171836592e-07, + "logits/chosen": -2.882235050201416, + "logits/rejected": -2.893958806991577, + "logps/chosen": -258.2135009765625, + "logps/rejected": -222.9864044189453, + "loss": 0.6598, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15996044874191284, + "rewards/margins": 1.0880732536315918, + "rewards/rejected": -1.2480336427688599, + "step": 3134 + }, + { + "epoch": 0.36, + "learning_rate": 1.9451012524874167e-07, + "logits/chosen": -2.6734280586242676, + "logits/rejected": -2.736083507537842, + "logps/chosen": -314.3001708984375, + "logps/rejected": -264.7532653808594, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44309449195861816, + "rewards/margins": 2.706721544265747, + "rewards/rejected": -2.26362681388855, + "step": 3135 + }, + { + "epoch": 0.36, + "learning_rate": 1.944750087791174e-07, + "logits/chosen": -3.4513635635375977, + "logits/rejected": -3.6545827388763428, + "logps/chosen": -214.97882080078125, + "logps/rejected": -235.15771484375, + "loss": 0.274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.026938125491142273, + "rewards/margins": 2.167647361755371, + "rewards/rejected": -2.1945855617523193, + "step": 3136 + }, + { + "epoch": 0.36, + "learning_rate": 1.9443989230949315e-07, + "logits/chosen": -2.933361530303955, + "logits/rejected": -2.754458427429199, + "logps/chosen": -197.4517364501953, + "logps/rejected": -224.07888793945312, + "loss": 0.273, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4978392422199249, + "rewards/margins": 1.8580803871154785, + "rewards/rejected": -1.360241174697876, + "step": 3137 + }, + { + "epoch": 0.36, + "learning_rate": 1.9440477583986888e-07, + "logits/chosen": -2.9276881217956543, + "logits/rejected": -3.0481343269348145, + "logps/chosen": -320.54998779296875, + "logps/rejected": -163.0902557373047, + "loss": 0.3918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17877639830112457, + "rewards/margins": 0.9948813319206238, + "rewards/rejected": -1.173657774925232, + "step": 3138 + }, + { + "epoch": 0.36, + "learning_rate": 1.9436965937024463e-07, + "logits/chosen": -4.334774494171143, + "logits/rejected": -3.926363945007324, + "logps/chosen": -255.1059112548828, + "logps/rejected": -215.308349609375, + "loss": 0.482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2157684564590454, + "rewards/margins": 1.1830332279205322, + "rewards/rejected": -1.398801565170288, + "step": 3139 + }, + { + "epoch": 0.36, + "learning_rate": 1.9433454290062039e-07, + "logits/chosen": -2.6784138679504395, + "logits/rejected": -2.4433908462524414, + "logps/chosen": -345.3836975097656, + "logps/rejected": -345.91033935546875, + "loss": 0.5368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.613369345664978, + "rewards/margins": 1.7982168197631836, + "rewards/rejected": -2.411586284637451, + "step": 3140 + }, + { + "epoch": 0.36, + "learning_rate": 1.9429942643099611e-07, + "logits/chosen": -2.617370128631592, + "logits/rejected": -2.732225179672241, + "logps/chosen": -253.665283203125, + "logps/rejected": -193.13133239746094, + "loss": 0.2801, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04861985146999359, + "rewards/margins": 1.7302783727645874, + "rewards/rejected": -1.681658387184143, + "step": 3141 + }, + { + "epoch": 0.36, + "learning_rate": 1.9426430996137187e-07, + "logits/chosen": -3.260280132293701, + "logits/rejected": -3.184082508087158, + "logps/chosen": -316.1820373535156, + "logps/rejected": -195.85411071777344, + "loss": 0.5302, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5305548906326294, + "rewards/margins": 1.2422001361846924, + "rewards/rejected": -1.7727551460266113, + "step": 3142 + }, + { + "epoch": 0.36, + "learning_rate": 1.9422919349174762e-07, + "logits/chosen": -3.3487188816070557, + "logits/rejected": -3.4055697917938232, + "logps/chosen": -252.9063720703125, + "logps/rejected": -308.38653564453125, + "loss": 0.3402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11965171247720718, + "rewards/margins": 1.3292714357376099, + "rewards/rejected": -1.448923110961914, + "step": 3143 + }, + { + "epoch": 0.36, + "learning_rate": 1.9419407702212335e-07, + "logits/chosen": -2.840327739715576, + "logits/rejected": -2.553577423095703, + "logps/chosen": -277.79779052734375, + "logps/rejected": -272.0838317871094, + "loss": 0.7271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07231229543685913, + "rewards/margins": 0.6255306005477905, + "rewards/rejected": -0.6978428959846497, + "step": 3144 + }, + { + "epoch": 0.36, + "learning_rate": 1.9415896055249913e-07, + "logits/chosen": -3.9872355461120605, + "logits/rejected": -4.02241325378418, + "logps/chosen": -267.6069030761719, + "logps/rejected": -216.90277099609375, + "loss": 0.4144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31662413477897644, + "rewards/margins": 1.290988564491272, + "rewards/rejected": -1.6076128482818604, + "step": 3145 + }, + { + "epoch": 0.36, + "learning_rate": 1.9412384408287486e-07, + "logits/chosen": -2.714205265045166, + "logits/rejected": -2.5255908966064453, + "logps/chosen": -336.0819396972656, + "logps/rejected": -185.05406188964844, + "loss": 0.7168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21295166015625, + "rewards/margins": 0.35085803270339966, + "rewards/rejected": -0.5638096928596497, + "step": 3146 + }, + { + "epoch": 0.36, + "learning_rate": 1.940887276132506e-07, + "logits/chosen": -2.165637493133545, + "logits/rejected": -2.294268846511841, + "logps/chosen": -397.3463134765625, + "logps/rejected": -523.36279296875, + "loss": 0.6833, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35481566190719604, + "rewards/margins": 0.8367919325828552, + "rewards/rejected": -1.1916077136993408, + "step": 3147 + }, + { + "epoch": 0.36, + "learning_rate": 1.9405361114362636e-07, + "logits/chosen": -2.8628482818603516, + "logits/rejected": -2.515326499938965, + "logps/chosen": -417.503173828125, + "logps/rejected": -268.97613525390625, + "loss": 0.402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3466891348361969, + "rewards/margins": 1.3490190505981445, + "rewards/rejected": -1.6957082748413086, + "step": 3148 + }, + { + "epoch": 0.36, + "learning_rate": 1.940184946740021e-07, + "logits/chosen": -4.020050525665283, + "logits/rejected": -3.744053602218628, + "logps/chosen": -277.6413269042969, + "logps/rejected": -253.1767578125, + "loss": 0.2919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4056010842323303, + "rewards/margins": 2.1107869148254395, + "rewards/rejected": -2.516387939453125, + "step": 3149 + }, + { + "epoch": 0.36, + "learning_rate": 1.9398337820437785e-07, + "logits/chosen": -2.723170042037964, + "logits/rejected": -2.948158025741577, + "logps/chosen": -256.3647155761719, + "logps/rejected": -292.14044189453125, + "loss": 0.2604, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15477971732616425, + "rewards/margins": 1.7812004089355469, + "rewards/rejected": -1.6264207363128662, + "step": 3150 + }, + { + "epoch": 0.36, + "learning_rate": 1.939482617347536e-07, + "logits/chosen": -3.267343521118164, + "logits/rejected": -3.414912700653076, + "logps/chosen": -303.08807373046875, + "logps/rejected": -327.85272216796875, + "loss": 0.371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09791743755340576, + "rewards/margins": 2.0082571506500244, + "rewards/rejected": -1.910339593887329, + "step": 3151 + }, + { + "epoch": 0.36, + "learning_rate": 1.9391314526512933e-07, + "logits/chosen": -3.181365966796875, + "logits/rejected": -3.0855886936187744, + "logps/chosen": -278.80596923828125, + "logps/rejected": -187.27603149414062, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1476357877254486, + "rewards/margins": 1.5447094440460205, + "rewards/rejected": -1.692345380783081, + "step": 3152 + }, + { + "epoch": 0.36, + "learning_rate": 1.9387802879550508e-07, + "logits/chosen": -2.430910587310791, + "logits/rejected": -2.4548614025115967, + "logps/chosen": -491.0486755371094, + "logps/rejected": -325.77783203125, + "loss": 0.8776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7084676623344421, + "rewards/margins": 0.39052680134773254, + "rewards/rejected": -1.098994493484497, + "step": 3153 + }, + { + "epoch": 0.36, + "learning_rate": 1.938429123258808e-07, + "logits/chosen": -3.0407025814056396, + "logits/rejected": -2.7807254791259766, + "logps/chosen": -291.38995361328125, + "logps/rejected": -177.69686889648438, + "loss": 0.3467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2521824240684509, + "rewards/margins": 1.6012303829193115, + "rewards/rejected": -1.3490480184555054, + "step": 3154 + }, + { + "epoch": 0.36, + "learning_rate": 1.9380779585625656e-07, + "logits/chosen": -2.9747557640075684, + "logits/rejected": -2.933706283569336, + "logps/chosen": -241.34112548828125, + "logps/rejected": -180.63107299804688, + "loss": 0.2545, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09966646134853363, + "rewards/margins": 2.479768753051758, + "rewards/rejected": -2.3801023960113525, + "step": 3155 + }, + { + "epoch": 0.36, + "learning_rate": 1.9377267938663234e-07, + "logits/chosen": -2.959115505218506, + "logits/rejected": -3.2778992652893066, + "logps/chosen": -270.78936767578125, + "logps/rejected": -419.3648986816406, + "loss": 0.445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7614113092422485, + "rewards/margins": 1.7615286111831665, + "rewards/rejected": -2.522939920425415, + "step": 3156 + }, + { + "epoch": 0.36, + "learning_rate": 1.9373756291700807e-07, + "logits/chosen": -2.664050817489624, + "logits/rejected": -2.7092819213867188, + "logps/chosen": -196.298828125, + "logps/rejected": -324.38543701171875, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13760371506214142, + "rewards/margins": 1.2768279314041138, + "rewards/rejected": -1.1392244100570679, + "step": 3157 + }, + { + "epoch": 0.36, + "learning_rate": 1.9370244644738382e-07, + "logits/chosen": -2.862964391708374, + "logits/rejected": -3.007152557373047, + "logps/chosen": -323.246826171875, + "logps/rejected": -166.35379028320312, + "loss": 0.4695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.010654762387275696, + "rewards/margins": 1.7380387783050537, + "rewards/rejected": -1.7486937046051025, + "step": 3158 + }, + { + "epoch": 0.36, + "learning_rate": 1.9366732997775958e-07, + "logits/chosen": -3.801161766052246, + "logits/rejected": -3.822605609893799, + "logps/chosen": -177.2981414794922, + "logps/rejected": -164.15139770507812, + "loss": 0.5274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7948575019836426, + "rewards/margins": 0.8318825960159302, + "rewards/rejected": -1.6267402172088623, + "step": 3159 + }, + { + "epoch": 0.36, + "learning_rate": 1.936322135081353e-07, + "logits/chosen": -3.1773176193237305, + "logits/rejected": -2.96525239944458, + "logps/chosen": -365.242431640625, + "logps/rejected": -307.3572692871094, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0967455580830574, + "rewards/margins": 0.6376952528953552, + "rewards/rejected": -0.5409497618675232, + "step": 3160 + }, + { + "epoch": 0.36, + "learning_rate": 1.9359709703851106e-07, + "logits/chosen": -3.4648280143737793, + "logits/rejected": -3.7581849098205566, + "logps/chosen": -357.2449951171875, + "logps/rejected": -314.943115234375, + "loss": 0.3883, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.59961998462677, + "rewards/margins": 1.2466429471969604, + "rewards/rejected": -1.8462629318237305, + "step": 3161 + }, + { + "epoch": 0.36, + "learning_rate": 1.9356198056888679e-07, + "logits/chosen": -2.6313295364379883, + "logits/rejected": -2.792285442352295, + "logps/chosen": -135.06724548339844, + "logps/rejected": -217.89158630371094, + "loss": 0.6147, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0711510181427002, + "rewards/margins": 0.39882540702819824, + "rewards/rejected": -0.46997642517089844, + "step": 3162 + }, + { + "epoch": 0.36, + "learning_rate": 1.9352686409926254e-07, + "logits/chosen": -3.9241766929626465, + "logits/rejected": -3.8566627502441406, + "logps/chosen": -284.4398193359375, + "logps/rejected": -352.3221435546875, + "loss": 0.368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5513325929641724, + "rewards/margins": 2.3936877250671387, + "rewards/rejected": -2.9450201988220215, + "step": 3163 + }, + { + "epoch": 0.36, + "learning_rate": 1.934917476296383e-07, + "logits/chosen": -3.068753480911255, + "logits/rejected": -3.1875836849212646, + "logps/chosen": -301.783203125, + "logps/rejected": -233.98565673828125, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6921709775924683, + "rewards/margins": 1.975440502166748, + "rewards/rejected": -1.2832696437835693, + "step": 3164 + }, + { + "epoch": 0.36, + "learning_rate": 1.9345663116001402e-07, + "logits/chosen": -3.226672887802124, + "logits/rejected": -3.318633556365967, + "logps/chosen": -269.446533203125, + "logps/rejected": -286.0015869140625, + "loss": 0.2603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26339155435562134, + "rewards/margins": 1.6534026861190796, + "rewards/rejected": -1.9167943000793457, + "step": 3165 + }, + { + "epoch": 0.36, + "learning_rate": 1.9342151469038977e-07, + "logits/chosen": -3.206882953643799, + "logits/rejected": -3.2786176204681396, + "logps/chosen": -173.13641357421875, + "logps/rejected": -165.844482421875, + "loss": 0.3851, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.051384419202804565, + "rewards/margins": 1.1042113304138184, + "rewards/rejected": -1.052827000617981, + "step": 3166 + }, + { + "epoch": 0.37, + "learning_rate": 1.9338639822076556e-07, + "logits/chosen": -2.49845027923584, + "logits/rejected": -2.826868772506714, + "logps/chosen": -403.8288879394531, + "logps/rejected": -257.8197021484375, + "loss": 0.3645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6386094093322754, + "rewards/margins": 1.3163683414459229, + "rewards/rejected": -0.6777588725090027, + "step": 3167 + }, + { + "epoch": 0.37, + "learning_rate": 1.9335128175114128e-07, + "logits/chosen": -3.104853868484497, + "logits/rejected": -3.050532102584839, + "logps/chosen": -351.00360107421875, + "logps/rejected": -203.71531677246094, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30171069502830505, + "rewards/margins": 2.229830741882324, + "rewards/rejected": -1.9281201362609863, + "step": 3168 + }, + { + "epoch": 0.37, + "learning_rate": 1.9331616528151704e-07, + "logits/chosen": -2.3206558227539062, + "logits/rejected": -2.5209836959838867, + "logps/chosen": -318.3055725097656, + "logps/rejected": -270.863037109375, + "loss": 0.2066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.663703203201294, + "rewards/margins": 2.0448946952819824, + "rewards/rejected": -1.381191611289978, + "step": 3169 + }, + { + "epoch": 0.37, + "learning_rate": 1.9328104881189276e-07, + "logits/chosen": -2.5549418926239014, + "logits/rejected": -2.7501368522644043, + "logps/chosen": -204.6936798095703, + "logps/rejected": -335.27996826171875, + "loss": 0.5318, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3278293311595917, + "rewards/margins": 1.1896642446517944, + "rewards/rejected": -1.5174936056137085, + "step": 3170 + }, + { + "epoch": 0.37, + "learning_rate": 1.9324593234226852e-07, + "logits/chosen": -2.3297410011291504, + "logits/rejected": -2.4476966857910156, + "logps/chosen": -198.59457397460938, + "logps/rejected": -205.01426696777344, + "loss": 0.9792, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0319730043411255, + "rewards/margins": -0.0736834704875946, + "rewards/rejected": -0.958289623260498, + "step": 3171 + }, + { + "epoch": 0.37, + "learning_rate": 1.9321081587264427e-07, + "logits/chosen": -3.3408331871032715, + "logits/rejected": -3.54144287109375, + "logps/chosen": -156.85028076171875, + "logps/rejected": -168.27459716796875, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4681437611579895, + "rewards/margins": 0.7855842113494873, + "rewards/rejected": -1.2537280321121216, + "step": 3172 + }, + { + "epoch": 0.37, + "learning_rate": 1.9317569940302e-07, + "logits/chosen": -2.618577718734741, + "logits/rejected": -2.7099623680114746, + "logps/chosen": -251.7499237060547, + "logps/rejected": -211.65570068359375, + "loss": 0.2359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5767526030540466, + "rewards/margins": 3.0526134967803955, + "rewards/rejected": -2.475860834121704, + "step": 3173 + }, + { + "epoch": 0.37, + "learning_rate": 1.9314058293339575e-07, + "logits/chosen": -2.828080177307129, + "logits/rejected": -2.793627977371216, + "logps/chosen": -323.42999267578125, + "logps/rejected": -282.9430236816406, + "loss": 0.3621, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18002460896968842, + "rewards/margins": 1.5425063371658325, + "rewards/rejected": -1.3624818325042725, + "step": 3174 + }, + { + "epoch": 0.37, + "learning_rate": 1.931054664637715e-07, + "logits/chosen": -2.6242740154266357, + "logits/rejected": -2.8956551551818848, + "logps/chosen": -252.06881713867188, + "logps/rejected": -245.21841430664062, + "loss": 0.2947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3929525315761566, + "rewards/margins": 1.5374438762664795, + "rewards/rejected": -1.930396318435669, + "step": 3175 + }, + { + "epoch": 0.37, + "learning_rate": 1.9307034999414723e-07, + "logits/chosen": -3.0584704875946045, + "logits/rejected": -3.1872775554656982, + "logps/chosen": -237.08355712890625, + "logps/rejected": -160.3933868408203, + "loss": 0.5175, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09294568002223969, + "rewards/margins": 1.720481038093567, + "rewards/rejected": -1.8134267330169678, + "step": 3176 + }, + { + "epoch": 0.37, + "learning_rate": 1.93035233524523e-07, + "logits/chosen": -3.177743911743164, + "logits/rejected": -3.0853703022003174, + "logps/chosen": -126.09711456298828, + "logps/rejected": -126.9111557006836, + "loss": 0.6389, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13913212716579437, + "rewards/margins": 0.45784538984298706, + "rewards/rejected": -0.5969774723052979, + "step": 3177 + }, + { + "epoch": 0.37, + "learning_rate": 1.9300011705489872e-07, + "logits/chosen": -2.8288044929504395, + "logits/rejected": -2.655651092529297, + "logps/chosen": -429.07073974609375, + "logps/rejected": -254.28530883789062, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5850380659103394, + "rewards/margins": 2.4018473625183105, + "rewards/rejected": -1.8168094158172607, + "step": 3178 + }, + { + "epoch": 0.37, + "learning_rate": 1.929650005852745e-07, + "logits/chosen": -3.537522077560425, + "logits/rejected": -3.495020866394043, + "logps/chosen": -222.0142364501953, + "logps/rejected": -217.59652709960938, + "loss": 0.8089, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0027263164520264, + "rewards/margins": 0.10398638248443604, + "rewards/rejected": -1.1067125797271729, + "step": 3179 + }, + { + "epoch": 0.37, + "learning_rate": 1.9292988411565025e-07, + "logits/chosen": -2.921863317489624, + "logits/rejected": -2.872988224029541, + "logps/chosen": -320.1180419921875, + "logps/rejected": -260.6642761230469, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29767608642578125, + "rewards/margins": 2.4440011978149414, + "rewards/rejected": -2.7416772842407227, + "step": 3180 + }, + { + "epoch": 0.37, + "learning_rate": 1.9289476764602598e-07, + "logits/chosen": -3.510861873626709, + "logits/rejected": -3.541411876678467, + "logps/chosen": -232.81268310546875, + "logps/rejected": -241.0210723876953, + "loss": 0.2038, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2568121552467346, + "rewards/margins": 2.2625627517700195, + "rewards/rejected": -2.0057506561279297, + "step": 3181 + }, + { + "epoch": 0.37, + "learning_rate": 1.9285965117640173e-07, + "logits/chosen": -3.2083821296691895, + "logits/rejected": -3.5233545303344727, + "logps/chosen": -157.1383056640625, + "logps/rejected": -169.42965698242188, + "loss": 0.4002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07776683568954468, + "rewards/margins": 1.0177593231201172, + "rewards/rejected": -1.095526099205017, + "step": 3182 + }, + { + "epoch": 0.37, + "learning_rate": 1.9282453470677748e-07, + "logits/chosen": -2.787046432495117, + "logits/rejected": -2.9413928985595703, + "logps/chosen": -157.8968505859375, + "logps/rejected": -239.31356811523438, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2068593055009842, + "rewards/margins": 1.6255970001220703, + "rewards/rejected": -1.418737769126892, + "step": 3183 + }, + { + "epoch": 0.37, + "learning_rate": 1.927894182371532e-07, + "logits/chosen": -2.4852118492126465, + "logits/rejected": -2.5678765773773193, + "logps/chosen": -251.90386962890625, + "logps/rejected": -297.1863708496094, + "loss": 0.2944, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08986397087574005, + "rewards/margins": 1.9219286441802979, + "rewards/rejected": -2.0117926597595215, + "step": 3184 + }, + { + "epoch": 0.37, + "learning_rate": 1.9275430176752897e-07, + "logits/chosen": -2.8091835975646973, + "logits/rejected": -3.001106023788452, + "logps/chosen": -269.08966064453125, + "logps/rejected": -262.691162109375, + "loss": 0.2466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13353192806243896, + "rewards/margins": 2.324127674102783, + "rewards/rejected": -2.1905956268310547, + "step": 3185 + }, + { + "epoch": 0.37, + "learning_rate": 1.927191852979047e-07, + "logits/chosen": -2.685307025909424, + "logits/rejected": -3.134504795074463, + "logps/chosen": -197.0176544189453, + "logps/rejected": -163.26231384277344, + "loss": 0.6256, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17461934685707092, + "rewards/margins": 0.9495357275009155, + "rewards/rejected": -0.7749163508415222, + "step": 3186 + }, + { + "epoch": 0.37, + "learning_rate": 1.9268406882828045e-07, + "logits/chosen": -3.203638792037964, + "logits/rejected": -2.848695993423462, + "logps/chosen": -126.86032104492188, + "logps/rejected": -203.45050048828125, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7413145899772644, + "rewards/margins": 0.9371476769447327, + "rewards/rejected": -1.6784621477127075, + "step": 3187 + }, + { + "epoch": 0.37, + "learning_rate": 1.926489523586562e-07, + "logits/chosen": -2.9424662590026855, + "logits/rejected": -3.1661322116851807, + "logps/chosen": -183.78929138183594, + "logps/rejected": -233.72328186035156, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22609272599220276, + "rewards/margins": 2.498617172241211, + "rewards/rejected": -2.272524356842041, + "step": 3188 + }, + { + "epoch": 0.37, + "learning_rate": 1.9261383588903193e-07, + "logits/chosen": -3.0470893383026123, + "logits/rejected": -3.0329556465148926, + "logps/chosen": -158.0448760986328, + "logps/rejected": -215.769287109375, + "loss": 0.3073, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.4080276489257812e-05, + "rewards/margins": 2.050638198852539, + "rewards/rejected": -2.0506138801574707, + "step": 3189 + }, + { + "epoch": 0.37, + "learning_rate": 1.925787194194077e-07, + "logits/chosen": -3.0375590324401855, + "logits/rejected": -3.0227859020233154, + "logps/chosen": -310.0751647949219, + "logps/rejected": -328.383544921875, + "loss": 0.6103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19008426368236542, + "rewards/margins": 0.7001814246177673, + "rewards/rejected": -0.8902656435966492, + "step": 3190 + }, + { + "epoch": 0.37, + "learning_rate": 1.9254360294978344e-07, + "logits/chosen": -2.3290019035339355, + "logits/rejected": -2.441476345062256, + "logps/chosen": -296.9434814453125, + "logps/rejected": -315.8758544921875, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3532309830188751, + "rewards/margins": 1.5018521547317505, + "rewards/rejected": -1.1486213207244873, + "step": 3191 + }, + { + "epoch": 0.37, + "learning_rate": 1.925084864801592e-07, + "logits/chosen": -2.2796630859375, + "logits/rejected": -2.489521026611328, + "logps/chosen": -396.0833435058594, + "logps/rejected": -218.4396209716797, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13185662031173706, + "rewards/margins": 1.1018154621124268, + "rewards/rejected": -1.233672022819519, + "step": 3192 + }, + { + "epoch": 0.37, + "learning_rate": 1.9247337001053494e-07, + "logits/chosen": -3.558927297592163, + "logits/rejected": -3.430616855621338, + "logps/chosen": -221.1109161376953, + "logps/rejected": -241.5007781982422, + "loss": 0.1454, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23434436321258545, + "rewards/margins": 4.832926273345947, + "rewards/rejected": -4.598581790924072, + "step": 3193 + }, + { + "epoch": 0.37, + "learning_rate": 1.9243825354091067e-07, + "logits/chosen": -3.2861223220825195, + "logits/rejected": -3.1551432609558105, + "logps/chosen": -99.10779571533203, + "logps/rejected": -114.55242156982422, + "loss": 0.3638, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14754851162433624, + "rewards/margins": 1.79312264919281, + "rewards/rejected": -1.6455740928649902, + "step": 3194 + }, + { + "epoch": 0.37, + "learning_rate": 1.9240313707128642e-07, + "logits/chosen": -2.242636203765869, + "logits/rejected": -2.3370909690856934, + "logps/chosen": -451.64892578125, + "logps/rejected": -248.69992065429688, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08344341814517975, + "rewards/margins": 1.6113696098327637, + "rewards/rejected": -1.5279263257980347, + "step": 3195 + }, + { + "epoch": 0.37, + "learning_rate": 1.9236802060166218e-07, + "logits/chosen": -3.7533931732177734, + "logits/rejected": -3.269853115081787, + "logps/chosen": -384.52545166015625, + "logps/rejected": -331.7982177734375, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025455381721258163, + "rewards/margins": 1.0872496366500854, + "rewards/rejected": -1.0617942810058594, + "step": 3196 + }, + { + "epoch": 0.37, + "learning_rate": 1.923329041320379e-07, + "logits/chosen": -3.019721746444702, + "logits/rejected": -3.1479649543762207, + "logps/chosen": -210.34909057617188, + "logps/rejected": -228.738525390625, + "loss": 0.3425, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15386933088302612, + "rewards/margins": 1.859704613685608, + "rewards/rejected": -1.7058351039886475, + "step": 3197 + }, + { + "epoch": 0.37, + "learning_rate": 1.9229778766241366e-07, + "logits/chosen": -2.7576911449432373, + "logits/rejected": -2.8671483993530273, + "logps/chosen": -296.6002502441406, + "logps/rejected": -192.82598876953125, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21972621977329254, + "rewards/margins": 1.6246566772460938, + "rewards/rejected": -1.4049303531646729, + "step": 3198 + }, + { + "epoch": 0.37, + "learning_rate": 1.922626711927894e-07, + "logits/chosen": -3.7675492763519287, + "logits/rejected": -3.9964122772216797, + "logps/chosen": -176.26800537109375, + "logps/rejected": -227.17747497558594, + "loss": 0.2093, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31062066555023193, + "rewards/margins": 2.37013578414917, + "rewards/rejected": -2.0595154762268066, + "step": 3199 + }, + { + "epoch": 0.37, + "learning_rate": 1.9222755472316514e-07, + "logits/chosen": -3.2625598907470703, + "logits/rejected": -3.0744194984436035, + "logps/chosen": -394.7248229980469, + "logps/rejected": -332.4236145019531, + "loss": 0.3813, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12685956060886383, + "rewards/margins": 1.0682594776153564, + "rewards/rejected": -1.1951191425323486, + "step": 3200 + }, + { + "epoch": 0.37, + "learning_rate": 1.9219243825354092e-07, + "logits/chosen": -3.2528815269470215, + "logits/rejected": -3.1706550121307373, + "logps/chosen": -351.19891357421875, + "logps/rejected": -313.12933349609375, + "loss": 0.4284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.53228759765625, + "rewards/margins": 1.420660138130188, + "rewards/rejected": -1.9529476165771484, + "step": 3201 + }, + { + "epoch": 0.37, + "learning_rate": 1.9215732178391665e-07, + "logits/chosen": -3.1220457553863525, + "logits/rejected": -3.3700711727142334, + "logps/chosen": -251.88401794433594, + "logps/rejected": -196.0518798828125, + "loss": 0.4355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6162028312683105, + "rewards/margins": 1.6659945249557495, + "rewards/rejected": -2.2821974754333496, + "step": 3202 + }, + { + "epoch": 0.37, + "learning_rate": 1.921222053142924e-07, + "logits/chosen": -3.1230826377868652, + "logits/rejected": -2.8391857147216797, + "logps/chosen": -381.2209777832031, + "logps/rejected": -312.4192810058594, + "loss": 0.2594, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07837282121181488, + "rewards/margins": 2.351102828979492, + "rewards/rejected": -2.2727298736572266, + "step": 3203 + }, + { + "epoch": 0.37, + "learning_rate": 1.9208708884466816e-07, + "logits/chosen": -3.2833290100097656, + "logits/rejected": -3.3133749961853027, + "logps/chosen": -214.62765502929688, + "logps/rejected": -201.61557006835938, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06315914541482925, + "rewards/margins": 1.0348143577575684, + "rewards/rejected": -1.0979735851287842, + "step": 3204 + }, + { + "epoch": 0.37, + "learning_rate": 1.9205197237504388e-07, + "logits/chosen": -2.581875801086426, + "logits/rejected": -2.335019111633301, + "logps/chosen": -262.3916015625, + "logps/rejected": -333.5521545410156, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2767653465270996e-05, + "rewards/margins": 0.36479470133781433, + "rewards/rejected": -0.3648275136947632, + "step": 3205 + }, + { + "epoch": 0.37, + "learning_rate": 1.9201685590541964e-07, + "logits/chosen": -2.985363245010376, + "logits/rejected": -2.973149538040161, + "logps/chosen": -403.019287109375, + "logps/rejected": -222.37109375, + "loss": 0.4155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7481920719146729, + "rewards/margins": 1.3837943077087402, + "rewards/rejected": -2.131986141204834, + "step": 3206 + }, + { + "epoch": 0.37, + "learning_rate": 1.9198173943579537e-07, + "logits/chosen": -2.712921380996704, + "logits/rejected": -2.682893753051758, + "logps/chosen": -296.4322814941406, + "logps/rejected": -310.62030029296875, + "loss": 0.263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3677060306072235, + "rewards/margins": 2.4026906490325928, + "rewards/rejected": -2.7703967094421387, + "step": 3207 + }, + { + "epoch": 0.37, + "learning_rate": 1.9194662296617112e-07, + "logits/chosen": -3.2227590084075928, + "logits/rejected": -3.538888454437256, + "logps/chosen": -179.6095428466797, + "logps/rejected": -202.28724670410156, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1451857089996338, + "rewards/margins": 2.2049641609191895, + "rewards/rejected": -2.0597784519195557, + "step": 3208 + }, + { + "epoch": 0.37, + "learning_rate": 1.9191150649654687e-07, + "logits/chosen": -2.466966390609741, + "logits/rejected": -2.716768980026245, + "logps/chosen": -297.9671630859375, + "logps/rejected": -233.03448486328125, + "loss": 0.4269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23534274101257324, + "rewards/margins": 1.1087921857833862, + "rewards/rejected": -0.873449444770813, + "step": 3209 + }, + { + "epoch": 0.37, + "learning_rate": 1.918763900269226e-07, + "logits/chosen": -2.7078092098236084, + "logits/rejected": -2.448582649230957, + "logps/chosen": -291.03289794921875, + "logps/rejected": -255.4500732421875, + "loss": 0.273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10297869145870209, + "rewards/margins": 1.7684916257858276, + "rewards/rejected": -1.665513038635254, + "step": 3210 + }, + { + "epoch": 0.37, + "learning_rate": 1.9184127355729835e-07, + "logits/chosen": -3.6057567596435547, + "logits/rejected": -3.57731294631958, + "logps/chosen": -196.20156860351562, + "logps/rejected": -94.17867279052734, + "loss": 0.5884, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2778235077857971, + "rewards/margins": 0.842933714389801, + "rewards/rejected": -1.1207573413848877, + "step": 3211 + }, + { + "epoch": 0.37, + "learning_rate": 1.9180615708767413e-07, + "logits/chosen": -2.54561448097229, + "logits/rejected": -2.239021062850952, + "logps/chosen": -292.28204345703125, + "logps/rejected": -340.16009521484375, + "loss": 0.1683, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1486508995294571, + "rewards/margins": 2.8863699436187744, + "rewards/rejected": -2.7377190589904785, + "step": 3212 + }, + { + "epoch": 0.37, + "learning_rate": 1.9177104061804986e-07, + "logits/chosen": -3.4201903343200684, + "logits/rejected": -3.356454849243164, + "logps/chosen": -123.1936264038086, + "logps/rejected": -186.9005889892578, + "loss": 0.3934, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5036709904670715, + "rewards/margins": 1.2099732160568237, + "rewards/rejected": -1.71364426612854, + "step": 3213 + }, + { + "epoch": 0.37, + "learning_rate": 1.9173592414842562e-07, + "logits/chosen": -3.868863105773926, + "logits/rejected": -3.5191736221313477, + "logps/chosen": -274.5214538574219, + "logps/rejected": -279.2172546386719, + "loss": 0.4968, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4927408695220947, + "rewards/margins": 0.8367164731025696, + "rewards/rejected": -0.3439755439758301, + "step": 3214 + }, + { + "epoch": 0.37, + "learning_rate": 1.9170080767880134e-07, + "logits/chosen": -3.4469943046569824, + "logits/rejected": -3.4305765628814697, + "logps/chosen": -79.89260864257812, + "logps/rejected": -150.68055725097656, + "loss": 0.4172, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3113442659378052, + "rewards/margins": 1.696946144104004, + "rewards/rejected": -1.3856017589569092, + "step": 3215 + }, + { + "epoch": 0.37, + "learning_rate": 1.916656912091771e-07, + "logits/chosen": -3.140932083129883, + "logits/rejected": -3.209458827972412, + "logps/chosen": -453.2861633300781, + "logps/rejected": -381.22442626953125, + "loss": 0.1949, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42390185594558716, + "rewards/margins": 2.575244903564453, + "rewards/rejected": -2.1513431072235107, + "step": 3216 + }, + { + "epoch": 0.37, + "learning_rate": 1.9163057473955285e-07, + "logits/chosen": -2.546959161758423, + "logits/rejected": -2.6946377754211426, + "logps/chosen": -317.20208740234375, + "logps/rejected": -279.99334716796875, + "loss": 0.3014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1507071554660797, + "rewards/margins": 2.0866451263427734, + "rewards/rejected": -2.237352132797241, + "step": 3217 + }, + { + "epoch": 0.37, + "learning_rate": 1.9159545826992858e-07, + "logits/chosen": -3.326563835144043, + "logits/rejected": -3.102933883666992, + "logps/chosen": -295.2079772949219, + "logps/rejected": -267.52801513671875, + "loss": 0.4642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4555734395980835, + "rewards/margins": 1.3516632318496704, + "rewards/rejected": -1.8072367906570435, + "step": 3218 + }, + { + "epoch": 0.37, + "learning_rate": 1.9156034180030433e-07, + "logits/chosen": -3.2117018699645996, + "logits/rejected": -2.8602612018585205, + "logps/chosen": -407.1167907714844, + "logps/rejected": -316.2675476074219, + "loss": 0.3929, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6173150539398193, + "rewards/margins": 1.251056432723999, + "rewards/rejected": -1.8683714866638184, + "step": 3219 + }, + { + "epoch": 0.37, + "learning_rate": 1.9152522533068009e-07, + "logits/chosen": -3.4387295246124268, + "logits/rejected": -3.4039721488952637, + "logps/chosen": -483.8855895996094, + "logps/rejected": -264.09417724609375, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.531195342540741, + "rewards/margins": 2.5248587131500244, + "rewards/rejected": -1.9936631917953491, + "step": 3220 + }, + { + "epoch": 0.37, + "learning_rate": 1.9149010886105581e-07, + "logits/chosen": -3.2351975440979004, + "logits/rejected": -3.429494619369507, + "logps/chosen": -119.4773178100586, + "logps/rejected": -181.87091064453125, + "loss": 0.3195, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08006696403026581, + "rewards/margins": 2.6682989597320557, + "rewards/rejected": -2.5882320404052734, + "step": 3221 + }, + { + "epoch": 0.37, + "learning_rate": 1.9145499239143157e-07, + "logits/chosen": -2.8644163608551025, + "logits/rejected": -2.9814255237579346, + "logps/chosen": -305.98089599609375, + "logps/rejected": -212.46640014648438, + "loss": 0.505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2272948920726776, + "rewards/margins": 1.0894325971603394, + "rewards/rejected": -1.3167275190353394, + "step": 3222 + }, + { + "epoch": 0.37, + "learning_rate": 1.914198759218073e-07, + "logits/chosen": -3.14992618560791, + "logits/rejected": -3.4204845428466797, + "logps/chosen": -352.255126953125, + "logps/rejected": -361.21490478515625, + "loss": 0.5094, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2552977204322815, + "rewards/margins": 0.9476374387741089, + "rewards/rejected": -0.6923396587371826, + "step": 3223 + }, + { + "epoch": 0.37, + "learning_rate": 1.9138475945218307e-07, + "logits/chosen": -2.758504629135132, + "logits/rejected": -2.6833691596984863, + "logps/chosen": -588.2627563476562, + "logps/rejected": -384.4261169433594, + "loss": 0.3157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1297607421875, + "rewards/margins": 1.599515438079834, + "rewards/rejected": -1.4697548151016235, + "step": 3224 + }, + { + "epoch": 0.37, + "learning_rate": 1.9134964298255883e-07, + "logits/chosen": -3.8217179775238037, + "logits/rejected": -3.8644299507141113, + "logps/chosen": -186.1505126953125, + "logps/rejected": -173.45523071289062, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18376606702804565, + "rewards/margins": 1.1243703365325928, + "rewards/rejected": -1.3081363439559937, + "step": 3225 + }, + { + "epoch": 0.37, + "learning_rate": 1.9131452651293456e-07, + "logits/chosen": -3.2060937881469727, + "logits/rejected": -3.176862955093384, + "logps/chosen": -286.36358642578125, + "logps/rejected": -277.86297607421875, + "loss": 0.7765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6222411394119263, + "rewards/margins": 0.7616034746170044, + "rewards/rejected": -1.3838446140289307, + "step": 3226 + }, + { + "epoch": 0.37, + "learning_rate": 1.912794100433103e-07, + "logits/chosen": -2.6943438053131104, + "logits/rejected": -2.6788463592529297, + "logps/chosen": -406.9605712890625, + "logps/rejected": -218.1900634765625, + "loss": 0.384, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1339031159877777, + "rewards/margins": 1.5504209995269775, + "rewards/rejected": -1.416517972946167, + "step": 3227 + }, + { + "epoch": 0.37, + "learning_rate": 1.9124429357368606e-07, + "logits/chosen": -3.1584818363189697, + "logits/rejected": -3.2305140495300293, + "logps/chosen": -276.3624572753906, + "logps/rejected": -271.6645202636719, + "loss": 0.3265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3623605966567993, + "rewards/margins": 2.2047200202941895, + "rewards/rejected": -2.5670809745788574, + "step": 3228 + }, + { + "epoch": 0.37, + "learning_rate": 1.912091771040618e-07, + "logits/chosen": -3.479188919067383, + "logits/rejected": -3.4197165966033936, + "logps/chosen": -142.38247680664062, + "logps/rejected": -176.1259765625, + "loss": 0.4916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43982207775115967, + "rewards/margins": 2.0081262588500977, + "rewards/rejected": -2.447948455810547, + "step": 3229 + }, + { + "epoch": 0.37, + "learning_rate": 1.9117406063443755e-07, + "logits/chosen": -3.4238433837890625, + "logits/rejected": -2.7327935695648193, + "logps/chosen": -302.55841064453125, + "logps/rejected": -130.51016235351562, + "loss": 0.4101, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17379309237003326, + "rewards/margins": 1.0282922983169556, + "rewards/rejected": -1.2020853757858276, + "step": 3230 + }, + { + "epoch": 0.37, + "learning_rate": 1.9113894416481327e-07, + "logits/chosen": -2.588366746902466, + "logits/rejected": -2.4745826721191406, + "logps/chosen": -284.635498046875, + "logps/rejected": -244.81593322753906, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29966822266578674, + "rewards/margins": 1.046379566192627, + "rewards/rejected": -1.3460476398468018, + "step": 3231 + }, + { + "epoch": 0.37, + "learning_rate": 1.9110382769518903e-07, + "logits/chosen": -3.3810057640075684, + "logits/rejected": -3.381730794906616, + "logps/chosen": -173.1231689453125, + "logps/rejected": -182.25955200195312, + "loss": 0.6605, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.060542285442352295, + "rewards/margins": 0.6775486469268799, + "rewards/rejected": -0.7380909323692322, + "step": 3232 + }, + { + "epoch": 0.37, + "learning_rate": 1.910687112255648e-07, + "logits/chosen": -3.0645012855529785, + "logits/rejected": -3.495469093322754, + "logps/chosen": -328.2294006347656, + "logps/rejected": -399.32452392578125, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2017148733139038, + "rewards/margins": 1.4950721263885498, + "rewards/rejected": -1.6967869997024536, + "step": 3233 + }, + { + "epoch": 0.37, + "learning_rate": 1.910335947559405e-07, + "logits/chosen": -3.646066904067993, + "logits/rejected": -3.6076340675354004, + "logps/chosen": -120.30828857421875, + "logps/rejected": -193.1534881591797, + "loss": 0.2496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33989855647087097, + "rewards/margins": 2.6305253505706787, + "rewards/rejected": -2.2906270027160645, + "step": 3234 + }, + { + "epoch": 0.37, + "learning_rate": 1.909984782863163e-07, + "logits/chosen": -3.2925965785980225, + "logits/rejected": -2.952096462249756, + "logps/chosen": -303.0674743652344, + "logps/rejected": -432.8258056640625, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15167789161205292, + "rewards/margins": 1.9019863605499268, + "rewards/rejected": -1.7503085136413574, + "step": 3235 + }, + { + "epoch": 0.37, + "learning_rate": 1.9096336181669202e-07, + "logits/chosen": -2.8985912799835205, + "logits/rejected": -2.9279794692993164, + "logps/chosen": -149.3248291015625, + "logps/rejected": -259.2939453125, + "loss": 0.3726, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5507509708404541, + "rewards/margins": 2.6153724193573, + "rewards/rejected": -3.166123390197754, + "step": 3236 + }, + { + "epoch": 0.37, + "learning_rate": 1.9092824534706777e-07, + "logits/chosen": -2.7447702884674072, + "logits/rejected": -2.529634714126587, + "logps/chosen": -372.3981018066406, + "logps/rejected": -370.15618896484375, + "loss": 0.2146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18551182746887207, + "rewards/margins": 1.5035548210144043, + "rewards/rejected": -1.6890666484832764, + "step": 3237 + }, + { + "epoch": 0.37, + "learning_rate": 1.9089312887744352e-07, + "logits/chosen": -3.1495883464813232, + "logits/rejected": -3.0153558254241943, + "logps/chosen": -330.2779541015625, + "logps/rejected": -249.9334716796875, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2555235028266907, + "rewards/margins": 1.4679946899414062, + "rewards/rejected": -1.7235183715820312, + "step": 3238 + }, + { + "epoch": 0.37, + "learning_rate": 1.9085801240781925e-07, + "logits/chosen": -2.6077158451080322, + "logits/rejected": -2.6612613201141357, + "logps/chosen": -169.96072387695312, + "logps/rejected": -207.2468719482422, + "loss": 0.488, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10723993182182312, + "rewards/margins": 0.9653011560440063, + "rewards/rejected": -1.0725409984588623, + "step": 3239 + }, + { + "epoch": 0.37, + "learning_rate": 1.90822895938195e-07, + "logits/chosen": -2.372718572616577, + "logits/rejected": -2.3203091621398926, + "logps/chosen": -183.3533172607422, + "logps/rejected": -325.9496154785156, + "loss": 0.2134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19972002506256104, + "rewards/margins": 2.1437699794769287, + "rewards/rejected": -2.3434898853302, + "step": 3240 + }, + { + "epoch": 0.37, + "learning_rate": 1.9078777946857076e-07, + "logits/chosen": -3.1283442974090576, + "logits/rejected": -3.0133469104766846, + "logps/chosen": -245.3660430908203, + "logps/rejected": -273.9266662597656, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6120586395263672, + "rewards/margins": 0.6677566766738892, + "rewards/rejected": -1.2798153162002563, + "step": 3241 + }, + { + "epoch": 0.37, + "learning_rate": 1.9075266299894649e-07, + "logits/chosen": -3.1479129791259766, + "logits/rejected": -3.1463518142700195, + "logps/chosen": -449.2508544921875, + "logps/rejected": -273.6101379394531, + "loss": 0.1929, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22827793657779694, + "rewards/margins": 2.315734624862671, + "rewards/rejected": -2.087456703186035, + "step": 3242 + }, + { + "epoch": 0.37, + "learning_rate": 1.9071754652932224e-07, + "logits/chosen": -3.03220534324646, + "logits/rejected": -3.182962417602539, + "logps/chosen": -253.18397521972656, + "logps/rejected": -317.7624816894531, + "loss": 0.2952, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0026053711771965027, + "rewards/margins": 2.8622312545776367, + "rewards/rejected": -2.859625816345215, + "step": 3243 + }, + { + "epoch": 0.37, + "learning_rate": 1.9068243005969797e-07, + "logits/chosen": -2.888859510421753, + "logits/rejected": -3.1244959831237793, + "logps/chosen": -241.65017700195312, + "logps/rejected": -349.09881591796875, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.016743693500757217, + "rewards/margins": 3.190858840942383, + "rewards/rejected": -3.2076027393341064, + "step": 3244 + }, + { + "epoch": 0.37, + "learning_rate": 1.9064731359007372e-07, + "logits/chosen": -2.896139144897461, + "logits/rejected": -2.836839437484741, + "logps/chosen": -331.42730712890625, + "logps/rejected": -174.92251586914062, + "loss": 0.3901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18573100864887238, + "rewards/margins": 1.3835103511810303, + "rewards/rejected": -1.5692414045333862, + "step": 3245 + }, + { + "epoch": 0.37, + "learning_rate": 1.906121971204495e-07, + "logits/chosen": -3.3594613075256348, + "logits/rejected": -3.548163414001465, + "logps/chosen": -112.77045440673828, + "logps/rejected": -197.39797973632812, + "loss": 0.2408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3001018166542053, + "rewards/margins": 1.817267894744873, + "rewards/rejected": -1.5171661376953125, + "step": 3246 + }, + { + "epoch": 0.37, + "learning_rate": 1.9057708065082523e-07, + "logits/chosen": -3.5678553581237793, + "logits/rejected": -3.436237335205078, + "logps/chosen": -267.3758544921875, + "logps/rejected": -264.5841369628906, + "loss": 0.6619, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5816099047660828, + "rewards/margins": 1.607773780822754, + "rewards/rejected": -2.1893835067749023, + "step": 3247 + }, + { + "epoch": 0.37, + "learning_rate": 1.9054196418120098e-07, + "logits/chosen": -2.9175918102264404, + "logits/rejected": -2.8050460815429688, + "logps/chosen": -126.16608428955078, + "logps/rejected": -160.1228790283203, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20938964188098907, + "rewards/margins": 1.9538230895996094, + "rewards/rejected": -2.163212776184082, + "step": 3248 + }, + { + "epoch": 0.37, + "learning_rate": 1.9050684771157674e-07, + "logits/chosen": -3.511631965637207, + "logits/rejected": -3.0988759994506836, + "logps/chosen": -118.99443054199219, + "logps/rejected": -158.79519653320312, + "loss": 0.5502, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25886887311935425, + "rewards/margins": 0.9568430185317993, + "rewards/rejected": -0.6979742050170898, + "step": 3249 + }, + { + "epoch": 0.37, + "learning_rate": 1.9047173124195246e-07, + "logits/chosen": -3.471848964691162, + "logits/rejected": -3.287520408630371, + "logps/chosen": -266.87237548828125, + "logps/rejected": -211.9257354736328, + "loss": 0.2189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03501468151807785, + "rewards/margins": 1.7859731912612915, + "rewards/rejected": -1.7509586811065674, + "step": 3250 + }, + { + "epoch": 0.37, + "learning_rate": 1.9043661477232822e-07, + "logits/chosen": -3.8052945137023926, + "logits/rejected": -3.6344809532165527, + "logps/chosen": -230.63682556152344, + "logps/rejected": -255.5173797607422, + "loss": 0.2833, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2045193910598755, + "rewards/margins": 3.4252254962921143, + "rewards/rejected": -3.2207062244415283, + "step": 3251 + }, + { + "epoch": 0.37, + "learning_rate": 1.9040149830270394e-07, + "logits/chosen": -2.6787590980529785, + "logits/rejected": -2.616711139678955, + "logps/chosen": -348.7457275390625, + "logps/rejected": -270.74237060546875, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19256411492824554, + "rewards/margins": 1.8544222116470337, + "rewards/rejected": -2.0469861030578613, + "step": 3252 + }, + { + "epoch": 0.38, + "learning_rate": 1.903663818330797e-07, + "logits/chosen": -3.0587100982666016, + "logits/rejected": -3.1048736572265625, + "logps/chosen": -184.961669921875, + "logps/rejected": -290.9366149902344, + "loss": 0.2589, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23616443574428558, + "rewards/margins": 3.8599915504455566, + "rewards/rejected": -3.6238269805908203, + "step": 3253 + }, + { + "epoch": 0.38, + "learning_rate": 1.9033126536345545e-07, + "logits/chosen": -3.1983935832977295, + "logits/rejected": -3.4304795265197754, + "logps/chosen": -163.0640411376953, + "logps/rejected": -186.475341796875, + "loss": 0.658, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5146329998970032, + "rewards/margins": 1.099518060684204, + "rewards/rejected": -1.6141510009765625, + "step": 3254 + }, + { + "epoch": 0.38, + "learning_rate": 1.9029614889383118e-07, + "logits/chosen": -2.4606666564941406, + "logits/rejected": -2.5099620819091797, + "logps/chosen": -309.9217529296875, + "logps/rejected": -276.42474365234375, + "loss": 0.349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22629261016845703, + "rewards/margins": 1.2124695777893066, + "rewards/rejected": -1.4387621879577637, + "step": 3255 + }, + { + "epoch": 0.38, + "learning_rate": 1.9026103242420693e-07, + "logits/chosen": -3.728325128555298, + "logits/rejected": -3.4861292839050293, + "logps/chosen": -186.12265014648438, + "logps/rejected": -196.19287109375, + "loss": 0.4437, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6350020170211792, + "rewards/margins": 2.339303970336914, + "rewards/rejected": -2.974306106567383, + "step": 3256 + }, + { + "epoch": 0.38, + "learning_rate": 1.9022591595458271e-07, + "logits/chosen": -3.241154193878174, + "logits/rejected": -3.0962798595428467, + "logps/chosen": -413.33782958984375, + "logps/rejected": -260.85430908203125, + "loss": 0.1589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6960663795471191, + "rewards/margins": 2.5299081802368164, + "rewards/rejected": -1.8338415622711182, + "step": 3257 + }, + { + "epoch": 0.38, + "learning_rate": 1.9019079948495844e-07, + "logits/chosen": -2.176119804382324, + "logits/rejected": -2.141174077987671, + "logps/chosen": -394.9962158203125, + "logps/rejected": -331.6242370605469, + "loss": 0.233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6488301157951355, + "rewards/margins": 1.909548044204712, + "rewards/rejected": -1.2607176303863525, + "step": 3258 + }, + { + "epoch": 0.38, + "learning_rate": 1.901556830153342e-07, + "logits/chosen": -3.1232521533966064, + "logits/rejected": -3.1128177642822266, + "logps/chosen": -162.72921752929688, + "logps/rejected": -283.23638916015625, + "loss": 0.3529, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015923619270324707, + "rewards/margins": 2.4470670223236084, + "rewards/rejected": -2.431143283843994, + "step": 3259 + }, + { + "epoch": 0.38, + "learning_rate": 1.9012056654570992e-07, + "logits/chosen": -3.2230913639068604, + "logits/rejected": -3.1792685985565186, + "logps/chosen": -159.42027282714844, + "logps/rejected": -262.39483642578125, + "loss": 0.5293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43209201097488403, + "rewards/margins": 0.7172941565513611, + "rewards/rejected": -1.1493862867355347, + "step": 3260 + }, + { + "epoch": 0.38, + "learning_rate": 1.9008545007608568e-07, + "logits/chosen": -3.58026385307312, + "logits/rejected": -3.494145154953003, + "logps/chosen": -169.50152587890625, + "logps/rejected": -193.07992553710938, + "loss": 0.2871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.615800678730011, + "rewards/margins": 1.4299113750457764, + "rewards/rejected": -2.0457119941711426, + "step": 3261 + }, + { + "epoch": 0.38, + "learning_rate": 1.9005033360646143e-07, + "logits/chosen": -2.9540281295776367, + "logits/rejected": -3.2339887619018555, + "logps/chosen": -383.470947265625, + "logps/rejected": -237.59872436523438, + "loss": 0.259, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30963021516799927, + "rewards/margins": 2.005920171737671, + "rewards/rejected": -1.6962900161743164, + "step": 3262 + }, + { + "epoch": 0.38, + "learning_rate": 1.9001521713683716e-07, + "logits/chosen": -3.5184195041656494, + "logits/rejected": -3.2213165760040283, + "logps/chosen": -282.95556640625, + "logps/rejected": -244.77105712890625, + "loss": 0.5145, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09156637638807297, + "rewards/margins": 0.8782737851142883, + "rewards/rejected": -0.9698401689529419, + "step": 3263 + }, + { + "epoch": 0.38, + "learning_rate": 1.899801006672129e-07, + "logits/chosen": -2.4168806076049805, + "logits/rejected": -2.424252986907959, + "logps/chosen": -339.12725830078125, + "logps/rejected": -286.3655700683594, + "loss": 0.5886, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5175842046737671, + "rewards/margins": 0.7221792936325073, + "rewards/rejected": -1.2397634983062744, + "step": 3264 + }, + { + "epoch": 0.38, + "learning_rate": 1.8994498419758867e-07, + "logits/chosen": -2.749178409576416, + "logits/rejected": -2.38816499710083, + "logps/chosen": -572.865478515625, + "logps/rejected": -271.8323974609375, + "loss": 0.3617, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.689293384552002, + "rewards/margins": 1.9038830995559692, + "rewards/rejected": -1.2145897150039673, + "step": 3265 + }, + { + "epoch": 0.38, + "learning_rate": 1.899098677279644e-07, + "logits/chosen": -3.4367265701293945, + "logits/rejected": -3.70025634765625, + "logps/chosen": -63.923343658447266, + "logps/rejected": -142.77394104003906, + "loss": 0.3483, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06515637040138245, + "rewards/margins": 1.7900564670562744, + "rewards/rejected": -1.7249001264572144, + "step": 3266 + }, + { + "epoch": 0.38, + "learning_rate": 1.8987475125834017e-07, + "logits/chosen": -2.6999218463897705, + "logits/rejected": -2.6521661281585693, + "logps/chosen": -130.17149353027344, + "logps/rejected": -212.41921997070312, + "loss": 0.3818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001506030559539795, + "rewards/margins": 1.44063138961792, + "rewards/rejected": -1.4391252994537354, + "step": 3267 + }, + { + "epoch": 0.38, + "learning_rate": 1.8983963478871587e-07, + "logits/chosen": -3.3422892093658447, + "logits/rejected": -2.9832239151000977, + "logps/chosen": -389.35394287109375, + "logps/rejected": -366.3259582519531, + "loss": 0.2645, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19954347610473633, + "rewards/margins": 1.9503593444824219, + "rewards/rejected": -1.7508158683776855, + "step": 3268 + }, + { + "epoch": 0.38, + "learning_rate": 1.8980451831909165e-07, + "logits/chosen": -3.8020968437194824, + "logits/rejected": -3.816239356994629, + "logps/chosen": -239.0646209716797, + "logps/rejected": -233.32949829101562, + "loss": 0.3473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3341495990753174, + "rewards/margins": 1.4127821922302246, + "rewards/rejected": -1.0786325931549072, + "step": 3269 + }, + { + "epoch": 0.38, + "learning_rate": 1.897694018494674e-07, + "logits/chosen": -2.644350528717041, + "logits/rejected": -2.427365779876709, + "logps/chosen": -273.2784729003906, + "logps/rejected": -319.6897888183594, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2436281144618988, + "rewards/margins": 1.0179781913757324, + "rewards/rejected": -1.2616063356399536, + "step": 3270 + }, + { + "epoch": 0.38, + "learning_rate": 1.8973428537984314e-07, + "logits/chosen": -2.8985328674316406, + "logits/rejected": -2.7413225173950195, + "logps/chosen": -337.0899963378906, + "logps/rejected": -204.57928466796875, + "loss": 0.574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7751039862632751, + "rewards/margins": 0.9183584451675415, + "rewards/rejected": -1.6934623718261719, + "step": 3271 + }, + { + "epoch": 0.38, + "learning_rate": 1.896991689102189e-07, + "logits/chosen": -3.141148090362549, + "logits/rejected": -3.1746315956115723, + "logps/chosen": -262.5524597167969, + "logps/rejected": -167.91156005859375, + "loss": 0.4441, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8348259925842285, + "rewards/margins": 2.5769550800323486, + "rewards/rejected": -1.7421290874481201, + "step": 3272 + }, + { + "epoch": 0.38, + "learning_rate": 1.8966405244059464e-07, + "logits/chosen": -3.045548915863037, + "logits/rejected": -3.4743990898132324, + "logps/chosen": -204.1428680419922, + "logps/rejected": -284.0826721191406, + "loss": 0.8844, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05128922313451767, + "rewards/margins": 0.23392778635025024, + "rewards/rejected": -0.18263855576515198, + "step": 3273 + }, + { + "epoch": 0.38, + "learning_rate": 1.8962893597097037e-07, + "logits/chosen": -2.5927233695983887, + "logits/rejected": -2.7178752422332764, + "logps/chosen": -216.53982543945312, + "logps/rejected": -270.5825500488281, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11347056925296783, + "rewards/margins": 2.8802762031555176, + "rewards/rejected": -2.76680588722229, + "step": 3274 + }, + { + "epoch": 0.38, + "learning_rate": 1.8959381950134612e-07, + "logits/chosen": -3.5513343811035156, + "logits/rejected": -3.615835189819336, + "logps/chosen": -191.34625244140625, + "logps/rejected": -233.92471313476562, + "loss": 0.7685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7832320928573608, + "rewards/margins": 1.2523956298828125, + "rewards/rejected": -2.035627603530884, + "step": 3275 + }, + { + "epoch": 0.38, + "learning_rate": 1.8955870303172185e-07, + "logits/chosen": -2.82460880279541, + "logits/rejected": -2.5946860313415527, + "logps/chosen": -385.426025390625, + "logps/rejected": -193.66998291015625, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023731417953968048, + "rewards/margins": 1.501528024673462, + "rewards/rejected": -1.4777965545654297, + "step": 3276 + }, + { + "epoch": 0.38, + "learning_rate": 1.895235865620976e-07, + "logits/chosen": -2.7012245655059814, + "logits/rejected": -2.9006242752075195, + "logps/chosen": -313.27178955078125, + "logps/rejected": -230.33370971679688, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48774468898773193, + "rewards/margins": 1.5161164999008179, + "rewards/rejected": -1.028371810913086, + "step": 3277 + }, + { + "epoch": 0.38, + "learning_rate": 1.8948847009247339e-07, + "logits/chosen": -3.764421224594116, + "logits/rejected": -4.064428329467773, + "logps/chosen": -162.78953552246094, + "logps/rejected": -212.28662109375, + "loss": 0.3659, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.533346951007843, + "rewards/margins": 1.8389813899993896, + "rewards/rejected": -1.3056344985961914, + "step": 3278 + }, + { + "epoch": 0.38, + "learning_rate": 1.894533536228491e-07, + "logits/chosen": -3.533937931060791, + "logits/rejected": -3.724313735961914, + "logps/chosen": -202.3952178955078, + "logps/rejected": -204.21546936035156, + "loss": 0.3533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0708017647266388, + "rewards/margins": 1.7498159408569336, + "rewards/rejected": -1.6790142059326172, + "step": 3279 + }, + { + "epoch": 0.38, + "learning_rate": 1.8941823715322487e-07, + "logits/chosen": -3.6773629188537598, + "logits/rejected": -3.9739575386047363, + "logps/chosen": -174.67254638671875, + "logps/rejected": -215.70721435546875, + "loss": 0.547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20336365699768066, + "rewards/margins": 1.1152557134628296, + "rewards/rejected": -1.3186193704605103, + "step": 3280 + }, + { + "epoch": 0.38, + "learning_rate": 1.893831206836006e-07, + "logits/chosen": -2.8084702491760254, + "logits/rejected": -3.1349902153015137, + "logps/chosen": -168.08966064453125, + "logps/rejected": -272.7445983886719, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.193680539727211, + "rewards/margins": 1.890684723854065, + "rewards/rejected": -2.0843653678894043, + "step": 3281 + }, + { + "epoch": 0.38, + "learning_rate": 1.8934800421397635e-07, + "logits/chosen": -3.5066637992858887, + "logits/rejected": -3.101552724838257, + "logps/chosen": -569.9415893554688, + "logps/rejected": -285.5455017089844, + "loss": 1.45, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.009317398071289, + "rewards/margins": 0.7138168811798096, + "rewards/rejected": -1.7231343984603882, + "step": 3282 + }, + { + "epoch": 0.38, + "learning_rate": 1.893128877443521e-07, + "logits/chosen": -3.3261404037475586, + "logits/rejected": -3.154379367828369, + "logps/chosen": -425.2516784667969, + "logps/rejected": -282.50958251953125, + "loss": 0.334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5665931105613708, + "rewards/margins": 1.9900599718093872, + "rewards/rejected": -1.4234668016433716, + "step": 3283 + }, + { + "epoch": 0.38, + "learning_rate": 1.8927777127472783e-07, + "logits/chosen": -3.350087881088257, + "logits/rejected": -3.349928855895996, + "logps/chosen": -190.12283325195312, + "logps/rejected": -154.02249145507812, + "loss": 0.8403, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8056511282920837, + "rewards/margins": 0.15024809539318085, + "rewards/rejected": -0.9558992385864258, + "step": 3284 + }, + { + "epoch": 0.38, + "learning_rate": 1.8924265480510358e-07, + "logits/chosen": -3.7102129459381104, + "logits/rejected": -3.347630739212036, + "logps/chosen": -202.17469787597656, + "logps/rejected": -193.7458038330078, + "loss": 0.4459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030800748616456985, + "rewards/margins": 0.9983671307563782, + "rewards/rejected": -1.029167890548706, + "step": 3285 + }, + { + "epoch": 0.38, + "learning_rate": 1.8920753833547934e-07, + "logits/chosen": -3.7895452976226807, + "logits/rejected": -3.3033580780029297, + "logps/chosen": -367.5447082519531, + "logps/rejected": -319.93499755859375, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19426314532756805, + "rewards/margins": 1.879339337348938, + "rewards/rejected": -1.685076117515564, + "step": 3286 + }, + { + "epoch": 0.38, + "learning_rate": 1.8917242186585506e-07, + "logits/chosen": -2.6906075477600098, + "logits/rejected": -2.460141658782959, + "logps/chosen": -288.1797180175781, + "logps/rejected": -218.65945434570312, + "loss": 0.2706, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3034425973892212, + "rewards/margins": 2.683164358139038, + "rewards/rejected": -2.3797216415405273, + "step": 3287 + }, + { + "epoch": 0.38, + "learning_rate": 1.8913730539623082e-07, + "logits/chosen": -3.032623767852783, + "logits/rejected": -3.402095079421997, + "logps/chosen": -375.28570556640625, + "logps/rejected": -320.8821716308594, + "loss": 0.5842, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09593716263771057, + "rewards/margins": 0.6738909482955933, + "rewards/rejected": -0.5779538154602051, + "step": 3288 + }, + { + "epoch": 0.38, + "learning_rate": 1.8910218892660655e-07, + "logits/chosen": -2.9447529315948486, + "logits/rejected": -2.9131171703338623, + "logps/chosen": -219.88800048828125, + "logps/rejected": -269.6672058105469, + "loss": 0.6481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.861519455909729, + "rewards/margins": 0.6639772057533264, + "rewards/rejected": -1.5254967212677002, + "step": 3289 + }, + { + "epoch": 0.38, + "learning_rate": 1.890670724569823e-07, + "logits/chosen": -3.296943187713623, + "logits/rejected": -3.4018473625183105, + "logps/chosen": -467.2939147949219, + "logps/rejected": -227.11643981933594, + "loss": 0.2872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03399544954299927, + "rewards/margins": 1.8533363342285156, + "rewards/rejected": -1.8193409442901611, + "step": 3290 + }, + { + "epoch": 0.38, + "learning_rate": 1.8903195598735808e-07, + "logits/chosen": -2.2892041206359863, + "logits/rejected": -2.437941074371338, + "logps/chosen": -474.3783264160156, + "logps/rejected": -494.00457763671875, + "loss": 0.5919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2495562732219696, + "rewards/margins": 1.4580421447753906, + "rewards/rejected": -1.7075984477996826, + "step": 3291 + }, + { + "epoch": 0.38, + "learning_rate": 1.889968395177338e-07, + "logits/chosen": -2.958592176437378, + "logits/rejected": -3.1123037338256836, + "logps/chosen": -283.4925231933594, + "logps/rejected": -296.1649169921875, + "loss": 0.4877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5664748549461365, + "rewards/margins": 1.1059414148330688, + "rewards/rejected": -1.6724162101745605, + "step": 3292 + }, + { + "epoch": 0.38, + "learning_rate": 1.8896172304810956e-07, + "logits/chosen": -3.092756748199463, + "logits/rejected": -2.7519452571868896, + "logps/chosen": -398.3504943847656, + "logps/rejected": -432.1631164550781, + "loss": 0.4874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11511875689029694, + "rewards/margins": 1.6519618034362793, + "rewards/rejected": -1.5368430614471436, + "step": 3293 + }, + { + "epoch": 0.38, + "learning_rate": 1.8892660657848532e-07, + "logits/chosen": -2.7783358097076416, + "logits/rejected": -3.1314947605133057, + "logps/chosen": -170.7704315185547, + "logps/rejected": -155.86134338378906, + "loss": 0.357, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6245697736740112, + "rewards/margins": 1.5900131464004517, + "rewards/rejected": -0.9654433131217957, + "step": 3294 + }, + { + "epoch": 0.38, + "learning_rate": 1.8889149010886104e-07, + "logits/chosen": -3.5736119747161865, + "logits/rejected": -3.4967470169067383, + "logps/chosen": -253.67776489257812, + "logps/rejected": -166.7464141845703, + "loss": 0.4302, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3428748846054077, + "rewards/margins": 0.9962511658668518, + "rewards/rejected": -1.3391261100769043, + "step": 3295 + }, + { + "epoch": 0.38, + "learning_rate": 1.888563736392368e-07, + "logits/chosen": -3.1321921348571777, + "logits/rejected": -3.1431524753570557, + "logps/chosen": -302.3958435058594, + "logps/rejected": -138.30589294433594, + "loss": 0.4415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44807320833206177, + "rewards/margins": 0.8661787509918213, + "rewards/rejected": -1.3142518997192383, + "step": 3296 + }, + { + "epoch": 0.38, + "learning_rate": 1.8882125716961252e-07, + "logits/chosen": -3.398437976837158, + "logits/rejected": -3.717824935913086, + "logps/chosen": -202.67575073242188, + "logps/rejected": -278.493408203125, + "loss": 0.3292, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3177153170108795, + "rewards/margins": 2.3680624961853027, + "rewards/rejected": -2.050347328186035, + "step": 3297 + }, + { + "epoch": 0.38, + "learning_rate": 1.8878614069998828e-07, + "logits/chosen": -3.6961684226989746, + "logits/rejected": -3.661113977432251, + "logps/chosen": -197.9889373779297, + "logps/rejected": -243.475341796875, + "loss": 0.3915, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.024217352271080017, + "rewards/margins": 1.6054434776306152, + "rewards/rejected": -1.6296608448028564, + "step": 3298 + }, + { + "epoch": 0.38, + "learning_rate": 1.8875102423036403e-07, + "logits/chosen": -3.4814982414245605, + "logits/rejected": -3.3964343070983887, + "logps/chosen": -311.9060974121094, + "logps/rejected": -251.25796508789062, + "loss": 0.6792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00868266075849533, + "rewards/margins": 0.6992079615592957, + "rewards/rejected": -0.7078907489776611, + "step": 3299 + }, + { + "epoch": 0.38, + "learning_rate": 1.8871590776073976e-07, + "logits/chosen": -3.337003707885742, + "logits/rejected": -3.4357104301452637, + "logps/chosen": -126.36668395996094, + "logps/rejected": -159.96090698242188, + "loss": 0.1661, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4924886226654053, + "rewards/margins": 2.4605462551116943, + "rewards/rejected": -1.96805739402771, + "step": 3300 + }, + { + "epoch": 0.38, + "learning_rate": 1.8868079129111554e-07, + "logits/chosen": -3.319650173187256, + "logits/rejected": -3.018343925476074, + "logps/chosen": -329.4004821777344, + "logps/rejected": -144.05502319335938, + "loss": 0.518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7958881258964539, + "rewards/margins": 0.984311580657959, + "rewards/rejected": -1.7801995277404785, + "step": 3301 + }, + { + "epoch": 0.38, + "learning_rate": 1.886456748214913e-07, + "logits/chosen": -3.367759943008423, + "logits/rejected": -3.259202480316162, + "logps/chosen": -341.15313720703125, + "logps/rejected": -376.1211242675781, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07507112622261047, + "rewards/margins": 2.022493839263916, + "rewards/rejected": -2.097564935684204, + "step": 3302 + }, + { + "epoch": 0.38, + "learning_rate": 1.8861055835186702e-07, + "logits/chosen": -3.0694632530212402, + "logits/rejected": -3.2107715606689453, + "logps/chosen": -258.094482421875, + "logps/rejected": -270.24658203125, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45277899503707886, + "rewards/margins": 1.5561782121658325, + "rewards/rejected": -2.0089573860168457, + "step": 3303 + }, + { + "epoch": 0.38, + "learning_rate": 1.8857544188224277e-07, + "logits/chosen": -3.002181053161621, + "logits/rejected": -3.022519826889038, + "logps/chosen": -273.4798278808594, + "logps/rejected": -330.8305969238281, + "loss": 0.9528, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.613883912563324, + "rewards/margins": 0.42271921038627625, + "rewards/rejected": -1.0366029739379883, + "step": 3304 + }, + { + "epoch": 0.38, + "learning_rate": 1.885403254126185e-07, + "logits/chosen": -3.3664557933807373, + "logits/rejected": -3.200284004211426, + "logps/chosen": -222.54446411132812, + "logps/rejected": -227.20130920410156, + "loss": 0.5178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03998154401779175, + "rewards/margins": 1.129910945892334, + "rewards/rejected": -1.1698925495147705, + "step": 3305 + }, + { + "epoch": 0.38, + "learning_rate": 1.8850520894299426e-07, + "logits/chosen": -3.821786642074585, + "logits/rejected": -3.862496852874756, + "logps/chosen": -254.02020263671875, + "logps/rejected": -368.3092956542969, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23757895827293396, + "rewards/margins": 3.0252878665924072, + "rewards/rejected": -3.262866973876953, + "step": 3306 + }, + { + "epoch": 0.38, + "learning_rate": 1.8847009247337e-07, + "logits/chosen": -3.1608047485351562, + "logits/rejected": -3.0649266242980957, + "logps/chosen": -275.3810729980469, + "logps/rejected": -256.4399108886719, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03167836368083954, + "rewards/margins": 2.303206205368042, + "rewards/rejected": -2.3348846435546875, + "step": 3307 + }, + { + "epoch": 0.38, + "learning_rate": 1.8843497600374574e-07, + "logits/chosen": -3.1852426528930664, + "logits/rejected": -3.001401424407959, + "logps/chosen": -197.1796875, + "logps/rejected": -186.481689453125, + "loss": 0.2805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.089028000831604, + "rewards/margins": 2.0856070518493652, + "rewards/rejected": -2.174635171890259, + "step": 3308 + }, + { + "epoch": 0.38, + "learning_rate": 1.883998595341215e-07, + "logits/chosen": -2.9746756553649902, + "logits/rejected": -3.350696086883545, + "logps/chosen": -167.04098510742188, + "logps/rejected": -193.2460479736328, + "loss": 0.5921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1360245645046234, + "rewards/margins": 0.7690789103507996, + "rewards/rejected": -0.6330543756484985, + "step": 3309 + }, + { + "epoch": 0.38, + "learning_rate": 1.8836474306449724e-07, + "logits/chosen": -3.5847833156585693, + "logits/rejected": -3.4554836750030518, + "logps/chosen": -204.88865661621094, + "logps/rejected": -183.84750366210938, + "loss": 0.2218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02667609415948391, + "rewards/margins": 2.6765217781066895, + "rewards/rejected": -2.649845600128174, + "step": 3310 + }, + { + "epoch": 0.38, + "learning_rate": 1.8832962659487297e-07, + "logits/chosen": -3.1703643798828125, + "logits/rejected": -3.0827040672302246, + "logps/chosen": -235.3411102294922, + "logps/rejected": -165.66087341308594, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21020975708961487, + "rewards/margins": 1.6115771532058716, + "rewards/rejected": -1.401367425918579, + "step": 3311 + }, + { + "epoch": 0.38, + "learning_rate": 1.8829451012524875e-07, + "logits/chosen": -2.3361592292785645, + "logits/rejected": -2.603630781173706, + "logps/chosen": -327.1177673339844, + "logps/rejected": -256.0703430175781, + "loss": 0.3917, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21818825602531433, + "rewards/margins": 1.1059364080429077, + "rewards/rejected": -1.3241246938705444, + "step": 3312 + }, + { + "epoch": 0.38, + "learning_rate": 1.8825939365562445e-07, + "logits/chosen": -3.5835318565368652, + "logits/rejected": -3.33602237701416, + "logps/chosen": -198.59976196289062, + "logps/rejected": -329.790771484375, + "loss": 0.5615, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10201224684715271, + "rewards/margins": 2.3762001991271973, + "rewards/rejected": -2.2741878032684326, + "step": 3313 + }, + { + "epoch": 0.38, + "learning_rate": 1.8822427718600023e-07, + "logits/chosen": -3.6227874755859375, + "logits/rejected": -3.7840752601623535, + "logps/chosen": -245.32415771484375, + "logps/rejected": -192.49459838867188, + "loss": 0.3795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2593570351600647, + "rewards/margins": 2.066713333129883, + "rewards/rejected": -2.3260703086853027, + "step": 3314 + }, + { + "epoch": 0.38, + "learning_rate": 1.88189160716376e-07, + "logits/chosen": -2.963606357574463, + "logits/rejected": -3.1620724201202393, + "logps/chosen": -315.806640625, + "logps/rejected": -321.217041015625, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.554469883441925, + "rewards/margins": 1.31089448928833, + "rewards/rejected": -0.7564246654510498, + "step": 3315 + }, + { + "epoch": 0.38, + "learning_rate": 1.8815404424675171e-07, + "logits/chosen": -3.6104540824890137, + "logits/rejected": -3.508883476257324, + "logps/chosen": -221.0469970703125, + "logps/rejected": -322.13067626953125, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16562509536743164, + "rewards/margins": 1.9476971626281738, + "rewards/rejected": -2.1133222579956055, + "step": 3316 + }, + { + "epoch": 0.38, + "learning_rate": 1.8811892777712747e-07, + "logits/chosen": -2.955089569091797, + "logits/rejected": -3.1377112865448, + "logps/chosen": -357.8824157714844, + "logps/rejected": -314.53961181640625, + "loss": 0.3232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1293627917766571, + "rewards/margins": 2.441380262374878, + "rewards/rejected": -2.3120179176330566, + "step": 3317 + }, + { + "epoch": 0.38, + "learning_rate": 1.8808381130750322e-07, + "logits/chosen": -1.7877302169799805, + "logits/rejected": -1.9313466548919678, + "logps/chosen": -372.9151306152344, + "logps/rejected": -259.4946594238281, + "loss": 0.5156, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021996498107910156, + "rewards/margins": 1.2861825227737427, + "rewards/rejected": -1.2641860246658325, + "step": 3318 + }, + { + "epoch": 0.38, + "learning_rate": 1.8804869483787895e-07, + "logits/chosen": -3.0780375003814697, + "logits/rejected": -3.2801733016967773, + "logps/chosen": -459.59814453125, + "logps/rejected": -324.6257019042969, + "loss": 0.4136, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36141911149024963, + "rewards/margins": 2.255362033843994, + "rewards/rejected": -1.8939428329467773, + "step": 3319 + }, + { + "epoch": 0.38, + "learning_rate": 1.880135783682547e-07, + "logits/chosen": -2.8265552520751953, + "logits/rejected": -2.706533432006836, + "logps/chosen": -271.15338134765625, + "logps/rejected": -210.36805725097656, + "loss": 0.5894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4487724006175995, + "rewards/margins": 0.7796653509140015, + "rewards/rejected": -1.2284376621246338, + "step": 3320 + }, + { + "epoch": 0.38, + "learning_rate": 1.8797846189863043e-07, + "logits/chosen": -2.788576602935791, + "logits/rejected": -2.840190887451172, + "logps/chosen": -252.01242065429688, + "logps/rejected": -175.6400604248047, + "loss": 0.2582, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45263272523880005, + "rewards/margins": 2.2169642448425293, + "rewards/rejected": -1.764331579208374, + "step": 3321 + }, + { + "epoch": 0.38, + "learning_rate": 1.8794334542900619e-07, + "logits/chosen": -3.044832944869995, + "logits/rejected": -3.179853916168213, + "logps/chosen": -249.2722930908203, + "logps/rejected": -153.52249145507812, + "loss": 0.4038, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03893648087978363, + "rewards/margins": 1.8686343431472778, + "rewards/rejected": -1.829697847366333, + "step": 3322 + }, + { + "epoch": 0.38, + "learning_rate": 1.8790822895938197e-07, + "logits/chosen": -3.148273229598999, + "logits/rejected": -3.119884490966797, + "logps/chosen": -316.1927795410156, + "logps/rejected": -268.95556640625, + "loss": 0.3877, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07867990434169769, + "rewards/margins": 1.0090584754943848, + "rewards/rejected": -0.9303786158561707, + "step": 3323 + }, + { + "epoch": 0.38, + "learning_rate": 1.8787311248975767e-07, + "logits/chosen": -3.249379873275757, + "logits/rejected": -3.315068244934082, + "logps/chosen": -152.01266479492188, + "logps/rejected": -138.59857177734375, + "loss": 0.3809, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41191452741622925, + "rewards/margins": 1.2860987186431885, + "rewards/rejected": -0.8741841912269592, + "step": 3324 + }, + { + "epoch": 0.38, + "learning_rate": 1.8783799602013345e-07, + "logits/chosen": -3.19000244140625, + "logits/rejected": -3.237889289855957, + "logps/chosen": -261.4431457519531, + "logps/rejected": -175.7769775390625, + "loss": 0.3493, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40924137830734253, + "rewards/margins": 1.1121751070022583, + "rewards/rejected": -0.7029337882995605, + "step": 3325 + }, + { + "epoch": 0.38, + "learning_rate": 1.878028795505092e-07, + "logits/chosen": -3.0472631454467773, + "logits/rejected": -3.134939432144165, + "logps/chosen": -288.6307067871094, + "logps/rejected": -355.745361328125, + "loss": 0.8361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3885285556316376, + "rewards/margins": 0.8718860149383545, + "rewards/rejected": -1.2604146003723145, + "step": 3326 + }, + { + "epoch": 0.38, + "learning_rate": 1.8776776308088493e-07, + "logits/chosen": -3.550727367401123, + "logits/rejected": -3.308666944503784, + "logps/chosen": -222.9954833984375, + "logps/rejected": -203.93783569335938, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43419745564460754, + "rewards/margins": 1.0974576473236084, + "rewards/rejected": -1.5316550731658936, + "step": 3327 + }, + { + "epoch": 0.38, + "learning_rate": 1.8773264661126068e-07, + "logits/chosen": -2.3440239429473877, + "logits/rejected": -2.456005573272705, + "logps/chosen": -401.47705078125, + "logps/rejected": -282.2621765136719, + "loss": 0.3423, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2311014086008072, + "rewards/margins": 2.3919286727905273, + "rewards/rejected": -2.160827398300171, + "step": 3328 + }, + { + "epoch": 0.38, + "learning_rate": 1.876975301416364e-07, + "logits/chosen": -3.332796573638916, + "logits/rejected": -2.918513536453247, + "logps/chosen": -256.4441833496094, + "logps/rejected": -156.6804962158203, + "loss": 0.5172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8432498574256897, + "rewards/margins": 0.8773359060287476, + "rewards/rejected": -1.720585823059082, + "step": 3329 + }, + { + "epoch": 0.38, + "learning_rate": 1.8766241367201216e-07, + "logits/chosen": -3.5055770874023438, + "logits/rejected": -3.5941367149353027, + "logps/chosen": -425.8380126953125, + "logps/rejected": -350.90826416015625, + "loss": 0.4307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.397757351398468, + "rewards/margins": 2.2431368827819824, + "rewards/rejected": -2.6408944129943848, + "step": 3330 + }, + { + "epoch": 0.38, + "learning_rate": 1.8762729720238792e-07, + "logits/chosen": -2.747596263885498, + "logits/rejected": -2.6344313621520996, + "logps/chosen": -438.404296875, + "logps/rejected": -256.55108642578125, + "loss": 0.1778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46374815702438354, + "rewards/margins": 1.9566004276275635, + "rewards/rejected": -1.4928523302078247, + "step": 3331 + }, + { + "epoch": 0.38, + "learning_rate": 1.8759218073276364e-07, + "logits/chosen": -3.1625051498413086, + "logits/rejected": -3.0398197174072266, + "logps/chosen": -247.79641723632812, + "logps/rejected": -221.09149169921875, + "loss": 0.2824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10505138337612152, + "rewards/margins": 1.5449755191802979, + "rewards/rejected": -1.650026798248291, + "step": 3332 + }, + { + "epoch": 0.38, + "learning_rate": 1.875570642631394e-07, + "logits/chosen": -3.540853977203369, + "logits/rejected": -3.352689504623413, + "logps/chosen": -303.53240966796875, + "logps/rejected": -280.15130615234375, + "loss": 0.6413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4182283580303192, + "rewards/margins": 1.3621182441711426, + "rewards/rejected": -0.943889856338501, + "step": 3333 + }, + { + "epoch": 0.38, + "learning_rate": 1.8752194779351513e-07, + "logits/chosen": -3.3047995567321777, + "logits/rejected": -3.3412954807281494, + "logps/chosen": -158.80426025390625, + "logps/rejected": -185.94676208496094, + "loss": 0.2036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1461183726787567, + "rewards/margins": 2.3584775924682617, + "rewards/rejected": -2.2123589515686035, + "step": 3334 + }, + { + "epoch": 0.38, + "learning_rate": 1.874868313238909e-07, + "logits/chosen": -3.198099136352539, + "logits/rejected": -2.8970463275909424, + "logps/chosen": -209.2010498046875, + "logps/rejected": -178.73695373535156, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10556148737668991, + "rewards/margins": 1.478636384010315, + "rewards/rejected": -1.3730748891830444, + "step": 3335 + }, + { + "epoch": 0.38, + "learning_rate": 1.8745171485426666e-07, + "logits/chosen": -2.6294567584991455, + "logits/rejected": -2.634214401245117, + "logps/chosen": -401.44219970703125, + "logps/rejected": -327.6624755859375, + "loss": 0.3827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06505288183689117, + "rewards/margins": 1.1446897983551025, + "rewards/rejected": -1.079636812210083, + "step": 3336 + }, + { + "epoch": 0.38, + "learning_rate": 1.874165983846424e-07, + "logits/chosen": -2.9999094009399414, + "logits/rejected": -3.298171043395996, + "logps/chosen": -246.87286376953125, + "logps/rejected": -262.9950256347656, + "loss": 0.6166, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4159756004810333, + "rewards/margins": 1.1566616296768188, + "rewards/rejected": -1.5726372003555298, + "step": 3337 + }, + { + "epoch": 0.38, + "learning_rate": 1.8738148191501814e-07, + "logits/chosen": -3.141669750213623, + "logits/rejected": -2.815613269805908, + "logps/chosen": -198.24844360351562, + "logps/rejected": -353.8513488769531, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18997150659561157, + "rewards/margins": 1.7183077335357666, + "rewards/rejected": -1.5283362865447998, + "step": 3338 + }, + { + "epoch": 0.38, + "learning_rate": 1.873463654453939e-07, + "logits/chosen": -2.634459972381592, + "logits/rejected": -2.607105016708374, + "logps/chosen": -300.10382080078125, + "logps/rejected": -222.62738037109375, + "loss": 0.4495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03068874031305313, + "rewards/margins": 0.785773754119873, + "rewards/rejected": -0.816462516784668, + "step": 3339 + }, + { + "epoch": 0.39, + "learning_rate": 1.8731124897576962e-07, + "logits/chosen": -2.624997615814209, + "logits/rejected": -2.5127382278442383, + "logps/chosen": -249.82827758789062, + "logps/rejected": -276.1859130859375, + "loss": 0.6184, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08863720297813416, + "rewards/margins": 0.6619340181350708, + "rewards/rejected": -0.573296844959259, + "step": 3340 + }, + { + "epoch": 0.39, + "learning_rate": 1.8727613250614538e-07, + "logits/chosen": -2.7503695487976074, + "logits/rejected": -2.4913058280944824, + "logps/chosen": -373.00689697265625, + "logps/rejected": -292.4007873535156, + "loss": 0.4127, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27091163396835327, + "rewards/margins": 1.3581360578536987, + "rewards/rejected": -1.0872244834899902, + "step": 3341 + }, + { + "epoch": 0.39, + "learning_rate": 1.872410160365211e-07, + "logits/chosen": -2.5320582389831543, + "logits/rejected": -2.530043601989746, + "logps/chosen": -215.9588623046875, + "logps/rejected": -255.28515625, + "loss": 0.5106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018528848886489868, + "rewards/margins": 2.9273273944854736, + "rewards/rejected": -2.9458560943603516, + "step": 3342 + }, + { + "epoch": 0.39, + "learning_rate": 1.8720589956689686e-07, + "logits/chosen": -3.311363697052002, + "logits/rejected": -3.3521389961242676, + "logps/chosen": -302.8426208496094, + "logps/rejected": -294.8957214355469, + "loss": 0.3976, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04431906342506409, + "rewards/margins": 1.8754771947860718, + "rewards/rejected": -1.831157922744751, + "step": 3343 + }, + { + "epoch": 0.39, + "learning_rate": 1.871707830972726e-07, + "logits/chosen": -3.40177845954895, + "logits/rejected": -3.2989864349365234, + "logps/chosen": -334.1148681640625, + "logps/rejected": -276.3096923828125, + "loss": 0.3485, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017334014177322388, + "rewards/margins": 2.0643246173858643, + "rewards/rejected": -2.0469906330108643, + "step": 3344 + }, + { + "epoch": 0.39, + "learning_rate": 1.8713566662764834e-07, + "logits/chosen": -2.217054843902588, + "logits/rejected": -2.5100722312927246, + "logps/chosen": -247.73190307617188, + "logps/rejected": -167.6239471435547, + "loss": 0.6107, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.353685200214386, + "rewards/margins": 1.0655039548873901, + "rewards/rejected": -0.7118187546730042, + "step": 3345 + }, + { + "epoch": 0.39, + "learning_rate": 1.8710055015802412e-07, + "logits/chosen": -2.178075075149536, + "logits/rejected": -2.1576452255249023, + "logps/chosen": -302.20660400390625, + "logps/rejected": -269.6806945800781, + "loss": 0.5126, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11869420111179352, + "rewards/margins": 1.1605371236801147, + "rewards/rejected": -1.279231309890747, + "step": 3346 + }, + { + "epoch": 0.39, + "learning_rate": 1.8706543368839987e-07, + "logits/chosen": -2.257629632949829, + "logits/rejected": -2.4049737453460693, + "logps/chosen": -288.4828186035156, + "logps/rejected": -236.7098388671875, + "loss": 0.3712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2058131992816925, + "rewards/margins": 1.362931728363037, + "rewards/rejected": -1.5687451362609863, + "step": 3347 + }, + { + "epoch": 0.39, + "learning_rate": 1.870303172187756e-07, + "logits/chosen": -3.8431026935577393, + "logits/rejected": -3.504307746887207, + "logps/chosen": -195.62246704101562, + "logps/rejected": -165.6244659423828, + "loss": 0.8627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9455024600028992, + "rewards/margins": 0.38038721680641174, + "rewards/rejected": -1.3258895874023438, + "step": 3348 + }, + { + "epoch": 0.39, + "learning_rate": 1.8699520074915135e-07, + "logits/chosen": -3.0325000286102295, + "logits/rejected": -3.3411097526550293, + "logps/chosen": -343.81414794921875, + "logps/rejected": -287.38677978515625, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7902094125747681, + "rewards/margins": 1.3343619108200073, + "rewards/rejected": -2.1245713233947754, + "step": 3349 + }, + { + "epoch": 0.39, + "learning_rate": 1.8696008427952708e-07, + "logits/chosen": -3.4896583557128906, + "logits/rejected": -3.298124313354492, + "logps/chosen": -271.431884765625, + "logps/rejected": -188.68923950195312, + "loss": 0.8103, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7972546815872192, + "rewards/margins": 0.2501929998397827, + "rewards/rejected": -1.047447681427002, + "step": 3350 + }, + { + "epoch": 0.39, + "learning_rate": 1.8692496780990284e-07, + "logits/chosen": -3.444488048553467, + "logits/rejected": -3.2263903617858887, + "logps/chosen": -149.27359008789062, + "logps/rejected": -232.07749938964844, + "loss": 0.4041, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3413795232772827, + "rewards/margins": 1.3979203701019287, + "rewards/rejected": -1.056540846824646, + "step": 3351 + }, + { + "epoch": 0.39, + "learning_rate": 1.868898513402786e-07, + "logits/chosen": -3.362281560897827, + "logits/rejected": -3.167978525161743, + "logps/chosen": -286.9901428222656, + "logps/rejected": -197.0650177001953, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2425033450126648, + "rewards/margins": 1.3410884141921997, + "rewards/rejected": -1.5835916996002197, + "step": 3352 + }, + { + "epoch": 0.39, + "learning_rate": 1.8685473487065432e-07, + "logits/chosen": -3.1142430305480957, + "logits/rejected": -3.0576536655426025, + "logps/chosen": -329.6435241699219, + "logps/rejected": -333.5663146972656, + "loss": 0.6718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.780022919178009, + "rewards/margins": 0.30121511220932007, + "rewards/rejected": -1.081238031387329, + "step": 3353 + }, + { + "epoch": 0.39, + "learning_rate": 1.8681961840103007e-07, + "logits/chosen": -3.5235843658447266, + "logits/rejected": -3.370985269546509, + "logps/chosen": -186.47865295410156, + "logps/rejected": -166.6328887939453, + "loss": 0.4251, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1238468587398529, + "rewards/margins": 1.4607198238372803, + "rewards/rejected": -1.336872935295105, + "step": 3354 + }, + { + "epoch": 0.39, + "learning_rate": 1.8678450193140582e-07, + "logits/chosen": -2.620180606842041, + "logits/rejected": -2.7486674785614014, + "logps/chosen": -138.8900909423828, + "logps/rejected": -282.7743835449219, + "loss": 0.4456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12089106440544128, + "rewards/margins": 1.9135546684265137, + "rewards/rejected": -2.0344457626342773, + "step": 3355 + }, + { + "epoch": 0.39, + "learning_rate": 1.8674938546178155e-07, + "logits/chosen": -3.702939033508301, + "logits/rejected": -3.7201905250549316, + "logps/chosen": -227.0693359375, + "logps/rejected": -319.1444091796875, + "loss": 0.5938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8549561500549316, + "rewards/margins": 1.06113862991333, + "rewards/rejected": -1.9160947799682617, + "step": 3356 + }, + { + "epoch": 0.39, + "learning_rate": 1.8671426899215733e-07, + "logits/chosen": -3.2310216426849365, + "logits/rejected": -3.7196450233459473, + "logps/chosen": -177.603759765625, + "logps/rejected": -244.32870483398438, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.414431095123291, + "rewards/margins": 2.636556625366211, + "rewards/rejected": -2.22212553024292, + "step": 3357 + }, + { + "epoch": 0.39, + "learning_rate": 1.8667915252253303e-07, + "logits/chosen": -2.5573692321777344, + "logits/rejected": -2.7154250144958496, + "logps/chosen": -417.4433898925781, + "logps/rejected": -195.0287628173828, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22184929251670837, + "rewards/margins": 1.4430923461914062, + "rewards/rejected": -1.221243143081665, + "step": 3358 + }, + { + "epoch": 0.39, + "learning_rate": 1.866440360529088e-07, + "logits/chosen": -3.3796443939208984, + "logits/rejected": -3.373166799545288, + "logps/chosen": -286.3492736816406, + "logps/rejected": -225.70712280273438, + "loss": 0.5381, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18584302067756653, + "rewards/margins": 1.0121279954910278, + "rewards/rejected": -1.1979711055755615, + "step": 3359 + }, + { + "epoch": 0.39, + "learning_rate": 1.8660891958328457e-07, + "logits/chosen": -2.8926033973693848, + "logits/rejected": -2.9248368740081787, + "logps/chosen": -439.7306823730469, + "logps/rejected": -297.16912841796875, + "loss": 0.0978, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1083266735076904, + "rewards/margins": 3.0434398651123047, + "rewards/rejected": -1.9351134300231934, + "step": 3360 + }, + { + "epoch": 0.39, + "learning_rate": 1.865738031136603e-07, + "logits/chosen": -2.720067262649536, + "logits/rejected": -2.682982921600342, + "logps/chosen": -400.87786865234375, + "logps/rejected": -284.9651794433594, + "loss": 0.4286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31100887060165405, + "rewards/margins": 1.0271086692810059, + "rewards/rejected": -1.3381175994873047, + "step": 3361 + }, + { + "epoch": 0.39, + "learning_rate": 1.8653868664403605e-07, + "logits/chosen": -2.1397862434387207, + "logits/rejected": -2.1891415119171143, + "logps/chosen": -208.695556640625, + "logps/rejected": -238.9937286376953, + "loss": 0.2815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09859620034694672, + "rewards/margins": 2.2002899646759033, + "rewards/rejected": -2.2988858222961426, + "step": 3362 + }, + { + "epoch": 0.39, + "learning_rate": 1.865035701744118e-07, + "logits/chosen": -2.6101458072662354, + "logits/rejected": -2.943380355834961, + "logps/chosen": -317.00274658203125, + "logps/rejected": -381.96014404296875, + "loss": 0.2555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07327142357826233, + "rewards/margins": 2.0773184299468994, + "rewards/rejected": -2.150589942932129, + "step": 3363 + }, + { + "epoch": 0.39, + "learning_rate": 1.8646845370478753e-07, + "logits/chosen": -3.013810157775879, + "logits/rejected": -3.291996479034424, + "logps/chosen": -269.6859130859375, + "logps/rejected": -278.3575744628906, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23223423957824707, + "rewards/margins": 2.323197841644287, + "rewards/rejected": -2.090963363647461, + "step": 3364 + }, + { + "epoch": 0.39, + "learning_rate": 1.8643333723516328e-07, + "logits/chosen": -3.0004775524139404, + "logits/rejected": -2.9746522903442383, + "logps/chosen": -195.71728515625, + "logps/rejected": -351.3665771484375, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29655030369758606, + "rewards/margins": 1.7599458694458008, + "rewards/rejected": -1.4633957147598267, + "step": 3365 + }, + { + "epoch": 0.39, + "learning_rate": 1.86398220765539e-07, + "logits/chosen": -2.5990443229675293, + "logits/rejected": -2.7844252586364746, + "logps/chosen": -370.12939453125, + "logps/rejected": -264.8372497558594, + "loss": 0.3081, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2996978759765625, + "rewards/margins": 1.8699290752410889, + "rewards/rejected": -1.570231318473816, + "step": 3366 + }, + { + "epoch": 0.39, + "learning_rate": 1.8636310429591476e-07, + "logits/chosen": -2.7141056060791016, + "logits/rejected": -2.564105987548828, + "logps/chosen": -134.06741333007812, + "logps/rejected": -263.7242431640625, + "loss": 1.004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.055227622389793396, + "rewards/margins": 0.3429550230503082, + "rewards/rejected": -0.39818263053894043, + "step": 3367 + }, + { + "epoch": 0.39, + "learning_rate": 1.8632798782629054e-07, + "logits/chosen": -2.9558935165405273, + "logits/rejected": -2.7172017097473145, + "logps/chosen": -300.5688781738281, + "logps/rejected": -295.043212890625, + "loss": 0.2361, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4470914602279663, + "rewards/margins": 3.5402493476867676, + "rewards/rejected": -3.0931577682495117, + "step": 3368 + }, + { + "epoch": 0.39, + "learning_rate": 1.8629287135666627e-07, + "logits/chosen": -3.631865978240967, + "logits/rejected": -3.37953519821167, + "logps/chosen": -140.4248809814453, + "logps/rejected": -96.6922607421875, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6538541316986084, + "rewards/margins": 0.5482792258262634, + "rewards/rejected": -1.2021334171295166, + "step": 3369 + }, + { + "epoch": 0.39, + "learning_rate": 1.8625775488704203e-07, + "logits/chosen": -2.8829598426818848, + "logits/rejected": -3.028331756591797, + "logps/chosen": -219.79953002929688, + "logps/rejected": -194.84095764160156, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2292080819606781, + "rewards/margins": 2.234205722808838, + "rewards/rejected": -2.004997730255127, + "step": 3370 + }, + { + "epoch": 0.39, + "learning_rate": 1.8622263841741778e-07, + "logits/chosen": -2.3341972827911377, + "logits/rejected": -2.4414947032928467, + "logps/chosen": -204.51611328125, + "logps/rejected": -239.00143432617188, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35514748096466064, + "rewards/margins": 1.5296046733856201, + "rewards/rejected": -1.1744571924209595, + "step": 3371 + }, + { + "epoch": 0.39, + "learning_rate": 1.861875219477935e-07, + "logits/chosen": -2.627225160598755, + "logits/rejected": -2.6667089462280273, + "logps/chosen": -554.605712890625, + "logps/rejected": -419.80242919921875, + "loss": 0.2777, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1647966504096985, + "rewards/margins": 2.3659350872039795, + "rewards/rejected": -2.201138496398926, + "step": 3372 + }, + { + "epoch": 0.39, + "learning_rate": 1.8615240547816926e-07, + "logits/chosen": -2.8291306495666504, + "logits/rejected": -2.8740427494049072, + "logps/chosen": -400.13616943359375, + "logps/rejected": -280.4493408203125, + "loss": 0.6007, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12412129342556, + "rewards/margins": 1.0625388622283936, + "rewards/rejected": -1.1866602897644043, + "step": 3373 + }, + { + "epoch": 0.39, + "learning_rate": 1.86117289008545e-07, + "logits/chosen": -3.0686099529266357, + "logits/rejected": -3.1982178688049316, + "logps/chosen": -247.5400390625, + "logps/rejected": -229.33641052246094, + "loss": 0.3827, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24513068795204163, + "rewards/margins": 1.2210686206817627, + "rewards/rejected": -0.9759379029273987, + "step": 3374 + }, + { + "epoch": 0.39, + "learning_rate": 1.8608217253892074e-07, + "logits/chosen": -2.6899802684783936, + "logits/rejected": -2.7356677055358887, + "logps/chosen": -147.55157470703125, + "logps/rejected": -205.00828552246094, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14768216013908386, + "rewards/margins": 1.979715347290039, + "rewards/rejected": -2.1273975372314453, + "step": 3375 + }, + { + "epoch": 0.39, + "learning_rate": 1.860470560692965e-07, + "logits/chosen": -3.8911726474761963, + "logits/rejected": -4.114433765411377, + "logps/chosen": -177.40090942382812, + "logps/rejected": -209.9423828125, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5649049878120422, + "rewards/margins": 2.5308401584625244, + "rewards/rejected": -3.095745086669922, + "step": 3376 + }, + { + "epoch": 0.39, + "learning_rate": 1.8601193959967222e-07, + "logits/chosen": -2.7695517539978027, + "logits/rejected": -2.738558053970337, + "logps/chosen": -286.91607666015625, + "logps/rejected": -199.46896362304688, + "loss": 0.3297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03962108492851257, + "rewards/margins": 1.0827757120132446, + "rewards/rejected": -1.1223968267440796, + "step": 3377 + }, + { + "epoch": 0.39, + "learning_rate": 1.8597682313004798e-07, + "logits/chosen": -2.427811622619629, + "logits/rejected": -2.4038844108581543, + "logps/chosen": -369.1319580078125, + "logps/rejected": -308.73876953125, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03487871587276459, + "rewards/margins": 1.2280731201171875, + "rewards/rejected": -1.1931943893432617, + "step": 3378 + }, + { + "epoch": 0.39, + "learning_rate": 1.859417066604237e-07, + "logits/chosen": -2.4350507259368896, + "logits/rejected": -2.372783899307251, + "logps/chosen": -367.0918884277344, + "logps/rejected": -340.97662353515625, + "loss": 0.5742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1634221076965332, + "rewards/margins": 1.012915015220642, + "rewards/rejected": -1.1763370037078857, + "step": 3379 + }, + { + "epoch": 0.39, + "learning_rate": 1.8590659019079949e-07, + "logits/chosen": -2.682093620300293, + "logits/rejected": -2.9124908447265625, + "logps/chosen": -173.0694122314453, + "logps/rejected": -275.68914794921875, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2074582278728485, + "rewards/margins": 3.350914239883423, + "rewards/rejected": -3.143455982208252, + "step": 3380 + }, + { + "epoch": 0.39, + "learning_rate": 1.8587147372117524e-07, + "logits/chosen": -3.0590505599975586, + "logits/rejected": -3.3552021980285645, + "logps/chosen": -234.07838439941406, + "logps/rejected": -289.84600830078125, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1857576221227646, + "rewards/margins": 2.1563804149627686, + "rewards/rejected": -1.9706226587295532, + "step": 3381 + }, + { + "epoch": 0.39, + "learning_rate": 1.8583635725155097e-07, + "logits/chosen": -3.3385438919067383, + "logits/rejected": -3.1695470809936523, + "logps/chosen": -155.19070434570312, + "logps/rejected": -276.621337890625, + "loss": 0.2268, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09304594993591309, + "rewards/margins": 3.2190189361572266, + "rewards/rejected": -3.1259727478027344, + "step": 3382 + }, + { + "epoch": 0.39, + "learning_rate": 1.8580124078192672e-07, + "logits/chosen": -3.629153251647949, + "logits/rejected": -3.4300785064697266, + "logps/chosen": -499.6724853515625, + "logps/rejected": -278.95361328125, + "loss": 0.3443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4883619546890259, + "rewards/margins": 1.6910970211029053, + "rewards/rejected": -2.1794590950012207, + "step": 3383 + }, + { + "epoch": 0.39, + "learning_rate": 1.8576612431230247e-07, + "logits/chosen": -2.231999397277832, + "logits/rejected": -2.3044333457946777, + "logps/chosen": -304.4563903808594, + "logps/rejected": -261.8544921875, + "loss": 0.4567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24544525146484375, + "rewards/margins": 1.9181358814239502, + "rewards/rejected": -2.163581371307373, + "step": 3384 + }, + { + "epoch": 0.39, + "learning_rate": 1.857310078426782e-07, + "logits/chosen": -2.8477489948272705, + "logits/rejected": -2.8152599334716797, + "logps/chosen": -200.8197784423828, + "logps/rejected": -267.375, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37946486473083496, + "rewards/margins": 0.18810680508613586, + "rewards/rejected": -0.567571759223938, + "step": 3385 + }, + { + "epoch": 0.39, + "learning_rate": 1.8569589137305396e-07, + "logits/chosen": -3.839646339416504, + "logits/rejected": -3.7602250576019287, + "logps/chosen": -135.00042724609375, + "logps/rejected": -175.22726440429688, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16968408226966858, + "rewards/margins": 1.6109760999679565, + "rewards/rejected": -1.7806601524353027, + "step": 3386 + }, + { + "epoch": 0.39, + "learning_rate": 1.8566077490342968e-07, + "logits/chosen": -2.488852024078369, + "logits/rejected": -2.615156888961792, + "logps/chosen": -227.81906127929688, + "logps/rejected": -220.7418212890625, + "loss": 0.6238, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012204080820083618, + "rewards/margins": 0.808881938457489, + "rewards/rejected": -0.796677827835083, + "step": 3387 + }, + { + "epoch": 0.39, + "learning_rate": 1.8562565843380544e-07, + "logits/chosen": -3.6475515365600586, + "logits/rejected": -3.4200499057769775, + "logps/chosen": -272.94586181640625, + "logps/rejected": -214.51524353027344, + "loss": 0.7092, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6195650696754456, + "rewards/margins": 0.14976060390472412, + "rewards/rejected": -0.7693256735801697, + "step": 3388 + }, + { + "epoch": 0.39, + "learning_rate": 1.855905419641812e-07, + "logits/chosen": -2.964409828186035, + "logits/rejected": -3.0110695362091064, + "logps/chosen": -193.103759765625, + "logps/rejected": -265.68994140625, + "loss": 0.3673, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30437523126602173, + "rewards/margins": 3.1547951698303223, + "rewards/rejected": -2.8504199981689453, + "step": 3389 + }, + { + "epoch": 0.39, + "learning_rate": 1.8555542549455692e-07, + "logits/chosen": -3.060431957244873, + "logits/rejected": -2.884021282196045, + "logps/chosen": -243.8343505859375, + "logps/rejected": -312.174560546875, + "loss": 0.2885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6447136402130127, + "rewards/margins": 2.2804665565490723, + "rewards/rejected": -2.925179958343506, + "step": 3390 + }, + { + "epoch": 0.39, + "learning_rate": 1.855203090249327e-07, + "logits/chosen": -3.038705825805664, + "logits/rejected": -2.8781962394714355, + "logps/chosen": -329.5721435546875, + "logps/rejected": -197.72413635253906, + "loss": 0.4405, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11811991035938263, + "rewards/margins": 1.1736043691635132, + "rewards/rejected": -1.055484414100647, + "step": 3391 + }, + { + "epoch": 0.39, + "learning_rate": 1.8548519255530845e-07, + "logits/chosen": -3.295572280883789, + "logits/rejected": -3.2315125465393066, + "logps/chosen": -301.0611572265625, + "logps/rejected": -287.15594482421875, + "loss": 0.3519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5227702856063843, + "rewards/margins": 1.3830246925354004, + "rewards/rejected": -1.9057950973510742, + "step": 3392 + }, + { + "epoch": 0.39, + "learning_rate": 1.8545007608568418e-07, + "logits/chosen": -2.9663784503936768, + "logits/rejected": -3.289010524749756, + "logps/chosen": -98.59745788574219, + "logps/rejected": -177.1878662109375, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1801941692829132, + "rewards/margins": 2.462151527404785, + "rewards/rejected": -2.2819571495056152, + "step": 3393 + }, + { + "epoch": 0.39, + "learning_rate": 1.8541495961605993e-07, + "logits/chosen": -2.7384355068206787, + "logits/rejected": -2.58261775970459, + "logps/chosen": -149.14356994628906, + "logps/rejected": -212.92825317382812, + "loss": 0.3339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10349922627210617, + "rewards/margins": 2.724579095840454, + "rewards/rejected": -2.828078508377075, + "step": 3394 + }, + { + "epoch": 0.39, + "learning_rate": 1.8537984314643566e-07, + "logits/chosen": -3.217405319213867, + "logits/rejected": -3.0452003479003906, + "logps/chosen": -286.14044189453125, + "logps/rejected": -265.0211181640625, + "loss": 0.7018, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41152554750442505, + "rewards/margins": 0.3327970504760742, + "rewards/rejected": -0.744322657585144, + "step": 3395 + }, + { + "epoch": 0.39, + "learning_rate": 1.8534472667681141e-07, + "logits/chosen": -3.1047143936157227, + "logits/rejected": -3.174858331680298, + "logps/chosen": -212.82627868652344, + "logps/rejected": -123.75166320800781, + "loss": 0.4077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22732719779014587, + "rewards/margins": 1.6396843194961548, + "rewards/rejected": -1.4123570919036865, + "step": 3396 + }, + { + "epoch": 0.39, + "learning_rate": 1.8530961020718717e-07, + "logits/chosen": -3.093470335006714, + "logits/rejected": -3.0289719104766846, + "logps/chosen": -279.82989501953125, + "logps/rejected": -364.48529052734375, + "loss": 0.4333, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05530785024166107, + "rewards/margins": 2.0141072273254395, + "rewards/rejected": -1.9587993621826172, + "step": 3397 + }, + { + "epoch": 0.39, + "learning_rate": 1.852744937375629e-07, + "logits/chosen": -3.001474380493164, + "logits/rejected": -3.060197353363037, + "logps/chosen": -322.37078857421875, + "logps/rejected": -351.6236267089844, + "loss": 0.285, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39066553115844727, + "rewards/margins": 2.1257545948028564, + "rewards/rejected": -2.5164201259613037, + "step": 3398 + }, + { + "epoch": 0.39, + "learning_rate": 1.8523937726793865e-07, + "logits/chosen": -4.070487976074219, + "logits/rejected": -3.6121044158935547, + "logps/chosen": -418.24908447265625, + "logps/rejected": -237.89312744140625, + "loss": 0.1945, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18658122420310974, + "rewards/margins": 2.391826629638672, + "rewards/rejected": -2.5784077644348145, + "step": 3399 + }, + { + "epoch": 0.39, + "learning_rate": 1.852042607983144e-07, + "logits/chosen": -2.8050506114959717, + "logits/rejected": -3.3182506561279297, + "logps/chosen": -347.0992431640625, + "logps/rejected": -157.43162536621094, + "loss": 0.3915, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12073326110839844, + "rewards/margins": 1.359837532043457, + "rewards/rejected": -1.2391042709350586, + "step": 3400 + }, + { + "epoch": 0.39, + "learning_rate": 1.8516914432869013e-07, + "logits/chosen": -3.655104637145996, + "logits/rejected": -3.44968318939209, + "logps/chosen": -286.94415283203125, + "logps/rejected": -227.41424560546875, + "loss": 0.3225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32168933749198914, + "rewards/margins": 1.2877914905548096, + "rewards/rejected": -1.6094807386398315, + "step": 3401 + }, + { + "epoch": 0.39, + "learning_rate": 1.851340278590659e-07, + "logits/chosen": -2.6493191719055176, + "logits/rejected": -2.5444202423095703, + "logps/chosen": -277.2802429199219, + "logps/rejected": -232.4818115234375, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13050682842731476, + "rewards/margins": 2.367535352706909, + "rewards/rejected": -2.2370285987854004, + "step": 3402 + }, + { + "epoch": 0.39, + "learning_rate": 1.8509891138944164e-07, + "logits/chosen": -3.004007339477539, + "logits/rejected": -3.000755786895752, + "logps/chosen": -174.97518920898438, + "logps/rejected": -272.045166015625, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5065417289733887, + "rewards/margins": 0.756436824798584, + "rewards/rejected": -1.2629785537719727, + "step": 3403 + }, + { + "epoch": 0.39, + "learning_rate": 1.850637949198174e-07, + "logits/chosen": -3.5222623348236084, + "logits/rejected": -3.2390434741973877, + "logps/chosen": -234.52227783203125, + "logps/rejected": -183.23561096191406, + "loss": 0.4366, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023716449737548828, + "rewards/margins": 1.4353289604187012, + "rewards/rejected": -1.4116125106811523, + "step": 3404 + }, + { + "epoch": 0.39, + "learning_rate": 1.8502867845019315e-07, + "logits/chosen": -2.6648526191711426, + "logits/rejected": -3.119711399078369, + "logps/chosen": -295.4226989746094, + "logps/rejected": -249.0082244873047, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07682031393051147, + "rewards/margins": 1.5970805883407593, + "rewards/rejected": -1.6739009618759155, + "step": 3405 + }, + { + "epoch": 0.39, + "learning_rate": 1.8499356198056887e-07, + "logits/chosen": -2.5974478721618652, + "logits/rejected": -2.548638105392456, + "logps/chosen": -344.97052001953125, + "logps/rejected": -248.51914978027344, + "loss": 0.2161, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3117366433143616, + "rewards/margins": 2.454721450805664, + "rewards/rejected": -2.1429848670959473, + "step": 3406 + }, + { + "epoch": 0.39, + "learning_rate": 1.8495844551094463e-07, + "logits/chosen": -3.1767420768737793, + "logits/rejected": -3.5475704669952393, + "logps/chosen": -214.65811157226562, + "logps/rejected": -239.67112731933594, + "loss": 0.2957, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23205077648162842, + "rewards/margins": 2.327207326889038, + "rewards/rejected": -2.09515643119812, + "step": 3407 + }, + { + "epoch": 0.39, + "learning_rate": 1.8492332904132038e-07, + "logits/chosen": -4.057716369628906, + "logits/rejected": -4.095120429992676, + "logps/chosen": -294.60040283203125, + "logps/rejected": -363.2543029785156, + "loss": 0.2342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04292702674865723, + "rewards/margins": 2.290642499923706, + "rewards/rejected": -2.3335695266723633, + "step": 3408 + }, + { + "epoch": 0.39, + "learning_rate": 1.848882125716961e-07, + "logits/chosen": -2.7262916564941406, + "logits/rejected": -2.8955628871917725, + "logps/chosen": -149.23635864257812, + "logps/rejected": -152.49020385742188, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015595519915223122, + "rewards/margins": 0.8295466303825378, + "rewards/rejected": -0.845142126083374, + "step": 3409 + }, + { + "epoch": 0.39, + "learning_rate": 1.8485309610207186e-07, + "logits/chosen": -2.4477405548095703, + "logits/rejected": -2.5379374027252197, + "logps/chosen": -320.546142578125, + "logps/rejected": -291.97137451171875, + "loss": 0.4202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06467342376708984, + "rewards/margins": 1.3058156967163086, + "rewards/rejected": -1.3704891204833984, + "step": 3410 + }, + { + "epoch": 0.39, + "learning_rate": 1.848179796324476e-07, + "logits/chosen": -4.032327651977539, + "logits/rejected": -3.812528371810913, + "logps/chosen": -284.2132873535156, + "logps/rejected": -228.83348083496094, + "loss": 0.4528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5192509889602661, + "rewards/margins": 1.3175041675567627, + "rewards/rejected": -1.8367552757263184, + "step": 3411 + }, + { + "epoch": 0.39, + "learning_rate": 1.8478286316282334e-07, + "logits/chosen": -2.609930992126465, + "logits/rejected": -2.7362887859344482, + "logps/chosen": -284.83740234375, + "logps/rejected": -198.05532836914062, + "loss": 0.3028, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6130197644233704, + "rewards/margins": 2.0226969718933105, + "rewards/rejected": -1.409677267074585, + "step": 3412 + }, + { + "epoch": 0.39, + "learning_rate": 1.8474774669319912e-07, + "logits/chosen": -3.4208579063415527, + "logits/rejected": -3.596730947494507, + "logps/chosen": -137.12066650390625, + "logps/rejected": -167.757568359375, + "loss": 0.349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17109420895576477, + "rewards/margins": 1.3649487495422363, + "rewards/rejected": -1.5360430479049683, + "step": 3413 + }, + { + "epoch": 0.39, + "learning_rate": 1.8471263022357485e-07, + "logits/chosen": -3.343433141708374, + "logits/rejected": -3.1570382118225098, + "logps/chosen": -240.90579223632812, + "logps/rejected": -194.027099609375, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04245595633983612, + "rewards/margins": 2.2222485542297363, + "rewards/rejected": -2.179792642593384, + "step": 3414 + }, + { + "epoch": 0.39, + "learning_rate": 1.846775137539506e-07, + "logits/chosen": -3.061934232711792, + "logits/rejected": -2.7414865493774414, + "logps/chosen": -209.19700622558594, + "logps/rejected": -230.2843017578125, + "loss": 0.5469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1764589250087738, + "rewards/margins": 0.608339786529541, + "rewards/rejected": -0.7847987413406372, + "step": 3415 + }, + { + "epoch": 0.39, + "learning_rate": 1.8464239728432636e-07, + "logits/chosen": -3.14156174659729, + "logits/rejected": -3.2964727878570557, + "logps/chosen": -310.32318115234375, + "logps/rejected": -256.63250732421875, + "loss": 0.4196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026013828814029694, + "rewards/margins": 1.2330732345581055, + "rewards/rejected": -1.2070592641830444, + "step": 3416 + }, + { + "epoch": 0.39, + "learning_rate": 1.8460728081470209e-07, + "logits/chosen": -2.322971820831299, + "logits/rejected": -2.3209328651428223, + "logps/chosen": -200.73098754882812, + "logps/rejected": -250.62603759765625, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013542119413614273, + "rewards/margins": 0.9380231499671936, + "rewards/rejected": -0.9515652656555176, + "step": 3417 + }, + { + "epoch": 0.39, + "learning_rate": 1.8457216434507784e-07, + "logits/chosen": -3.3814640045166016, + "logits/rejected": -3.225640058517456, + "logps/chosen": -222.65309143066406, + "logps/rejected": -244.70159912109375, + "loss": 0.7562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0886484384536743, + "rewards/margins": 1.0527050495147705, + "rewards/rejected": -2.1413536071777344, + "step": 3418 + }, + { + "epoch": 0.39, + "learning_rate": 1.8453704787545357e-07, + "logits/chosen": -3.8868417739868164, + "logits/rejected": -3.726973533630371, + "logps/chosen": -220.2620391845703, + "logps/rejected": -229.06588745117188, + "loss": 0.3496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0012073889374732971, + "rewards/margins": 2.6934709548950195, + "rewards/rejected": -2.69467830657959, + "step": 3419 + }, + { + "epoch": 0.39, + "learning_rate": 1.8450193140582932e-07, + "logits/chosen": -2.560168743133545, + "logits/rejected": -2.5692739486694336, + "logps/chosen": -242.5186767578125, + "logps/rejected": -231.54470825195312, + "loss": 0.8903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9504167437553406, + "rewards/margins": 0.329061895608902, + "rewards/rejected": -1.279478669166565, + "step": 3420 + }, + { + "epoch": 0.39, + "learning_rate": 1.8446681493620508e-07, + "logits/chosen": -2.688538074493408, + "logits/rejected": -2.659205913543701, + "logps/chosen": -184.4125213623047, + "logps/rejected": -174.36456298828125, + "loss": 0.9505, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1642804145812988, + "rewards/margins": 0.6069768667221069, + "rewards/rejected": -1.7712574005126953, + "step": 3421 + }, + { + "epoch": 0.39, + "learning_rate": 1.844316984665808e-07, + "logits/chosen": -2.7949483394622803, + "logits/rejected": -2.58508038520813, + "logps/chosen": -494.8498840332031, + "logps/rejected": -289.1013488769531, + "loss": 0.3411, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1843445897102356, + "rewards/margins": 1.3441722393035889, + "rewards/rejected": -1.5285168886184692, + "step": 3422 + }, + { + "epoch": 0.39, + "learning_rate": 1.8439658199695656e-07, + "logits/chosen": -2.9284255504608154, + "logits/rejected": -3.0344367027282715, + "logps/chosen": -141.4424285888672, + "logps/rejected": -253.0063018798828, + "loss": 0.3252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1213291585445404, + "rewards/margins": 2.977273941040039, + "rewards/rejected": -3.0986032485961914, + "step": 3423 + }, + { + "epoch": 0.39, + "learning_rate": 1.8436146552733234e-07, + "logits/chosen": -2.9407036304473877, + "logits/rejected": -2.6752398014068604, + "logps/chosen": -319.13568115234375, + "logps/rejected": -310.0118103027344, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38937199115753174, + "rewards/margins": 1.0542962551116943, + "rewards/rejected": -0.6649242043495178, + "step": 3424 + }, + { + "epoch": 0.39, + "learning_rate": 1.8432634905770806e-07, + "logits/chosen": -2.160059928894043, + "logits/rejected": -2.2293734550476074, + "logps/chosen": -371.50390625, + "logps/rejected": -349.00189208984375, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10690009593963623, + "rewards/margins": 3.0078330039978027, + "rewards/rejected": -3.1147329807281494, + "step": 3425 + }, + { + "epoch": 0.39, + "learning_rate": 1.8429123258808382e-07, + "logits/chosen": -2.670339822769165, + "logits/rejected": -2.7568917274475098, + "logps/chosen": -311.570068359375, + "logps/rejected": -395.3167724609375, + "loss": 0.5987, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18331420421600342, + "rewards/margins": 0.5684347748756409, + "rewards/rejected": -0.38512054085731506, + "step": 3426 + }, + { + "epoch": 0.4, + "learning_rate": 1.8425611611845955e-07, + "logits/chosen": -3.1106386184692383, + "logits/rejected": -2.9441208839416504, + "logps/chosen": -161.59664916992188, + "logps/rejected": -213.43841552734375, + "loss": 0.367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25946155190467834, + "rewards/margins": 1.148641586303711, + "rewards/rejected": -0.889180064201355, + "step": 3427 + }, + { + "epoch": 0.4, + "learning_rate": 1.842209996488353e-07, + "logits/chosen": -3.5887045860290527, + "logits/rejected": -3.5745785236358643, + "logps/chosen": -255.00872802734375, + "logps/rejected": -195.31399536132812, + "loss": 0.6525, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4729490578174591, + "rewards/margins": 0.9715657234191895, + "rewards/rejected": -1.4445146322250366, + "step": 3428 + }, + { + "epoch": 0.4, + "learning_rate": 1.8418588317921105e-07, + "logits/chosen": -3.4735629558563232, + "logits/rejected": -3.0894460678100586, + "logps/chosen": -148.03395080566406, + "logps/rejected": -200.5955810546875, + "loss": 0.3102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2623097002506256, + "rewards/margins": 1.661289095878601, + "rewards/rejected": -1.9235990047454834, + "step": 3429 + }, + { + "epoch": 0.4, + "learning_rate": 1.8415076670958678e-07, + "logits/chosen": -3.4379820823669434, + "logits/rejected": -3.3982369899749756, + "logps/chosen": -157.95657348632812, + "logps/rejected": -231.3470001220703, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15168511867523193, + "rewards/margins": 1.4541528224945068, + "rewards/rejected": -1.6058378219604492, + "step": 3430 + }, + { + "epoch": 0.4, + "learning_rate": 1.8411565023996253e-07, + "logits/chosen": -2.6997365951538086, + "logits/rejected": -2.529655933380127, + "logps/chosen": -304.3262939453125, + "logps/rejected": -292.0592346191406, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06559017300605774, + "rewards/margins": 2.0350987911224365, + "rewards/rejected": -2.100688934326172, + "step": 3431 + }, + { + "epoch": 0.4, + "learning_rate": 1.8408053377033826e-07, + "logits/chosen": -3.7115635871887207, + "logits/rejected": -3.5179336071014404, + "logps/chosen": -359.3333740234375, + "logps/rejected": -260.69757080078125, + "loss": 0.676, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6465620994567871, + "rewards/margins": 1.2208415269851685, + "rewards/rejected": -1.8674036264419556, + "step": 3432 + }, + { + "epoch": 0.4, + "learning_rate": 1.8404541730071402e-07, + "logits/chosen": -3.126112222671509, + "logits/rejected": -3.272031545639038, + "logps/chosen": -341.91534423828125, + "logps/rejected": -235.974365234375, + "loss": 0.2768, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1963905245065689, + "rewards/margins": 2.88618540763855, + "rewards/rejected": -3.082575798034668, + "step": 3433 + }, + { + "epoch": 0.4, + "learning_rate": 1.8401030083108977e-07, + "logits/chosen": -3.420992374420166, + "logits/rejected": -3.3470547199249268, + "logps/chosen": -401.2160949707031, + "logps/rejected": -374.7069396972656, + "loss": 0.3688, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24546414613723755, + "rewards/margins": 1.626028060913086, + "rewards/rejected": -1.3805640935897827, + "step": 3434 + }, + { + "epoch": 0.4, + "learning_rate": 1.839751843614655e-07, + "logits/chosen": -3.233541250228882, + "logits/rejected": -3.424907922744751, + "logps/chosen": -333.7332458496094, + "logps/rejected": -289.154541015625, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4791027009487152, + "rewards/margins": 2.737766742706299, + "rewards/rejected": -2.258664131164551, + "step": 3435 + }, + { + "epoch": 0.4, + "learning_rate": 1.8394006789184128e-07, + "logits/chosen": -3.577627182006836, + "logits/rejected": -3.5800490379333496, + "logps/chosen": -253.7808837890625, + "logps/rejected": -295.1513671875, + "loss": 0.5732, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25326278805732727, + "rewards/margins": 0.9296380281448364, + "rewards/rejected": -1.1829009056091309, + "step": 3436 + }, + { + "epoch": 0.4, + "learning_rate": 1.8390495142221703e-07, + "logits/chosen": -2.829408645629883, + "logits/rejected": -2.527937412261963, + "logps/chosen": -308.918701171875, + "logps/rejected": -309.6043701171875, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18247976899147034, + "rewards/margins": 1.5011465549468994, + "rewards/rejected": -1.6836262941360474, + "step": 3437 + }, + { + "epoch": 0.4, + "learning_rate": 1.8386983495259276e-07, + "logits/chosen": -2.8233695030212402, + "logits/rejected": -2.999967098236084, + "logps/chosen": -182.94113159179688, + "logps/rejected": -326.7904357910156, + "loss": 0.6618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5578591227531433, + "rewards/margins": 0.7877384424209595, + "rewards/rejected": -1.3455976247787476, + "step": 3438 + }, + { + "epoch": 0.4, + "learning_rate": 1.838347184829685e-07, + "logits/chosen": -3.5705630779266357, + "logits/rejected": -3.539708137512207, + "logps/chosen": -222.5360107421875, + "logps/rejected": -236.65380859375, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33820006251335144, + "rewards/margins": 2.008821964263916, + "rewards/rejected": -1.6706221103668213, + "step": 3439 + }, + { + "epoch": 0.4, + "learning_rate": 1.8379960201334424e-07, + "logits/chosen": -3.048891544342041, + "logits/rejected": -2.9751343727111816, + "logps/chosen": -97.4214096069336, + "logps/rejected": -145.11106872558594, + "loss": 0.4746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4986240565776825, + "rewards/margins": 0.7183965444564819, + "rewards/rejected": -1.2170205116271973, + "step": 3440 + }, + { + "epoch": 0.4, + "learning_rate": 1.8376448554372e-07, + "logits/chosen": -3.4824090003967285, + "logits/rejected": -3.376521110534668, + "logps/chosen": -293.71234130859375, + "logps/rejected": -243.0302276611328, + "loss": 0.3933, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12637832760810852, + "rewards/margins": 1.6994874477386475, + "rewards/rejected": -1.8258657455444336, + "step": 3441 + }, + { + "epoch": 0.4, + "learning_rate": 1.8372936907409575e-07, + "logits/chosen": -2.820589303970337, + "logits/rejected": -2.6469178199768066, + "logps/chosen": -107.77979278564453, + "logps/rejected": -210.06214904785156, + "loss": 0.7545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5823734402656555, + "rewards/margins": 0.4806463122367859, + "rewards/rejected": -1.0630197525024414, + "step": 3442 + }, + { + "epoch": 0.4, + "learning_rate": 1.8369425260447148e-07, + "logits/chosen": -3.081399917602539, + "logits/rejected": -3.029306650161743, + "logps/chosen": -284.517333984375, + "logps/rejected": -267.10015869140625, + "loss": 0.3037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04415444657206535, + "rewards/margins": 2.6443567276000977, + "rewards/rejected": -2.688511371612549, + "step": 3443 + }, + { + "epoch": 0.4, + "learning_rate": 1.8365913613484723e-07, + "logits/chosen": -3.756296157836914, + "logits/rejected": -3.645860433578491, + "logps/chosen": -220.17391967773438, + "logps/rejected": -229.02725219726562, + "loss": 0.3976, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42754799127578735, + "rewards/margins": 1.1632983684539795, + "rewards/rejected": -0.7357503771781921, + "step": 3444 + }, + { + "epoch": 0.4, + "learning_rate": 1.8362401966522298e-07, + "logits/chosen": -2.9803335666656494, + "logits/rejected": -3.0743279457092285, + "logps/chosen": -193.96510314941406, + "logps/rejected": -273.0840759277344, + "loss": 0.3767, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1567777693271637, + "rewards/margins": 1.538785457611084, + "rewards/rejected": -1.3820075988769531, + "step": 3445 + }, + { + "epoch": 0.4, + "learning_rate": 1.835889031955987e-07, + "logits/chosen": -2.8084921836853027, + "logits/rejected": -3.286257743835449, + "logps/chosen": -179.17327880859375, + "logps/rejected": -257.85699462890625, + "loss": 0.5309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0890406146645546, + "rewards/margins": 1.8660002946853638, + "rewards/rejected": -1.9550409317016602, + "step": 3446 + }, + { + "epoch": 0.4, + "learning_rate": 1.835537867259745e-07, + "logits/chosen": -3.215855598449707, + "logits/rejected": -2.7233681678771973, + "logps/chosen": -433.71429443359375, + "logps/rejected": -362.7952880859375, + "loss": 0.1659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5932531356811523, + "rewards/margins": 2.6540896892547607, + "rewards/rejected": -2.0608367919921875, + "step": 3447 + }, + { + "epoch": 0.4, + "learning_rate": 1.8351867025635022e-07, + "logits/chosen": -3.5963690280914307, + "logits/rejected": -3.6705715656280518, + "logps/chosen": -168.9079132080078, + "logps/rejected": -136.09437561035156, + "loss": 0.762, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5231258273124695, + "rewards/margins": 0.8437231779098511, + "rewards/rejected": -1.3668489456176758, + "step": 3448 + }, + { + "epoch": 0.4, + "learning_rate": 1.8348355378672597e-07, + "logits/chosen": -3.612478494644165, + "logits/rejected": -3.0914835929870605, + "logps/chosen": -476.5375061035156, + "logps/rejected": -362.8228759765625, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2538681626319885, + "rewards/margins": 2.132351875305176, + "rewards/rejected": -2.3862199783325195, + "step": 3449 + }, + { + "epoch": 0.4, + "learning_rate": 1.8344843731710173e-07, + "logits/chosen": -3.289804458618164, + "logits/rejected": -3.276055335998535, + "logps/chosen": -142.82647705078125, + "logps/rejected": -131.37258911132812, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19650857150554657, + "rewards/margins": 1.370190143585205, + "rewards/rejected": -1.1736814975738525, + "step": 3450 + }, + { + "epoch": 0.4, + "learning_rate": 1.8341332084747745e-07, + "logits/chosen": -3.6379916667938232, + "logits/rejected": -3.678070306777954, + "logps/chosen": -205.85464477539062, + "logps/rejected": -208.7236328125, + "loss": 0.3036, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12542441487312317, + "rewards/margins": 1.9577301740646362, + "rewards/rejected": -1.8323057889938354, + "step": 3451 + }, + { + "epoch": 0.4, + "learning_rate": 1.833782043778532e-07, + "logits/chosen": -2.6341700553894043, + "logits/rejected": -2.497532367706299, + "logps/chosen": -179.85658264160156, + "logps/rejected": -341.32855224609375, + "loss": 0.2213, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14233654737472534, + "rewards/margins": 2.1341073513031006, + "rewards/rejected": -1.9917707443237305, + "step": 3452 + }, + { + "epoch": 0.4, + "learning_rate": 1.8334308790822896e-07, + "logits/chosen": -3.6272635459899902, + "logits/rejected": -3.214503049850464, + "logps/chosen": -367.9060363769531, + "logps/rejected": -237.32644653320312, + "loss": 0.3114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.000913769006729126, + "rewards/margins": 1.2735432386398315, + "rewards/rejected": -1.2744569778442383, + "step": 3453 + }, + { + "epoch": 0.4, + "learning_rate": 1.833079714386047e-07, + "logits/chosen": -2.830575942993164, + "logits/rejected": -2.723090887069702, + "logps/chosen": -319.3262023925781, + "logps/rejected": -242.7493896484375, + "loss": 0.431, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21272200345993042, + "rewards/margins": 1.3225533962249756, + "rewards/rejected": -1.10983145236969, + "step": 3454 + }, + { + "epoch": 0.4, + "learning_rate": 1.8327285496898044e-07, + "logits/chosen": -4.272207260131836, + "logits/rejected": -3.5548930168151855, + "logps/chosen": -246.58929443359375, + "logps/rejected": -136.9527587890625, + "loss": 0.329, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2594621181488037, + "rewards/margins": 1.41023588180542, + "rewards/rejected": -1.1507737636566162, + "step": 3455 + }, + { + "epoch": 0.4, + "learning_rate": 1.8323773849935617e-07, + "logits/chosen": -3.0373036861419678, + "logits/rejected": -3.3966822624206543, + "logps/chosen": -116.74152374267578, + "logps/rejected": -246.48388671875, + "loss": 0.2496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04930911958217621, + "rewards/margins": 3.7068915367126465, + "rewards/rejected": -3.6575822830200195, + "step": 3456 + }, + { + "epoch": 0.4, + "learning_rate": 1.8320262202973192e-07, + "logits/chosen": -2.892296075820923, + "logits/rejected": -2.6429786682128906, + "logps/chosen": -266.3183898925781, + "logps/rejected": -236.55799865722656, + "loss": 0.5891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.647473931312561, + "rewards/margins": 0.7106512784957886, + "rewards/rejected": -1.35812509059906, + "step": 3457 + }, + { + "epoch": 0.4, + "learning_rate": 1.831675055601077e-07, + "logits/chosen": -2.6327829360961914, + "logits/rejected": -2.696831226348877, + "logps/chosen": -168.52471923828125, + "logps/rejected": -153.287109375, + "loss": 0.2145, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6071206331253052, + "rewards/margins": 2.1361184120178223, + "rewards/rejected": -1.528997778892517, + "step": 3458 + }, + { + "epoch": 0.4, + "learning_rate": 1.8313238909048343e-07, + "logits/chosen": -2.7673087120056152, + "logits/rejected": -2.692427396774292, + "logps/chosen": -482.3166198730469, + "logps/rejected": -351.4071960449219, + "loss": 0.3819, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0673666000366211, + "rewards/margins": 1.567945122718811, + "rewards/rejected": -1.50057852268219, + "step": 3459 + }, + { + "epoch": 0.4, + "learning_rate": 1.8309727262085918e-07, + "logits/chosen": -3.606419563293457, + "logits/rejected": -3.5960752964019775, + "logps/chosen": -179.07508850097656, + "logps/rejected": -248.56195068359375, + "loss": 0.2682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4236631989479065, + "rewards/margins": 1.633461594581604, + "rewards/rejected": -2.0571248531341553, + "step": 3460 + }, + { + "epoch": 0.4, + "learning_rate": 1.8306215615123494e-07, + "logits/chosen": -2.8672056198120117, + "logits/rejected": -2.6310644149780273, + "logps/chosen": -369.20330810546875, + "logps/rejected": -239.59381103515625, + "loss": 0.522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2249467521905899, + "rewards/margins": 0.8697970509529114, + "rewards/rejected": -1.0947437286376953, + "step": 3461 + }, + { + "epoch": 0.4, + "learning_rate": 1.8302703968161067e-07, + "logits/chosen": -3.337010622024536, + "logits/rejected": -3.4404327869415283, + "logps/chosen": -386.5289306640625, + "logps/rejected": -214.51809692382812, + "loss": 0.4047, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12233559787273407, + "rewards/margins": 1.9801111221313477, + "rewards/rejected": -1.8577754497528076, + "step": 3462 + }, + { + "epoch": 0.4, + "learning_rate": 1.8299192321198642e-07, + "logits/chosen": -2.7257258892059326, + "logits/rejected": -2.9512267112731934, + "logps/chosen": -186.25701904296875, + "logps/rejected": -314.9561767578125, + "loss": 0.3029, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11656700074672699, + "rewards/margins": 1.6411206722259521, + "rewards/rejected": -1.5245535373687744, + "step": 3463 + }, + { + "epoch": 0.4, + "learning_rate": 1.8295680674236215e-07, + "logits/chosen": -2.292220115661621, + "logits/rejected": -2.544412612915039, + "logps/chosen": -260.32073974609375, + "logps/rejected": -299.4255676269531, + "loss": 0.3648, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41026222705841064, + "rewards/margins": 1.7526750564575195, + "rewards/rejected": -1.3424127101898193, + "step": 3464 + }, + { + "epoch": 0.4, + "learning_rate": 1.829216902727379e-07, + "logits/chosen": -3.08674955368042, + "logits/rejected": -2.9405272006988525, + "logps/chosen": -258.9977722167969, + "logps/rejected": -232.78359985351562, + "loss": 0.2712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12193387001752853, + "rewards/margins": 1.6900193691253662, + "rewards/rejected": -1.5680854320526123, + "step": 3465 + }, + { + "epoch": 0.4, + "learning_rate": 1.8288657380311365e-07, + "logits/chosen": -2.3859076499938965, + "logits/rejected": -2.606198310852051, + "logps/chosen": -277.888427734375, + "logps/rejected": -326.7913818359375, + "loss": 0.6194, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2726787328720093, + "rewards/margins": 1.7269638776779175, + "rewards/rejected": -1.4542850255966187, + "step": 3466 + }, + { + "epoch": 0.4, + "learning_rate": 1.8285145733348938e-07, + "logits/chosen": -3.2121658325195312, + "logits/rejected": -3.276298761367798, + "logps/chosen": -351.5593566894531, + "logps/rejected": -320.284912109375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07429036498069763, + "rewards/margins": 2.455681562423706, + "rewards/rejected": -2.3813910484313965, + "step": 3467 + }, + { + "epoch": 0.4, + "learning_rate": 1.8281634086386514e-07, + "logits/chosen": -3.138625144958496, + "logits/rejected": -3.0182228088378906, + "logps/chosen": -228.5332489013672, + "logps/rejected": -195.54931640625, + "loss": 0.2832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15359792113304138, + "rewards/margins": 1.688624620437622, + "rewards/rejected": -1.8422224521636963, + "step": 3468 + }, + { + "epoch": 0.4, + "learning_rate": 1.8278122439424092e-07, + "logits/chosen": -3.1214916706085205, + "logits/rejected": -3.3553481101989746, + "logps/chosen": -308.2829895019531, + "logps/rejected": -228.91238403320312, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5386403799057007, + "rewards/margins": 1.3041132688522339, + "rewards/rejected": -1.8427536487579346, + "step": 3469 + }, + { + "epoch": 0.4, + "learning_rate": 1.8274610792461664e-07, + "logits/chosen": -3.2479937076568604, + "logits/rejected": -3.0262136459350586, + "logps/chosen": -261.2726745605469, + "logps/rejected": -312.3704528808594, + "loss": 0.8092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1200781911611557, + "rewards/margins": 0.7003231048583984, + "rewards/rejected": -0.8204012513160706, + "step": 3470 + }, + { + "epoch": 0.4, + "learning_rate": 1.827109914549924e-07, + "logits/chosen": -3.1713786125183105, + "logits/rejected": -3.632185220718384, + "logps/chosen": -171.10513305664062, + "logps/rejected": -350.1403503417969, + "loss": 0.1547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6436076164245605, + "rewards/margins": 3.031108856201172, + "rewards/rejected": -2.3875010013580322, + "step": 3471 + }, + { + "epoch": 0.4, + "learning_rate": 1.8267587498536813e-07, + "logits/chosen": -3.0028862953186035, + "logits/rejected": -2.6542558670043945, + "logps/chosen": -405.6163330078125, + "logps/rejected": -358.9677429199219, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5053099393844604, + "rewards/margins": 2.334343194961548, + "rewards/rejected": -1.8290331363677979, + "step": 3472 + }, + { + "epoch": 0.4, + "learning_rate": 1.8264075851574388e-07, + "logits/chosen": -2.9319562911987305, + "logits/rejected": -2.919583559036255, + "logps/chosen": -287.0363464355469, + "logps/rejected": -248.58811950683594, + "loss": 0.1516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28877294063568115, + "rewards/margins": 2.155003070831299, + "rewards/rejected": -1.8662300109863281, + "step": 3473 + }, + { + "epoch": 0.4, + "learning_rate": 1.8260564204611963e-07, + "logits/chosen": -3.0969252586364746, + "logits/rejected": -2.9206998348236084, + "logps/chosen": -323.7716064453125, + "logps/rejected": -247.12319946289062, + "loss": 0.2542, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4837474524974823, + "rewards/margins": 2.381845474243164, + "rewards/rejected": -1.898098111152649, + "step": 3474 + }, + { + "epoch": 0.4, + "learning_rate": 1.8257052557649536e-07, + "logits/chosen": -2.482708215713501, + "logits/rejected": -2.549679756164551, + "logps/chosen": -498.50634765625, + "logps/rejected": -556.5753784179688, + "loss": 0.3497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04774561524391174, + "rewards/margins": 2.6559560298919678, + "rewards/rejected": -2.7037017345428467, + "step": 3475 + }, + { + "epoch": 0.4, + "learning_rate": 1.8253540910687111e-07, + "logits/chosen": -3.0582573413848877, + "logits/rejected": -2.9603753089904785, + "logps/chosen": -309.6461181640625, + "logps/rejected": -296.5096130371094, + "loss": 0.4161, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07479983568191528, + "rewards/margins": 1.4507852792739868, + "rewards/rejected": -1.3759855031967163, + "step": 3476 + }, + { + "epoch": 0.4, + "learning_rate": 1.8250029263724684e-07, + "logits/chosen": -3.2805378437042236, + "logits/rejected": -3.347604274749756, + "logps/chosen": -276.556884765625, + "logps/rejected": -318.2950134277344, + "loss": 0.1449, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5710178017616272, + "rewards/margins": 2.6123299598693848, + "rewards/rejected": -2.0413122177124023, + "step": 3477 + }, + { + "epoch": 0.4, + "learning_rate": 1.824651761676226e-07, + "logits/chosen": -3.6501729488372803, + "logits/rejected": -3.6750926971435547, + "logps/chosen": -183.37741088867188, + "logps/rejected": -213.5255889892578, + "loss": 0.4588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5190547704696655, + "rewards/margins": 1.7534277439117432, + "rewards/rejected": -2.2724826335906982, + "step": 3478 + }, + { + "epoch": 0.4, + "learning_rate": 1.8243005969799835e-07, + "logits/chosen": -3.0983777046203613, + "logits/rejected": -3.025247097015381, + "logps/chosen": -178.22421264648438, + "logps/rejected": -174.6618194580078, + "loss": 0.6634, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31720882654190063, + "rewards/margins": 0.32789894938468933, + "rewards/rejected": -0.6451078057289124, + "step": 3479 + }, + { + "epoch": 0.4, + "learning_rate": 1.8239494322837408e-07, + "logits/chosen": -3.4038310050964355, + "logits/rejected": -3.725478410720825, + "logps/chosen": -394.9437255859375, + "logps/rejected": -322.6715087890625, + "loss": 0.2465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07618457078933716, + "rewards/margins": 1.709977626800537, + "rewards/rejected": -1.7861621379852295, + "step": 3480 + }, + { + "epoch": 0.4, + "learning_rate": 1.8235982675874986e-07, + "logits/chosen": -2.801156997680664, + "logits/rejected": -2.628204345703125, + "logps/chosen": -257.4753723144531, + "logps/rejected": -278.3416442871094, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2723209857940674, + "rewards/margins": 2.5469841957092285, + "rewards/rejected": -2.274663209915161, + "step": 3481 + }, + { + "epoch": 0.4, + "learning_rate": 1.823247102891256e-07, + "logits/chosen": -2.965620517730713, + "logits/rejected": -3.482375144958496, + "logps/chosen": -191.66055297851562, + "logps/rejected": -169.75970458984375, + "loss": 0.8041, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.432248055934906, + "rewards/margins": 0.6079539656639099, + "rewards/rejected": -1.040202021598816, + "step": 3482 + }, + { + "epoch": 0.4, + "learning_rate": 1.8228959381950134e-07, + "logits/chosen": -2.315192222595215, + "logits/rejected": -2.3022992610931396, + "logps/chosen": -547.8173217773438, + "logps/rejected": -330.9700622558594, + "loss": 0.4336, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7364358901977539, + "rewards/margins": 1.5207734107971191, + "rewards/rejected": -0.78433758020401, + "step": 3483 + }, + { + "epoch": 0.4, + "learning_rate": 1.822544773498771e-07, + "logits/chosen": -3.3228416442871094, + "logits/rejected": -3.421426296234131, + "logps/chosen": -323.55810546875, + "logps/rejected": -186.70529174804688, + "loss": 0.2926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07875534892082214, + "rewards/margins": 1.877289056777954, + "rewards/rejected": -1.956044316291809, + "step": 3484 + }, + { + "epoch": 0.4, + "learning_rate": 1.8221936088025282e-07, + "logits/chosen": -2.8166258335113525, + "logits/rejected": -3.010793685913086, + "logps/chosen": -326.30487060546875, + "logps/rejected": -313.98291015625, + "loss": 0.6684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2543906271457672, + "rewards/margins": 0.664979875087738, + "rewards/rejected": -0.9193704724311829, + "step": 3485 + }, + { + "epoch": 0.4, + "learning_rate": 1.8218424441062857e-07, + "logits/chosen": -3.0913450717926025, + "logits/rejected": -3.230609893798828, + "logps/chosen": -205.50894165039062, + "logps/rejected": -176.6011505126953, + "loss": 0.5844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4192887842655182, + "rewards/margins": 1.156830072402954, + "rewards/rejected": -1.5761187076568604, + "step": 3486 + }, + { + "epoch": 0.4, + "learning_rate": 1.8214912794100433e-07, + "logits/chosen": -3.2586112022399902, + "logits/rejected": -3.238257884979248, + "logps/chosen": -252.6163787841797, + "logps/rejected": -182.692138671875, + "loss": 0.3995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03619527816772461, + "rewards/margins": 1.1175763607025146, + "rewards/rejected": -1.1537716388702393, + "step": 3487 + }, + { + "epoch": 0.4, + "learning_rate": 1.8211401147138005e-07, + "logits/chosen": -3.536797046661377, + "logits/rejected": -3.7977352142333984, + "logps/chosen": -130.3854217529297, + "logps/rejected": -238.85816955566406, + "loss": 0.3964, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6975823044776917, + "rewards/margins": 1.4764883518218994, + "rewards/rejected": -0.778905987739563, + "step": 3488 + }, + { + "epoch": 0.4, + "learning_rate": 1.820788950017558e-07, + "logits/chosen": -2.8117165565490723, + "logits/rejected": -2.8562541007995605, + "logps/chosen": -226.33157348632812, + "logps/rejected": -223.89236450195312, + "loss": 0.5455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3207423985004425, + "rewards/margins": 1.2906781435012817, + "rewards/rejected": -1.6114205121994019, + "step": 3489 + }, + { + "epoch": 0.4, + "learning_rate": 1.820437785321316e-07, + "logits/chosen": -2.6005659103393555, + "logits/rejected": -2.570387125015259, + "logps/chosen": -155.2984161376953, + "logps/rejected": -155.80609130859375, + "loss": 0.2867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1323653757572174, + "rewards/margins": 1.7298380136489868, + "rewards/rejected": -1.5974726676940918, + "step": 3490 + }, + { + "epoch": 0.4, + "learning_rate": 1.820086620625073e-07, + "logits/chosen": -3.620378017425537, + "logits/rejected": -3.3464744091033936, + "logps/chosen": -167.00424194335938, + "logps/rejected": -125.95811462402344, + "loss": 0.459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4857735335826874, + "rewards/margins": 0.8232100009918213, + "rewards/rejected": -1.308983564376831, + "step": 3491 + }, + { + "epoch": 0.4, + "learning_rate": 1.8197354559288307e-07, + "logits/chosen": -3.0018129348754883, + "logits/rejected": -2.81488037109375, + "logps/chosen": -274.6968994140625, + "logps/rejected": -263.5760803222656, + "loss": 0.2185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04472966492176056, + "rewards/margins": 2.2247681617736816, + "rewards/rejected": -2.1800384521484375, + "step": 3492 + }, + { + "epoch": 0.4, + "learning_rate": 1.819384291232588e-07, + "logits/chosen": -3.2814111709594727, + "logits/rejected": -3.0376269817352295, + "logps/chosen": -334.3322448730469, + "logps/rejected": -227.94834899902344, + "loss": 0.5195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14294324815273285, + "rewards/margins": 0.8109804391860962, + "rewards/rejected": -0.953923761844635, + "step": 3493 + }, + { + "epoch": 0.4, + "learning_rate": 1.8190331265363455e-07, + "logits/chosen": -3.5348424911499023, + "logits/rejected": -3.389540910720825, + "logps/chosen": -227.6029052734375, + "logps/rejected": -176.49974060058594, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6147162914276123, + "rewards/margins": 1.8974261283874512, + "rewards/rejected": -1.2827098369598389, + "step": 3494 + }, + { + "epoch": 0.4, + "learning_rate": 1.818681961840103e-07, + "logits/chosen": -3.398228168487549, + "logits/rejected": -3.0416505336761475, + "logps/chosen": -181.53073120117188, + "logps/rejected": -218.158203125, + "loss": 0.7362, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5922834873199463, + "rewards/margins": 0.39503419399261475, + "rewards/rejected": -0.987317681312561, + "step": 3495 + }, + { + "epoch": 0.4, + "learning_rate": 1.8183307971438603e-07, + "logits/chosen": -3.3147096633911133, + "logits/rejected": -3.261153221130371, + "logps/chosen": -199.16162109375, + "logps/rejected": -177.08108520507812, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3324744701385498, + "rewards/margins": 1.4164992570877075, + "rewards/rejected": -1.7489736080169678, + "step": 3496 + }, + { + "epoch": 0.4, + "learning_rate": 1.8179796324476179e-07, + "logits/chosen": -2.738931894302368, + "logits/rejected": -2.635317802429199, + "logps/chosen": -293.00030517578125, + "logps/rejected": -207.85572814941406, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2505229413509369, + "rewards/margins": 0.7619376182556152, + "rewards/rejected": -0.511414647102356, + "step": 3497 + }, + { + "epoch": 0.4, + "learning_rate": 1.8176284677513754e-07, + "logits/chosen": -2.78204345703125, + "logits/rejected": -2.776705503463745, + "logps/chosen": -190.83224487304688, + "logps/rejected": -174.32786560058594, + "loss": 0.5097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.360834538936615, + "rewards/margins": 0.8450148105621338, + "rewards/rejected": -1.2058494091033936, + "step": 3498 + }, + { + "epoch": 0.4, + "learning_rate": 1.8172773030551327e-07, + "logits/chosen": -3.341421604156494, + "logits/rejected": -3.09098219871521, + "logps/chosen": -231.8680419921875, + "logps/rejected": -219.88687133789062, + "loss": 0.3338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10879823565483093, + "rewards/margins": 1.5189762115478516, + "rewards/rejected": -1.6277744770050049, + "step": 3499 + }, + { + "epoch": 0.4, + "learning_rate": 1.8169261383588902e-07, + "logits/chosen": -3.3972723484039307, + "logits/rejected": -3.31160044670105, + "logps/chosen": -203.02529907226562, + "logps/rejected": -124.50965118408203, + "loss": 0.5684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7365705966949463, + "rewards/margins": 0.7156797647476196, + "rewards/rejected": -1.452250361442566, + "step": 3500 + }, + { + "epoch": 0.4, + "learning_rate": 1.8165749736626475e-07, + "logits/chosen": -2.3249521255493164, + "logits/rejected": -2.3845362663269043, + "logps/chosen": -295.1923828125, + "logps/rejected": -236.73013305664062, + "loss": 0.4975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6185683608055115, + "rewards/margins": 1.1536791324615479, + "rewards/rejected": -1.7722474336624146, + "step": 3501 + }, + { + "epoch": 0.4, + "learning_rate": 1.816223808966405e-07, + "logits/chosen": -2.95945405960083, + "logits/rejected": -3.520936965942383, + "logps/chosen": -161.14306640625, + "logps/rejected": -253.6451416015625, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2552975118160248, + "rewards/margins": 3.4290122985839844, + "rewards/rejected": -3.1737148761749268, + "step": 3502 + }, + { + "epoch": 0.4, + "learning_rate": 1.8158726442701628e-07, + "logits/chosen": -2.876594066619873, + "logits/rejected": -2.8193209171295166, + "logps/chosen": -261.20098876953125, + "logps/rejected": -282.892333984375, + "loss": 0.5641, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3670169711112976, + "rewards/margins": 0.5241342782974243, + "rewards/rejected": -0.8911513090133667, + "step": 3503 + }, + { + "epoch": 0.4, + "learning_rate": 1.81552147957392e-07, + "logits/chosen": -2.8715567588806152, + "logits/rejected": -2.938143730163574, + "logps/chosen": -240.77210998535156, + "logps/rejected": -131.92938232421875, + "loss": 0.3856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3410758674144745, + "rewards/margins": 1.1967374086380005, + "rewards/rejected": -1.5378131866455078, + "step": 3504 + }, + { + "epoch": 0.4, + "learning_rate": 1.8151703148776776e-07, + "logits/chosen": -2.33023738861084, + "logits/rejected": -2.721149206161499, + "logps/chosen": -314.65509033203125, + "logps/rejected": -266.1240234375, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0778169259428978, + "rewards/margins": 1.386824607849121, + "rewards/rejected": -1.4646415710449219, + "step": 3505 + }, + { + "epoch": 0.4, + "learning_rate": 1.8148191501814352e-07, + "logits/chosen": -3.210291862487793, + "logits/rejected": -3.298276424407959, + "logps/chosen": -231.75164794921875, + "logps/rejected": -184.5130615234375, + "loss": 0.3288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4118233919143677, + "rewards/margins": 2.3243932723999023, + "rewards/rejected": -1.9125698804855347, + "step": 3506 + }, + { + "epoch": 0.4, + "learning_rate": 1.8144679854851925e-07, + "logits/chosen": -3.4119420051574707, + "logits/rejected": -3.175459384918213, + "logps/chosen": -377.87127685546875, + "logps/rejected": -238.89112854003906, + "loss": 0.3367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22446171939373016, + "rewards/margins": 1.336361289024353, + "rewards/rejected": -1.5608230829238892, + "step": 3507 + }, + { + "epoch": 0.4, + "learning_rate": 1.81411682078895e-07, + "logits/chosen": -3.3921639919281006, + "logits/rejected": -3.4882702827453613, + "logps/chosen": -272.9784851074219, + "logps/rejected": -405.99853515625, + "loss": 0.6273, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009888291358947754, + "rewards/margins": 1.6388428211212158, + "rewards/rejected": -1.6289546489715576, + "step": 3508 + }, + { + "epoch": 0.4, + "learning_rate": 1.8137656560927073e-07, + "logits/chosen": -3.625364065170288, + "logits/rejected": -3.4376864433288574, + "logps/chosen": -154.8473358154297, + "logps/rejected": -182.18690490722656, + "loss": 0.3963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024679839611053467, + "rewards/margins": 1.5191617012023926, + "rewards/rejected": -1.5438413619995117, + "step": 3509 + }, + { + "epoch": 0.4, + "learning_rate": 1.8134144913964648e-07, + "logits/chosen": -3.633683919906616, + "logits/rejected": -3.8273935317993164, + "logps/chosen": -275.23907470703125, + "logps/rejected": -370.685546875, + "loss": 0.6129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32291507720947266, + "rewards/margins": 0.8851771354675293, + "rewards/rejected": -1.208092212677002, + "step": 3510 + }, + { + "epoch": 0.4, + "learning_rate": 1.8130633267002223e-07, + "logits/chosen": -2.595123767852783, + "logits/rejected": -2.3078722953796387, + "logps/chosen": -302.73455810546875, + "logps/rejected": -248.29824829101562, + "loss": 0.527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2803901433944702, + "rewards/margins": 0.771223247051239, + "rewards/rejected": -1.051613450050354, + "step": 3511 + }, + { + "epoch": 0.4, + "learning_rate": 1.8127121620039796e-07, + "logits/chosen": -2.610584259033203, + "logits/rejected": -2.9112067222595215, + "logps/chosen": -340.950927734375, + "logps/rejected": -282.0301513671875, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2517627477645874, + "rewards/margins": 1.2492716312408447, + "rewards/rejected": -0.9975088238716125, + "step": 3512 + }, + { + "epoch": 0.4, + "learning_rate": 1.8123609973077372e-07, + "logits/chosen": -3.429264545440674, + "logits/rejected": -3.6341028213500977, + "logps/chosen": -149.20033264160156, + "logps/rejected": -275.7229309082031, + "loss": 0.1171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17710228264331818, + "rewards/margins": 2.8497495651245117, + "rewards/rejected": -2.672646999359131, + "step": 3513 + }, + { + "epoch": 0.41, + "learning_rate": 1.812009832611495e-07, + "logits/chosen": -3.748776435852051, + "logits/rejected": -3.8666112422943115, + "logps/chosen": -166.33421325683594, + "logps/rejected": -190.65948486328125, + "loss": 0.6712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5440917015075684, + "rewards/margins": 0.28966575860977173, + "rewards/rejected": -0.8337573409080505, + "step": 3514 + }, + { + "epoch": 0.41, + "learning_rate": 1.8116586679152522e-07, + "logits/chosen": -2.9627506732940674, + "logits/rejected": -3.121913433074951, + "logps/chosen": -291.54486083984375, + "logps/rejected": -150.74749755859375, + "loss": 0.5496, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7176952958106995, + "rewards/margins": 0.8127733469009399, + "rewards/rejected": -1.5304685831069946, + "step": 3515 + }, + { + "epoch": 0.41, + "learning_rate": 1.8113075032190098e-07, + "logits/chosen": -3.2746124267578125, + "logits/rejected": -3.2562255859375, + "logps/chosen": -131.16357421875, + "logps/rejected": -203.6732177734375, + "loss": 0.3891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024155285209417343, + "rewards/margins": 1.4734981060028076, + "rewards/rejected": -1.4976534843444824, + "step": 3516 + }, + { + "epoch": 0.41, + "learning_rate": 1.810956338522767e-07, + "logits/chosen": -3.247891902923584, + "logits/rejected": -3.107728958129883, + "logps/chosen": -262.46990966796875, + "logps/rejected": -263.36395263671875, + "loss": 0.4809, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23728737235069275, + "rewards/margins": 1.7863588333129883, + "rewards/rejected": -2.023646116256714, + "step": 3517 + }, + { + "epoch": 0.41, + "learning_rate": 1.8106051738265246e-07, + "logits/chosen": -3.0262341499328613, + "logits/rejected": -2.7744410037994385, + "logps/chosen": -179.73501586914062, + "logps/rejected": -331.4676208496094, + "loss": 0.3731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3168907165527344, + "rewards/margins": 2.8446624279022217, + "rewards/rejected": -3.161553144454956, + "step": 3518 + }, + { + "epoch": 0.41, + "learning_rate": 1.810254009130282e-07, + "logits/chosen": -3.222296714782715, + "logits/rejected": -3.4267783164978027, + "logps/chosen": -304.1332702636719, + "logps/rejected": -256.58868408203125, + "loss": 0.1085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4557272791862488, + "rewards/margins": 3.2894203662872314, + "rewards/rejected": -2.833693027496338, + "step": 3519 + }, + { + "epoch": 0.41, + "learning_rate": 1.8099028444340394e-07, + "logits/chosen": -3.3049049377441406, + "logits/rejected": -3.351945400238037, + "logps/chosen": -295.232177734375, + "logps/rejected": -168.79864501953125, + "loss": 0.3549, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.351114422082901, + "rewards/margins": 1.7252130508422852, + "rewards/rejected": -1.374098539352417, + "step": 3520 + }, + { + "epoch": 0.41, + "learning_rate": 1.809551679737797e-07, + "logits/chosen": -3.6326072216033936, + "logits/rejected": -3.6233863830566406, + "logps/chosen": -184.19166564941406, + "logps/rejected": -251.66482543945312, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13720515370368958, + "rewards/margins": 2.3914060592651367, + "rewards/rejected": -2.2542009353637695, + "step": 3521 + }, + { + "epoch": 0.41, + "learning_rate": 1.8092005150415542e-07, + "logits/chosen": -3.7643675804138184, + "logits/rejected": -3.722980499267578, + "logps/chosen": -172.59310913085938, + "logps/rejected": -228.09353637695312, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20166811347007751, + "rewards/margins": 3.226357936859131, + "rewards/rejected": -3.0246901512145996, + "step": 3522 + }, + { + "epoch": 0.41, + "learning_rate": 1.8088493503453117e-07, + "logits/chosen": -3.0340662002563477, + "logits/rejected": -3.3380446434020996, + "logps/chosen": -189.09100341796875, + "logps/rejected": -273.3052978515625, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44025278091430664, + "rewards/margins": 3.1986234188079834, + "rewards/rejected": -2.758370876312256, + "step": 3523 + }, + { + "epoch": 0.41, + "learning_rate": 1.8084981856490696e-07, + "logits/chosen": -3.426180839538574, + "logits/rejected": -2.935577869415283, + "logps/chosen": -259.528076171875, + "logps/rejected": -346.6033020019531, + "loss": 0.4706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4800933599472046, + "rewards/margins": 0.9394630789756775, + "rewards/rejected": -1.4195563793182373, + "step": 3524 + }, + { + "epoch": 0.41, + "learning_rate": 1.8081470209528266e-07, + "logits/chosen": -2.9008960723876953, + "logits/rejected": -2.846498489379883, + "logps/chosen": -263.61151123046875, + "logps/rejected": -349.6339416503906, + "loss": 0.8014, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2338694930076599, + "rewards/margins": 0.1784113198518753, + "rewards/rejected": -0.412280797958374, + "step": 3525 + }, + { + "epoch": 0.41, + "learning_rate": 1.8077958562565844e-07, + "logits/chosen": -3.256047248840332, + "logits/rejected": -3.243262767791748, + "logps/chosen": -268.6332092285156, + "logps/rejected": -332.2796936035156, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3255889415740967, + "rewards/margins": 2.0704267024993896, + "rewards/rejected": -1.744837760925293, + "step": 3526 + }, + { + "epoch": 0.41, + "learning_rate": 1.807444691560342e-07, + "logits/chosen": -2.972883939743042, + "logits/rejected": -3.1339945793151855, + "logps/chosen": -96.5503158569336, + "logps/rejected": -188.99761962890625, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025065936148166656, + "rewards/margins": 0.5969998240470886, + "rewards/rejected": -0.5719338655471802, + "step": 3527 + }, + { + "epoch": 0.41, + "learning_rate": 1.8070935268640992e-07, + "logits/chosen": -3.1618940830230713, + "logits/rejected": -2.9313883781433105, + "logps/chosen": -312.5709228515625, + "logps/rejected": -298.4981689453125, + "loss": 0.8775, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8528946042060852, + "rewards/margins": 0.6155151128768921, + "rewards/rejected": -1.4684096574783325, + "step": 3528 + }, + { + "epoch": 0.41, + "learning_rate": 1.8067423621678567e-07, + "logits/chosen": -2.730638027191162, + "logits/rejected": -2.5746774673461914, + "logps/chosen": -334.9827575683594, + "logps/rejected": -292.0713806152344, + "loss": 0.5312, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5346776843070984, + "rewards/margins": 1.5166630744934082, + "rewards/rejected": -0.9819853901863098, + "step": 3529 + }, + { + "epoch": 0.41, + "learning_rate": 1.806391197471614e-07, + "logits/chosen": -2.695577621459961, + "logits/rejected": -2.7087061405181885, + "logps/chosen": -305.7821960449219, + "logps/rejected": -233.46151733398438, + "loss": 0.2069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1624404489994049, + "rewards/margins": 2.9837310314178467, + "rewards/rejected": -2.8212904930114746, + "step": 3530 + }, + { + "epoch": 0.41, + "learning_rate": 1.8060400327753715e-07, + "logits/chosen": -3.1361608505249023, + "logits/rejected": -2.916029453277588, + "logps/chosen": -320.7582092285156, + "logps/rejected": -185.6348419189453, + "loss": 0.3569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.040156908333301544, + "rewards/margins": 1.3215934038162231, + "rewards/rejected": -1.281436562538147, + "step": 3531 + }, + { + "epoch": 0.41, + "learning_rate": 1.805688868079129e-07, + "logits/chosen": -2.852931022644043, + "logits/rejected": -2.800537109375, + "logps/chosen": -257.8814392089844, + "logps/rejected": -314.03094482421875, + "loss": 0.5626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04305630177259445, + "rewards/margins": 1.6780409812927246, + "rewards/rejected": -1.7210972309112549, + "step": 3532 + }, + { + "epoch": 0.41, + "learning_rate": 1.8053377033828863e-07, + "logits/chosen": -3.238154888153076, + "logits/rejected": -3.3996310234069824, + "logps/chosen": -310.34698486328125, + "logps/rejected": -235.49856567382812, + "loss": 0.2456, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6982054710388184, + "rewards/margins": 2.295017719268799, + "rewards/rejected": -1.5968122482299805, + "step": 3533 + }, + { + "epoch": 0.41, + "learning_rate": 1.804986538686644e-07, + "logits/chosen": -2.837984800338745, + "logits/rejected": -3.119460105895996, + "logps/chosen": -267.6942138671875, + "logps/rejected": -272.9362487792969, + "loss": 0.2073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07215878367424011, + "rewards/margins": 2.4345455169677734, + "rewards/rejected": -2.506704330444336, + "step": 3534 + }, + { + "epoch": 0.41, + "learning_rate": 1.8046353739904017e-07, + "logits/chosen": -2.6964151859283447, + "logits/rejected": -2.881702184677124, + "logps/chosen": -341.4447021484375, + "logps/rejected": -274.589111328125, + "loss": 0.3022, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6412876844406128, + "rewards/margins": 1.657568097114563, + "rewards/rejected": -1.0162804126739502, + "step": 3535 + }, + { + "epoch": 0.41, + "learning_rate": 1.8042842092941587e-07, + "logits/chosen": -2.972026824951172, + "logits/rejected": -3.1504030227661133, + "logps/chosen": -240.920166015625, + "logps/rejected": -268.59295654296875, + "loss": 0.5927, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.038548290729522705, + "rewards/margins": 0.7191018462181091, + "rewards/rejected": -0.7576501369476318, + "step": 3536 + }, + { + "epoch": 0.41, + "learning_rate": 1.8039330445979165e-07, + "logits/chosen": -2.6403419971466064, + "logits/rejected": -2.7194504737854004, + "logps/chosen": -320.1580505371094, + "logps/rejected": -171.84414672851562, + "loss": 0.6675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3283534646034241, + "rewards/margins": 0.5059452652931213, + "rewards/rejected": -0.8342987298965454, + "step": 3537 + }, + { + "epoch": 0.41, + "learning_rate": 1.8035818799016738e-07, + "logits/chosen": -2.829660415649414, + "logits/rejected": -3.1425180435180664, + "logps/chosen": -297.35687255859375, + "logps/rejected": -259.4021301269531, + "loss": 0.5342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4285831153392792, + "rewards/margins": 0.7417243123054504, + "rewards/rejected": -1.1703073978424072, + "step": 3538 + }, + { + "epoch": 0.41, + "learning_rate": 1.8032307152054313e-07, + "logits/chosen": -3.6579337120056152, + "logits/rejected": -3.167973518371582, + "logps/chosen": -269.94598388671875, + "logps/rejected": -185.7836151123047, + "loss": 0.4202, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14276990294456482, + "rewards/margins": 2.0528042316436768, + "rewards/rejected": -1.910034418106079, + "step": 3539 + }, + { + "epoch": 0.41, + "learning_rate": 1.8028795505091888e-07, + "logits/chosen": -3.599830150604248, + "logits/rejected": -3.450124502182007, + "logps/chosen": -171.15365600585938, + "logps/rejected": -245.3795166015625, + "loss": 0.4425, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07464843988418579, + "rewards/margins": 1.3255510330200195, + "rewards/rejected": -1.250902533531189, + "step": 3540 + }, + { + "epoch": 0.41, + "learning_rate": 1.802528385812946e-07, + "logits/chosen": -3.2925190925598145, + "logits/rejected": -3.050676107406616, + "logps/chosen": -186.185791015625, + "logps/rejected": -280.36651611328125, + "loss": 0.4658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28264689445495605, + "rewards/margins": 1.1117125749588013, + "rewards/rejected": -1.3943595886230469, + "step": 3541 + }, + { + "epoch": 0.41, + "learning_rate": 1.8021772211167037e-07, + "logits/chosen": -3.0572071075439453, + "logits/rejected": -2.9025843143463135, + "logps/chosen": -325.84466552734375, + "logps/rejected": -357.8912353515625, + "loss": 0.4736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2489316463470459, + "rewards/margins": 0.8641751408576965, + "rewards/rejected": -0.6152435541152954, + "step": 3542 + }, + { + "epoch": 0.41, + "learning_rate": 1.8018260564204612e-07, + "logits/chosen": -3.3201866149902344, + "logits/rejected": -3.6926050186157227, + "logps/chosen": -180.25425720214844, + "logps/rejected": -418.7351989746094, + "loss": 0.1679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24111074209213257, + "rewards/margins": 4.011889934539795, + "rewards/rejected": -4.253000736236572, + "step": 3543 + }, + { + "epoch": 0.41, + "learning_rate": 1.8014748917242185e-07, + "logits/chosen": -3.4852776527404785, + "logits/rejected": -3.5088112354278564, + "logps/chosen": -225.91748046875, + "logps/rejected": -394.4898681640625, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9017502069473267, + "rewards/margins": 4.778542995452881, + "rewards/rejected": -3.8767926692962646, + "step": 3544 + }, + { + "epoch": 0.41, + "learning_rate": 1.801123727027976e-07, + "logits/chosen": -3.647865056991577, + "logits/rejected": -3.666170597076416, + "logps/chosen": -198.337646484375, + "logps/rejected": -202.23406982421875, + "loss": 0.1704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2849385738372803, + "rewards/margins": 2.9037084579467773, + "rewards/rejected": -2.618769884109497, + "step": 3545 + }, + { + "epoch": 0.41, + "learning_rate": 1.8007725623317333e-07, + "logits/chosen": -3.5994811058044434, + "logits/rejected": -3.402290105819702, + "logps/chosen": -493.35211181640625, + "logps/rejected": -249.50746154785156, + "loss": 0.2979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03033638373017311, + "rewards/margins": 2.3456578254699707, + "rewards/rejected": -2.315321445465088, + "step": 3546 + }, + { + "epoch": 0.41, + "learning_rate": 1.8004213976354908e-07, + "logits/chosen": -3.00870680809021, + "logits/rejected": -2.7908101081848145, + "logps/chosen": -261.6490478515625, + "logps/rejected": -243.97512817382812, + "loss": 0.5655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2533054053783417, + "rewards/margins": 1.5803775787353516, + "rewards/rejected": -1.8336832523345947, + "step": 3547 + }, + { + "epoch": 0.41, + "learning_rate": 1.8000702329392486e-07, + "logits/chosen": -2.3727807998657227, + "logits/rejected": -2.5971624851226807, + "logps/chosen": -340.977783203125, + "logps/rejected": -287.73504638671875, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3282092213630676, + "rewards/margins": 1.6488394737243652, + "rewards/rejected": -1.3206300735473633, + "step": 3548 + }, + { + "epoch": 0.41, + "learning_rate": 1.799719068243006e-07, + "logits/chosen": -3.045339584350586, + "logits/rejected": -3.395671844482422, + "logps/chosen": -314.1576843261719, + "logps/rejected": -296.9385986328125, + "loss": 0.8336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.671963632106781, + "rewards/margins": 2.199672222137451, + "rewards/rejected": -2.8716354370117188, + "step": 3549 + }, + { + "epoch": 0.41, + "learning_rate": 1.7993679035467634e-07, + "logits/chosen": -3.4149508476257324, + "logits/rejected": -3.393261432647705, + "logps/chosen": -181.16526794433594, + "logps/rejected": -187.52818298339844, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8653045296669006, + "rewards/margins": 2.1019089221954346, + "rewards/rejected": -1.2366042137145996, + "step": 3550 + }, + { + "epoch": 0.41, + "learning_rate": 1.799016738850521e-07, + "logits/chosen": -3.4932665824890137, + "logits/rejected": -3.396778106689453, + "logps/chosen": -445.5292053222656, + "logps/rejected": -223.22637939453125, + "loss": 0.2947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12282834202051163, + "rewards/margins": 1.9155761003494263, + "rewards/rejected": -2.0384044647216797, + "step": 3551 + }, + { + "epoch": 0.41, + "learning_rate": 1.7986655741542782e-07, + "logits/chosen": -3.0920093059539795, + "logits/rejected": -2.52260160446167, + "logps/chosen": -480.3854064941406, + "logps/rejected": -315.7304382324219, + "loss": 0.5029, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2985793948173523, + "rewards/margins": 0.8601231575012207, + "rewards/rejected": -1.1587024927139282, + "step": 3552 + }, + { + "epoch": 0.41, + "learning_rate": 1.7983144094580358e-07, + "logits/chosen": -3.444403886795044, + "logits/rejected": -3.4472296237945557, + "logps/chosen": -244.19346618652344, + "logps/rejected": -232.25140380859375, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0017276853322982788, + "rewards/margins": 0.7963016629219055, + "rewards/rejected": -0.794573962688446, + "step": 3553 + }, + { + "epoch": 0.41, + "learning_rate": 1.797963244761793e-07, + "logits/chosen": -3.444579601287842, + "logits/rejected": -3.6395111083984375, + "logps/chosen": -266.3729248046875, + "logps/rejected": -278.0374755859375, + "loss": 0.2473, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19648362696170807, + "rewards/margins": 1.9721672534942627, + "rewards/rejected": -1.7756834030151367, + "step": 3554 + }, + { + "epoch": 0.41, + "learning_rate": 1.7976120800655506e-07, + "logits/chosen": -3.6600799560546875, + "logits/rejected": -3.525402307510376, + "logps/chosen": -378.540283203125, + "logps/rejected": -248.02169799804688, + "loss": 0.5229, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1571844518184662, + "rewards/margins": 1.1672313213348389, + "rewards/rejected": -1.0100469589233398, + "step": 3555 + }, + { + "epoch": 0.41, + "learning_rate": 1.7972609153693081e-07, + "logits/chosen": -3.34706711769104, + "logits/rejected": -3.369274854660034, + "logps/chosen": -157.0826416015625, + "logps/rejected": -130.1790771484375, + "loss": 0.6179, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29796284437179565, + "rewards/margins": 0.8686269521713257, + "rewards/rejected": -1.1665897369384766, + "step": 3556 + }, + { + "epoch": 0.41, + "learning_rate": 1.7969097506730654e-07, + "logits/chosen": -3.1978306770324707, + "logits/rejected": -3.237532138824463, + "logps/chosen": -383.8458251953125, + "logps/rejected": -388.5544738769531, + "loss": 0.362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05847259610891342, + "rewards/margins": 1.5209019184112549, + "rewards/rejected": -1.4624292850494385, + "step": 3557 + }, + { + "epoch": 0.41, + "learning_rate": 1.7965585859768232e-07, + "logits/chosen": -2.982764959335327, + "logits/rejected": -2.586331844329834, + "logps/chosen": -258.25738525390625, + "logps/rejected": -280.41949462890625, + "loss": 0.612, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2329237163066864, + "rewards/margins": 1.338122844696045, + "rewards/rejected": -1.1051990985870361, + "step": 3558 + }, + { + "epoch": 0.41, + "learning_rate": 1.7962074212805808e-07, + "logits/chosen": -3.6966843605041504, + "logits/rejected": -3.742447853088379, + "logps/chosen": -301.4953308105469, + "logps/rejected": -374.72808837890625, + "loss": 0.1556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5347863435745239, + "rewards/margins": 2.4585700035095215, + "rewards/rejected": -2.993356466293335, + "step": 3559 + }, + { + "epoch": 0.41, + "learning_rate": 1.795856256584338e-07, + "logits/chosen": -3.516669273376465, + "logits/rejected": -3.243940591812134, + "logps/chosen": -191.81639099121094, + "logps/rejected": -162.83828735351562, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3171611428260803, + "rewards/margins": 2.252139091491699, + "rewards/rejected": -1.9349777698516846, + "step": 3560 + }, + { + "epoch": 0.41, + "learning_rate": 1.7955050918880956e-07, + "logits/chosen": -3.2200703620910645, + "logits/rejected": -2.896397829055786, + "logps/chosen": -377.1133117675781, + "logps/rejected": -352.603759765625, + "loss": 0.7102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009619921445846558, + "rewards/margins": 1.5102200508117676, + "rewards/rejected": -1.5198400020599365, + "step": 3561 + }, + { + "epoch": 0.41, + "learning_rate": 1.7951539271918528e-07, + "logits/chosen": -2.9030487537384033, + "logits/rejected": -3.216236114501953, + "logps/chosen": -317.156982421875, + "logps/rejected": -229.81228637695312, + "loss": 0.649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6754428744316101, + "rewards/margins": 1.6627082824707031, + "rewards/rejected": -2.338150978088379, + "step": 3562 + }, + { + "epoch": 0.41, + "learning_rate": 1.7948027624956104e-07, + "logits/chosen": -3.3277573585510254, + "logits/rejected": -3.6093921661376953, + "logps/chosen": -112.35592651367188, + "logps/rejected": -235.80731201171875, + "loss": 0.764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14807671308517456, + "rewards/margins": 1.7362470626831055, + "rewards/rejected": -1.8843238353729248, + "step": 3563 + }, + { + "epoch": 0.41, + "learning_rate": 1.794451597799368e-07, + "logits/chosen": -2.754390239715576, + "logits/rejected": -2.4703493118286133, + "logps/chosen": -148.1443328857422, + "logps/rejected": -265.64007568359375, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1317780613899231, + "rewards/margins": 1.9318079948425293, + "rewards/rejected": -2.0635862350463867, + "step": 3564 + }, + { + "epoch": 0.41, + "learning_rate": 1.7941004331031252e-07, + "logits/chosen": -3.497138738632202, + "logits/rejected": -3.274937152862549, + "logps/chosen": -155.88491821289062, + "logps/rejected": -208.86766052246094, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34379512071609497, + "rewards/margins": 1.904645562171936, + "rewards/rejected": -1.5608505010604858, + "step": 3565 + }, + { + "epoch": 0.41, + "learning_rate": 1.7937492684068827e-07, + "logits/chosen": -3.0342555046081543, + "logits/rejected": -3.0455422401428223, + "logps/chosen": -353.06744384765625, + "logps/rejected": -405.1575927734375, + "loss": 0.3666, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0693933516740799, + "rewards/margins": 2.194253921508789, + "rewards/rejected": -2.1248605251312256, + "step": 3566 + }, + { + "epoch": 0.41, + "learning_rate": 1.7933981037106403e-07, + "logits/chosen": -2.927114725112915, + "logits/rejected": -2.704010009765625, + "logps/chosen": -232.83078002929688, + "logps/rejected": -276.80255126953125, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0649225041270256, + "rewards/margins": 1.9575031995773315, + "rewards/rejected": -1.8925807476043701, + "step": 3567 + }, + { + "epoch": 0.41, + "learning_rate": 1.7930469390143975e-07, + "logits/chosen": -3.120628833770752, + "logits/rejected": -3.0751068592071533, + "logps/chosen": -282.2817687988281, + "logps/rejected": -229.83148193359375, + "loss": 0.3624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09228713810443878, + "rewards/margins": 1.308052897453308, + "rewards/rejected": -1.4003400802612305, + "step": 3568 + }, + { + "epoch": 0.41, + "learning_rate": 1.7926957743181553e-07, + "logits/chosen": -3.197434663772583, + "logits/rejected": -3.1529765129089355, + "logps/chosen": -235.1815185546875, + "logps/rejected": -247.06492614746094, + "loss": 0.4944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48043420910835266, + "rewards/margins": 1.019172191619873, + "rewards/rejected": -1.4996063709259033, + "step": 3569 + }, + { + "epoch": 0.41, + "learning_rate": 1.7923446096219124e-07, + "logits/chosen": -3.244213104248047, + "logits/rejected": -3.5436325073242188, + "logps/chosen": -139.10414123535156, + "logps/rejected": -166.80296325683594, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14913733303546906, + "rewards/margins": 2.1221837997436523, + "rewards/rejected": -1.9730464220046997, + "step": 3570 + }, + { + "epoch": 0.41, + "learning_rate": 1.7919934449256702e-07, + "logits/chosen": -3.663511276245117, + "logits/rejected": -3.7316360473632812, + "logps/chosen": -140.53834533691406, + "logps/rejected": -241.27655029296875, + "loss": 0.1211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.721221923828125, + "rewards/margins": 3.1858344078063965, + "rewards/rejected": -2.4646124839782715, + "step": 3571 + }, + { + "epoch": 0.41, + "learning_rate": 1.7916422802294277e-07, + "logits/chosen": -2.329477310180664, + "logits/rejected": -2.4047067165374756, + "logps/chosen": -271.47418212890625, + "logps/rejected": -278.38818359375, + "loss": 0.354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2950696051120758, + "rewards/margins": 1.474135160446167, + "rewards/rejected": -1.76920485496521, + "step": 3572 + }, + { + "epoch": 0.41, + "learning_rate": 1.791291115533185e-07, + "logits/chosen": -3.5097479820251465, + "logits/rejected": -3.0119855403900146, + "logps/chosen": -306.4727783203125, + "logps/rejected": -197.92445373535156, + "loss": 0.2785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28227612376213074, + "rewards/margins": 1.8367836475372314, + "rewards/rejected": -2.1190598011016846, + "step": 3573 + }, + { + "epoch": 0.41, + "learning_rate": 1.7909399508369425e-07, + "logits/chosen": -3.6271681785583496, + "logits/rejected": -3.247648000717163, + "logps/chosen": -339.15997314453125, + "logps/rejected": -238.498291015625, + "loss": 0.676, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0342439413070679, + "rewards/margins": 0.9076406359672546, + "rewards/rejected": -1.9418846368789673, + "step": 3574 + }, + { + "epoch": 0.41, + "learning_rate": 1.7905887861406998e-07, + "logits/chosen": -2.9825615882873535, + "logits/rejected": -3.0818192958831787, + "logps/chosen": -412.22479248046875, + "logps/rejected": -321.6934509277344, + "loss": 0.8278, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9805960059165955, + "rewards/margins": 0.8124275207519531, + "rewards/rejected": -1.7930233478546143, + "step": 3575 + }, + { + "epoch": 0.41, + "learning_rate": 1.7902376214444573e-07, + "logits/chosen": -3.1296074390411377, + "logits/rejected": -3.39495587348938, + "logps/chosen": -184.06619262695312, + "logps/rejected": -319.42950439453125, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.003612454980611801, + "rewards/margins": 3.005953311920166, + "rewards/rejected": -3.009565591812134, + "step": 3576 + }, + { + "epoch": 0.41, + "learning_rate": 1.7898864567482149e-07, + "logits/chosen": -2.3476271629333496, + "logits/rejected": -2.5994365215301514, + "logps/chosen": -227.3824462890625, + "logps/rejected": -255.1779327392578, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06441403925418854, + "rewards/margins": 2.780094861984253, + "rewards/rejected": -2.844508647918701, + "step": 3577 + }, + { + "epoch": 0.41, + "learning_rate": 1.789535292051972e-07, + "logits/chosen": -2.42329740524292, + "logits/rejected": -2.829495668411255, + "logps/chosen": -282.9150390625, + "logps/rejected": -226.6349334716797, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0601639449596405, + "rewards/margins": 1.396545171737671, + "rewards/rejected": -1.4567091464996338, + "step": 3578 + }, + { + "epoch": 0.41, + "learning_rate": 1.7891841273557297e-07, + "logits/chosen": -3.555476427078247, + "logits/rejected": -3.6050782203674316, + "logps/chosen": -251.6819610595703, + "logps/rejected": -362.14373779296875, + "loss": 0.3381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46539461612701416, + "rewards/margins": 2.855320453643799, + "rewards/rejected": -3.3207151889801025, + "step": 3579 + }, + { + "epoch": 0.41, + "learning_rate": 1.7888329626594875e-07, + "logits/chosen": -3.1268386840820312, + "logits/rejected": -2.943819284439087, + "logps/chosen": -390.14190673828125, + "logps/rejected": -163.991455078125, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49210745096206665, + "rewards/margins": 0.7762027978897095, + "rewards/rejected": -1.2683101892471313, + "step": 3580 + }, + { + "epoch": 0.41, + "learning_rate": 1.7884817979632445e-07, + "logits/chosen": -3.3787033557891846, + "logits/rejected": -3.035946846008301, + "logps/chosen": -146.97305297851562, + "logps/rejected": -202.9108123779297, + "loss": 0.5315, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06655339151620865, + "rewards/margins": 1.419156789779663, + "rewards/rejected": -1.3526034355163574, + "step": 3581 + }, + { + "epoch": 0.41, + "learning_rate": 1.7881306332670023e-07, + "logits/chosen": -2.947458267211914, + "logits/rejected": -2.6708731651306152, + "logps/chosen": -304.565673828125, + "logps/rejected": -176.69415283203125, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3028818964958191, + "rewards/margins": 1.72908616065979, + "rewards/rejected": -2.031968116760254, + "step": 3582 + }, + { + "epoch": 0.41, + "learning_rate": 1.7877794685707596e-07, + "logits/chosen": -2.65566349029541, + "logits/rejected": -2.5557210445404053, + "logps/chosen": -431.9498291015625, + "logps/rejected": -304.64019775390625, + "loss": 0.4772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41195517778396606, + "rewards/margins": 1.254706859588623, + "rewards/rejected": -1.6666619777679443, + "step": 3583 + }, + { + "epoch": 0.41, + "learning_rate": 1.787428303874517e-07, + "logits/chosen": -3.0637857913970947, + "logits/rejected": -3.0384857654571533, + "logps/chosen": -167.08755493164062, + "logps/rejected": -156.80093383789062, + "loss": 0.6778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5385434627532959, + "rewards/margins": 0.9841570258140564, + "rewards/rejected": -1.522700548171997, + "step": 3584 + }, + { + "epoch": 0.41, + "learning_rate": 1.7870771391782746e-07, + "logits/chosen": -3.107322931289673, + "logits/rejected": -2.8449246883392334, + "logps/chosen": -361.24346923828125, + "logps/rejected": -272.8359375, + "loss": 0.6084, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02595856785774231, + "rewards/margins": 0.957190990447998, + "rewards/rejected": -0.9312323927879333, + "step": 3585 + }, + { + "epoch": 0.41, + "learning_rate": 1.786725974482032e-07, + "logits/chosen": -3.466798782348633, + "logits/rejected": -3.1818337440490723, + "logps/chosen": -299.8273010253906, + "logps/rejected": -281.3395080566406, + "loss": 0.7945, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.746163547039032, + "rewards/margins": 1.2158188819885254, + "rewards/rejected": -1.9619824886322021, + "step": 3586 + }, + { + "epoch": 0.41, + "learning_rate": 1.7863748097857894e-07, + "logits/chosen": -2.857988119125366, + "logits/rejected": -3.1148555278778076, + "logps/chosen": -192.47434997558594, + "logps/rejected": -215.66024780273438, + "loss": 0.2889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16386285424232483, + "rewards/margins": 2.3750767707824707, + "rewards/rejected": -2.5389397144317627, + "step": 3587 + }, + { + "epoch": 0.41, + "learning_rate": 1.786023645089547e-07, + "logits/chosen": -2.8350830078125, + "logits/rejected": -3.085730791091919, + "logps/chosen": -332.0268249511719, + "logps/rejected": -253.9791717529297, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08461606502532959, + "rewards/margins": 3.1047372817993164, + "rewards/rejected": -3.1893532276153564, + "step": 3588 + }, + { + "epoch": 0.41, + "learning_rate": 1.7856724803933043e-07, + "logits/chosen": -3.738708257675171, + "logits/rejected": -3.707634687423706, + "logps/chosen": -347.7174072265625, + "logps/rejected": -179.05740356445312, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.038574814796447754, + "rewards/margins": 1.0742621421813965, + "rewards/rejected": -1.0356874465942383, + "step": 3589 + }, + { + "epoch": 0.41, + "learning_rate": 1.7853213156970618e-07, + "logits/chosen": -2.3237740993499756, + "logits/rejected": -2.5790634155273438, + "logps/chosen": -445.1953430175781, + "logps/rejected": -381.7630615234375, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5266571044921875, + "rewards/margins": 1.8418421745300293, + "rewards/rejected": -1.3151849508285522, + "step": 3590 + }, + { + "epoch": 0.41, + "learning_rate": 1.784970151000819e-07, + "logits/chosen": -2.8597424030303955, + "logits/rejected": -2.7405660152435303, + "logps/chosen": -153.71580505371094, + "logps/rejected": -251.53204345703125, + "loss": 0.2879, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13036289811134338, + "rewards/margins": 1.8805550336837769, + "rewards/rejected": -1.7501921653747559, + "step": 3591 + }, + { + "epoch": 0.41, + "learning_rate": 1.784618986304577e-07, + "logits/chosen": -2.4824888706207275, + "logits/rejected": -2.765441656112671, + "logps/chosen": -331.6485595703125, + "logps/rejected": -375.02642822265625, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0673438012599945, + "rewards/margins": 1.6951868534088135, + "rewards/rejected": -1.76253080368042, + "step": 3592 + }, + { + "epoch": 0.41, + "learning_rate": 1.7842678216083344e-07, + "logits/chosen": -2.984872341156006, + "logits/rejected": -3.0500099658966064, + "logps/chosen": -437.6054382324219, + "logps/rejected": -493.95220947265625, + "loss": 0.497, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4914587140083313, + "rewards/margins": 0.8893882632255554, + "rewards/rejected": -1.3808469772338867, + "step": 3593 + }, + { + "epoch": 0.41, + "learning_rate": 1.7839166569120917e-07, + "logits/chosen": -3.6472840309143066, + "logits/rejected": -3.6511573791503906, + "logps/chosen": -247.15640258789062, + "logps/rejected": -214.354736328125, + "loss": 0.494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37301820516586304, + "rewards/margins": 1.428259015083313, + "rewards/rejected": -1.8012771606445312, + "step": 3594 + }, + { + "epoch": 0.41, + "learning_rate": 1.7835654922158492e-07, + "logits/chosen": -3.2311148643493652, + "logits/rejected": -3.4611332416534424, + "logps/chosen": -244.40472412109375, + "logps/rejected": -243.2731475830078, + "loss": 0.4239, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3222489058971405, + "rewards/margins": 1.739262580871582, + "rewards/rejected": -1.4170136451721191, + "step": 3595 + }, + { + "epoch": 0.41, + "learning_rate": 1.7832143275196068e-07, + "logits/chosen": -3.2573282718658447, + "logits/rejected": -3.61942982673645, + "logps/chosen": -313.6560974121094, + "logps/rejected": -289.79425048828125, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12074257433414459, + "rewards/margins": 1.6523836851119995, + "rewards/rejected": -1.5316412448883057, + "step": 3596 + }, + { + "epoch": 0.41, + "learning_rate": 1.782863162823364e-07, + "logits/chosen": -3.2022879123687744, + "logits/rejected": -3.108163356781006, + "logps/chosen": -221.5350799560547, + "logps/rejected": -230.35195922851562, + "loss": 0.4047, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21291349828243256, + "rewards/margins": 1.4624825716018677, + "rewards/rejected": -1.249569058418274, + "step": 3597 + }, + { + "epoch": 0.41, + "learning_rate": 1.7825119981271216e-07, + "logits/chosen": -3.629181385040283, + "logits/rejected": -3.476834774017334, + "logps/chosen": -208.03732299804688, + "logps/rejected": -194.42080688476562, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11159797012805939, + "rewards/margins": 1.0596052408218384, + "rewards/rejected": -0.9480072855949402, + "step": 3598 + }, + { + "epoch": 0.41, + "learning_rate": 1.7821608334308789e-07, + "logits/chosen": -3.6371102333068848, + "logits/rejected": -3.6974854469299316, + "logps/chosen": -247.7490997314453, + "logps/rejected": -332.53143310546875, + "loss": 0.2142, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6866492629051208, + "rewards/margins": 3.1044065952301025, + "rewards/rejected": -2.417757034301758, + "step": 3599 + }, + { + "epoch": 0.42, + "learning_rate": 1.7818096687346364e-07, + "logits/chosen": -2.1971964836120605, + "logits/rejected": -2.1297290325164795, + "logps/chosen": -332.1491394042969, + "logps/rejected": -222.23915100097656, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015263639390468597, + "rewards/margins": 1.3945873975753784, + "rewards/rejected": -1.3793237209320068, + "step": 3600 + }, + { + "epoch": 0.42, + "learning_rate": 1.781458504038394e-07, + "logits/chosen": -3.243313789367676, + "logits/rejected": -2.9705090522766113, + "logps/chosen": -244.26766967773438, + "logps/rejected": -229.92703247070312, + "loss": 0.5011, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3790815472602844, + "rewards/margins": 0.8491986393928528, + "rewards/rejected": -1.2282801866531372, + "step": 3601 + }, + { + "epoch": 0.42, + "learning_rate": 1.7811073393421512e-07, + "logits/chosen": -3.493864059448242, + "logits/rejected": -3.3653013706207275, + "logps/chosen": -254.73472595214844, + "logps/rejected": -272.8605041503906, + "loss": 0.3372, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37540706992149353, + "rewards/margins": 1.1201491355895996, + "rewards/rejected": -0.7447421550750732, + "step": 3602 + }, + { + "epoch": 0.42, + "learning_rate": 1.780756174645909e-07, + "logits/chosen": -2.4711759090423584, + "logits/rejected": -2.2874808311462402, + "logps/chosen": -136.7311553955078, + "logps/rejected": -369.6774597167969, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2830653786659241, + "rewards/margins": 2.8032593727111816, + "rewards/rejected": -2.5201942920684814, + "step": 3603 + }, + { + "epoch": 0.42, + "learning_rate": 1.7804050099496665e-07, + "logits/chosen": -3.1284470558166504, + "logits/rejected": -3.427149772644043, + "logps/chosen": -165.9254608154297, + "logps/rejected": -252.3822021484375, + "loss": 0.2473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23782917857170105, + "rewards/margins": 2.6813595294952393, + "rewards/rejected": -2.919188976287842, + "step": 3604 + }, + { + "epoch": 0.42, + "learning_rate": 1.7800538452534238e-07, + "logits/chosen": -3.4391682147979736, + "logits/rejected": -3.4757349491119385, + "logps/chosen": -303.8683776855469, + "logps/rejected": -316.22369384765625, + "loss": 0.3487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11527447402477264, + "rewards/margins": 1.7317123413085938, + "rewards/rejected": -1.8469866514205933, + "step": 3605 + }, + { + "epoch": 0.42, + "learning_rate": 1.7797026805571814e-07, + "logits/chosen": -3.324484348297119, + "logits/rejected": -3.269904613494873, + "logps/chosen": -275.097900390625, + "logps/rejected": -203.29835510253906, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.013506542891263962, + "rewards/margins": 2.093096971511841, + "rewards/rejected": -2.0795905590057373, + "step": 3606 + }, + { + "epoch": 0.42, + "learning_rate": 1.7793515158609386e-07, + "logits/chosen": -2.802933692932129, + "logits/rejected": -2.6279678344726562, + "logps/chosen": -127.04118347167969, + "logps/rejected": -227.58680725097656, + "loss": 0.4356, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07712316513061523, + "rewards/margins": 1.8498387336730957, + "rewards/rejected": -1.7727155685424805, + "step": 3607 + }, + { + "epoch": 0.42, + "learning_rate": 1.7790003511646962e-07, + "logits/chosen": -3.744924545288086, + "logits/rejected": -3.696375846862793, + "logps/chosen": -202.2759246826172, + "logps/rejected": -252.5869140625, + "loss": 0.1905, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4990552067756653, + "rewards/margins": 3.599550724029541, + "rewards/rejected": -3.1004955768585205, + "step": 3608 + }, + { + "epoch": 0.42, + "learning_rate": 1.7786491864684537e-07, + "logits/chosen": -3.4189586639404297, + "logits/rejected": -3.3847310543060303, + "logps/chosen": -283.3072204589844, + "logps/rejected": -291.08740234375, + "loss": 0.5317, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23952540755271912, + "rewards/margins": 1.0005695819854736, + "rewards/rejected": -0.7610442042350769, + "step": 3609 + }, + { + "epoch": 0.42, + "learning_rate": 1.778298021772211e-07, + "logits/chosen": -3.8750076293945312, + "logits/rejected": -4.246096134185791, + "logps/chosen": -143.98519897460938, + "logps/rejected": -244.53274536132812, + "loss": 0.3754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4130411446094513, + "rewards/margins": 1.1894339323043823, + "rewards/rejected": -1.6024751663208008, + "step": 3610 + }, + { + "epoch": 0.42, + "learning_rate": 1.7779468570759685e-07, + "logits/chosen": -2.4477384090423584, + "logits/rejected": -2.4438869953155518, + "logps/chosen": -179.17491149902344, + "logps/rejected": -186.32838439941406, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05304726958274841, + "rewards/margins": 1.2747142314910889, + "rewards/rejected": -1.2216670513153076, + "step": 3611 + }, + { + "epoch": 0.42, + "learning_rate": 1.777595692379726e-07, + "logits/chosen": -3.761974811553955, + "logits/rejected": -3.787919521331787, + "logps/chosen": -272.86517333984375, + "logps/rejected": -204.9129180908203, + "loss": 0.2827, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03332391381263733, + "rewards/margins": 2.135557174682617, + "rewards/rejected": -2.1022331714630127, + "step": 3612 + }, + { + "epoch": 0.42, + "learning_rate": 1.7772445276834833e-07, + "logits/chosen": -3.0092782974243164, + "logits/rejected": -2.890505313873291, + "logps/chosen": -261.05487060546875, + "logps/rejected": -240.9141845703125, + "loss": 0.3473, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17746275663375854, + "rewards/margins": 1.3008074760437012, + "rewards/rejected": -1.4782702922821045, + "step": 3613 + }, + { + "epoch": 0.42, + "learning_rate": 1.7768933629872411e-07, + "logits/chosen": -2.469113826751709, + "logits/rejected": -2.7553484439849854, + "logps/chosen": -164.67059326171875, + "logps/rejected": -298.8160400390625, + "loss": 0.2287, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06380872428417206, + "rewards/margins": 2.438086986541748, + "rewards/rejected": -2.3742780685424805, + "step": 3614 + }, + { + "epoch": 0.42, + "learning_rate": 1.7765421982909981e-07, + "logits/chosen": -3.7470250129699707, + "logits/rejected": -3.323604106903076, + "logps/chosen": -335.26123046875, + "logps/rejected": -183.23675537109375, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7733514904975891, + "rewards/margins": 2.5403060913085938, + "rewards/rejected": -1.7669545412063599, + "step": 3615 + }, + { + "epoch": 0.42, + "learning_rate": 1.776191033594756e-07, + "logits/chosen": -3.0998876094818115, + "logits/rejected": -3.125283718109131, + "logps/chosen": -219.47897338867188, + "logps/rejected": -226.8091583251953, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32330378890037537, + "rewards/margins": 2.430572509765625, + "rewards/rejected": -2.107268810272217, + "step": 3616 + }, + { + "epoch": 0.42, + "learning_rate": 1.7758398688985135e-07, + "logits/chosen": -3.2517032623291016, + "logits/rejected": -2.769226551055908, + "logps/chosen": -408.4369201660156, + "logps/rejected": -312.8028259277344, + "loss": 0.3757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3774459660053253, + "rewards/margins": 1.129897117614746, + "rewards/rejected": -1.507343053817749, + "step": 3617 + }, + { + "epoch": 0.42, + "learning_rate": 1.7754887042022708e-07, + "logits/chosen": -2.463773488998413, + "logits/rejected": -2.4494080543518066, + "logps/chosen": -453.77117919921875, + "logps/rejected": -403.610107421875, + "loss": 0.3074, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16539503633975983, + "rewards/margins": 2.214940309524536, + "rewards/rejected": -2.0495450496673584, + "step": 3618 + }, + { + "epoch": 0.42, + "learning_rate": 1.7751375395060283e-07, + "logits/chosen": -3.2914929389953613, + "logits/rejected": -3.1912124156951904, + "logps/chosen": -322.12652587890625, + "logps/rejected": -205.80499267578125, + "loss": 0.4387, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19578562676906586, + "rewards/margins": 1.6542820930480957, + "rewards/rejected": -1.4584966897964478, + "step": 3619 + }, + { + "epoch": 0.42, + "learning_rate": 1.7747863748097856e-07, + "logits/chosen": -3.265730619430542, + "logits/rejected": -2.859041929244995, + "logps/chosen": -283.0668640136719, + "logps/rejected": -135.1713409423828, + "loss": 0.5815, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22154748439788818, + "rewards/margins": 1.3742036819458008, + "rewards/rejected": -1.5957510471343994, + "step": 3620 + }, + { + "epoch": 0.42, + "learning_rate": 1.774435210113543e-07, + "logits/chosen": -3.135930299758911, + "logits/rejected": -3.2024154663085938, + "logps/chosen": -235.9925994873047, + "logps/rejected": -241.52487182617188, + "loss": 0.3178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02126297727227211, + "rewards/margins": 1.5163897275924683, + "rewards/rejected": -1.495126724243164, + "step": 3621 + }, + { + "epoch": 0.42, + "learning_rate": 1.7740840454173007e-07, + "logits/chosen": -2.8377175331115723, + "logits/rejected": -2.809696674346924, + "logps/chosen": -174.07395935058594, + "logps/rejected": -127.28224182128906, + "loss": 0.6494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0016875974833965302, + "rewards/margins": 0.5058482885360718, + "rewards/rejected": -0.5075358152389526, + "step": 3622 + }, + { + "epoch": 0.42, + "learning_rate": 1.773732880721058e-07, + "logits/chosen": -3.0956342220306396, + "logits/rejected": -2.9850006103515625, + "logps/chosen": -287.33197021484375, + "logps/rejected": -269.5440673828125, + "loss": 0.3462, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22540880739688873, + "rewards/margins": 1.6704620122909546, + "rewards/rejected": -1.4450531005859375, + "step": 3623 + }, + { + "epoch": 0.42, + "learning_rate": 1.7733817160248155e-07, + "logits/chosen": -2.9730210304260254, + "logits/rejected": -2.8924403190612793, + "logps/chosen": -185.6951904296875, + "logps/rejected": -281.78009033203125, + "loss": 0.669, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1997338831424713, + "rewards/margins": 0.3705267012119293, + "rewards/rejected": -0.5702605843544006, + "step": 3624 + }, + { + "epoch": 0.42, + "learning_rate": 1.7730305513285733e-07, + "logits/chosen": -3.105058431625366, + "logits/rejected": -2.7510719299316406, + "logps/chosen": -169.9664764404297, + "logps/rejected": -163.7920379638672, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5565968751907349, + "rewards/margins": 0.8129980564117432, + "rewards/rejected": -1.369594931602478, + "step": 3625 + }, + { + "epoch": 0.42, + "learning_rate": 1.7726793866323305e-07, + "logits/chosen": -3.0943500995635986, + "logits/rejected": -3.062509536743164, + "logps/chosen": -350.2023010253906, + "logps/rejected": -318.0776062011719, + "loss": 0.4032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3525112271308899, + "rewards/margins": 1.302563190460205, + "rewards/rejected": -0.95005202293396, + "step": 3626 + }, + { + "epoch": 0.42, + "learning_rate": 1.772328221936088e-07, + "logits/chosen": -3.3769869804382324, + "logits/rejected": -3.070962905883789, + "logps/chosen": -308.0557861328125, + "logps/rejected": -298.95318603515625, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2304621934890747, + "rewards/margins": 2.0559091567993164, + "rewards/rejected": -2.2863712310791016, + "step": 3627 + }, + { + "epoch": 0.42, + "learning_rate": 1.7719770572398454e-07, + "logits/chosen": -3.4546990394592285, + "logits/rejected": -3.5186803340911865, + "logps/chosen": -246.6385040283203, + "logps/rejected": -301.3353271484375, + "loss": 0.2091, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43586647510528564, + "rewards/margins": 2.5885744094848633, + "rewards/rejected": -2.152707815170288, + "step": 3628 + }, + { + "epoch": 0.42, + "learning_rate": 1.771625892543603e-07, + "logits/chosen": -3.2144250869750977, + "logits/rejected": -3.158358573913574, + "logps/chosen": -132.92147827148438, + "logps/rejected": -206.49127197265625, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06636463105678558, + "rewards/margins": 0.9211452007293701, + "rewards/rejected": -0.8547805547714233, + "step": 3629 + }, + { + "epoch": 0.42, + "learning_rate": 1.7712747278473604e-07, + "logits/chosen": -3.1416573524475098, + "logits/rejected": -3.600766897201538, + "logps/chosen": -210.75302124023438, + "logps/rejected": -163.16444396972656, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08937221765518188, + "rewards/margins": 1.8864513635635376, + "rewards/rejected": -1.797079086303711, + "step": 3630 + }, + { + "epoch": 0.42, + "learning_rate": 1.7709235631511177e-07, + "logits/chosen": -2.612319231033325, + "logits/rejected": -2.4941697120666504, + "logps/chosen": -374.9217529296875, + "logps/rejected": -263.1956787109375, + "loss": 0.5677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15920697152614594, + "rewards/margins": 0.7866968512535095, + "rewards/rejected": -0.9459038376808167, + "step": 3631 + }, + { + "epoch": 0.42, + "learning_rate": 1.7705723984548752e-07, + "logits/chosen": -3.0926594734191895, + "logits/rejected": -3.273958206176758, + "logps/chosen": -258.765380859375, + "logps/rejected": -291.8808898925781, + "loss": 0.4476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34579548239707947, + "rewards/margins": 1.5180823802947998, + "rewards/rejected": -1.172287106513977, + "step": 3632 + }, + { + "epoch": 0.42, + "learning_rate": 1.7702212337586328e-07, + "logits/chosen": -2.363273859024048, + "logits/rejected": -2.3719887733459473, + "logps/chosen": -400.58843994140625, + "logps/rejected": -380.9510498046875, + "loss": 0.2131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.010603904724121094, + "rewards/margins": 1.9839978218078613, + "rewards/rejected": -1.9946014881134033, + "step": 3633 + }, + { + "epoch": 0.42, + "learning_rate": 1.76987006906239e-07, + "logits/chosen": -3.0332791805267334, + "logits/rejected": -3.0523829460144043, + "logps/chosen": -170.6876983642578, + "logps/rejected": -393.2889404296875, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.490287184715271, + "rewards/margins": 3.7241032123565674, + "rewards/rejected": -3.233815908432007, + "step": 3634 + }, + { + "epoch": 0.42, + "learning_rate": 1.7695189043661476e-07, + "logits/chosen": -2.860816478729248, + "logits/rejected": -2.9145326614379883, + "logps/chosen": -201.06004333496094, + "logps/rejected": -176.80810546875, + "loss": 0.2935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23176246881484985, + "rewards/margins": 1.7282990217208862, + "rewards/rejected": -1.9600615501403809, + "step": 3635 + }, + { + "epoch": 0.42, + "learning_rate": 1.769167739669905e-07, + "logits/chosen": -2.9824328422546387, + "logits/rejected": -2.9698352813720703, + "logps/chosen": -298.8978576660156, + "logps/rejected": -255.1248016357422, + "loss": 0.457, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47917526960372925, + "rewards/margins": 0.95234215259552, + "rewards/rejected": -0.473166823387146, + "step": 3636 + }, + { + "epoch": 0.42, + "learning_rate": 1.7688165749736627e-07, + "logits/chosen": -2.799117088317871, + "logits/rejected": -2.8473410606384277, + "logps/chosen": -339.62542724609375, + "logps/rejected": -250.97140502929688, + "loss": 0.3787, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03334202617406845, + "rewards/margins": 1.5134683847427368, + "rewards/rejected": -1.4801263809204102, + "step": 3637 + }, + { + "epoch": 0.42, + "learning_rate": 1.7684654102774202e-07, + "logits/chosen": -2.3478775024414062, + "logits/rejected": -2.2873880863189697, + "logps/chosen": -362.37628173828125, + "logps/rejected": -432.9797058105469, + "loss": 0.3857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20124129951000214, + "rewards/margins": 1.730050802230835, + "rewards/rejected": -1.5288094282150269, + "step": 3638 + }, + { + "epoch": 0.42, + "learning_rate": 1.7681142455811775e-07, + "logits/chosen": -3.220346689224243, + "logits/rejected": -2.5427706241607666, + "logps/chosen": -358.1689758300781, + "logps/rejected": -267.6650390625, + "loss": 0.3799, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16996780037879944, + "rewards/margins": 1.4173362255096436, + "rewards/rejected": -1.5873042345046997, + "step": 3639 + }, + { + "epoch": 0.42, + "learning_rate": 1.767763080884935e-07, + "logits/chosen": -2.6814520359039307, + "logits/rejected": -2.8254384994506836, + "logps/chosen": -302.58636474609375, + "logps/rejected": -260.7044677734375, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1532062143087387, + "rewards/margins": 1.005242943763733, + "rewards/rejected": -0.852036714553833, + "step": 3640 + }, + { + "epoch": 0.42, + "learning_rate": 1.7674119161886926e-07, + "logits/chosen": -2.943485736846924, + "logits/rejected": -3.073014259338379, + "logps/chosen": -281.6152648925781, + "logps/rejected": -239.97113037109375, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011068761348724365, + "rewards/margins": 1.0024497509002686, + "rewards/rejected": -0.9913809299468994, + "step": 3641 + }, + { + "epoch": 0.42, + "learning_rate": 1.7670607514924498e-07, + "logits/chosen": -2.772709846496582, + "logits/rejected": -2.927123546600342, + "logps/chosen": -245.40646362304688, + "logps/rejected": -274.51153564453125, + "loss": 0.7098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.76863694190979, + "rewards/margins": 0.78693687915802, + "rewards/rejected": -1.55557382106781, + "step": 3642 + }, + { + "epoch": 0.42, + "learning_rate": 1.7667095867962074e-07, + "logits/chosen": -3.081103563308716, + "logits/rejected": -2.770125389099121, + "logps/chosen": -268.05950927734375, + "logps/rejected": -306.35455322265625, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5047484040260315, + "rewards/margins": 1.88785719871521, + "rewards/rejected": -1.3831087350845337, + "step": 3643 + }, + { + "epoch": 0.42, + "learning_rate": 1.7663584220999646e-07, + "logits/chosen": -3.140497922897339, + "logits/rejected": -2.9025254249572754, + "logps/chosen": -274.5437927246094, + "logps/rejected": -251.7563018798828, + "loss": 0.4712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.753960132598877, + "rewards/margins": 1.5107910633087158, + "rewards/rejected": -2.264751434326172, + "step": 3644 + }, + { + "epoch": 0.42, + "learning_rate": 1.7660072574037222e-07, + "logits/chosen": -2.6147029399871826, + "logits/rejected": -3.006087303161621, + "logps/chosen": -215.32363891601562, + "logps/rejected": -314.00701904296875, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4955264925956726, + "rewards/margins": 1.5818054676055908, + "rewards/rejected": -2.077331781387329, + "step": 3645 + }, + { + "epoch": 0.42, + "learning_rate": 1.7656560927074797e-07, + "logits/chosen": -2.649563789367676, + "logits/rejected": -2.567958354949951, + "logps/chosen": -188.53775024414062, + "logps/rejected": -249.00657653808594, + "loss": 0.546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0882013812661171, + "rewards/margins": 1.824887752532959, + "rewards/rejected": -1.9130892753601074, + "step": 3646 + }, + { + "epoch": 0.42, + "learning_rate": 1.765304928011237e-07, + "logits/chosen": -2.613126754760742, + "logits/rejected": -2.917285203933716, + "logps/chosen": -287.8216247558594, + "logps/rejected": -122.5135726928711, + "loss": 0.3272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12070082873106003, + "rewards/margins": 1.2106047868728638, + "rewards/rejected": -1.0899039506912231, + "step": 3647 + }, + { + "epoch": 0.42, + "learning_rate": 1.7649537633149948e-07, + "logits/chosen": -3.3094234466552734, + "logits/rejected": -3.559028148651123, + "logps/chosen": -151.65423583984375, + "logps/rejected": -204.50338745117188, + "loss": 0.4381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2810271978378296, + "rewards/margins": 2.122706890106201, + "rewards/rejected": -2.403733968734741, + "step": 3648 + }, + { + "epoch": 0.42, + "learning_rate": 1.7646025986187523e-07, + "logits/chosen": -3.695383071899414, + "logits/rejected": -3.711465835571289, + "logps/chosen": -185.64743041992188, + "logps/rejected": -198.56048583984375, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03897445276379585, + "rewards/margins": 1.7048319578170776, + "rewards/rejected": -1.7438063621520996, + "step": 3649 + }, + { + "epoch": 0.42, + "learning_rate": 1.7642514339225096e-07, + "logits/chosen": -2.4696390628814697, + "logits/rejected": -2.4563019275665283, + "logps/chosen": -312.1109924316406, + "logps/rejected": -380.2071533203125, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.014812782406806946, + "rewards/margins": 2.9819231033325195, + "rewards/rejected": -2.9671101570129395, + "step": 3650 + }, + { + "epoch": 0.42, + "learning_rate": 1.7639002692262672e-07, + "logits/chosen": -3.054286479949951, + "logits/rejected": -3.4297707080841064, + "logps/chosen": -171.6527557373047, + "logps/rejected": -212.00408935546875, + "loss": 0.2795, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6143932938575745, + "rewards/margins": 2.265759229660034, + "rewards/rejected": -1.6513659954071045, + "step": 3651 + }, + { + "epoch": 0.42, + "learning_rate": 1.7635491045300244e-07, + "logits/chosen": -3.5356597900390625, + "logits/rejected": -3.3827714920043945, + "logps/chosen": -237.62335205078125, + "logps/rejected": -244.1085205078125, + "loss": 0.2377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11496227234601974, + "rewards/margins": 2.1504745483398438, + "rewards/rejected": -2.0355124473571777, + "step": 3652 + }, + { + "epoch": 0.42, + "learning_rate": 1.763197939833782e-07, + "logits/chosen": -3.4730801582336426, + "logits/rejected": -3.5231266021728516, + "logps/chosen": -379.6275634765625, + "logps/rejected": -360.52691650390625, + "loss": 0.2104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2855432331562042, + "rewards/margins": 2.7163121700286865, + "rewards/rejected": -3.0018553733825684, + "step": 3653 + }, + { + "epoch": 0.42, + "learning_rate": 1.7628467751375395e-07, + "logits/chosen": -2.6805691719055176, + "logits/rejected": -2.606480121612549, + "logps/chosen": -377.70361328125, + "logps/rejected": -220.27452087402344, + "loss": 0.4317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15545958280563354, + "rewards/margins": 1.2018458843231201, + "rewards/rejected": -1.0463862419128418, + "step": 3654 + }, + { + "epoch": 0.42, + "learning_rate": 1.7624956104412968e-07, + "logits/chosen": -2.796295166015625, + "logits/rejected": -2.802703619003296, + "logps/chosen": -323.9333801269531, + "logps/rejected": -396.0824279785156, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30582720041275024, + "rewards/margins": 1.525369644165039, + "rewards/rejected": -1.219542384147644, + "step": 3655 + }, + { + "epoch": 0.42, + "learning_rate": 1.7621444457450543e-07, + "logits/chosen": -3.074664354324341, + "logits/rejected": -2.9063656330108643, + "logps/chosen": -134.7894287109375, + "logps/rejected": -207.12083435058594, + "loss": 0.3646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1659690886735916, + "rewards/margins": 3.3255927562713623, + "rewards/rejected": -3.4915618896484375, + "step": 3656 + }, + { + "epoch": 0.42, + "learning_rate": 1.7617932810488119e-07, + "logits/chosen": -2.788881540298462, + "logits/rejected": -2.715400218963623, + "logps/chosen": -319.77001953125, + "logps/rejected": -294.37225341796875, + "loss": 0.1504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2999189496040344, + "rewards/margins": 3.4237289428710938, + "rewards/rejected": -3.123810052871704, + "step": 3657 + }, + { + "epoch": 0.42, + "learning_rate": 1.761442116352569e-07, + "logits/chosen": -3.4173128604888916, + "logits/rejected": -3.2563412189483643, + "logps/chosen": -211.1702880859375, + "logps/rejected": -242.68035888671875, + "loss": 0.7213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.023118890821933746, + "rewards/margins": 1.0918171405792236, + "rewards/rejected": -1.114936113357544, + "step": 3658 + }, + { + "epoch": 0.42, + "learning_rate": 1.761090951656327e-07, + "logits/chosen": -3.4876203536987305, + "logits/rejected": -3.292799949645996, + "logps/chosen": -254.85113525390625, + "logps/rejected": -185.74607849121094, + "loss": 0.3861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2451949566602707, + "rewards/margins": 2.270878791809082, + "rewards/rejected": -2.516073703765869, + "step": 3659 + }, + { + "epoch": 0.42, + "learning_rate": 1.760739786960084e-07, + "logits/chosen": -3.8485403060913086, + "logits/rejected": -3.8267271518707275, + "logps/chosen": -155.20123291015625, + "logps/rejected": -210.4557342529297, + "loss": 0.2287, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6365233659744263, + "rewards/margins": 2.4134607315063477, + "rewards/rejected": -1.776937484741211, + "step": 3660 + }, + { + "epoch": 0.42, + "learning_rate": 1.7603886222638417e-07, + "logits/chosen": -3.7104101181030273, + "logits/rejected": -3.495621681213379, + "logps/chosen": -278.2177734375, + "logps/rejected": -146.71397399902344, + "loss": 0.4815, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005841255187988281, + "rewards/margins": 1.2475409507751465, + "rewards/rejected": -1.2416998147964478, + "step": 3661 + }, + { + "epoch": 0.42, + "learning_rate": 1.7600374575675993e-07, + "logits/chosen": -2.770716667175293, + "logits/rejected": -2.7188820838928223, + "logps/chosen": -270.78045654296875, + "logps/rejected": -239.47996520996094, + "loss": 0.241, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2629327178001404, + "rewards/margins": 2.3488526344299316, + "rewards/rejected": -2.0859200954437256, + "step": 3662 + }, + { + "epoch": 0.42, + "learning_rate": 1.7596862928713566e-07, + "logits/chosen": -2.9577372074127197, + "logits/rejected": -2.7594523429870605, + "logps/chosen": -410.2992858886719, + "logps/rejected": -333.0937194824219, + "loss": 0.7051, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0373762845993042, + "rewards/margins": 0.46418291330337524, + "rewards/rejected": -1.5015592575073242, + "step": 3663 + }, + { + "epoch": 0.42, + "learning_rate": 1.759335128175114e-07, + "logits/chosen": -3.327953577041626, + "logits/rejected": -3.3014254570007324, + "logps/chosen": -179.184814453125, + "logps/rejected": -190.2238006591797, + "loss": 0.7397, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30971142649650574, + "rewards/margins": 0.3783586323261261, + "rewards/rejected": -0.6880700588226318, + "step": 3664 + }, + { + "epoch": 0.42, + "learning_rate": 1.7589839634788714e-07, + "logits/chosen": -2.819211006164551, + "logits/rejected": -2.8049912452697754, + "logps/chosen": -401.12420654296875, + "logps/rejected": -256.31427001953125, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17595121264457703, + "rewards/margins": 1.51735520362854, + "rewards/rejected": -1.3414039611816406, + "step": 3665 + }, + { + "epoch": 0.42, + "learning_rate": 1.758632798782629e-07, + "logits/chosen": -2.505849838256836, + "logits/rejected": -2.376697301864624, + "logps/chosen": -256.70904541015625, + "logps/rejected": -232.02247619628906, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27026069164276123, + "rewards/margins": 1.2330822944641113, + "rewards/rejected": -1.5033429861068726, + "step": 3666 + }, + { + "epoch": 0.42, + "learning_rate": 1.7582816340863864e-07, + "logits/chosen": -2.8012218475341797, + "logits/rejected": -2.8539414405822754, + "logps/chosen": -197.89016723632812, + "logps/rejected": -280.71466064453125, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5192413330078125, + "rewards/margins": 2.657531499862671, + "rewards/rejected": -2.1382904052734375, + "step": 3667 + }, + { + "epoch": 0.42, + "learning_rate": 1.7579304693901437e-07, + "logits/chosen": -2.881971836090088, + "logits/rejected": -2.901447296142578, + "logps/chosen": -223.26048278808594, + "logps/rejected": -323.59002685546875, + "loss": 0.2808, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4198216497898102, + "rewards/margins": 2.851172685623169, + "rewards/rejected": -2.4313511848449707, + "step": 3668 + }, + { + "epoch": 0.42, + "learning_rate": 1.7575793046939013e-07, + "logits/chosen": -3.636566400527954, + "logits/rejected": -3.425074577331543, + "logps/chosen": -346.55755615234375, + "logps/rejected": -306.062744140625, + "loss": 0.338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015161365270614624, + "rewards/margins": 1.249566912651062, + "rewards/rejected": -1.234405517578125, + "step": 3669 + }, + { + "epoch": 0.42, + "learning_rate": 1.757228139997659e-07, + "logits/chosen": -3.645534038543701, + "logits/rejected": -3.7407495975494385, + "logps/chosen": -292.91790771484375, + "logps/rejected": -273.178955078125, + "loss": 0.2386, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07463999092578888, + "rewards/margins": 2.6679911613464355, + "rewards/rejected": -2.593351125717163, + "step": 3670 + }, + { + "epoch": 0.42, + "learning_rate": 1.7568769753014163e-07, + "logits/chosen": -3.3033218383789062, + "logits/rejected": -3.3104379177093506, + "logps/chosen": -204.4661865234375, + "logps/rejected": -259.4085998535156, + "loss": 0.4702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02536982297897339, + "rewards/margins": 1.9571795463562012, + "rewards/rejected": -1.9825491905212402, + "step": 3671 + }, + { + "epoch": 0.42, + "learning_rate": 1.756525810605174e-07, + "logits/chosen": -3.599915027618408, + "logits/rejected": -3.6784982681274414, + "logps/chosen": -181.92926025390625, + "logps/rejected": -224.1087188720703, + "loss": 0.3597, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4313427805900574, + "rewards/margins": 1.650991439819336, + "rewards/rejected": -1.2196487188339233, + "step": 3672 + }, + { + "epoch": 0.42, + "learning_rate": 1.7561746459089311e-07, + "logits/chosen": -2.8521246910095215, + "logits/rejected": -3.0531527996063232, + "logps/chosen": -332.9010009765625, + "logps/rejected": -254.57510375976562, + "loss": 0.1776, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5503280758857727, + "rewards/margins": 3.1421520709991455, + "rewards/rejected": -2.5918242931365967, + "step": 3673 + }, + { + "epoch": 0.42, + "learning_rate": 1.7558234812126887e-07, + "logits/chosen": -3.2007575035095215, + "logits/rejected": -2.4732701778411865, + "logps/chosen": -240.74737548828125, + "logps/rejected": -184.19097900390625, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4747461676597595, + "rewards/margins": 1.3680391311645508, + "rewards/rejected": -0.8932929039001465, + "step": 3674 + }, + { + "epoch": 0.42, + "learning_rate": 1.7554723165164462e-07, + "logits/chosen": -2.2716755867004395, + "logits/rejected": -2.4005074501037598, + "logps/chosen": -103.11361694335938, + "logps/rejected": -256.044677734375, + "loss": 0.4659, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2913406789302826, + "rewards/margins": 1.5435810089111328, + "rewards/rejected": -1.2522403001785278, + "step": 3675 + }, + { + "epoch": 0.42, + "learning_rate": 1.7551211518202035e-07, + "logits/chosen": -3.160127878189087, + "logits/rejected": -3.250457525253296, + "logps/chosen": -206.453125, + "logps/rejected": -212.25253295898438, + "loss": 0.2886, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5067204833030701, + "rewards/margins": 1.533459186553955, + "rewards/rejected": -1.0267386436462402, + "step": 3676 + }, + { + "epoch": 0.42, + "learning_rate": 1.754769987123961e-07, + "logits/chosen": -2.930300235748291, + "logits/rejected": -2.809786558151245, + "logps/chosen": -340.4488830566406, + "logps/rejected": -279.65545654296875, + "loss": 0.4697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20617930591106415, + "rewards/margins": 2.6692047119140625, + "rewards/rejected": -2.8753838539123535, + "step": 3677 + }, + { + "epoch": 0.42, + "learning_rate": 1.7544188224277186e-07, + "logits/chosen": -3.345031261444092, + "logits/rejected": -3.3347318172454834, + "logps/chosen": -224.3382568359375, + "logps/rejected": -147.240234375, + "loss": 0.4561, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18426811695098877, + "rewards/margins": 1.3964011669158936, + "rewards/rejected": -1.5806691646575928, + "step": 3678 + }, + { + "epoch": 0.42, + "learning_rate": 1.7540676577314758e-07, + "logits/chosen": -3.3903074264526367, + "logits/rejected": -3.34645414352417, + "logps/chosen": -248.82781982421875, + "logps/rejected": -150.66310119628906, + "loss": 0.4146, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3776225745677948, + "rewards/margins": 1.2259905338287354, + "rewards/rejected": -0.8483679890632629, + "step": 3679 + }, + { + "epoch": 0.42, + "learning_rate": 1.7537164930352334e-07, + "logits/chosen": -3.1243927478790283, + "logits/rejected": -3.3144679069519043, + "logps/chosen": -272.9576416015625, + "logps/rejected": -287.1666564941406, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03647109866142273, + "rewards/margins": 2.2195956707000732, + "rewards/rejected": -2.2560667991638184, + "step": 3680 + }, + { + "epoch": 0.42, + "learning_rate": 1.7533653283389907e-07, + "logits/chosen": -3.866467237472534, + "logits/rejected": -3.907078981399536, + "logps/chosen": -265.8874206542969, + "logps/rejected": -274.9924011230469, + "loss": 0.4631, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1495412290096283, + "rewards/margins": 1.4447901248931885, + "rewards/rejected": -1.5943312644958496, + "step": 3681 + }, + { + "epoch": 0.42, + "learning_rate": 1.7530141636427485e-07, + "logits/chosen": -3.3129658699035645, + "logits/rejected": -3.078537702560425, + "logps/chosen": -425.0328063964844, + "logps/rejected": -313.2099609375, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.009612083435058594, + "rewards/margins": 1.6689425706863403, + "rewards/rejected": -1.6593303680419922, + "step": 3682 + }, + { + "epoch": 0.42, + "learning_rate": 1.752662998946506e-07, + "logits/chosen": -2.9076428413391113, + "logits/rejected": -3.0406863689422607, + "logps/chosen": -247.17361450195312, + "logps/rejected": -192.90579223632812, + "loss": 0.6173, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3767296373844147, + "rewards/margins": 0.4925684928894043, + "rewards/rejected": -0.8692981600761414, + "step": 3683 + }, + { + "epoch": 0.42, + "learning_rate": 1.7523118342502633e-07, + "logits/chosen": -3.044571876525879, + "logits/rejected": -2.984180212020874, + "logps/chosen": -166.7109832763672, + "logps/rejected": -146.12913513183594, + "loss": 0.5838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16488619148731232, + "rewards/margins": 0.4403136372566223, + "rewards/rejected": -0.6051998734474182, + "step": 3684 + }, + { + "epoch": 0.42, + "learning_rate": 1.7519606695540208e-07, + "logits/chosen": -2.985586166381836, + "logits/rejected": -3.2165706157684326, + "logps/chosen": -448.128173828125, + "logps/rejected": -288.26593017578125, + "loss": 0.3185, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32975998520851135, + "rewards/margins": 2.4367318153381348, + "rewards/rejected": -2.1069719791412354, + "step": 3685 + }, + { + "epoch": 0.42, + "learning_rate": 1.7516095048577784e-07, + "logits/chosen": -2.56539249420166, + "logits/rejected": -2.5182745456695557, + "logps/chosen": -253.15289306640625, + "logps/rejected": -249.2406005859375, + "loss": 0.4847, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21470847725868225, + "rewards/margins": 1.381178855895996, + "rewards/rejected": -1.1664702892303467, + "step": 3686 + }, + { + "epoch": 0.43, + "learning_rate": 1.7512583401615356e-07, + "logits/chosen": -2.7904419898986816, + "logits/rejected": -3.1729698181152344, + "logps/chosen": -422.5179443359375, + "logps/rejected": -338.39886474609375, + "loss": 0.6094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5157440900802612, + "rewards/margins": 0.5581536293029785, + "rewards/rejected": -1.0738977193832397, + "step": 3687 + }, + { + "epoch": 0.43, + "learning_rate": 1.7509071754652932e-07, + "logits/chosen": -2.3732004165649414, + "logits/rejected": -2.344940185546875, + "logps/chosen": -270.4952697753906, + "logps/rejected": -192.2425079345703, + "loss": 0.4994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04739580303430557, + "rewards/margins": 1.4107962846755981, + "rewards/rejected": -1.458191990852356, + "step": 3688 + }, + { + "epoch": 0.43, + "learning_rate": 1.7505560107690504e-07, + "logits/chosen": -3.320840358734131, + "logits/rejected": -3.3140125274658203, + "logps/chosen": -362.8102722167969, + "logps/rejected": -204.6231689453125, + "loss": 0.1873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2925833761692047, + "rewards/margins": 2.486625909805298, + "rewards/rejected": -2.194042682647705, + "step": 3689 + }, + { + "epoch": 0.43, + "learning_rate": 1.750204846072808e-07, + "logits/chosen": -3.2383601665496826, + "logits/rejected": -2.9918212890625, + "logps/chosen": -397.9549255371094, + "logps/rejected": -283.3879699707031, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0777997225522995, + "rewards/margins": 1.394925832748413, + "rewards/rejected": -1.4727253913879395, + "step": 3690 + }, + { + "epoch": 0.43, + "learning_rate": 1.7498536813765655e-07, + "logits/chosen": -3.4846014976501465, + "logits/rejected": -3.4960670471191406, + "logps/chosen": -164.9415283203125, + "logps/rejected": -234.11642456054688, + "loss": 0.4807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02652006968855858, + "rewards/margins": 0.8070441484451294, + "rewards/rejected": -0.7805240154266357, + "step": 3691 + }, + { + "epoch": 0.43, + "learning_rate": 1.7495025166803228e-07, + "logits/chosen": -2.424030065536499, + "logits/rejected": -2.503279447555542, + "logps/chosen": -264.1920166015625, + "logps/rejected": -212.9855499267578, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.173688605427742, + "rewards/margins": 0.9786339998245239, + "rewards/rejected": -1.15232253074646, + "step": 3692 + }, + { + "epoch": 0.43, + "learning_rate": 1.7491513519840806e-07, + "logits/chosen": -3.946321964263916, + "logits/rejected": -3.6730384826660156, + "logps/chosen": -331.69842529296875, + "logps/rejected": -238.94546508789062, + "loss": 0.2948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.157293900847435, + "rewards/margins": 1.9004795551300049, + "rewards/rejected": -2.0577735900878906, + "step": 3693 + }, + { + "epoch": 0.43, + "learning_rate": 1.7488001872878381e-07, + "logits/chosen": -2.9397292137145996, + "logits/rejected": -2.7671985626220703, + "logps/chosen": -381.0958251953125, + "logps/rejected": -293.4032897949219, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06413798779249191, + "rewards/margins": 2.4678945541381836, + "rewards/rejected": -2.4037563800811768, + "step": 3694 + }, + { + "epoch": 0.43, + "learning_rate": 1.7484490225915954e-07, + "logits/chosen": -3.505201816558838, + "logits/rejected": -3.641650915145874, + "logps/chosen": -241.50909423828125, + "logps/rejected": -302.693603515625, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4585130214691162, + "rewards/margins": 1.4443204402923584, + "rewards/rejected": -0.9858075380325317, + "step": 3695 + }, + { + "epoch": 0.43, + "learning_rate": 1.748097857895353e-07, + "logits/chosen": -2.571855068206787, + "logits/rejected": -2.744417428970337, + "logps/chosen": -199.8043975830078, + "logps/rejected": -272.9269714355469, + "loss": 0.3304, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02228662371635437, + "rewards/margins": 2.4847335815429688, + "rewards/rejected": -2.462446928024292, + "step": 3696 + }, + { + "epoch": 0.43, + "learning_rate": 1.7477466931991102e-07, + "logits/chosen": -3.1367175579071045, + "logits/rejected": -2.911776065826416, + "logps/chosen": -268.60748291015625, + "logps/rejected": -283.1267395019531, + "loss": 0.2648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03149862214922905, + "rewards/margins": 2.236211061477661, + "rewards/rejected": -2.267709732055664, + "step": 3697 + }, + { + "epoch": 0.43, + "learning_rate": 1.7473955285028678e-07, + "logits/chosen": -3.427600860595703, + "logits/rejected": -3.278036594390869, + "logps/chosen": -461.87469482421875, + "logps/rejected": -329.0977478027344, + "loss": 0.5293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7517992258071899, + "rewards/margins": 0.8983291387557983, + "rewards/rejected": -1.6501283645629883, + "step": 3698 + }, + { + "epoch": 0.43, + "learning_rate": 1.7470443638066253e-07, + "logits/chosen": -2.5508201122283936, + "logits/rejected": -2.710571050643921, + "logps/chosen": -321.293701171875, + "logps/rejected": -209.51194763183594, + "loss": 0.2007, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024450138211250305, + "rewards/margins": 2.032128095626831, + "rewards/rejected": -2.0076780319213867, + "step": 3699 + }, + { + "epoch": 0.43, + "learning_rate": 1.7466931991103826e-07, + "logits/chosen": -2.9145636558532715, + "logits/rejected": -2.9309887886047363, + "logps/chosen": -155.93807983398438, + "logps/rejected": -170.2395477294922, + "loss": 0.778, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5807395577430725, + "rewards/margins": 0.1392105221748352, + "rewards/rejected": -0.7199500799179077, + "step": 3700 + }, + { + "epoch": 0.43, + "learning_rate": 1.74634203441414e-07, + "logits/chosen": -3.4217257499694824, + "logits/rejected": -3.622093915939331, + "logps/chosen": -242.79603576660156, + "logps/rejected": -180.10264587402344, + "loss": 0.7115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5945636630058289, + "rewards/margins": 1.9756590127944946, + "rewards/rejected": -2.570222854614258, + "step": 3701 + }, + { + "epoch": 0.43, + "learning_rate": 1.7459908697178976e-07, + "logits/chosen": -2.9771952629089355, + "logits/rejected": -2.8347296714782715, + "logps/chosen": -163.63543701171875, + "logps/rejected": -278.3885803222656, + "loss": 0.342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31905388832092285, + "rewards/margins": 1.1981523036956787, + "rewards/rejected": -1.5172061920166016, + "step": 3702 + }, + { + "epoch": 0.43, + "learning_rate": 1.745639705021655e-07, + "logits/chosen": -3.5726640224456787, + "logits/rejected": -3.4684059619903564, + "logps/chosen": -225.43280029296875, + "logps/rejected": -300.43243408203125, + "loss": 0.2404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3851601779460907, + "rewards/margins": 1.9210405349731445, + "rewards/rejected": -2.3062007427215576, + "step": 3703 + }, + { + "epoch": 0.43, + "learning_rate": 1.7452885403254127e-07, + "logits/chosen": -2.531132221221924, + "logits/rejected": -2.923954486846924, + "logps/chosen": -167.4175262451172, + "logps/rejected": -178.97923278808594, + "loss": 0.4544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7285107374191284, + "rewards/margins": 1.6871981620788574, + "rewards/rejected": -2.4157090187072754, + "step": 3704 + }, + { + "epoch": 0.43, + "learning_rate": 1.74493737562917e-07, + "logits/chosen": -3.2709741592407227, + "logits/rejected": -3.0662009716033936, + "logps/chosen": -217.2175750732422, + "logps/rejected": -257.073486328125, + "loss": 0.7644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3162473142147064, + "rewards/margins": 1.2224907875061035, + "rewards/rejected": -1.5387380123138428, + "step": 3705 + }, + { + "epoch": 0.43, + "learning_rate": 1.7445862109329275e-07, + "logits/chosen": -2.809107780456543, + "logits/rejected": -2.913557291030884, + "logps/chosen": -216.73199462890625, + "logps/rejected": -183.3575439453125, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06992653757333755, + "rewards/margins": 1.582355260848999, + "rewards/rejected": -1.6522817611694336, + "step": 3706 + }, + { + "epoch": 0.43, + "learning_rate": 1.744235046236685e-07, + "logits/chosen": -2.3153061866760254, + "logits/rejected": -1.959198236465454, + "logps/chosen": -483.89569091796875, + "logps/rejected": -260.3747863769531, + "loss": 0.5165, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7013405561447144, + "rewards/margins": 1.2353366613388062, + "rewards/rejected": -1.9366772174835205, + "step": 3707 + }, + { + "epoch": 0.43, + "learning_rate": 1.7438838815404423e-07, + "logits/chosen": -3.6715540885925293, + "logits/rejected": -3.6815688610076904, + "logps/chosen": -284.1375732421875, + "logps/rejected": -285.5923767089844, + "loss": 0.1526, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2228599190711975, + "rewards/margins": 2.7908384799957275, + "rewards/rejected": -2.567978858947754, + "step": 3708 + }, + { + "epoch": 0.43, + "learning_rate": 1.7435327168442e-07, + "logits/chosen": -3.2557315826416016, + "logits/rejected": -3.175935983657837, + "logps/chosen": -232.49562072753906, + "logps/rejected": -274.1630859375, + "loss": 0.1356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15113241970539093, + "rewards/margins": 2.4042086601257324, + "rewards/rejected": -2.5553407669067383, + "step": 3709 + }, + { + "epoch": 0.43, + "learning_rate": 1.7431815521479574e-07, + "logits/chosen": -3.1184685230255127, + "logits/rejected": -2.7624216079711914, + "logps/chosen": -365.116455078125, + "logps/rejected": -250.8306884765625, + "loss": 0.4401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11741462349891663, + "rewards/margins": 1.4352836608886719, + "rewards/rejected": -1.5526982545852661, + "step": 3710 + }, + { + "epoch": 0.43, + "learning_rate": 1.7428303874517147e-07, + "logits/chosen": -3.1036338806152344, + "logits/rejected": -3.0900933742523193, + "logps/chosen": -265.8157653808594, + "logps/rejected": -313.23541259765625, + "loss": 0.7364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5526188611984253, + "rewards/margins": 0.9299554824829102, + "rewards/rejected": -1.482574224472046, + "step": 3711 + }, + { + "epoch": 0.43, + "learning_rate": 1.7424792227554722e-07, + "logits/chosen": -3.0043256282806396, + "logits/rejected": -3.14552640914917, + "logps/chosen": -272.4706115722656, + "logps/rejected": -213.83880615234375, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0703638419508934, + "rewards/margins": 2.5352463722229004, + "rewards/rejected": -2.4648826122283936, + "step": 3712 + }, + { + "epoch": 0.43, + "learning_rate": 1.7421280580592295e-07, + "logits/chosen": -3.430011510848999, + "logits/rejected": -3.286167621612549, + "logps/chosen": -153.2526397705078, + "logps/rejected": -192.38787841796875, + "loss": 0.352, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0560891330242157, + "rewards/margins": 2.607851266860962, + "rewards/rejected": -2.551762342453003, + "step": 3713 + }, + { + "epoch": 0.43, + "learning_rate": 1.741776893362987e-07, + "logits/chosen": -2.9374959468841553, + "logits/rejected": -2.735877513885498, + "logps/chosen": -270.79620361328125, + "logps/rejected": -233.01893615722656, + "loss": 0.3584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03279295191168785, + "rewards/margins": 1.639037847518921, + "rewards/rejected": -1.6062449216842651, + "step": 3714 + }, + { + "epoch": 0.43, + "learning_rate": 1.7414257286667449e-07, + "logits/chosen": -3.0995917320251465, + "logits/rejected": -3.0616824626922607, + "logps/chosen": -213.5299072265625, + "logps/rejected": -208.32095336914062, + "loss": 0.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7068892121315002, + "rewards/margins": 3.0374233722686768, + "rewards/rejected": -2.3305342197418213, + "step": 3715 + }, + { + "epoch": 0.43, + "learning_rate": 1.741074563970502e-07, + "logits/chosen": -3.1602022647857666, + "logits/rejected": -3.3269128799438477, + "logps/chosen": -167.39398193359375, + "logps/rejected": -263.35919189453125, + "loss": 0.2527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6800152063369751, + "rewards/margins": 2.6010067462921143, + "rewards/rejected": -3.281022071838379, + "step": 3716 + }, + { + "epoch": 0.43, + "learning_rate": 1.7407233992742597e-07, + "logits/chosen": -3.234210968017578, + "logits/rejected": -3.3967504501342773, + "logps/chosen": -195.07281494140625, + "logps/rejected": -224.53060913085938, + "loss": 0.2893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08705976605415344, + "rewards/margins": 2.4722471237182617, + "rewards/rejected": -2.3851871490478516, + "step": 3717 + }, + { + "epoch": 0.43, + "learning_rate": 1.740372234578017e-07, + "logits/chosen": -2.4172005653381348, + "logits/rejected": -2.2316784858703613, + "logps/chosen": -399.85931396484375, + "logps/rejected": -411.05389404296875, + "loss": 0.6556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06230878829956055, + "rewards/margins": 0.8832308053970337, + "rewards/rejected": -0.9455395936965942, + "step": 3718 + }, + { + "epoch": 0.43, + "learning_rate": 1.7400210698817745e-07, + "logits/chosen": -3.3011815547943115, + "logits/rejected": -3.2946176528930664, + "logps/chosen": -135.64401245117188, + "logps/rejected": -176.7571258544922, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6940982937812805, + "rewards/margins": 2.8332295417785645, + "rewards/rejected": -2.1391313076019287, + "step": 3719 + }, + { + "epoch": 0.43, + "learning_rate": 1.739669905185532e-07, + "logits/chosen": -3.3727972507476807, + "logits/rejected": -3.685126304626465, + "logps/chosen": -250.3538360595703, + "logps/rejected": -226.63839721679688, + "loss": 0.1745, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14897069334983826, + "rewards/margins": 2.8145577907562256, + "rewards/rejected": -2.6655871868133545, + "step": 3720 + }, + { + "epoch": 0.43, + "learning_rate": 1.7393187404892893e-07, + "logits/chosen": -3.647068500518799, + "logits/rejected": -3.5583415031433105, + "logps/chosen": -202.49444580078125, + "logps/rejected": -162.31961059570312, + "loss": 1.1728, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8068774938583374, + "rewards/margins": 0.5751646757125854, + "rewards/rejected": -1.3820419311523438, + "step": 3721 + }, + { + "epoch": 0.43, + "learning_rate": 1.7389675757930468e-07, + "logits/chosen": -3.7852377891540527, + "logits/rejected": -3.5614068508148193, + "logps/chosen": -293.679443359375, + "logps/rejected": -189.707275390625, + "loss": 0.2695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23389622569084167, + "rewards/margins": 1.4923696517944336, + "rewards/rejected": -1.726265788078308, + "step": 3722 + }, + { + "epoch": 0.43, + "learning_rate": 1.7386164110968044e-07, + "logits/chosen": -4.119531631469727, + "logits/rejected": -3.844345808029175, + "logps/chosen": -168.73733520507812, + "logps/rejected": -120.65850830078125, + "loss": 0.3971, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5362793803215027, + "rewards/margins": 2.0037803649902344, + "rewards/rejected": -1.4675010442733765, + "step": 3723 + }, + { + "epoch": 0.43, + "learning_rate": 1.7382652464005616e-07, + "logits/chosen": -2.9961299896240234, + "logits/rejected": -3.2106330394744873, + "logps/chosen": -140.75152587890625, + "logps/rejected": -178.9505157470703, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24096640944480896, + "rewards/margins": 0.8276091814041138, + "rewards/rejected": -1.0685756206512451, + "step": 3724 + }, + { + "epoch": 0.43, + "learning_rate": 1.7379140817043192e-07, + "logits/chosen": -2.488757371902466, + "logits/rejected": -2.621731758117676, + "logps/chosen": -206.59608459472656, + "logps/rejected": -297.89263916015625, + "loss": 0.8511, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4363895058631897, + "rewards/margins": 0.003753870725631714, + "rewards/rejected": -0.440143346786499, + "step": 3725 + }, + { + "epoch": 0.43, + "learning_rate": 1.7375629170080765e-07, + "logits/chosen": -3.4577009677886963, + "logits/rejected": -3.3677000999450684, + "logps/chosen": -308.1956481933594, + "logps/rejected": -261.90234375, + "loss": 0.6174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24059553444385529, + "rewards/margins": 0.8099790215492249, + "rewards/rejected": -1.0505746603012085, + "step": 3726 + }, + { + "epoch": 0.43, + "learning_rate": 1.7372117523118343e-07, + "logits/chosen": -3.6408326625823975, + "logits/rejected": -3.6256370544433594, + "logps/chosen": -173.5767364501953, + "logps/rejected": -215.7896728515625, + "loss": 0.3642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6518775224685669, + "rewards/margins": 1.1302363872528076, + "rewards/rejected": -1.782113790512085, + "step": 3727 + }, + { + "epoch": 0.43, + "learning_rate": 1.7368605876155918e-07, + "logits/chosen": -2.5820603370666504, + "logits/rejected": -2.4769129753112793, + "logps/chosen": -270.1460876464844, + "logps/rejected": -233.9062957763672, + "loss": 0.5611, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4119492173194885, + "rewards/margins": 0.6932536363601685, + "rewards/rejected": -0.2813044488430023, + "step": 3728 + }, + { + "epoch": 0.43, + "learning_rate": 1.736509422919349e-07, + "logits/chosen": -3.1260123252868652, + "logits/rejected": -3.0632143020629883, + "logps/chosen": -311.21826171875, + "logps/rejected": -293.10626220703125, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08253462612628937, + "rewards/margins": 1.4716479778289795, + "rewards/rejected": -1.389113426208496, + "step": 3729 + }, + { + "epoch": 0.43, + "learning_rate": 1.7361582582231066e-07, + "logits/chosen": -2.8321444988250732, + "logits/rejected": -2.815842390060425, + "logps/chosen": -397.676025390625, + "logps/rejected": -257.3731689453125, + "loss": 0.4558, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36626535654067993, + "rewards/margins": 1.32305908203125, + "rewards/rejected": -1.6893244981765747, + "step": 3730 + }, + { + "epoch": 0.43, + "learning_rate": 1.7358070935268641e-07, + "logits/chosen": -3.1440279483795166, + "logits/rejected": -2.856450080871582, + "logps/chosen": -201.34988403320312, + "logps/rejected": -140.28057861328125, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07275040447711945, + "rewards/margins": 1.1361089944839478, + "rewards/rejected": -1.0633587837219238, + "step": 3731 + }, + { + "epoch": 0.43, + "learning_rate": 1.7354559288306214e-07, + "logits/chosen": -4.138597011566162, + "logits/rejected": -4.043882846832275, + "logps/chosen": -340.222900390625, + "logps/rejected": -277.70758056640625, + "loss": 0.3252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17927579581737518, + "rewards/margins": 1.4108481407165527, + "rewards/rejected": -1.5901238918304443, + "step": 3732 + }, + { + "epoch": 0.43, + "learning_rate": 1.735104764134379e-07, + "logits/chosen": -2.805663585662842, + "logits/rejected": -2.718316078186035, + "logps/chosen": -342.8249206542969, + "logps/rejected": -412.2599182128906, + "loss": 0.2897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16609282791614532, + "rewards/margins": 3.0698304176330566, + "rewards/rejected": -3.2359232902526855, + "step": 3733 + }, + { + "epoch": 0.43, + "learning_rate": 1.7347535994381362e-07, + "logits/chosen": -3.0370609760284424, + "logits/rejected": -3.0466582775115967, + "logps/chosen": -354.747802734375, + "logps/rejected": -263.7821350097656, + "loss": 0.5792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8503535985946655, + "rewards/margins": 0.4751267433166504, + "rewards/rejected": -1.325480341911316, + "step": 3734 + }, + { + "epoch": 0.43, + "learning_rate": 1.7344024347418938e-07, + "logits/chosen": -2.662426710128784, + "logits/rejected": -2.6668052673339844, + "logps/chosen": -557.9041137695312, + "logps/rejected": -413.247802734375, + "loss": 0.3101, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011926829814910889, + "rewards/margins": 2.16447114944458, + "rewards/rejected": -2.1525444984436035, + "step": 3735 + }, + { + "epoch": 0.43, + "learning_rate": 1.7340512700456513e-07, + "logits/chosen": -3.9934606552124023, + "logits/rejected": -3.525285482406616, + "logps/chosen": -261.7499694824219, + "logps/rejected": -169.47398376464844, + "loss": 0.4763, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.061528608202934265, + "rewards/margins": 1.1511785984039307, + "rewards/rejected": -1.08965003490448, + "step": 3736 + }, + { + "epoch": 0.43, + "learning_rate": 1.7337001053494086e-07, + "logits/chosen": -3.422666072845459, + "logits/rejected": -4.150254726409912, + "logps/chosen": -183.86012268066406, + "logps/rejected": -279.6000061035156, + "loss": 0.6761, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0509915351867676, + "rewards/margins": 0.8170249462127686, + "rewards/rejected": -1.8680164813995361, + "step": 3737 + }, + { + "epoch": 0.43, + "learning_rate": 1.7333489406531664e-07, + "logits/chosen": -3.2007670402526855, + "logits/rejected": -3.1191704273223877, + "logps/chosen": -96.04226684570312, + "logps/rejected": -133.2779541015625, + "loss": 0.6599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5465809106826782, + "rewards/margins": 0.23731614649295807, + "rewards/rejected": -0.7838970422744751, + "step": 3738 + }, + { + "epoch": 0.43, + "learning_rate": 1.732997775956924e-07, + "logits/chosen": -2.756735324859619, + "logits/rejected": -2.9342527389526367, + "logps/chosen": -307.6697998046875, + "logps/rejected": -215.6663360595703, + "loss": 0.3546, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11521779745817184, + "rewards/margins": 1.4246940612792969, + "rewards/rejected": -1.3094762563705444, + "step": 3739 + }, + { + "epoch": 0.43, + "learning_rate": 1.7326466112606812e-07, + "logits/chosen": -3.2743749618530273, + "logits/rejected": -3.680750608444214, + "logps/chosen": -333.0386962890625, + "logps/rejected": -244.85873413085938, + "loss": 0.605, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7138423919677734, + "rewards/margins": 1.7085479497909546, + "rewards/rejected": -2.4223904609680176, + "step": 3740 + }, + { + "epoch": 0.43, + "learning_rate": 1.7322954465644387e-07, + "logits/chosen": -4.102250576019287, + "logits/rejected": -4.1101460456848145, + "logps/chosen": -258.7186279296875, + "logps/rejected": -266.19683837890625, + "loss": 0.1982, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1363341212272644, + "rewards/margins": 3.0530388355255127, + "rewards/rejected": -2.9167046546936035, + "step": 3741 + }, + { + "epoch": 0.43, + "learning_rate": 1.731944281868196e-07, + "logits/chosen": -2.7496142387390137, + "logits/rejected": -2.7995071411132812, + "logps/chosen": -209.58712768554688, + "logps/rejected": -242.10256958007812, + "loss": 0.5387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.572131335735321, + "rewards/margins": 0.8373973965644836, + "rewards/rejected": -1.4095287322998047, + "step": 3742 + }, + { + "epoch": 0.43, + "learning_rate": 1.7315931171719536e-07, + "logits/chosen": -3.748307228088379, + "logits/rejected": -3.2961785793304443, + "logps/chosen": -376.5100402832031, + "logps/rejected": -241.3789825439453, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09915222227573395, + "rewards/margins": 2.2838025093078613, + "rewards/rejected": -2.184650421142578, + "step": 3743 + }, + { + "epoch": 0.43, + "learning_rate": 1.731241952475711e-07, + "logits/chosen": -3.3801889419555664, + "logits/rejected": -3.3140628337860107, + "logps/chosen": -205.1801300048828, + "logps/rejected": -168.0063018798828, + "loss": 1.0723, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.024546504020691, + "rewards/margins": 0.4213202893733978, + "rewards/rejected": -1.4458668231964111, + "step": 3744 + }, + { + "epoch": 0.43, + "learning_rate": 1.7308907877794684e-07, + "logits/chosen": -3.621767520904541, + "logits/rejected": -3.583552837371826, + "logps/chosen": -223.68772888183594, + "logps/rejected": -161.5611114501953, + "loss": 0.3444, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0510859340429306, + "rewards/margins": 1.5853151082992554, + "rewards/rejected": -1.6364010572433472, + "step": 3745 + }, + { + "epoch": 0.43, + "learning_rate": 1.730539623083226e-07, + "logits/chosen": -3.3164844512939453, + "logits/rejected": -3.103489637374878, + "logps/chosen": -266.2097473144531, + "logps/rejected": -282.68487548828125, + "loss": 0.3769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4207172989845276, + "rewards/margins": 1.3649448156356812, + "rewards/rejected": -1.7856621742248535, + "step": 3746 + }, + { + "epoch": 0.43, + "learning_rate": 1.7301884583869837e-07, + "logits/chosen": -2.949655532836914, + "logits/rejected": -3.2077231407165527, + "logps/chosen": -271.07037353515625, + "logps/rejected": -225.57838439941406, + "loss": 0.3926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26247358322143555, + "rewards/margins": 1.007907509803772, + "rewards/rejected": -1.270380973815918, + "step": 3747 + }, + { + "epoch": 0.43, + "learning_rate": 1.7298372936907407e-07, + "logits/chosen": -2.7848761081695557, + "logits/rejected": -2.8310413360595703, + "logps/chosen": -120.28494262695312, + "logps/rejected": -199.54537963867188, + "loss": 0.3203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40132513642311096, + "rewards/margins": 1.4209840297698975, + "rewards/rejected": -1.8223092555999756, + "step": 3748 + }, + { + "epoch": 0.43, + "learning_rate": 1.7294861289944985e-07, + "logits/chosen": -3.4358983039855957, + "logits/rejected": -3.021350860595703, + "logps/chosen": -209.57484436035156, + "logps/rejected": -181.9486541748047, + "loss": 0.3157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03155313432216644, + "rewards/margins": 1.6744012832641602, + "rewards/rejected": -1.7059543132781982, + "step": 3749 + }, + { + "epoch": 0.43, + "learning_rate": 1.7291349642982558e-07, + "logits/chosen": -3.9762468338012695, + "logits/rejected": -3.746121883392334, + "logps/chosen": -254.10276794433594, + "logps/rejected": -291.08160400390625, + "loss": 0.613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9535631537437439, + "rewards/margins": 1.3152574300765991, + "rewards/rejected": -2.2688205242156982, + "step": 3750 + }, + { + "epoch": 0.43, + "learning_rate": 1.7287837996020133e-07, + "logits/chosen": -3.0338549613952637, + "logits/rejected": -3.205471992492676, + "logps/chosen": -245.88043212890625, + "logps/rejected": -345.44879150390625, + "loss": 0.2955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6738663911819458, + "rewards/margins": 2.2599024772644043, + "rewards/rejected": -1.5860360860824585, + "step": 3751 + }, + { + "epoch": 0.43, + "learning_rate": 1.728432634905771e-07, + "logits/chosen": -3.031090021133423, + "logits/rejected": -3.026207447052002, + "logps/chosen": -182.75103759765625, + "logps/rejected": -260.35736083984375, + "loss": 0.4836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3902100920677185, + "rewards/margins": 2.264827013015747, + "rewards/rejected": -2.6550369262695312, + "step": 3752 + }, + { + "epoch": 0.43, + "learning_rate": 1.7280814702095281e-07, + "logits/chosen": -3.0521602630615234, + "logits/rejected": -2.8582916259765625, + "logps/chosen": -260.48919677734375, + "logps/rejected": -175.30770874023438, + "loss": 0.3976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0617675706744194, + "rewards/margins": 1.4435043334960938, + "rewards/rejected": -1.5052720308303833, + "step": 3753 + }, + { + "epoch": 0.43, + "learning_rate": 1.7277303055132857e-07, + "logits/chosen": -3.5169730186462402, + "logits/rejected": -3.267887592315674, + "logps/chosen": -179.4544677734375, + "logps/rejected": -171.11480712890625, + "loss": 0.5368, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24344421923160553, + "rewards/margins": 0.7576186656951904, + "rewards/rejected": -0.5141744613647461, + "step": 3754 + }, + { + "epoch": 0.43, + "learning_rate": 1.7273791408170432e-07, + "logits/chosen": -3.2177040576934814, + "logits/rejected": -3.2704086303710938, + "logps/chosen": -311.4418029785156, + "logps/rejected": -290.2576904296875, + "loss": 0.2421, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08006151020526886, + "rewards/margins": 2.496225357055664, + "rewards/rejected": -2.416163682937622, + "step": 3755 + }, + { + "epoch": 0.43, + "learning_rate": 1.7270279761208005e-07, + "logits/chosen": -2.321563482284546, + "logits/rejected": -2.388504981994629, + "logps/chosen": -403.6138916015625, + "logps/rejected": -243.88693237304688, + "loss": 0.2603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7375403642654419, + "rewards/margins": 1.9392553567886353, + "rewards/rejected": -1.2017149925231934, + "step": 3756 + }, + { + "epoch": 0.43, + "learning_rate": 1.726676811424558e-07, + "logits/chosen": -3.4286608695983887, + "logits/rejected": -3.783137798309326, + "logps/chosen": -210.37037658691406, + "logps/rejected": -304.6061706542969, + "loss": 0.5576, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49288368225097656, + "rewards/margins": 0.9996010065078735, + "rewards/rejected": -0.5067174434661865, + "step": 3757 + }, + { + "epoch": 0.43, + "learning_rate": 1.7263256467283153e-07, + "logits/chosen": -2.32926344871521, + "logits/rejected": -2.346259832382202, + "logps/chosen": -231.8414764404297, + "logps/rejected": -225.57867431640625, + "loss": 0.3891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5577883720397949, + "rewards/margins": 1.288053035736084, + "rewards/rejected": -0.7302647829055786, + "step": 3758 + }, + { + "epoch": 0.43, + "learning_rate": 1.7259744820320728e-07, + "logits/chosen": -3.015768527984619, + "logits/rejected": -2.9672882556915283, + "logps/chosen": -216.74713134765625, + "logps/rejected": -222.57928466796875, + "loss": 0.3199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.046950459480285645, + "rewards/margins": 2.1382627487182617, + "rewards/rejected": -2.185213327407837, + "step": 3759 + }, + { + "epoch": 0.43, + "learning_rate": 1.7256233173358306e-07, + "logits/chosen": -2.8012619018554688, + "logits/rejected": -2.857393264770508, + "logps/chosen": -482.74237060546875, + "logps/rejected": -297.6199951171875, + "loss": 0.324, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3245088458061218, + "rewards/margins": 2.26320481300354, + "rewards/rejected": -1.938696026802063, + "step": 3760 + }, + { + "epoch": 0.43, + "learning_rate": 1.725272152639588e-07, + "logits/chosen": -3.3329856395721436, + "logits/rejected": -3.1899263858795166, + "logps/chosen": -255.1383819580078, + "logps/rejected": -296.7931213378906, + "loss": 0.2929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25162559747695923, + "rewards/margins": 3.2806930541992188, + "rewards/rejected": -3.532318592071533, + "step": 3761 + }, + { + "epoch": 0.43, + "learning_rate": 1.7249209879433455e-07, + "logits/chosen": -3.172010660171509, + "logits/rejected": -2.808654546737671, + "logps/chosen": -268.28656005859375, + "logps/rejected": -240.1605224609375, + "loss": 0.4486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09425263106822968, + "rewards/margins": 1.5832979679107666, + "rewards/rejected": -1.4890453815460205, + "step": 3762 + }, + { + "epoch": 0.43, + "learning_rate": 1.7245698232471027e-07, + "logits/chosen": -3.5375232696533203, + "logits/rejected": -3.709341526031494, + "logps/chosen": -220.43634033203125, + "logps/rejected": -221.7480010986328, + "loss": 0.2565, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2044648975133896, + "rewards/margins": 1.8968849182128906, + "rewards/rejected": -1.6924200057983398, + "step": 3763 + }, + { + "epoch": 0.43, + "learning_rate": 1.7242186585508603e-07, + "logits/chosen": -3.24182391166687, + "logits/rejected": -3.2131035327911377, + "logps/chosen": -242.82884216308594, + "logps/rejected": -281.5714111328125, + "loss": 0.1565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1911657750606537, + "rewards/margins": 3.1873292922973633, + "rewards/rejected": -2.9961633682250977, + "step": 3764 + }, + { + "epoch": 0.43, + "learning_rate": 1.7238674938546178e-07, + "logits/chosen": -2.767106056213379, + "logits/rejected": -2.4081058502197266, + "logps/chosen": -290.01239013671875, + "logps/rejected": -484.33319091796875, + "loss": 0.5825, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1530022919178009, + "rewards/margins": 0.7826467752456665, + "rewards/rejected": -0.935649037361145, + "step": 3765 + }, + { + "epoch": 0.43, + "learning_rate": 1.723516329158375e-07, + "logits/chosen": -2.8416781425476074, + "logits/rejected": -2.5857510566711426, + "logps/chosen": -386.83282470703125, + "logps/rejected": -217.19271850585938, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2793276906013489, + "rewards/margins": 1.8353736400604248, + "rewards/rejected": -1.5560457706451416, + "step": 3766 + }, + { + "epoch": 0.43, + "learning_rate": 1.7231651644621326e-07, + "logits/chosen": -2.7575252056121826, + "logits/rejected": -2.584373712539673, + "logps/chosen": -279.595947265625, + "logps/rejected": -392.8921203613281, + "loss": 0.3786, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09211406856775284, + "rewards/margins": 3.1375341415405273, + "rewards/rejected": -3.045419931411743, + "step": 3767 + }, + { + "epoch": 0.43, + "learning_rate": 1.7228139997658902e-07, + "logits/chosen": -3.4872541427612305, + "logits/rejected": -3.5614116191864014, + "logps/chosen": -320.3623046875, + "logps/rejected": -273.0469055175781, + "loss": 0.6961, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12884853780269623, + "rewards/margins": 0.7124890089035034, + "rewards/rejected": -0.5836405158042908, + "step": 3768 + }, + { + "epoch": 0.43, + "learning_rate": 1.7224628350696474e-07, + "logits/chosen": -3.2136921882629395, + "logits/rejected": -2.9907727241516113, + "logps/chosen": -311.1197204589844, + "logps/rejected": -283.3961486816406, + "loss": 0.2554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01953045278787613, + "rewards/margins": 1.861210823059082, + "rewards/rejected": -1.8807411193847656, + "step": 3769 + }, + { + "epoch": 0.43, + "learning_rate": 1.722111670373405e-07, + "logits/chosen": -2.921227216720581, + "logits/rejected": -2.9075517654418945, + "logps/chosen": -140.94256591796875, + "logps/rejected": -171.49383544921875, + "loss": 0.3574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11262717843055725, + "rewards/margins": 1.8568111658096313, + "rewards/rejected": -1.9694383144378662, + "step": 3770 + }, + { + "epoch": 0.43, + "learning_rate": 1.7217605056771622e-07, + "logits/chosen": -3.271697759628296, + "logits/rejected": -3.0364036560058594, + "logps/chosen": -601.8358764648438, + "logps/rejected": -228.93771362304688, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6326030492782593, + "rewards/margins": 2.4144575595855713, + "rewards/rejected": -3.047060489654541, + "step": 3771 + }, + { + "epoch": 0.43, + "learning_rate": 1.72140934098092e-07, + "logits/chosen": -3.837602376937866, + "logits/rejected": -3.569702625274658, + "logps/chosen": -294.4512634277344, + "logps/rejected": -239.8753204345703, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32728689908981323, + "rewards/margins": 2.192551612854004, + "rewards/rejected": -1.8652647733688354, + "step": 3772 + }, + { + "epoch": 0.43, + "learning_rate": 1.7210581762846776e-07, + "logits/chosen": -2.7564144134521484, + "logits/rejected": -2.894111156463623, + "logps/chosen": -262.4818115234375, + "logps/rejected": -287.537353515625, + "loss": 0.8863, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8129072785377502, + "rewards/margins": 0.1512846052646637, + "rewards/rejected": -0.9641919136047363, + "step": 3773 + }, + { + "epoch": 0.44, + "learning_rate": 1.7207070115884349e-07, + "logits/chosen": -2.9237234592437744, + "logits/rejected": -3.1189920902252197, + "logps/chosen": -157.84848022460938, + "logps/rejected": -245.73715209960938, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2860395610332489, + "rewards/margins": 3.3897221088409424, + "rewards/rejected": -3.103682518005371, + "step": 3774 + }, + { + "epoch": 0.44, + "learning_rate": 1.7203558468921924e-07, + "logits/chosen": -2.9473421573638916, + "logits/rejected": -3.088089942932129, + "logps/chosen": -230.54855346679688, + "logps/rejected": -180.13418579101562, + "loss": 0.3523, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2866535782814026, + "rewards/margins": 1.3763785362243652, + "rewards/rejected": -1.6630322933197021, + "step": 3775 + }, + { + "epoch": 0.44, + "learning_rate": 1.72000468219595e-07, + "logits/chosen": -2.932097911834717, + "logits/rejected": -2.9439802169799805, + "logps/chosen": -413.0455627441406, + "logps/rejected": -314.57952880859375, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2476312518119812, + "rewards/margins": 1.8542098999023438, + "rewards/rejected": -2.1018409729003906, + "step": 3776 + }, + { + "epoch": 0.44, + "learning_rate": 1.7196535174997072e-07, + "logits/chosen": -3.646982192993164, + "logits/rejected": -3.5575196743011475, + "logps/chosen": -363.8162536621094, + "logps/rejected": -284.8108215332031, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15030869841575623, + "rewards/margins": 0.9669332504272461, + "rewards/rejected": -1.1172419786453247, + "step": 3777 + }, + { + "epoch": 0.44, + "learning_rate": 1.7193023528034648e-07, + "logits/chosen": -3.6191353797912598, + "logits/rejected": -3.0513086318969727, + "logps/chosen": -154.88742065429688, + "logps/rejected": -191.8935546875, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5188064575195312, + "rewards/margins": 2.0743510723114014, + "rewards/rejected": -2.5931572914123535, + "step": 3778 + }, + { + "epoch": 0.44, + "learning_rate": 1.718951188107222e-07, + "logits/chosen": -3.2513442039489746, + "logits/rejected": -3.0559425354003906, + "logps/chosen": -340.1258239746094, + "logps/rejected": -213.18914794921875, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23339098691940308, + "rewards/margins": 1.9659757614135742, + "rewards/rejected": -1.732584834098816, + "step": 3779 + }, + { + "epoch": 0.44, + "learning_rate": 1.7186000234109796e-07, + "logits/chosen": -3.9758970737457275, + "logits/rejected": -3.5776596069335938, + "logps/chosen": -285.97210693359375, + "logps/rejected": -251.82049560546875, + "loss": 0.1721, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39764294028282166, + "rewards/margins": 3.1666245460510254, + "rewards/rejected": -2.7689812183380127, + "step": 3780 + }, + { + "epoch": 0.44, + "learning_rate": 1.718248858714737e-07, + "logits/chosen": -3.3483729362487793, + "logits/rejected": -3.2296245098114014, + "logps/chosen": -150.41799926757812, + "logps/rejected": -184.947021484375, + "loss": 0.4254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3149542808532715, + "rewards/margins": 1.4023830890655518, + "rewards/rejected": -1.7173373699188232, + "step": 3781 + }, + { + "epoch": 0.44, + "learning_rate": 1.7178976940184944e-07, + "logits/chosen": -3.3945565223693848, + "logits/rejected": -3.072197914123535, + "logps/chosen": -397.41107177734375, + "logps/rejected": -263.0228271484375, + "loss": 0.2505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30891740322113037, + "rewards/margins": 1.6927427053451538, + "rewards/rejected": -2.001660108566284, + "step": 3782 + }, + { + "epoch": 0.44, + "learning_rate": 1.7175465293222522e-07, + "logits/chosen": -3.784128189086914, + "logits/rejected": -3.7780697345733643, + "logps/chosen": -211.1038818359375, + "logps/rejected": -199.1459197998047, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38433822989463806, + "rewards/margins": 1.9595481157302856, + "rewards/rejected": -1.5752098560333252, + "step": 3783 + }, + { + "epoch": 0.44, + "learning_rate": 1.7171953646260097e-07, + "logits/chosen": -3.6336989402770996, + "logits/rejected": -3.193356990814209, + "logps/chosen": -223.29385375976562, + "logps/rejected": -184.8057861328125, + "loss": 0.2845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4022286534309387, + "rewards/margins": 1.5733940601348877, + "rewards/rejected": -1.9756226539611816, + "step": 3784 + }, + { + "epoch": 0.44, + "learning_rate": 1.716844199929767e-07, + "logits/chosen": -3.471452236175537, + "logits/rejected": -3.5129008293151855, + "logps/chosen": -206.58956909179688, + "logps/rejected": -193.74070739746094, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3440912961959839, + "rewards/margins": 1.327117681503296, + "rewards/rejected": -1.6712090969085693, + "step": 3785 + }, + { + "epoch": 0.44, + "learning_rate": 1.7164930352335245e-07, + "logits/chosen": -3.1886954307556152, + "logits/rejected": -3.4079842567443848, + "logps/chosen": -238.61163330078125, + "logps/rejected": -235.8498077392578, + "loss": 0.606, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34245002269744873, + "rewards/margins": 1.023464322090149, + "rewards/rejected": -1.365914225578308, + "step": 3786 + }, + { + "epoch": 0.44, + "learning_rate": 1.7161418705372818e-07, + "logits/chosen": -2.654240608215332, + "logits/rejected": -2.445498466491699, + "logps/chosen": -302.33251953125, + "logps/rejected": -149.2390594482422, + "loss": 0.7057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33185315132141113, + "rewards/margins": 0.4227653741836548, + "rewards/rejected": -0.7546184062957764, + "step": 3787 + }, + { + "epoch": 0.44, + "learning_rate": 1.7157907058410393e-07, + "logits/chosen": -3.180511951446533, + "logits/rejected": -3.46157169342041, + "logps/chosen": -283.07147216796875, + "logps/rejected": -301.0265197753906, + "loss": 0.4253, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16283874213695526, + "rewards/margins": 2.060246467590332, + "rewards/rejected": -2.223085403442383, + "step": 3788 + }, + { + "epoch": 0.44, + "learning_rate": 1.715439541144797e-07, + "logits/chosen": -2.825237274169922, + "logits/rejected": -2.553175926208496, + "logps/chosen": -414.92095947265625, + "logps/rejected": -333.3368835449219, + "loss": 0.193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07892120629549026, + "rewards/margins": 2.303670883178711, + "rewards/rejected": -2.38259220123291, + "step": 3789 + }, + { + "epoch": 0.44, + "learning_rate": 1.7150883764485542e-07, + "logits/chosen": -3.8301186561584473, + "logits/rejected": -3.7145628929138184, + "logps/chosen": -262.1764831542969, + "logps/rejected": -131.44866943359375, + "loss": 0.3186, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22470420598983765, + "rewards/margins": 2.341282367706299, + "rewards/rejected": -2.1165781021118164, + "step": 3790 + }, + { + "epoch": 0.44, + "learning_rate": 1.7147372117523117e-07, + "logits/chosen": -3.1890060901641846, + "logits/rejected": -2.943732500076294, + "logps/chosen": -360.22442626953125, + "logps/rejected": -377.3102111816406, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3285140097141266, + "rewards/margins": 2.470351457595825, + "rewards/rejected": -2.1418375968933105, + "step": 3791 + }, + { + "epoch": 0.44, + "learning_rate": 1.7143860470560695e-07, + "logits/chosen": -2.6706409454345703, + "logits/rejected": -2.5234575271606445, + "logps/chosen": -349.8684997558594, + "logps/rejected": -310.84259033203125, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14751070737838745, + "rewards/margins": 0.8692659139633179, + "rewards/rejected": -1.01677668094635, + "step": 3792 + }, + { + "epoch": 0.44, + "learning_rate": 1.7140348823598265e-07, + "logits/chosen": -3.367725133895874, + "logits/rejected": -2.971841335296631, + "logps/chosen": -309.6128234863281, + "logps/rejected": -317.1749267578125, + "loss": 0.3931, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03316380828619003, + "rewards/margins": 1.6719704866409302, + "rewards/rejected": -1.638806700706482, + "step": 3793 + }, + { + "epoch": 0.44, + "learning_rate": 1.7136837176635843e-07, + "logits/chosen": -2.906613826751709, + "logits/rejected": -3.1899733543395996, + "logps/chosen": -395.36065673828125, + "logps/rejected": -236.57327270507812, + "loss": 0.5778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1592264175415039, + "rewards/margins": 1.1905279159545898, + "rewards/rejected": -1.3497543334960938, + "step": 3794 + }, + { + "epoch": 0.44, + "learning_rate": 1.7133325529673416e-07, + "logits/chosen": -3.7073416709899902, + "logits/rejected": -3.016030788421631, + "logps/chosen": -264.8912353515625, + "logps/rejected": -239.44078063964844, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1900431513786316, + "rewards/margins": 1.4997045993804932, + "rewards/rejected": -1.6897478103637695, + "step": 3795 + }, + { + "epoch": 0.44, + "learning_rate": 1.712981388271099e-07, + "logits/chosen": -4.088573932647705, + "logits/rejected": -4.187191963195801, + "logps/chosen": -167.92295837402344, + "logps/rejected": -159.87496948242188, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018006712198257446, + "rewards/margins": 1.915718674659729, + "rewards/rejected": -1.933725357055664, + "step": 3796 + }, + { + "epoch": 0.44, + "learning_rate": 1.7126302235748567e-07, + "logits/chosen": -3.145259380340576, + "logits/rejected": -3.128885269165039, + "logps/chosen": -339.15179443359375, + "logps/rejected": -443.16558837890625, + "loss": 0.1871, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36675554513931274, + "rewards/margins": 2.5216948986053467, + "rewards/rejected": -2.1549394130706787, + "step": 3797 + }, + { + "epoch": 0.44, + "learning_rate": 1.712279058878614e-07, + "logits/chosen": -3.3041372299194336, + "logits/rejected": -3.260498285293579, + "logps/chosen": -514.44287109375, + "logps/rejected": -352.6444091796875, + "loss": 0.1446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.413865864276886, + "rewards/margins": 3.511106491088867, + "rewards/rejected": -3.097240686416626, + "step": 3798 + }, + { + "epoch": 0.44, + "learning_rate": 1.7119278941823715e-07, + "logits/chosen": -2.978891134262085, + "logits/rejected": -2.7410731315612793, + "logps/chosen": -175.6814727783203, + "logps/rejected": -185.3964080810547, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.276872456073761, + "rewards/margins": 0.5685970783233643, + "rewards/rejected": -0.84546959400177, + "step": 3799 + }, + { + "epoch": 0.44, + "learning_rate": 1.711576729486129e-07, + "logits/chosen": -2.6903491020202637, + "logits/rejected": -2.850292921066284, + "logps/chosen": -132.75621032714844, + "logps/rejected": -304.98046875, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03744758665561676, + "rewards/margins": 2.0476841926574707, + "rewards/rejected": -2.0851316452026367, + "step": 3800 + }, + { + "epoch": 0.44, + "learning_rate": 1.7112255647898863e-07, + "logits/chosen": -3.384897470474243, + "logits/rejected": -3.6092803478240967, + "logps/chosen": -119.74797058105469, + "logps/rejected": -150.85470581054688, + "loss": 0.3868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5893073081970215, + "rewards/margins": 2.45082426071167, + "rewards/rejected": -1.8615169525146484, + "step": 3801 + }, + { + "epoch": 0.44, + "learning_rate": 1.7108744000936438e-07, + "logits/chosen": -3.722026824951172, + "logits/rejected": -3.499962329864502, + "logps/chosen": -280.06085205078125, + "logps/rejected": -276.5624084472656, + "loss": 0.3727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26659050583839417, + "rewards/margins": 1.6953930854797363, + "rewards/rejected": -1.9619836807250977, + "step": 3802 + }, + { + "epoch": 0.44, + "learning_rate": 1.710523235397401e-07, + "logits/chosen": -3.38175106048584, + "logits/rejected": -3.1147313117980957, + "logps/chosen": -286.04766845703125, + "logps/rejected": -287.004150390625, + "loss": 0.8426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4416065514087677, + "rewards/margins": 1.2828757762908936, + "rewards/rejected": -1.7244822978973389, + "step": 3803 + }, + { + "epoch": 0.44, + "learning_rate": 1.7101720707011586e-07, + "logits/chosen": -3.362394332885742, + "logits/rejected": -3.0913658142089844, + "logps/chosen": -104.05601501464844, + "logps/rejected": -131.61489868164062, + "loss": 0.5674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7701177000999451, + "rewards/margins": 0.5198270082473755, + "rewards/rejected": -1.2899446487426758, + "step": 3804 + }, + { + "epoch": 0.44, + "learning_rate": 1.7098209060049164e-07, + "logits/chosen": -2.964688539505005, + "logits/rejected": -2.905113935470581, + "logps/chosen": -317.737548828125, + "logps/rejected": -255.02978515625, + "loss": 0.5529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05295287072658539, + "rewards/margins": 1.0584172010421753, + "rewards/rejected": -1.1113699674606323, + "step": 3805 + }, + { + "epoch": 0.44, + "learning_rate": 1.7094697413086737e-07, + "logits/chosen": -2.352909803390503, + "logits/rejected": -2.485879421234131, + "logps/chosen": -349.4520263671875, + "logps/rejected": -396.7132263183594, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5364240407943726, + "rewards/margins": 1.7808735370635986, + "rewards/rejected": -1.2444497346878052, + "step": 3806 + }, + { + "epoch": 0.44, + "learning_rate": 1.7091185766124313e-07, + "logits/chosen": -3.4424283504486084, + "logits/rejected": -3.2578954696655273, + "logps/chosen": -224.66932678222656, + "logps/rejected": -236.7154083251953, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3880527913570404, + "rewards/margins": 0.9170973896980286, + "rewards/rejected": -1.3051502704620361, + "step": 3807 + }, + { + "epoch": 0.44, + "learning_rate": 1.7087674119161885e-07, + "logits/chosen": -2.9239816665649414, + "logits/rejected": -2.7686431407928467, + "logps/chosen": -279.962890625, + "logps/rejected": -280.96533203125, + "loss": 0.2851, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7070999145507812, + "rewards/margins": 2.0233676433563232, + "rewards/rejected": -1.3162678480148315, + "step": 3808 + }, + { + "epoch": 0.44, + "learning_rate": 1.708416247219946e-07, + "logits/chosen": -2.91094708442688, + "logits/rejected": -2.834409236907959, + "logps/chosen": -206.5435333251953, + "logps/rejected": -348.33905029296875, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.100542813539505, + "rewards/margins": 0.9231792092323303, + "rewards/rejected": -1.0237220525741577, + "step": 3809 + }, + { + "epoch": 0.44, + "learning_rate": 1.7080650825237036e-07, + "logits/chosen": -3.069371461868286, + "logits/rejected": -3.2963435649871826, + "logps/chosen": -322.9842834472656, + "logps/rejected": -290.70294189453125, + "loss": 0.2629, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1866873800754547, + "rewards/margins": 2.3205254077911377, + "rewards/rejected": -2.133838176727295, + "step": 3810 + }, + { + "epoch": 0.44, + "learning_rate": 1.707713917827461e-07, + "logits/chosen": -3.197383165359497, + "logits/rejected": -3.3959131240844727, + "logps/chosen": -243.641845703125, + "logps/rejected": -312.5992431640625, + "loss": 0.2223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19247426092624664, + "rewards/margins": 2.029167890548706, + "rewards/rejected": -2.221642255783081, + "step": 3811 + }, + { + "epoch": 0.44, + "learning_rate": 1.7073627531312184e-07, + "logits/chosen": -3.0703818798065186, + "logits/rejected": -3.1225194931030273, + "logps/chosen": -169.8203887939453, + "logps/rejected": -267.2319030761719, + "loss": 0.417, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06669960170984268, + "rewards/margins": 2.3917927742004395, + "rewards/rejected": -2.3250932693481445, + "step": 3812 + }, + { + "epoch": 0.44, + "learning_rate": 1.707011588434976e-07, + "logits/chosen": -3.377410650253296, + "logits/rejected": -3.4691505432128906, + "logps/chosen": -440.074462890625, + "logps/rejected": -498.91485595703125, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12108853459358215, + "rewards/margins": 3.088165760040283, + "rewards/rejected": -3.209254503250122, + "step": 3813 + }, + { + "epoch": 0.44, + "learning_rate": 1.7066604237387332e-07, + "logits/chosen": -3.2385759353637695, + "logits/rejected": -3.221871852874756, + "logps/chosen": -396.2362060546875, + "logps/rejected": -305.79925537109375, + "loss": 0.6136, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09719810634851456, + "rewards/margins": 0.8486742973327637, + "rewards/rejected": -0.9458723068237305, + "step": 3814 + }, + { + "epoch": 0.44, + "learning_rate": 1.7063092590424908e-07, + "logits/chosen": -3.0420916080474854, + "logits/rejected": -3.44242525100708, + "logps/chosen": -207.71365356445312, + "logps/rejected": -215.11241149902344, + "loss": 0.2066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11452966928482056, + "rewards/margins": 2.885307550430298, + "rewards/rejected": -2.9998371601104736, + "step": 3815 + }, + { + "epoch": 0.44, + "learning_rate": 1.705958094346248e-07, + "logits/chosen": -2.3081603050231934, + "logits/rejected": -2.5343337059020996, + "logps/chosen": -187.44094848632812, + "logps/rejected": -200.67991638183594, + "loss": 0.8023, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5093955397605896, + "rewards/margins": 0.6127266883850098, + "rewards/rejected": -1.1221221685409546, + "step": 3816 + }, + { + "epoch": 0.44, + "learning_rate": 1.7056069296500058e-07, + "logits/chosen": -2.81065034866333, + "logits/rejected": -2.7185287475585938, + "logps/chosen": -166.5787811279297, + "logps/rejected": -156.59622192382812, + "loss": 0.4313, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4592604339122772, + "rewards/margins": 1.682938814163208, + "rewards/rejected": -1.2236783504486084, + "step": 3817 + }, + { + "epoch": 0.44, + "learning_rate": 1.7052557649537634e-07, + "logits/chosen": -3.3730549812316895, + "logits/rejected": -3.2173471450805664, + "logps/chosen": -272.0760192871094, + "logps/rejected": -369.3324279785156, + "loss": 0.3986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3932245373725891, + "rewards/margins": 1.6551427841186523, + "rewards/rejected": -2.0483672618865967, + "step": 3818 + }, + { + "epoch": 0.44, + "learning_rate": 1.7049046002575207e-07, + "logits/chosen": -3.074230670928955, + "logits/rejected": -3.325981616973877, + "logps/chosen": -208.9912872314453, + "logps/rejected": -241.4286346435547, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05713619291782379, + "rewards/margins": 1.1786962747573853, + "rewards/rejected": -1.2358324527740479, + "step": 3819 + }, + { + "epoch": 0.44, + "learning_rate": 1.7045534355612782e-07, + "logits/chosen": -3.706920623779297, + "logits/rejected": -3.5391039848327637, + "logps/chosen": -191.14913940429688, + "logps/rejected": -220.66818237304688, + "loss": 0.3089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10462673008441925, + "rewards/margins": 2.4020633697509766, + "rewards/rejected": -2.50669002532959, + "step": 3820 + }, + { + "epoch": 0.44, + "learning_rate": 1.7042022708650357e-07, + "logits/chosen": -2.6009182929992676, + "logits/rejected": -2.45725154876709, + "logps/chosen": -111.2066421508789, + "logps/rejected": -121.3702163696289, + "loss": 1.0407, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2581225633621216, + "rewards/margins": -0.01598186045885086, + "rewards/rejected": -1.2421408891677856, + "step": 3821 + }, + { + "epoch": 0.44, + "learning_rate": 1.703851106168793e-07, + "logits/chosen": -3.23268985748291, + "logits/rejected": -3.356743335723877, + "logps/chosen": -90.07105255126953, + "logps/rejected": -192.77630615234375, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.057198844850063324, + "rewards/margins": 1.8842376470565796, + "rewards/rejected": -1.8270388841629028, + "step": 3822 + }, + { + "epoch": 0.44, + "learning_rate": 1.7034999414725505e-07, + "logits/chosen": -2.639680862426758, + "logits/rejected": -2.80167555809021, + "logps/chosen": -200.57904052734375, + "logps/rejected": -316.2478332519531, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05684966593980789, + "rewards/margins": 2.7982258796691895, + "rewards/rejected": -2.7413763999938965, + "step": 3823 + }, + { + "epoch": 0.44, + "learning_rate": 1.7031487767763078e-07, + "logits/chosen": -3.003185510635376, + "logits/rejected": -3.0988316535949707, + "logps/chosen": -154.01751708984375, + "logps/rejected": -225.97911071777344, + "loss": 0.5552, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06929926574230194, + "rewards/margins": 1.1516435146331787, + "rewards/rejected": -1.2209429740905762, + "step": 3824 + }, + { + "epoch": 0.44, + "learning_rate": 1.7027976120800654e-07, + "logits/chosen": -3.283865213394165, + "logits/rejected": -3.0429017543792725, + "logps/chosen": -224.4901885986328, + "logps/rejected": -125.26390838623047, + "loss": 0.4067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.236540287733078, + "rewards/margins": 1.3416614532470703, + "rewards/rejected": -1.5782018899917603, + "step": 3825 + }, + { + "epoch": 0.44, + "learning_rate": 1.7024464473838232e-07, + "logits/chosen": -3.0854709148406982, + "logits/rejected": -3.2770776748657227, + "logps/chosen": -180.8231964111328, + "logps/rejected": -200.98545837402344, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06526744365692139, + "rewards/margins": 2.7088186740875244, + "rewards/rejected": -2.7740859985351562, + "step": 3826 + }, + { + "epoch": 0.44, + "learning_rate": 1.7020952826875802e-07, + "logits/chosen": -2.8152170181274414, + "logits/rejected": -2.7437825202941895, + "logps/chosen": -214.0752716064453, + "logps/rejected": -276.72247314453125, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2248414158821106, + "rewards/margins": 2.4227190017700195, + "rewards/rejected": -2.1978774070739746, + "step": 3827 + }, + { + "epoch": 0.44, + "learning_rate": 1.701744117991338e-07, + "logits/chosen": -2.9112558364868164, + "logits/rejected": -2.6681814193725586, + "logps/chosen": -363.1322937011719, + "logps/rejected": -233.21099853515625, + "loss": 0.4897, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42571067810058594, + "rewards/margins": 1.8968596458435059, + "rewards/rejected": -2.322570323944092, + "step": 3828 + }, + { + "epoch": 0.44, + "learning_rate": 1.7013929532950955e-07, + "logits/chosen": -3.256643295288086, + "logits/rejected": -2.9688034057617188, + "logps/chosen": -126.83967590332031, + "logps/rejected": -112.01837158203125, + "loss": 0.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3388954997062683, + "rewards/margins": 0.8048197627067566, + "rewards/rejected": -1.1437151432037354, + "step": 3829 + }, + { + "epoch": 0.44, + "learning_rate": 1.7010417885988528e-07, + "logits/chosen": -2.7688357830047607, + "logits/rejected": -2.922804355621338, + "logps/chosen": -228.9270477294922, + "logps/rejected": -168.8568115234375, + "loss": 0.3219, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5096137523651123, + "rewards/margins": 2.0172204971313477, + "rewards/rejected": -1.5076066255569458, + "step": 3830 + }, + { + "epoch": 0.44, + "learning_rate": 1.7006906239026103e-07, + "logits/chosen": -3.156186819076538, + "logits/rejected": -3.1783320903778076, + "logps/chosen": -167.80462646484375, + "logps/rejected": -199.39089965820312, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21322020888328552, + "rewards/margins": 1.2089229822158813, + "rewards/rejected": -1.4221431016921997, + "step": 3831 + }, + { + "epoch": 0.44, + "learning_rate": 1.7003394592063676e-07, + "logits/chosen": -3.0764353275299072, + "logits/rejected": -3.278012275695801, + "logps/chosen": -307.578369140625, + "logps/rejected": -280.45294189453125, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2797574996948242, + "rewards/margins": 3.3237645626068115, + "rewards/rejected": -3.0440073013305664, + "step": 3832 + }, + { + "epoch": 0.44, + "learning_rate": 1.6999882945101251e-07, + "logits/chosen": -3.2776849269866943, + "logits/rejected": -3.784109115600586, + "logps/chosen": -178.92681884765625, + "logps/rejected": -177.3022003173828, + "loss": 0.4214, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17100583016872406, + "rewards/margins": 1.5860283374786377, + "rewards/rejected": -1.415022373199463, + "step": 3833 + }, + { + "epoch": 0.44, + "learning_rate": 1.6996371298138827e-07, + "logits/chosen": -2.627293586730957, + "logits/rejected": -2.6379499435424805, + "logps/chosen": -228.23289489746094, + "logps/rejected": -223.85821533203125, + "loss": 0.2454, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19243809580802917, + "rewards/margins": 1.8751585483551025, + "rewards/rejected": -1.682720422744751, + "step": 3834 + }, + { + "epoch": 0.44, + "learning_rate": 1.69928596511764e-07, + "logits/chosen": -2.817966938018799, + "logits/rejected": -3.147529363632202, + "logps/chosen": -201.0763702392578, + "logps/rejected": -159.84628295898438, + "loss": 0.6622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06648050248622894, + "rewards/margins": 1.218529462814331, + "rewards/rejected": -1.2850098609924316, + "step": 3835 + }, + { + "epoch": 0.44, + "learning_rate": 1.6989348004213975e-07, + "logits/chosen": -2.8921432495117188, + "logits/rejected": -2.9727940559387207, + "logps/chosen": -406.6605529785156, + "logps/rejected": -414.479736328125, + "loss": 0.4338, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41246211528778076, + "rewards/margins": 1.2892175912857056, + "rewards/rejected": -0.87675541639328, + "step": 3836 + }, + { + "epoch": 0.44, + "learning_rate": 1.6985836357251553e-07, + "logits/chosen": -2.975193500518799, + "logits/rejected": -2.8656649589538574, + "logps/chosen": -373.5258483886719, + "logps/rejected": -229.15151977539062, + "loss": 0.4598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2931094169616699, + "rewards/margins": 1.3585842847824097, + "rewards/rejected": -1.6516937017440796, + "step": 3837 + }, + { + "epoch": 0.44, + "learning_rate": 1.6982324710289123e-07, + "logits/chosen": -2.90350604057312, + "logits/rejected": -2.9982352256774902, + "logps/chosen": -224.6698760986328, + "logps/rejected": -192.93051147460938, + "loss": 0.3323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3522271513938904, + "rewards/margins": 2.0066092014312744, + "rewards/rejected": -1.6543821096420288, + "step": 3838 + }, + { + "epoch": 0.44, + "learning_rate": 1.69788130633267e-07, + "logits/chosen": -3.630878210067749, + "logits/rejected": -3.901643753051758, + "logps/chosen": -105.46337890625, + "logps/rejected": -161.8000030517578, + "loss": 0.2042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4509018361568451, + "rewards/margins": 1.8903559446334839, + "rewards/rejected": -1.4394540786743164, + "step": 3839 + }, + { + "epoch": 0.44, + "learning_rate": 1.6975301416364274e-07, + "logits/chosen": -3.1081881523132324, + "logits/rejected": -3.775651454925537, + "logps/chosen": -216.453857421875, + "logps/rejected": -443.6360778808594, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0730377659201622, + "rewards/margins": 3.4959614276885986, + "rewards/rejected": -3.4229235649108887, + "step": 3840 + }, + { + "epoch": 0.44, + "learning_rate": 1.697178976940185e-07, + "logits/chosen": -3.491589069366455, + "logits/rejected": -3.3765645027160645, + "logps/chosen": -180.77725219726562, + "logps/rejected": -236.9386749267578, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21217359602451324, + "rewards/margins": 1.8032169342041016, + "rewards/rejected": -2.015390396118164, + "step": 3841 + }, + { + "epoch": 0.44, + "learning_rate": 1.6968278122439425e-07, + "logits/chosen": -3.5496749877929688, + "logits/rejected": -3.6957125663757324, + "logps/chosen": -237.6498260498047, + "logps/rejected": -195.25393676757812, + "loss": 0.411, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.012890934944152832, + "rewards/margins": 1.1408519744873047, + "rewards/rejected": -1.1279609203338623, + "step": 3842 + }, + { + "epoch": 0.44, + "learning_rate": 1.6964766475476997e-07, + "logits/chosen": -3.059424877166748, + "logits/rejected": -3.005371570587158, + "logps/chosen": -257.50933837890625, + "logps/rejected": -247.61671447753906, + "loss": 0.2584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2443234771490097, + "rewards/margins": 2.3234567642211914, + "rewards/rejected": -2.5677804946899414, + "step": 3843 + }, + { + "epoch": 0.44, + "learning_rate": 1.6961254828514573e-07, + "logits/chosen": -2.8302178382873535, + "logits/rejected": -2.791989326477051, + "logps/chosen": -394.8868408203125, + "logps/rejected": -264.3414306640625, + "loss": 0.4039, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48508021235466003, + "rewards/margins": 2.160410165786743, + "rewards/rejected": -1.6753300428390503, + "step": 3844 + }, + { + "epoch": 0.44, + "learning_rate": 1.6957743181552148e-07, + "logits/chosen": -2.563244342803955, + "logits/rejected": -2.798534393310547, + "logps/chosen": -514.3057861328125, + "logps/rejected": -412.5401611328125, + "loss": 0.3098, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3019489645957947, + "rewards/margins": 1.951155662536621, + "rewards/rejected": -1.6492067575454712, + "step": 3845 + }, + { + "epoch": 0.44, + "learning_rate": 1.695423153458972e-07, + "logits/chosen": -3.4920597076416016, + "logits/rejected": -3.453874349594116, + "logps/chosen": -240.20269775390625, + "logps/rejected": -175.48529052734375, + "loss": 0.439, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3508619964122772, + "rewards/margins": 1.9807889461517334, + "rewards/rejected": -1.6299270391464233, + "step": 3846 + }, + { + "epoch": 0.44, + "learning_rate": 1.6950719887627296e-07, + "logits/chosen": -3.157660961151123, + "logits/rejected": -3.0012764930725098, + "logps/chosen": -172.97145080566406, + "logps/rejected": -186.09869384765625, + "loss": 0.4586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1452174186706543, + "rewards/margins": 1.2179901599884033, + "rewards/rejected": -1.0727728605270386, + "step": 3847 + }, + { + "epoch": 0.44, + "learning_rate": 1.694720824066487e-07, + "logits/chosen": -2.7757856845855713, + "logits/rejected": -2.6125760078430176, + "logps/chosen": -270.7652587890625, + "logps/rejected": -194.69570922851562, + "loss": 0.4561, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46264350414276123, + "rewards/margins": 0.8285222053527832, + "rewards/rejected": -1.2911657094955444, + "step": 3848 + }, + { + "epoch": 0.44, + "learning_rate": 1.6943696593702444e-07, + "logits/chosen": -2.099597692489624, + "logits/rejected": -2.2245240211486816, + "logps/chosen": -294.6419982910156, + "logps/rejected": -258.6245422363281, + "loss": 0.2524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17028126120567322, + "rewards/margins": 1.5689235925674438, + "rewards/rejected": -1.3986423015594482, + "step": 3849 + }, + { + "epoch": 0.44, + "learning_rate": 1.6940184946740022e-07, + "logits/chosen": -3.6705031394958496, + "logits/rejected": -3.3457181453704834, + "logps/chosen": -257.61151123046875, + "logps/rejected": -279.2810974121094, + "loss": 0.1477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.026253875344991684, + "rewards/margins": 3.354686737060547, + "rewards/rejected": -3.3809406757354736, + "step": 3850 + }, + { + "epoch": 0.44, + "learning_rate": 1.6936673299777595e-07, + "logits/chosen": -3.6294422149658203, + "logits/rejected": -3.568026542663574, + "logps/chosen": -229.396728515625, + "logps/rejected": -300.3705139160156, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2553942799568176, + "rewards/margins": 2.346906900405884, + "rewards/rejected": -2.6023013591766357, + "step": 3851 + }, + { + "epoch": 0.44, + "learning_rate": 1.693316165281517e-07, + "logits/chosen": -2.568222761154175, + "logits/rejected": -2.519191265106201, + "logps/chosen": -364.44183349609375, + "logps/rejected": -284.93359375, + "loss": 0.6937, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7436318397521973, + "rewards/margins": 0.12356360256671906, + "rewards/rejected": -0.8671954274177551, + "step": 3852 + }, + { + "epoch": 0.44, + "learning_rate": 1.6929650005852746e-07, + "logits/chosen": -2.6728463172912598, + "logits/rejected": -3.287393093109131, + "logps/chosen": -207.31398010253906, + "logps/rejected": -299.4951171875, + "loss": 0.2872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11166258156299591, + "rewards/margins": 1.8056936264038086, + "rewards/rejected": -1.6940311193466187, + "step": 3853 + }, + { + "epoch": 0.44, + "learning_rate": 1.6926138358890319e-07, + "logits/chosen": -3.4130241870880127, + "logits/rejected": -3.267488956451416, + "logps/chosen": -164.45053100585938, + "logps/rejected": -152.47476196289062, + "loss": 0.4926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29558950662612915, + "rewards/margins": 1.0614149570465088, + "rewards/rejected": -1.3570042848587036, + "step": 3854 + }, + { + "epoch": 0.44, + "learning_rate": 1.6922626711927894e-07, + "logits/chosen": -2.845470428466797, + "logits/rejected": -2.8128538131713867, + "logps/chosen": -438.796630859375, + "logps/rejected": -431.63970947265625, + "loss": 0.2672, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06244960054755211, + "rewards/margins": 2.0924651622772217, + "rewards/rejected": -2.030015468597412, + "step": 3855 + }, + { + "epoch": 0.44, + "learning_rate": 1.6919115064965467e-07, + "logits/chosen": -3.7588729858398438, + "logits/rejected": -3.5088040828704834, + "logps/chosen": -326.962158203125, + "logps/rejected": -241.14700317382812, + "loss": 0.2859, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3627627491950989, + "rewards/margins": 1.9224705696105957, + "rewards/rejected": -1.5597076416015625, + "step": 3856 + }, + { + "epoch": 0.44, + "learning_rate": 1.6915603418003042e-07, + "logits/chosen": -4.22706413269043, + "logits/rejected": -3.7585716247558594, + "logps/chosen": -417.4139099121094, + "logps/rejected": -280.3055419921875, + "loss": 0.2542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13051772117614746, + "rewards/margins": 1.832986831665039, + "rewards/rejected": -1.7024691104888916, + "step": 3857 + }, + { + "epoch": 0.44, + "learning_rate": 1.6912091771040617e-07, + "logits/chosen": -3.067542552947998, + "logits/rejected": -3.0553903579711914, + "logps/chosen": -269.66033935546875, + "logps/rejected": -316.33294677734375, + "loss": 0.6232, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5500885248184204, + "rewards/margins": 1.5749268531799316, + "rewards/rejected": -2.1250152587890625, + "step": 3858 + }, + { + "epoch": 0.44, + "learning_rate": 1.690858012407819e-07, + "logits/chosen": -2.6001038551330566, + "logits/rejected": -2.756808280944824, + "logps/chosen": -672.947998046875, + "logps/rejected": -285.09002685546875, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5256681442260742, + "rewards/margins": 1.3180114030838013, + "rewards/rejected": -0.7923431992530823, + "step": 3859 + }, + { + "epoch": 0.44, + "learning_rate": 1.6905068477115768e-07, + "logits/chosen": -3.079257011413574, + "logits/rejected": -3.0342655181884766, + "logps/chosen": -92.11864471435547, + "logps/rejected": -99.0540771484375, + "loss": 0.5902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11408732086420059, + "rewards/margins": 0.44643402099609375, + "rewards/rejected": -0.5605213642120361, + "step": 3860 + }, + { + "epoch": 0.45, + "learning_rate": 1.6901556830153338e-07, + "logits/chosen": -2.578151226043701, + "logits/rejected": -2.549561023712158, + "logps/chosen": -250.04710388183594, + "logps/rejected": -312.9638366699219, + "loss": 0.1119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6282626390457153, + "rewards/margins": 3.3821964263916016, + "rewards/rejected": -2.753933906555176, + "step": 3861 + }, + { + "epoch": 0.45, + "learning_rate": 1.6898045183190916e-07, + "logits/chosen": -3.592589855194092, + "logits/rejected": -3.6364269256591797, + "logps/chosen": -145.2577667236328, + "logps/rejected": -207.04232788085938, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3064040243625641, + "rewards/margins": 1.6719707250595093, + "rewards/rejected": -1.978374719619751, + "step": 3862 + }, + { + "epoch": 0.45, + "learning_rate": 1.6894533536228492e-07, + "logits/chosen": -2.8985047340393066, + "logits/rejected": -2.9390482902526855, + "logps/chosen": -288.90386962890625, + "logps/rejected": -330.4521484375, + "loss": 0.7616, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4438968598842621, + "rewards/margins": 0.5341230630874634, + "rewards/rejected": -0.9780200123786926, + "step": 3863 + }, + { + "epoch": 0.45, + "learning_rate": 1.6891021889266065e-07, + "logits/chosen": -2.296065330505371, + "logits/rejected": -2.7112064361572266, + "logps/chosen": -158.19680786132812, + "logps/rejected": -139.00413513183594, + "loss": 0.5802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5655057430267334, + "rewards/margins": 0.5407572984695435, + "rewards/rejected": -1.1062629222869873, + "step": 3864 + }, + { + "epoch": 0.45, + "learning_rate": 1.688751024230364e-07, + "logits/chosen": -2.9475653171539307, + "logits/rejected": -3.011859893798828, + "logps/chosen": -296.60809326171875, + "logps/rejected": -405.88623046875, + "loss": 0.1359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2793511152267456, + "rewards/margins": 3.616455078125, + "rewards/rejected": -3.895806312561035, + "step": 3865 + }, + { + "epoch": 0.45, + "learning_rate": 1.6883998595341215e-07, + "logits/chosen": -3.0542092323303223, + "logits/rejected": -3.1475327014923096, + "logps/chosen": -122.2643051147461, + "logps/rejected": -130.79351806640625, + "loss": 0.4921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11073818057775497, + "rewards/margins": 0.8513396978378296, + "rewards/rejected": -0.7406014800071716, + "step": 3866 + }, + { + "epoch": 0.45, + "learning_rate": 1.6880486948378788e-07, + "logits/chosen": -3.76084303855896, + "logits/rejected": -3.6724438667297363, + "logps/chosen": -266.74346923828125, + "logps/rejected": -307.8699645996094, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27069538831710815, + "rewards/margins": 1.9395934343338013, + "rewards/rejected": -2.2102887630462646, + "step": 3867 + }, + { + "epoch": 0.45, + "learning_rate": 1.6876975301416363e-07, + "logits/chosen": -3.8395590782165527, + "logits/rejected": -3.994643449783325, + "logps/chosen": -181.98068237304688, + "logps/rejected": -325.96002197265625, + "loss": 0.8344, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1679760068655014, + "rewards/margins": 0.7577130794525146, + "rewards/rejected": -0.589737057685852, + "step": 3868 + }, + { + "epoch": 0.45, + "learning_rate": 1.6873463654453936e-07, + "logits/chosen": -2.4880287647247314, + "logits/rejected": -2.7334797382354736, + "logps/chosen": -230.49827575683594, + "logps/rejected": -330.4295654296875, + "loss": 0.4923, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4239642024040222, + "rewards/margins": 1.0541129112243652, + "rewards/rejected": -1.4780771732330322, + "step": 3869 + }, + { + "epoch": 0.45, + "learning_rate": 1.6869952007491512e-07, + "logits/chosen": -3.1235508918762207, + "logits/rejected": -3.05199933052063, + "logps/chosen": -284.97320556640625, + "logps/rejected": -336.5337219238281, + "loss": 0.4886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15582121908664703, + "rewards/margins": 1.5183848142623901, + "rewards/rejected": -1.674206018447876, + "step": 3870 + }, + { + "epoch": 0.45, + "learning_rate": 1.686644036052909e-07, + "logits/chosen": -2.8333353996276855, + "logits/rejected": -2.749884605407715, + "logps/chosen": -173.58392333984375, + "logps/rejected": -189.92547607421875, + "loss": 0.4994, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.776189386844635, + "rewards/margins": 1.169601321220398, + "rewards/rejected": -1.9457906484603882, + "step": 3871 + }, + { + "epoch": 0.45, + "learning_rate": 1.686292871356666e-07, + "logits/chosen": -2.709059238433838, + "logits/rejected": -2.7070860862731934, + "logps/chosen": -246.8770751953125, + "logps/rejected": -321.349853515625, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.616919994354248, + "rewards/margins": 2.304532766342163, + "rewards/rejected": -1.687612771987915, + "step": 3872 + }, + { + "epoch": 0.45, + "learning_rate": 1.6859417066604238e-07, + "logits/chosen": -3.0224175453186035, + "logits/rejected": -2.831676483154297, + "logps/chosen": -204.62814331054688, + "logps/rejected": -156.0582275390625, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12646663188934326, + "rewards/margins": 1.147215485572815, + "rewards/rejected": -1.0207488536834717, + "step": 3873 + }, + { + "epoch": 0.45, + "learning_rate": 1.6855905419641813e-07, + "logits/chosen": -3.748628854751587, + "logits/rejected": -4.007974147796631, + "logps/chosen": -160.3075408935547, + "logps/rejected": -169.16030883789062, + "loss": 0.295, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2671356797218323, + "rewards/margins": 2.5194919109344482, + "rewards/rejected": -2.2523562908172607, + "step": 3874 + }, + { + "epoch": 0.45, + "learning_rate": 1.6852393772679386e-07, + "logits/chosen": -3.393664598464966, + "logits/rejected": -3.315054178237915, + "logps/chosen": -265.0625915527344, + "logps/rejected": -269.63140869140625, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18085436522960663, + "rewards/margins": 3.3327484130859375, + "rewards/rejected": -3.1518940925598145, + "step": 3875 + }, + { + "epoch": 0.45, + "learning_rate": 1.684888212571696e-07, + "logits/chosen": -2.6979892253875732, + "logits/rejected": -2.4428157806396484, + "logps/chosen": -415.4053955078125, + "logps/rejected": -311.0868225097656, + "loss": 0.2981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17243170738220215, + "rewards/margins": 2.718362808227539, + "rewards/rejected": -2.545931100845337, + "step": 3876 + }, + { + "epoch": 0.45, + "learning_rate": 1.6845370478754534e-07, + "logits/chosen": -3.24533748626709, + "logits/rejected": -3.1194043159484863, + "logps/chosen": -291.28363037109375, + "logps/rejected": -500.8436279296875, + "loss": 0.6475, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15035519003868103, + "rewards/margins": 0.9167739152908325, + "rewards/rejected": -1.067129135131836, + "step": 3877 + }, + { + "epoch": 0.45, + "learning_rate": 1.684185883179211e-07, + "logits/chosen": -3.4611239433288574, + "logits/rejected": -3.0433244705200195, + "logps/chosen": -315.01666259765625, + "logps/rejected": -246.11155700683594, + "loss": 0.6005, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2760579586029053, + "rewards/margins": 0.6002941131591797, + "rewards/rejected": -0.8763521909713745, + "step": 3878 + }, + { + "epoch": 0.45, + "learning_rate": 1.6838347184829685e-07, + "logits/chosen": -2.6113336086273193, + "logits/rejected": -2.670722484588623, + "logps/chosen": -262.0046691894531, + "logps/rejected": -223.1996307373047, + "loss": 0.562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37164202332496643, + "rewards/margins": 0.8300586342811584, + "rewards/rejected": -1.2017008066177368, + "step": 3879 + }, + { + "epoch": 0.45, + "learning_rate": 1.6834835537867257e-07, + "logits/chosen": -3.342900276184082, + "logits/rejected": -3.673882484436035, + "logps/chosen": -169.36630249023438, + "logps/rejected": -151.51882934570312, + "loss": 0.4471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19415529072284698, + "rewards/margins": 0.9486415386199951, + "rewards/rejected": -1.1427967548370361, + "step": 3880 + }, + { + "epoch": 0.45, + "learning_rate": 1.6831323890904833e-07, + "logits/chosen": -2.229797840118408, + "logits/rejected": -2.537360668182373, + "logps/chosen": -399.0658264160156, + "logps/rejected": -352.37896728515625, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3610866069793701, + "rewards/margins": 2.1801562309265137, + "rewards/rejected": -1.819069743156433, + "step": 3881 + }, + { + "epoch": 0.45, + "learning_rate": 1.682781224394241e-07, + "logits/chosen": -3.6444664001464844, + "logits/rejected": -3.378690242767334, + "logps/chosen": -347.6923828125, + "logps/rejected": -311.1700439453125, + "loss": 0.4543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23901455104351044, + "rewards/margins": 1.8777564764022827, + "rewards/rejected": -2.1167712211608887, + "step": 3882 + }, + { + "epoch": 0.45, + "learning_rate": 1.682430059697998e-07, + "logits/chosen": -2.93082857131958, + "logits/rejected": -2.772521495819092, + "logps/chosen": -240.93133544921875, + "logps/rejected": -211.7318878173828, + "loss": 0.4091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5353559851646423, + "rewards/margins": 1.1363091468811035, + "rewards/rejected": -1.6716651916503906, + "step": 3883 + }, + { + "epoch": 0.45, + "learning_rate": 1.682078895001756e-07, + "logits/chosen": -2.586240291595459, + "logits/rejected": -2.949476957321167, + "logps/chosen": -193.0887908935547, + "logps/rejected": -221.45816040039062, + "loss": 0.5653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1109359860420227, + "rewards/margins": 2.0748164653778076, + "rewards/rejected": -2.1857523918151855, + "step": 3884 + }, + { + "epoch": 0.45, + "learning_rate": 1.6817277303055132e-07, + "logits/chosen": -3.2292394638061523, + "logits/rejected": -3.2614524364471436, + "logps/chosen": -588.3258056640625, + "logps/rejected": -325.7349853515625, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2567040026187897, + "rewards/margins": 1.66847825050354, + "rewards/rejected": -1.9251822233200073, + "step": 3885 + }, + { + "epoch": 0.45, + "learning_rate": 1.6813765656092707e-07, + "logits/chosen": -3.5593667030334473, + "logits/rejected": -3.1227147579193115, + "logps/chosen": -325.40478515625, + "logps/rejected": -273.6921691894531, + "loss": 0.205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10359519720077515, + "rewards/margins": 2.1111083030700684, + "rewards/rejected": -2.0075132846832275, + "step": 3886 + }, + { + "epoch": 0.45, + "learning_rate": 1.6810254009130283e-07, + "logits/chosen": -2.8436498641967773, + "logits/rejected": -3.0519158840179443, + "logps/chosen": -146.1928253173828, + "logps/rejected": -228.0513916015625, + "loss": 0.491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.721136212348938, + "rewards/margins": 1.5719950199127197, + "rewards/rejected": -2.2931313514709473, + "step": 3887 + }, + { + "epoch": 0.45, + "learning_rate": 1.6806742362167855e-07, + "logits/chosen": -3.2631518840789795, + "logits/rejected": -3.4329028129577637, + "logps/chosen": -312.8166198730469, + "logps/rejected": -350.3734130859375, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.586063802242279, + "rewards/margins": 2.6200506687164307, + "rewards/rejected": -2.033986806869507, + "step": 3888 + }, + { + "epoch": 0.45, + "learning_rate": 1.680323071520543e-07, + "logits/chosen": -3.1916112899780273, + "logits/rejected": -3.44156551361084, + "logps/chosen": -384.52398681640625, + "logps/rejected": -255.17474365234375, + "loss": 0.6373, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17399150133132935, + "rewards/margins": 1.3684548139572144, + "rewards/rejected": -1.1944632530212402, + "step": 3889 + }, + { + "epoch": 0.45, + "learning_rate": 1.6799719068243006e-07, + "logits/chosen": -3.2748184204101562, + "logits/rejected": -3.3138275146484375, + "logps/chosen": -188.09335327148438, + "logps/rejected": -154.2055206298828, + "loss": 0.3981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.017880067229270935, + "rewards/margins": 1.4722833633422852, + "rewards/rejected": -1.4544031620025635, + "step": 3890 + }, + { + "epoch": 0.45, + "learning_rate": 1.679620742128058e-07, + "logits/chosen": -3.231192111968994, + "logits/rejected": -3.3155622482299805, + "logps/chosen": -136.6150360107422, + "logps/rejected": -130.3331298828125, + "loss": 0.5179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5456979870796204, + "rewards/margins": 0.5372752547264099, + "rewards/rejected": -1.0829732418060303, + "step": 3891 + }, + { + "epoch": 0.45, + "learning_rate": 1.6792695774318154e-07, + "logits/chosen": -3.229435920715332, + "logits/rejected": -3.3020009994506836, + "logps/chosen": -58.47271728515625, + "logps/rejected": -134.1791229248047, + "loss": 0.5248, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023945041000843048, + "rewards/margins": 0.8842027187347412, + "rewards/rejected": -0.9081476926803589, + "step": 3892 + }, + { + "epoch": 0.45, + "learning_rate": 1.6789184127355727e-07, + "logits/chosen": -3.7879343032836914, + "logits/rejected": -3.8460826873779297, + "logps/chosen": -201.2899932861328, + "logps/rejected": -136.4763946533203, + "loss": 0.416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08611539006233215, + "rewards/margins": 1.3985284566879272, + "rewards/rejected": -1.4846436977386475, + "step": 3893 + }, + { + "epoch": 0.45, + "learning_rate": 1.6785672480393305e-07, + "logits/chosen": -3.2066304683685303, + "logits/rejected": -2.967660903930664, + "logps/chosen": -335.3586120605469, + "logps/rejected": -251.68028259277344, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28362923860549927, + "rewards/margins": 1.8555078506469727, + "rewards/rejected": -1.5718786716461182, + "step": 3894 + }, + { + "epoch": 0.45, + "learning_rate": 1.678216083343088e-07, + "logits/chosen": -3.0254433155059814, + "logits/rejected": -2.952657699584961, + "logps/chosen": -310.55615234375, + "logps/rejected": -209.0472412109375, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11981107294559479, + "rewards/margins": 1.2418019771575928, + "rewards/rejected": -1.1219907999038696, + "step": 3895 + }, + { + "epoch": 0.45, + "learning_rate": 1.6778649186468453e-07, + "logits/chosen": -3.44158673286438, + "logits/rejected": -3.160054922103882, + "logps/chosen": -183.380615234375, + "logps/rejected": -212.17001342773438, + "loss": 0.1761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22255712747573853, + "rewards/margins": 2.851637363433838, + "rewards/rejected": -3.0741944313049316, + "step": 3896 + }, + { + "epoch": 0.45, + "learning_rate": 1.6775137539506028e-07, + "logits/chosen": -2.9350032806396484, + "logits/rejected": -3.064073085784912, + "logps/chosen": -348.854736328125, + "logps/rejected": -291.3142395019531, + "loss": 0.7842, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7101092338562012, + "rewards/margins": 0.6592597961425781, + "rewards/rejected": -1.3693690299987793, + "step": 3897 + }, + { + "epoch": 0.45, + "learning_rate": 1.6771625892543604e-07, + "logits/chosen": -3.0275933742523193, + "logits/rejected": -2.914254665374756, + "logps/chosen": -424.3076171875, + "logps/rejected": -496.337158203125, + "loss": 1.2405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0633334070444107, + "rewards/margins": -0.43494293093681335, + "rewards/rejected": 0.49827635288238525, + "step": 3898 + }, + { + "epoch": 0.45, + "learning_rate": 1.6768114245581177e-07, + "logits/chosen": -2.8038079738616943, + "logits/rejected": -2.9028995037078857, + "logps/chosen": -177.37120056152344, + "logps/rejected": -194.94635009765625, + "loss": 0.3249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14306241273880005, + "rewards/margins": 2.216038227081299, + "rewards/rejected": -2.359100341796875, + "step": 3899 + }, + { + "epoch": 0.45, + "learning_rate": 1.6764602598618752e-07, + "logits/chosen": -3.2047643661499023, + "logits/rejected": -3.1078362464904785, + "logps/chosen": -280.8101806640625, + "logps/rejected": -271.61334228515625, + "loss": 0.4999, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.056729406118392944, + "rewards/margins": 1.1859720945358276, + "rewards/rejected": -1.1292426586151123, + "step": 3900 + }, + { + "epoch": 0.45, + "learning_rate": 1.6761090951656325e-07, + "logits/chosen": -3.1043453216552734, + "logits/rejected": -3.3430185317993164, + "logps/chosen": -103.40493774414062, + "logps/rejected": -162.62950134277344, + "loss": 0.2771, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16949604451656342, + "rewards/margins": 2.740267515182495, + "rewards/rejected": -2.5707716941833496, + "step": 3901 + }, + { + "epoch": 0.45, + "learning_rate": 1.67575793046939e-07, + "logits/chosen": -2.845283031463623, + "logits/rejected": -2.888688325881958, + "logps/chosen": -381.3897705078125, + "logps/rejected": -272.5105895996094, + "loss": 0.3588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34148579835891724, + "rewards/margins": 1.7198735475540161, + "rewards/rejected": -2.061359167098999, + "step": 3902 + }, + { + "epoch": 0.45, + "learning_rate": 1.6754067657731475e-07, + "logits/chosen": -3.615793228149414, + "logits/rejected": -3.4860551357269287, + "logps/chosen": -252.29299926757812, + "logps/rejected": -320.9788818359375, + "loss": 0.3775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19172416627407074, + "rewards/margins": 1.9543359279632568, + "rewards/rejected": -2.146059989929199, + "step": 3903 + }, + { + "epoch": 0.45, + "learning_rate": 1.6750556010769048e-07, + "logits/chosen": -2.899372100830078, + "logits/rejected": -2.77384614944458, + "logps/chosen": -220.14102172851562, + "logps/rejected": -256.169677734375, + "loss": 0.8327, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07914219796657562, + "rewards/margins": 0.9766309857368469, + "rewards/rejected": -0.8974887132644653, + "step": 3904 + }, + { + "epoch": 0.45, + "learning_rate": 1.6747044363806626e-07, + "logits/chosen": -4.083298206329346, + "logits/rejected": -3.6591484546661377, + "logps/chosen": -311.29107666015625, + "logps/rejected": -143.20938110351562, + "loss": 0.7459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8008547425270081, + "rewards/margins": 0.6739466190338135, + "rewards/rejected": -1.4748014211654663, + "step": 3905 + }, + { + "epoch": 0.45, + "learning_rate": 1.6743532716844196e-07, + "logits/chosen": -3.1859443187713623, + "logits/rejected": -3.48614239692688, + "logps/chosen": -294.55657958984375, + "logps/rejected": -483.15631103515625, + "loss": 0.7159, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4383498430252075, + "rewards/margins": 0.2848326563835144, + "rewards/rejected": -0.7231824398040771, + "step": 3906 + }, + { + "epoch": 0.45, + "learning_rate": 1.6740021069881774e-07, + "logits/chosen": -3.3163130283355713, + "logits/rejected": -3.3488364219665527, + "logps/chosen": -263.12335205078125, + "logps/rejected": -300.5025634765625, + "loss": 0.4234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7805500030517578, + "rewards/margins": 1.705501675605774, + "rewards/rejected": -2.486051559448242, + "step": 3907 + }, + { + "epoch": 0.45, + "learning_rate": 1.673650942291935e-07, + "logits/chosen": -2.5484869480133057, + "logits/rejected": -2.5810928344726562, + "logps/chosen": -219.32505798339844, + "logps/rejected": -256.1408996582031, + "loss": 0.3058, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08273406326770782, + "rewards/margins": 2.1331024169921875, + "rewards/rejected": -2.215836524963379, + "step": 3908 + }, + { + "epoch": 0.45, + "learning_rate": 1.6732997775956922e-07, + "logits/chosen": -3.905609607696533, + "logits/rejected": -3.7289156913757324, + "logps/chosen": -245.06663513183594, + "logps/rejected": -197.56292724609375, + "loss": 0.4261, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04866520315408707, + "rewards/margins": 1.2163619995117188, + "rewards/rejected": -1.1676968336105347, + "step": 3909 + }, + { + "epoch": 0.45, + "learning_rate": 1.6729486128994498e-07, + "logits/chosen": -4.019491672515869, + "logits/rejected": -3.8270466327667236, + "logps/chosen": -243.4026641845703, + "logps/rejected": -196.89645385742188, + "loss": 0.4203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.042392242699861526, + "rewards/margins": 1.3091219663619995, + "rewards/rejected": -1.3515143394470215, + "step": 3910 + }, + { + "epoch": 0.45, + "learning_rate": 1.6725974482032073e-07, + "logits/chosen": -2.494875431060791, + "logits/rejected": -2.664867639541626, + "logps/chosen": -690.2308349609375, + "logps/rejected": -518.0101318359375, + "loss": 0.6143, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18021927773952484, + "rewards/margins": 1.3277764320373535, + "rewards/rejected": -1.147557258605957, + "step": 3911 + }, + { + "epoch": 0.45, + "learning_rate": 1.6722462835069646e-07, + "logits/chosen": -2.5157837867736816, + "logits/rejected": -2.8236405849456787, + "logps/chosen": -202.26364135742188, + "logps/rejected": -242.93899536132812, + "loss": 0.405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08699209243059158, + "rewards/margins": 1.2268575429916382, + "rewards/rejected": -1.3138495683670044, + "step": 3912 + }, + { + "epoch": 0.45, + "learning_rate": 1.6718951188107221e-07, + "logits/chosen": -3.0516393184661865, + "logits/rejected": -3.5533287525177, + "logps/chosen": -200.78036499023438, + "logps/rejected": -245.96046447753906, + "loss": 0.2833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24726897478103638, + "rewards/margins": 2.300065755844116, + "rewards/rejected": -2.547334909439087, + "step": 3913 + }, + { + "epoch": 0.45, + "learning_rate": 1.6715439541144794e-07, + "logits/chosen": -2.1686410903930664, + "logits/rejected": -2.3424363136291504, + "logps/chosen": -281.7262878417969, + "logps/rejected": -240.1151123046875, + "loss": 0.4651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06388819217681885, + "rewards/margins": 1.3189105987548828, + "rewards/rejected": -1.382798671722412, + "step": 3914 + }, + { + "epoch": 0.45, + "learning_rate": 1.671192789418237e-07, + "logits/chosen": -3.195685863494873, + "logits/rejected": -2.894683837890625, + "logps/chosen": -277.0578918457031, + "logps/rejected": -192.98788452148438, + "loss": 0.5809, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5015321969985962, + "rewards/margins": 0.7417024374008179, + "rewards/rejected": -1.243234634399414, + "step": 3915 + }, + { + "epoch": 0.45, + "learning_rate": 1.6708416247219948e-07, + "logits/chosen": -2.842817783355713, + "logits/rejected": -2.7137913703918457, + "logps/chosen": -323.8084716796875, + "logps/rejected": -425.027587890625, + "loss": 0.2629, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0037068650126457214, + "rewards/margins": 2.248180389404297, + "rewards/rejected": -2.244473695755005, + "step": 3916 + }, + { + "epoch": 0.45, + "learning_rate": 1.6704904600257518e-07, + "logits/chosen": -2.7317750453948975, + "logits/rejected": -2.9263811111450195, + "logps/chosen": -387.1406555175781, + "logps/rejected": -326.827880859375, + "loss": 0.2967, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5465068817138672, + "rewards/margins": 1.5278393030166626, + "rewards/rejected": -0.9813324809074402, + "step": 3917 + }, + { + "epoch": 0.45, + "learning_rate": 1.6701392953295096e-07, + "logits/chosen": -3.2112770080566406, + "logits/rejected": -3.1896276473999023, + "logps/chosen": -203.7079620361328, + "logps/rejected": -225.31387329101562, + "loss": 0.4817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6842728853225708, + "rewards/margins": 0.9469429850578308, + "rewards/rejected": -1.6312158107757568, + "step": 3918 + }, + { + "epoch": 0.45, + "learning_rate": 1.669788130633267e-07, + "logits/chosen": -3.175036907196045, + "logits/rejected": -3.582202196121216, + "logps/chosen": -174.6392822265625, + "logps/rejected": -273.2793273925781, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007361330091953278, + "rewards/margins": 3.4077954292297363, + "rewards/rejected": -3.4004340171813965, + "step": 3919 + }, + { + "epoch": 0.45, + "learning_rate": 1.6694369659370244e-07, + "logits/chosen": -2.888916015625, + "logits/rejected": -2.753185272216797, + "logps/chosen": -143.0266571044922, + "logps/rejected": -130.4777069091797, + "loss": 0.4428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18790292739868164, + "rewards/margins": 0.9244746565818787, + "rewards/rejected": -1.112377643585205, + "step": 3920 + }, + { + "epoch": 0.45, + "learning_rate": 1.669085801240782e-07, + "logits/chosen": -3.3482506275177, + "logits/rejected": -3.5022435188293457, + "logps/chosen": -181.4210205078125, + "logps/rejected": -242.97943115234375, + "loss": 0.2461, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1498022973537445, + "rewards/margins": 1.8597441911697388, + "rewards/rejected": -1.7099418640136719, + "step": 3921 + }, + { + "epoch": 0.45, + "learning_rate": 1.6687346365445392e-07, + "logits/chosen": -2.97554874420166, + "logits/rejected": -3.2131731510162354, + "logps/chosen": -319.1679992675781, + "logps/rejected": -334.4341125488281, + "loss": 0.6327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3946351110935211, + "rewards/margins": 1.6252377033233643, + "rewards/rejected": -2.0198729038238525, + "step": 3922 + }, + { + "epoch": 0.45, + "learning_rate": 1.6683834718482967e-07, + "logits/chosen": -2.7938454151153564, + "logits/rejected": -2.641062021255493, + "logps/chosen": -157.29443359375, + "logps/rejected": -192.3769989013672, + "loss": 0.5457, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17077118158340454, + "rewards/margins": 1.4029216766357422, + "rewards/rejected": -1.5736926794052124, + "step": 3923 + }, + { + "epoch": 0.45, + "learning_rate": 1.6680323071520543e-07, + "logits/chosen": -2.5717058181762695, + "logits/rejected": -2.475672483444214, + "logps/chosen": -217.38052368164062, + "logps/rejected": -259.3857116699219, + "loss": 0.149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34016233682632446, + "rewards/margins": 2.8364720344543457, + "rewards/rejected": -2.496309995651245, + "step": 3924 + }, + { + "epoch": 0.45, + "learning_rate": 1.6676811424558115e-07, + "logits/chosen": -3.6335225105285645, + "logits/rejected": -4.148991584777832, + "logps/chosen": -205.16958618164062, + "logps/rejected": -219.7207794189453, + "loss": 0.5761, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6447200179100037, + "rewards/margins": 0.5442485809326172, + "rewards/rejected": -1.1889686584472656, + "step": 3925 + }, + { + "epoch": 0.45, + "learning_rate": 1.667329977759569e-07, + "logits/chosen": -2.8978145122528076, + "logits/rejected": -3.1460652351379395, + "logps/chosen": -190.1858367919922, + "logps/rejected": -209.722900390625, + "loss": 0.436, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42125946283340454, + "rewards/margins": 2.1215572357177734, + "rewards/rejected": -2.5428171157836914, + "step": 3926 + }, + { + "epoch": 0.45, + "learning_rate": 1.666978813063327e-07, + "logits/chosen": -2.9563565254211426, + "logits/rejected": -3.01013445854187, + "logps/chosen": -294.2347717285156, + "logps/rejected": -262.5138854980469, + "loss": 0.3021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2665388286113739, + "rewards/margins": 1.7277708053588867, + "rewards/rejected": -1.994309663772583, + "step": 3927 + }, + { + "epoch": 0.45, + "learning_rate": 1.6666276483670842e-07, + "logits/chosen": -2.7254207134246826, + "logits/rejected": -2.772742986679077, + "logps/chosen": -312.483154296875, + "logps/rejected": -288.9871826171875, + "loss": 0.3681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15272855758666992, + "rewards/margins": 2.0317296981811523, + "rewards/rejected": -2.1844582557678223, + "step": 3928 + }, + { + "epoch": 0.45, + "learning_rate": 1.6662764836708417e-07, + "logits/chosen": -3.1562881469726562, + "logits/rejected": -2.9972753524780273, + "logps/chosen": -246.06527709960938, + "logps/rejected": -275.90057373046875, + "loss": 0.3011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2586939334869385, + "rewards/margins": 1.503697156906128, + "rewards/rejected": -1.2450032234191895, + "step": 3929 + }, + { + "epoch": 0.45, + "learning_rate": 1.665925318974599e-07, + "logits/chosen": -2.771390914916992, + "logits/rejected": -2.81821346282959, + "logps/chosen": -344.42205810546875, + "logps/rejected": -197.8970947265625, + "loss": 0.3047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.540795087814331, + "rewards/margins": 1.5866599082946777, + "rewards/rejected": -1.0458648204803467, + "step": 3930 + }, + { + "epoch": 0.45, + "learning_rate": 1.6655741542783565e-07, + "logits/chosen": -2.9273390769958496, + "logits/rejected": -2.957430839538574, + "logps/chosen": -203.44021606445312, + "logps/rejected": -254.94305419921875, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8859602212905884, + "rewards/margins": 0.831865668296814, + "rewards/rejected": -1.7178257703781128, + "step": 3931 + }, + { + "epoch": 0.45, + "learning_rate": 1.665222989582114e-07, + "logits/chosen": -3.131474494934082, + "logits/rejected": -3.11191463470459, + "logps/chosen": -226.9496612548828, + "logps/rejected": -330.57122802734375, + "loss": 0.4189, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3152734041213989, + "rewards/margins": 1.7751468420028687, + "rewards/rejected": -2.0904202461242676, + "step": 3932 + }, + { + "epoch": 0.45, + "learning_rate": 1.6648718248858713e-07, + "logits/chosen": -3.1802546977996826, + "logits/rejected": -3.330568790435791, + "logps/chosen": -196.8290252685547, + "logps/rejected": -195.74844360351562, + "loss": 0.369, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31033021211624146, + "rewards/margins": 1.2503724098205566, + "rewards/rejected": -1.5607025623321533, + "step": 3933 + }, + { + "epoch": 0.45, + "learning_rate": 1.6645206601896289e-07, + "logits/chosen": -3.4656312465667725, + "logits/rejected": -3.6490557193756104, + "logps/chosen": -51.048744201660156, + "logps/rejected": -150.63478088378906, + "loss": 0.2075, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5096320509910583, + "rewards/margins": 2.744006872177124, + "rewards/rejected": -2.234375, + "step": 3934 + }, + { + "epoch": 0.45, + "learning_rate": 1.6641694954933864e-07, + "logits/chosen": -3.1361091136932373, + "logits/rejected": -3.2533063888549805, + "logps/chosen": -229.19491577148438, + "logps/rejected": -198.26638793945312, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7726982831954956, + "rewards/margins": 0.8672773838043213, + "rewards/rejected": -1.6399755477905273, + "step": 3935 + }, + { + "epoch": 0.45, + "learning_rate": 1.6638183307971437e-07, + "logits/chosen": -3.426068067550659, + "logits/rejected": -3.300900459289551, + "logps/chosen": -243.7493896484375, + "logps/rejected": -140.19775390625, + "loss": 0.9715, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14539562165737152, + "rewards/margins": 0.06436687707901001, + "rewards/rejected": -0.20976249873638153, + "step": 3936 + }, + { + "epoch": 0.45, + "learning_rate": 1.6634671661009012e-07, + "logits/chosen": -2.2964227199554443, + "logits/rejected": -2.6459438800811768, + "logps/chosen": -329.76141357421875, + "logps/rejected": -274.14044189453125, + "loss": 0.516, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08467893302440643, + "rewards/margins": 0.8177881836891174, + "rewards/rejected": -0.7331092357635498, + "step": 3937 + }, + { + "epoch": 0.45, + "learning_rate": 1.6631160014046585e-07, + "logits/chosen": -2.7670536041259766, + "logits/rejected": -2.8458662033081055, + "logps/chosen": -293.47601318359375, + "logps/rejected": -248.64146423339844, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4590204358100891, + "rewards/margins": 2.6637744903564453, + "rewards/rejected": -3.1227951049804688, + "step": 3938 + }, + { + "epoch": 0.45, + "learning_rate": 1.6627648367084163e-07, + "logits/chosen": -3.2104363441467285, + "logits/rejected": -3.38616943359375, + "logps/chosen": -193.8213653564453, + "logps/rejected": -208.594970703125, + "loss": 0.232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024422720074653625, + "rewards/margins": 2.3275668621063232, + "rewards/rejected": -2.351989507675171, + "step": 3939 + }, + { + "epoch": 0.45, + "learning_rate": 1.6624136720121738e-07, + "logits/chosen": -2.6745595932006836, + "logits/rejected": -2.9450459480285645, + "logps/chosen": -506.9779357910156, + "logps/rejected": -217.281494140625, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2863805592060089, + "rewards/margins": 2.1853699684143066, + "rewards/rejected": -1.8989893198013306, + "step": 3940 + }, + { + "epoch": 0.45, + "learning_rate": 1.662062507315931e-07, + "logits/chosen": -3.8072128295898438, + "logits/rejected": -3.6781773567199707, + "logps/chosen": -271.0982971191406, + "logps/rejected": -268.2368469238281, + "loss": 0.4746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.314240038394928, + "rewards/margins": 1.5845011472702026, + "rewards/rejected": -1.8987411260604858, + "step": 3941 + }, + { + "epoch": 0.45, + "learning_rate": 1.6617113426196886e-07, + "logits/chosen": -3.32926869392395, + "logits/rejected": -3.3597798347473145, + "logps/chosen": -393.288818359375, + "logps/rejected": -378.1005859375, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06532678753137589, + "rewards/margins": 3.001054286956787, + "rewards/rejected": -2.935727596282959, + "step": 3942 + }, + { + "epoch": 0.45, + "learning_rate": 1.6613601779234462e-07, + "logits/chosen": -3.183940887451172, + "logits/rejected": -3.24076509475708, + "logps/chosen": -261.2845458984375, + "logps/rejected": -234.1049346923828, + "loss": 0.138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1516067534685135, + "rewards/margins": 2.7614076137542725, + "rewards/rejected": -2.6098008155822754, + "step": 3943 + }, + { + "epoch": 0.45, + "learning_rate": 1.6610090132272034e-07, + "logits/chosen": -3.7316646575927734, + "logits/rejected": -3.6260194778442383, + "logps/chosen": -311.6120910644531, + "logps/rejected": -317.6443786621094, + "loss": 0.3553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18183842301368713, + "rewards/margins": 1.6836862564086914, + "rewards/rejected": -1.8655246496200562, + "step": 3944 + }, + { + "epoch": 0.45, + "learning_rate": 1.660657848530961e-07, + "logits/chosen": -2.9080231189727783, + "logits/rejected": -2.743009090423584, + "logps/chosen": -424.9231872558594, + "logps/rejected": -264.76416015625, + "loss": 0.3021, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24496810138225555, + "rewards/margins": 2.410780906677246, + "rewards/rejected": -2.1658129692077637, + "step": 3945 + }, + { + "epoch": 0.45, + "learning_rate": 1.6603066838347183e-07, + "logits/chosen": -3.3465728759765625, + "logits/rejected": -3.4729700088500977, + "logps/chosen": -207.16937255859375, + "logps/rejected": -276.08050537109375, + "loss": 0.183, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33600008487701416, + "rewards/margins": 2.6388473510742188, + "rewards/rejected": -2.302847385406494, + "step": 3946 + }, + { + "epoch": 0.46, + "learning_rate": 1.6599555191384758e-07, + "logits/chosen": -3.1818604469299316, + "logits/rejected": -3.1019415855407715, + "logps/chosen": -206.70872497558594, + "logps/rejected": -292.5819091796875, + "loss": 0.8129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7445281147956848, + "rewards/margins": 0.5674898624420166, + "rewards/rejected": -1.3120179176330566, + "step": 3947 + }, + { + "epoch": 0.46, + "learning_rate": 1.6596043544422333e-07, + "logits/chosen": -3.1033477783203125, + "logits/rejected": -3.2285282611846924, + "logps/chosen": -287.02252197265625, + "logps/rejected": -478.1990966796875, + "loss": 0.2269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09701438993215561, + "rewards/margins": 3.983635425567627, + "rewards/rejected": -3.8866209983825684, + "step": 3948 + }, + { + "epoch": 0.46, + "learning_rate": 1.6592531897459906e-07, + "logits/chosen": -3.4116649627685547, + "logits/rejected": -3.4333150386810303, + "logps/chosen": -275.67254638671875, + "logps/rejected": -274.20623779296875, + "loss": 0.2816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5063928961753845, + "rewards/margins": 1.4183909893035889, + "rewards/rejected": -1.9247838258743286, + "step": 3949 + }, + { + "epoch": 0.46, + "learning_rate": 1.6589020250497484e-07, + "logits/chosen": -2.7551186084747314, + "logits/rejected": -2.8596112728118896, + "logps/chosen": -231.7318878173828, + "logps/rejected": -301.86419677734375, + "loss": 0.5625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2832072377204895, + "rewards/margins": 1.9950810670852661, + "rewards/rejected": -2.2782883644104004, + "step": 3950 + }, + { + "epoch": 0.46, + "learning_rate": 1.6585508603535054e-07, + "logits/chosen": -2.949077606201172, + "logits/rejected": -3.0316786766052246, + "logps/chosen": -139.98046875, + "logps/rejected": -231.80307006835938, + "loss": 0.2196, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21762631833553314, + "rewards/margins": 2.1158483028411865, + "rewards/rejected": -1.8982219696044922, + "step": 3951 + }, + { + "epoch": 0.46, + "learning_rate": 1.6581996956572632e-07, + "logits/chosen": -2.6996636390686035, + "logits/rejected": -2.743530750274658, + "logps/chosen": -224.20693969726562, + "logps/rejected": -224.3370361328125, + "loss": 0.3443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21939529478549957, + "rewards/margins": 2.306821823120117, + "rewards/rejected": -2.526216983795166, + "step": 3952 + }, + { + "epoch": 0.46, + "learning_rate": 1.6578485309610208e-07, + "logits/chosen": -2.6949586868286133, + "logits/rejected": -2.7072267532348633, + "logps/chosen": -450.14263916015625, + "logps/rejected": -334.2158508300781, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07065839320421219, + "rewards/margins": 2.2746214866638184, + "rewards/rejected": -2.3452799320220947, + "step": 3953 + }, + { + "epoch": 0.46, + "learning_rate": 1.657497366264778e-07, + "logits/chosen": -3.669567823410034, + "logits/rejected": -3.545783281326294, + "logps/chosen": -267.88055419921875, + "logps/rejected": -266.5813903808594, + "loss": 0.1155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16398391127586365, + "rewards/margins": 3.2232327461242676, + "rewards/rejected": -3.059248924255371, + "step": 3954 + }, + { + "epoch": 0.46, + "learning_rate": 1.6571462015685356e-07, + "logits/chosen": -4.099387168884277, + "logits/rejected": -3.9782285690307617, + "logps/chosen": -140.45372009277344, + "logps/rejected": -200.38003540039062, + "loss": 0.5413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6678177714347839, + "rewards/margins": 0.8936976194381714, + "rewards/rejected": -1.5615154504776, + "step": 3955 + }, + { + "epoch": 0.46, + "learning_rate": 1.656795036872293e-07, + "logits/chosen": -2.6751601696014404, + "logits/rejected": -2.9875009059906006, + "logps/chosen": -278.93646240234375, + "logps/rejected": -434.8519287109375, + "loss": 0.1888, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4723946750164032, + "rewards/margins": 3.45566987991333, + "rewards/rejected": -2.9832751750946045, + "step": 3956 + }, + { + "epoch": 0.46, + "learning_rate": 1.6564438721760504e-07, + "logits/chosen": -3.4537341594696045, + "logits/rejected": -3.411539077758789, + "logps/chosen": -205.83761596679688, + "logps/rejected": -227.76454162597656, + "loss": 0.2794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07443726062774658, + "rewards/margins": 1.5221247673034668, + "rewards/rejected": -1.596562147140503, + "step": 3957 + }, + { + "epoch": 0.46, + "learning_rate": 1.656092707479808e-07, + "logits/chosen": -3.77352237701416, + "logits/rejected": -3.8291430473327637, + "logps/chosen": -322.41375732421875, + "logps/rejected": -254.01405334472656, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32932227849960327, + "rewards/margins": 1.9812369346618652, + "rewards/rejected": -1.6519148349761963, + "step": 3958 + }, + { + "epoch": 0.46, + "learning_rate": 1.6557415427835652e-07, + "logits/chosen": -3.1541318893432617, + "logits/rejected": -3.266061305999756, + "logps/chosen": -281.408935546875, + "logps/rejected": -266.9397888183594, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02969842404127121, + "rewards/margins": 1.0571486949920654, + "rewards/rejected": -1.0274502038955688, + "step": 3959 + }, + { + "epoch": 0.46, + "learning_rate": 1.6553903780873227e-07, + "logits/chosen": -2.750767230987549, + "logits/rejected": -2.77349853515625, + "logps/chosen": -278.79571533203125, + "logps/rejected": -245.61578369140625, + "loss": 0.5922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7730966210365295, + "rewards/margins": 0.6253774166107178, + "rewards/rejected": -1.3984739780426025, + "step": 3960 + }, + { + "epoch": 0.46, + "learning_rate": 1.6550392133910805e-07, + "logits/chosen": -3.9635000228881836, + "logits/rejected": -3.691056966781616, + "logps/chosen": -311.1716003417969, + "logps/rejected": -346.4088134765625, + "loss": 0.4196, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7732243537902832, + "rewards/margins": 1.802823543548584, + "rewards/rejected": -2.576047897338867, + "step": 3961 + }, + { + "epoch": 0.46, + "learning_rate": 1.6546880486948378e-07, + "logits/chosen": -2.7927889823913574, + "logits/rejected": -2.7317700386047363, + "logps/chosen": -192.64907836914062, + "logps/rejected": -186.52369689941406, + "loss": 0.5924, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.018361926078796387, + "rewards/margins": 0.5788378715515137, + "rewards/rejected": -0.5971997976303101, + "step": 3962 + }, + { + "epoch": 0.46, + "learning_rate": 1.6543368839985954e-07, + "logits/chosen": -3.101942777633667, + "logits/rejected": -3.1254310607910156, + "logps/chosen": -197.59812927246094, + "logps/rejected": -168.5692901611328, + "loss": 0.2309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.441495418548584, + "rewards/margins": 1.745103120803833, + "rewards/rejected": -1.3036075830459595, + "step": 3963 + }, + { + "epoch": 0.46, + "learning_rate": 1.653985719302353e-07, + "logits/chosen": -3.07716703414917, + "logits/rejected": -3.0328919887542725, + "logps/chosen": -348.2048645019531, + "logps/rejected": -369.5650329589844, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09208270907402039, + "rewards/margins": 1.2332953214645386, + "rewards/rejected": -1.3253778219223022, + "step": 3964 + }, + { + "epoch": 0.46, + "learning_rate": 1.6536345546061102e-07, + "logits/chosen": -2.4997291564941406, + "logits/rejected": -2.5756750106811523, + "logps/chosen": -211.7244110107422, + "logps/rejected": -258.54364013671875, + "loss": 0.7362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4678249657154083, + "rewards/margins": 1.054385781288147, + "rewards/rejected": -1.522210717201233, + "step": 3965 + }, + { + "epoch": 0.46, + "learning_rate": 1.6532833899098677e-07, + "logits/chosen": -3.635465145111084, + "logits/rejected": -3.5831503868103027, + "logps/chosen": -319.3228759765625, + "logps/rejected": -218.89492797851562, + "loss": 0.447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.048495396971702576, + "rewards/margins": 0.8125790357589722, + "rewards/rejected": -0.8610744476318359, + "step": 3966 + }, + { + "epoch": 0.46, + "learning_rate": 1.652932225213625e-07, + "logits/chosen": -3.5552186965942383, + "logits/rejected": -3.435089349746704, + "logps/chosen": -111.84662628173828, + "logps/rejected": -101.10833740234375, + "loss": 0.6296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5400054454803467, + "rewards/margins": 0.3165719211101532, + "rewards/rejected": -0.8565773367881775, + "step": 3967 + }, + { + "epoch": 0.46, + "learning_rate": 1.6525810605173825e-07, + "logits/chosen": -3.2507565021514893, + "logits/rejected": -3.3229596614837646, + "logps/chosen": -208.93109130859375, + "logps/rejected": -178.7959747314453, + "loss": 0.2974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28170067071914673, + "rewards/margins": 1.6594507694244385, + "rewards/rejected": -1.94115149974823, + "step": 3968 + }, + { + "epoch": 0.46, + "learning_rate": 1.65222989582114e-07, + "logits/chosen": -2.9264767169952393, + "logits/rejected": -3.0075089931488037, + "logps/chosen": -259.5301513671875, + "logps/rejected": -296.82489013671875, + "loss": 0.2094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1802763193845749, + "rewards/margins": 2.2352750301361084, + "rewards/rejected": -2.0549988746643066, + "step": 3969 + }, + { + "epoch": 0.46, + "learning_rate": 1.6518787311248973e-07, + "logits/chosen": -3.341752529144287, + "logits/rejected": -3.175489902496338, + "logps/chosen": -286.57830810546875, + "logps/rejected": -220.0558319091797, + "loss": 0.3078, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1467384397983551, + "rewards/margins": 1.9107708930969238, + "rewards/rejected": -1.7640326023101807, + "step": 3970 + }, + { + "epoch": 0.46, + "learning_rate": 1.651527566428655e-07, + "logits/chosen": -2.144599199295044, + "logits/rejected": -2.144249677658081, + "logps/chosen": -337.7557678222656, + "logps/rejected": -230.0604705810547, + "loss": 0.3223, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09539368748664856, + "rewards/margins": 1.2287732362747192, + "rewards/rejected": -1.133379578590393, + "step": 3971 + }, + { + "epoch": 0.46, + "learning_rate": 1.6511764017324127e-07, + "logits/chosen": -3.231484889984131, + "logits/rejected": -3.3901374340057373, + "logps/chosen": -157.3760986328125, + "logps/rejected": -285.7887268066406, + "loss": 0.3465, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16599106788635254, + "rewards/margins": 2.4666965007781982, + "rewards/rejected": -2.3007051944732666, + "step": 3972 + }, + { + "epoch": 0.46, + "learning_rate": 1.65082523703617e-07, + "logits/chosen": -3.4712109565734863, + "logits/rejected": -3.370635509490967, + "logps/chosen": -304.71417236328125, + "logps/rejected": -256.9755859375, + "loss": 0.2937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16690418124198914, + "rewards/margins": 1.6194047927856445, + "rewards/rejected": -1.786309003829956, + "step": 3973 + }, + { + "epoch": 0.46, + "learning_rate": 1.6504740723399275e-07, + "logits/chosen": -3.4244792461395264, + "logits/rejected": -3.6659579277038574, + "logps/chosen": -214.91839599609375, + "logps/rejected": -210.9064178466797, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8289140462875366, + "rewards/margins": 2.5794179439544678, + "rewards/rejected": -1.7505037784576416, + "step": 3974 + }, + { + "epoch": 0.46, + "learning_rate": 1.6501229076436848e-07, + "logits/chosen": -3.188371181488037, + "logits/rejected": -2.8804514408111572, + "logps/chosen": -191.48306274414062, + "logps/rejected": -278.719970703125, + "loss": 0.4044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4965165853500366, + "rewards/margins": 2.8400635719299316, + "rewards/rejected": -3.336580276489258, + "step": 3975 + }, + { + "epoch": 0.46, + "learning_rate": 1.6497717429474423e-07, + "logits/chosen": -3.4767565727233887, + "logits/rejected": -3.351149082183838, + "logps/chosen": -307.1471252441406, + "logps/rejected": -216.97792053222656, + "loss": 0.4328, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40834617614746094, + "rewards/margins": 1.675047516822815, + "rewards/rejected": -1.266701340675354, + "step": 3976 + }, + { + "epoch": 0.46, + "learning_rate": 1.6494205782511998e-07, + "logits/chosen": -3.3121650218963623, + "logits/rejected": -2.923543930053711, + "logps/chosen": -239.4761962890625, + "logps/rejected": -174.41180419921875, + "loss": 0.6314, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20744732022285461, + "rewards/margins": 0.5713068246841431, + "rewards/rejected": -0.7787541151046753, + "step": 3977 + }, + { + "epoch": 0.46, + "learning_rate": 1.649069413554957e-07, + "logits/chosen": -3.0934929847717285, + "logits/rejected": -2.7812631130218506, + "logps/chosen": -181.4929656982422, + "logps/rejected": -164.94595336914062, + "loss": 0.3004, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8139621615409851, + "rewards/margins": 2.257063627243042, + "rewards/rejected": -1.443101406097412, + "step": 3978 + }, + { + "epoch": 0.46, + "learning_rate": 1.6487182488587146e-07, + "logits/chosen": -2.771048069000244, + "logits/rejected": -3.094297409057617, + "logps/chosen": -177.0226287841797, + "logps/rejected": -219.43560791015625, + "loss": 0.5734, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.057240039110183716, + "rewards/margins": 1.981013298034668, + "rewards/rejected": -1.923773169517517, + "step": 3979 + }, + { + "epoch": 0.46, + "learning_rate": 1.6483670841624722e-07, + "logits/chosen": -3.1800315380096436, + "logits/rejected": -3.2251572608947754, + "logps/chosen": -173.72579956054688, + "logps/rejected": -198.10275268554688, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.266027569770813, + "rewards/margins": 1.5047733783721924, + "rewards/rejected": -1.770801067352295, + "step": 3980 + }, + { + "epoch": 0.46, + "learning_rate": 1.6480159194662295e-07, + "logits/chosen": -3.216588020324707, + "logits/rejected": -3.4371676445007324, + "logps/chosen": -217.40249633789062, + "logps/rejected": -344.0405578613281, + "loss": 0.108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4530790448188782, + "rewards/margins": 3.0385255813598633, + "rewards/rejected": -2.585446834564209, + "step": 3981 + }, + { + "epoch": 0.46, + "learning_rate": 1.647664754769987e-07, + "logits/chosen": -2.758603572845459, + "logits/rejected": -2.7158920764923096, + "logps/chosen": -297.5655822753906, + "logps/rejected": -320.78558349609375, + "loss": 0.7365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6531367897987366, + "rewards/margins": 0.6878114342689514, + "rewards/rejected": -1.3409483432769775, + "step": 3982 + }, + { + "epoch": 0.46, + "learning_rate": 1.6473135900737443e-07, + "logits/chosen": -4.092775344848633, + "logits/rejected": -3.9806699752807617, + "logps/chosen": -318.9555358886719, + "logps/rejected": -260.61962890625, + "loss": 0.7359, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40656930208206177, + "rewards/margins": 0.9104679822921753, + "rewards/rejected": -1.3170373439788818, + "step": 3983 + }, + { + "epoch": 0.46, + "learning_rate": 1.646962425377502e-07, + "logits/chosen": -2.8695454597473145, + "logits/rejected": -2.8708181381225586, + "logps/chosen": -304.20294189453125, + "logps/rejected": -250.50601196289062, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0858922079205513, + "rewards/margins": 2.2498929500579834, + "rewards/rejected": -2.335785150527954, + "step": 3984 + }, + { + "epoch": 0.46, + "learning_rate": 1.6466112606812596e-07, + "logits/chosen": -3.3683032989501953, + "logits/rejected": -3.6590805053710938, + "logps/chosen": -173.3211669921875, + "logps/rejected": -184.30824279785156, + "loss": 0.2949, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17366328835487366, + "rewards/margins": 2.2997193336486816, + "rewards/rejected": -2.12605619430542, + "step": 3985 + }, + { + "epoch": 0.46, + "learning_rate": 1.646260095985017e-07, + "logits/chosen": -3.6463472843170166, + "logits/rejected": -3.603259563446045, + "logps/chosen": -230.11764526367188, + "logps/rejected": -266.04296875, + "loss": 0.6006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46812936663627625, + "rewards/margins": 0.8776636123657227, + "rewards/rejected": -1.3457930088043213, + "step": 3986 + }, + { + "epoch": 0.46, + "learning_rate": 1.6459089312887744e-07, + "logits/chosen": -3.108025312423706, + "logits/rejected": -3.0504095554351807, + "logps/chosen": -181.22821044921875, + "logps/rejected": -254.70335388183594, + "loss": 0.6199, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6955282092094421, + "rewards/margins": 0.3873995542526245, + "rewards/rejected": -1.0829278230667114, + "step": 3987 + }, + { + "epoch": 0.46, + "learning_rate": 1.645557766592532e-07, + "logits/chosen": -3.343568801879883, + "logits/rejected": -2.9868083000183105, + "logps/chosen": -434.24591064453125, + "logps/rejected": -246.72369384765625, + "loss": 0.7949, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42785656452178955, + "rewards/margins": 1.0305135250091553, + "rewards/rejected": -1.4583702087402344, + "step": 3988 + }, + { + "epoch": 0.46, + "learning_rate": 1.6452066018962892e-07, + "logits/chosen": -3.6544039249420166, + "logits/rejected": -3.333561897277832, + "logps/chosen": -161.2752227783203, + "logps/rejected": -171.18716430664062, + "loss": 1.1869, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1563000679016113, + "rewards/margins": 0.6723873019218445, + "rewards/rejected": -1.8286874294281006, + "step": 3989 + }, + { + "epoch": 0.46, + "learning_rate": 1.6448554372000468e-07, + "logits/chosen": -3.1885628700256348, + "logits/rejected": -2.833173990249634, + "logps/chosen": -301.1690368652344, + "logps/rejected": -215.670654296875, + "loss": 0.2884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03793298453092575, + "rewards/margins": 2.3116183280944824, + "rewards/rejected": -2.349551200866699, + "step": 3990 + }, + { + "epoch": 0.46, + "learning_rate": 1.644504272503804e-07, + "logits/chosen": -2.951505661010742, + "logits/rejected": -3.2268502712249756, + "logps/chosen": -204.24673461914062, + "logps/rejected": -200.89932250976562, + "loss": 0.5415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6377841830253601, + "rewards/margins": 1.9683786630630493, + "rewards/rejected": -2.6061627864837646, + "step": 3991 + }, + { + "epoch": 0.46, + "learning_rate": 1.6441531078075616e-07, + "logits/chosen": -2.80319881439209, + "logits/rejected": -2.6608431339263916, + "logps/chosen": -185.8607635498047, + "logps/rejected": -131.388671875, + "loss": 0.6989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36704012751579285, + "rewards/margins": 0.2181515395641327, + "rewards/rejected": -0.5851916074752808, + "step": 3992 + }, + { + "epoch": 0.46, + "learning_rate": 1.643801943111319e-07, + "logits/chosen": -3.5328030586242676, + "logits/rejected": -3.614912509918213, + "logps/chosen": -220.2752227783203, + "logps/rejected": -266.94830322265625, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13142554461956024, + "rewards/margins": 3.371856451034546, + "rewards/rejected": -3.2404305934906006, + "step": 3993 + }, + { + "epoch": 0.46, + "learning_rate": 1.6434507784150764e-07, + "logits/chosen": -3.4521870613098145, + "logits/rejected": -3.2855241298675537, + "logps/chosen": -257.023193359375, + "logps/rejected": -307.99029541015625, + "loss": 0.2527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6928210258483887, + "rewards/margins": 2.042175769805908, + "rewards/rejected": -2.734996795654297, + "step": 3994 + }, + { + "epoch": 0.46, + "learning_rate": 1.6430996137188342e-07, + "logits/chosen": -3.950422763824463, + "logits/rejected": -3.8057146072387695, + "logps/chosen": -236.10211181640625, + "logps/rejected": -177.81661987304688, + "loss": 0.1995, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5446525812149048, + "rewards/margins": 2.14439058303833, + "rewards/rejected": -1.5997378826141357, + "step": 3995 + }, + { + "epoch": 0.46, + "learning_rate": 1.6427484490225917e-07, + "logits/chosen": -2.7173614501953125, + "logits/rejected": -2.8052632808685303, + "logps/chosen": -337.5051574707031, + "logps/rejected": -261.2828369140625, + "loss": 0.3001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46268516778945923, + "rewards/margins": 1.6465518474578857, + "rewards/rejected": -1.1838668584823608, + "step": 3996 + }, + { + "epoch": 0.46, + "learning_rate": 1.642397284326349e-07, + "logits/chosen": -3.3771941661834717, + "logits/rejected": -3.3279247283935547, + "logps/chosen": -316.04620361328125, + "logps/rejected": -219.026611328125, + "loss": 0.6833, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4462580978870392, + "rewards/margins": 1.2310659885406494, + "rewards/rejected": -1.6773239374160767, + "step": 3997 + }, + { + "epoch": 0.46, + "learning_rate": 1.6420461196301066e-07, + "logits/chosen": -2.562575578689575, + "logits/rejected": -2.5766289234161377, + "logps/chosen": -279.7907409667969, + "logps/rejected": -180.93743896484375, + "loss": 0.4346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0762377679347992, + "rewards/margins": 0.733461856842041, + "rewards/rejected": -0.8096995949745178, + "step": 3998 + }, + { + "epoch": 0.46, + "learning_rate": 1.6416949549338638e-07, + "logits/chosen": -2.6048173904418945, + "logits/rejected": -2.518115997314453, + "logps/chosen": -362.49322509765625, + "logps/rejected": -299.4237060546875, + "loss": 0.3127, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29383695125579834, + "rewards/margins": 2.069251298904419, + "rewards/rejected": -2.3630881309509277, + "step": 3999 + }, + { + "epoch": 0.46, + "learning_rate": 1.6413437902376214e-07, + "logits/chosen": -2.8831188678741455, + "logits/rejected": -2.9819562435150146, + "logps/chosen": -194.85986328125, + "logps/rejected": -283.9088134765625, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10785654187202454, + "rewards/margins": 2.621767044067383, + "rewards/rejected": -2.729623556137085, + "step": 4000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.8433773517608643, + "eval_logits/rejected": -2.8090808391571045, + "eval_logps/chosen": -293.8716125488281, + "eval_logps/rejected": -236.62954711914062, + "eval_loss": 0.43787261843681335, + "eval_rewards/accuracies": 0.800000011920929, + "eval_rewards/chosen": 0.018334832042455673, + "eval_rewards/margins": 1.2470276355743408, + "eval_rewards/rejected": -1.2286927700042725, + "eval_runtime": 32.6016, + "eval_samples_per_second": 2.147, + "eval_steps_per_second": 1.074, + "step": 4000 + }, + { + "epoch": 0.46, + "learning_rate": 1.640992625541379e-07, + "logits/chosen": -3.097788095474243, + "logits/rejected": -3.0029335021972656, + "logps/chosen": -158.92428588867188, + "logps/rejected": -166.89224243164062, + "loss": 0.7172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6463521718978882, + "rewards/margins": 0.6549421548843384, + "rewards/rejected": -1.3012943267822266, + "step": 4001 + }, + { + "epoch": 0.46, + "learning_rate": 1.6406414608451362e-07, + "logits/chosen": -3.387054681777954, + "logits/rejected": -3.2726821899414062, + "logps/chosen": -165.92349243164062, + "logps/rejected": -280.93206787109375, + "loss": 0.4087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06472450494766235, + "rewards/margins": 2.3058180809020996, + "rewards/rejected": -2.370542526245117, + "step": 4002 + }, + { + "epoch": 0.46, + "learning_rate": 1.6402902961488937e-07, + "logits/chosen": -2.514674186706543, + "logits/rejected": -2.4830241203308105, + "logps/chosen": -321.7777404785156, + "logps/rejected": -171.14181518554688, + "loss": 0.4612, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3805154263973236, + "rewards/margins": 1.555202603340149, + "rewards/rejected": -1.174687147140503, + "step": 4003 + }, + { + "epoch": 0.46, + "learning_rate": 1.639939131452651e-07, + "logits/chosen": -3.5103750228881836, + "logits/rejected": -3.2903897762298584, + "logps/chosen": -212.14007568359375, + "logps/rejected": -239.06271362304688, + "loss": 0.8264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5355724692344666, + "rewards/margins": 0.48404234647750854, + "rewards/rejected": -1.019614815711975, + "step": 4004 + }, + { + "epoch": 0.46, + "learning_rate": 1.6395879667564085e-07, + "logits/chosen": -3.399909496307373, + "logits/rejected": -3.4066264629364014, + "logps/chosen": -218.37957763671875, + "logps/rejected": -162.23890686035156, + "loss": 0.4699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05159464478492737, + "rewards/margins": 1.2562288045883179, + "rewards/rejected": -1.3078234195709229, + "step": 4005 + }, + { + "epoch": 0.46, + "learning_rate": 1.6392368020601663e-07, + "logits/chosen": -3.539332389831543, + "logits/rejected": -3.36002779006958, + "logps/chosen": -244.35662841796875, + "logps/rejected": -318.65423583984375, + "loss": 0.3805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20495277643203735, + "rewards/margins": 3.080638885498047, + "rewards/rejected": -3.2855913639068604, + "step": 4006 + }, + { + "epoch": 0.46, + "learning_rate": 1.6388856373639236e-07, + "logits/chosen": -2.781803607940674, + "logits/rejected": -3.110149383544922, + "logps/chosen": -295.13690185546875, + "logps/rejected": -344.74249267578125, + "loss": 0.4069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07358483970165253, + "rewards/margins": 2.223085880279541, + "rewards/rejected": -2.296671152114868, + "step": 4007 + }, + { + "epoch": 0.46, + "learning_rate": 1.6385344726676812e-07, + "logits/chosen": -3.62471342086792, + "logits/rejected": -3.3946077823638916, + "logps/chosen": -260.4302978515625, + "logps/rejected": -314.1594543457031, + "loss": 0.3844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.350037157535553, + "rewards/margins": 2.478698968887329, + "rewards/rejected": -2.8287360668182373, + "step": 4008 + }, + { + "epoch": 0.46, + "learning_rate": 1.6381833079714387e-07, + "logits/chosen": -3.5677404403686523, + "logits/rejected": -3.5073530673980713, + "logps/chosen": -243.21038818359375, + "logps/rejected": -238.63772583007812, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5886500477790833, + "rewards/margins": 1.4157829284667969, + "rewards/rejected": -0.8271329402923584, + "step": 4009 + }, + { + "epoch": 0.46, + "learning_rate": 1.637832143275196e-07, + "logits/chosen": -3.1958200931549072, + "logits/rejected": -3.2570877075195312, + "logps/chosen": -390.84637451171875, + "logps/rejected": -211.732177734375, + "loss": 0.4516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023188352584838867, + "rewards/margins": 0.9268955588340759, + "rewards/rejected": -0.9037072658538818, + "step": 4010 + }, + { + "epoch": 0.46, + "learning_rate": 1.6374809785789535e-07, + "logits/chosen": -2.602372884750366, + "logits/rejected": -2.593405246734619, + "logps/chosen": -410.18829345703125, + "logps/rejected": -237.3779296875, + "loss": 0.5197, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20458859205245972, + "rewards/margins": 0.612922191619873, + "rewards/rejected": -0.40833356976509094, + "step": 4011 + }, + { + "epoch": 0.46, + "learning_rate": 1.6371298138827108e-07, + "logits/chosen": -3.1995320320129395, + "logits/rejected": -2.9835469722747803, + "logps/chosen": -342.9864501953125, + "logps/rejected": -271.7644958496094, + "loss": 0.2711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02700500562787056, + "rewards/margins": 1.590925931930542, + "rewards/rejected": -1.6179308891296387, + "step": 4012 + }, + { + "epoch": 0.46, + "learning_rate": 1.6367786491864683e-07, + "logits/chosen": -2.8886349201202393, + "logits/rejected": -2.9970672130584717, + "logps/chosen": -189.3954620361328, + "logps/rejected": -183.78915405273438, + "loss": 0.4653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13795128464698792, + "rewards/margins": 1.4225164651870728, + "rewards/rejected": -1.5604678392410278, + "step": 4013 + }, + { + "epoch": 0.46, + "learning_rate": 1.6364274844902259e-07, + "logits/chosen": -2.8558554649353027, + "logits/rejected": -2.925271511077881, + "logps/chosen": -279.20123291015625, + "logps/rejected": -246.9000244140625, + "loss": 0.7978, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.030086562037467957, + "rewards/margins": 0.22714731097221375, + "rewards/rejected": -0.2572338581085205, + "step": 4014 + }, + { + "epoch": 0.46, + "learning_rate": 1.636076319793983e-07, + "logits/chosen": -2.9062204360961914, + "logits/rejected": -2.839794635772705, + "logps/chosen": -97.71676635742188, + "logps/rejected": -196.38473510742188, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6069939136505127, + "rewards/margins": 1.4259965419769287, + "rewards/rejected": -2.0329904556274414, + "step": 4015 + }, + { + "epoch": 0.46, + "learning_rate": 1.6357251550977407e-07, + "logits/chosen": -4.005174160003662, + "logits/rejected": -3.9457809925079346, + "logps/chosen": -328.5836486816406, + "logps/rejected": -204.25997924804688, + "loss": 0.4288, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14630062878131866, + "rewards/margins": 1.3929662704467773, + "rewards/rejected": -1.246665596961975, + "step": 4016 + }, + { + "epoch": 0.46, + "learning_rate": 1.6353739904014985e-07, + "logits/chosen": -3.495882987976074, + "logits/rejected": -2.9803388118743896, + "logps/chosen": -348.90936279296875, + "logps/rejected": -303.5547180175781, + "loss": 0.4247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.420043021440506, + "rewards/margins": 1.2908498048782349, + "rewards/rejected": -1.710892677307129, + "step": 4017 + }, + { + "epoch": 0.46, + "learning_rate": 1.6350228257052557e-07, + "logits/chosen": -2.914503812789917, + "logits/rejected": -2.8801662921905518, + "logps/chosen": -309.33160400390625, + "logps/rejected": -240.04901123046875, + "loss": 0.3722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14050158858299255, + "rewards/margins": 1.5887449979782104, + "rewards/rejected": -1.448243498802185, + "step": 4018 + }, + { + "epoch": 0.46, + "learning_rate": 1.6346716610090133e-07, + "logits/chosen": -3.072740077972412, + "logits/rejected": -3.12315034866333, + "logps/chosen": -139.6576385498047, + "logps/rejected": -275.0268859863281, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15434737503528595, + "rewards/margins": 2.192469596862793, + "rewards/rejected": -2.3468170166015625, + "step": 4019 + }, + { + "epoch": 0.46, + "learning_rate": 1.6343204963127706e-07, + "logits/chosen": -3.4853806495666504, + "logits/rejected": -3.199787139892578, + "logps/chosen": -251.83929443359375, + "logps/rejected": -286.9573974609375, + "loss": 0.4732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0514964684844017, + "rewards/margins": 0.9908259510993958, + "rewards/rejected": -0.9393295049667358, + "step": 4020 + }, + { + "epoch": 0.46, + "learning_rate": 1.633969331616528e-07, + "logits/chosen": -3.445040225982666, + "logits/rejected": -3.291023015975952, + "logps/chosen": -379.9154357910156, + "logps/rejected": -162.46421813964844, + "loss": 0.6259, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44096535444259644, + "rewards/margins": 0.8879681825637817, + "rewards/rejected": -1.3289337158203125, + "step": 4021 + }, + { + "epoch": 0.46, + "learning_rate": 1.6336181669202856e-07, + "logits/chosen": -3.3272764682769775, + "logits/rejected": -2.935596466064453, + "logps/chosen": -306.778076171875, + "logps/rejected": -216.4516143798828, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16507583856582642, + "rewards/margins": 1.5624847412109375, + "rewards/rejected": -1.3974090814590454, + "step": 4022 + }, + { + "epoch": 0.46, + "learning_rate": 1.633267002224043e-07, + "logits/chosen": -2.799703359603882, + "logits/rejected": -2.9963207244873047, + "logps/chosen": -119.42378234863281, + "logps/rejected": -195.06724548339844, + "loss": 0.3615, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13284647464752197, + "rewards/margins": 1.8496222496032715, + "rewards/rejected": -1.7167757749557495, + "step": 4023 + }, + { + "epoch": 0.46, + "learning_rate": 1.6329158375278004e-07, + "logits/chosen": -3.6333224773406982, + "logits/rejected": -3.2635669708251953, + "logps/chosen": -206.45010375976562, + "logps/rejected": -101.29391479492188, + "loss": 0.5436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3644915521144867, + "rewards/margins": 0.7492147088050842, + "rewards/rejected": -0.38472312688827515, + "step": 4024 + }, + { + "epoch": 0.46, + "learning_rate": 1.632564672831558e-07, + "logits/chosen": -3.3943490982055664, + "logits/rejected": -3.5797531604766846, + "logps/chosen": -266.91485595703125, + "logps/rejected": -222.17868041992188, + "loss": 0.8014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3407958447933197, + "rewards/margins": 1.2777647972106934, + "rewards/rejected": -1.618560552597046, + "step": 4025 + }, + { + "epoch": 0.46, + "learning_rate": 1.6322135081353153e-07, + "logits/chosen": -3.2120461463928223, + "logits/rejected": -3.018148183822632, + "logps/chosen": -142.23968505859375, + "logps/rejected": -211.62646484375, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2758142352104187, + "rewards/margins": 2.663294792175293, + "rewards/rejected": -2.9391088485717773, + "step": 4026 + }, + { + "epoch": 0.46, + "learning_rate": 1.6318623434390728e-07, + "logits/chosen": -3.059521198272705, + "logits/rejected": -3.024779796600342, + "logps/chosen": -204.6798858642578, + "logps/rejected": -301.9896240234375, + "loss": 0.2352, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11559763550758362, + "rewards/margins": 1.718866229057312, + "rewards/rejected": -1.6032685041427612, + "step": 4027 + }, + { + "epoch": 0.46, + "learning_rate": 1.63151117874283e-07, + "logits/chosen": -3.172276735305786, + "logits/rejected": -3.2330238819122314, + "logps/chosen": -175.01214599609375, + "logps/rejected": -183.70687866210938, + "loss": 0.4463, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1732264757156372, + "rewards/margins": 1.4366583824157715, + "rewards/rejected": -1.2634317874908447, + "step": 4028 + }, + { + "epoch": 0.46, + "learning_rate": 1.631160014046588e-07, + "logits/chosen": -2.864838123321533, + "logits/rejected": -2.4638473987579346, + "logps/chosen": -286.1737976074219, + "logps/rejected": -310.2687072753906, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40263769030570984, + "rewards/margins": 1.4971652030944824, + "rewards/rejected": -1.0945274829864502, + "step": 4029 + }, + { + "epoch": 0.46, + "learning_rate": 1.6308088493503454e-07, + "logits/chosen": -3.555224895477295, + "logits/rejected": -3.609050750732422, + "logps/chosen": -176.36825561523438, + "logps/rejected": -229.11532592773438, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44689565896987915, + "rewards/margins": 1.3451135158538818, + "rewards/rejected": -1.7920091152191162, + "step": 4030 + }, + { + "epoch": 0.46, + "learning_rate": 1.6304576846541027e-07, + "logits/chosen": -2.8344674110412598, + "logits/rejected": -2.485281229019165, + "logps/chosen": -230.05386352539062, + "logps/rejected": -336.6531982421875, + "loss": 0.2868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18868786096572876, + "rewards/margins": 2.620177745819092, + "rewards/rejected": -2.808865547180176, + "step": 4031 + }, + { + "epoch": 0.46, + "learning_rate": 1.6301065199578602e-07, + "logits/chosen": -3.367945671081543, + "logits/rejected": -3.166989326477051, + "logps/chosen": -299.2272033691406, + "logps/rejected": -223.93243408203125, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8628818988800049, + "rewards/margins": 1.200608491897583, + "rewards/rejected": -2.063490390777588, + "step": 4032 + }, + { + "epoch": 0.46, + "learning_rate": 1.6297553552616178e-07, + "logits/chosen": -3.068775177001953, + "logits/rejected": -2.9324536323547363, + "logps/chosen": -277.0895080566406, + "logps/rejected": -259.8199768066406, + "loss": 0.3232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01919812336564064, + "rewards/margins": 1.8387786149978638, + "rewards/rejected": -1.8579767942428589, + "step": 4033 + }, + { + "epoch": 0.47, + "learning_rate": 1.629404190565375e-07, + "logits/chosen": -2.5898752212524414, + "logits/rejected": -2.6114344596862793, + "logps/chosen": -318.0885925292969, + "logps/rejected": -276.208984375, + "loss": 0.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0021690428256988525, + "rewards/margins": 1.2351105213165283, + "rewards/rejected": -1.2372796535491943, + "step": 4034 + }, + { + "epoch": 0.47, + "learning_rate": 1.6290530258691326e-07, + "logits/chosen": -3.412468910217285, + "logits/rejected": -3.5625195503234863, + "logps/chosen": -212.18800354003906, + "logps/rejected": -237.26254272460938, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5222256183624268, + "rewards/margins": 0.937780499458313, + "rewards/rejected": -1.4600062370300293, + "step": 4035 + }, + { + "epoch": 0.47, + "learning_rate": 1.6287018611728898e-07, + "logits/chosen": -3.0926148891448975, + "logits/rejected": -3.334456205368042, + "logps/chosen": -137.22344970703125, + "logps/rejected": -167.22621154785156, + "loss": 0.3686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.319000780582428, + "rewards/margins": 1.8409645557403564, + "rewards/rejected": -1.5219638347625732, + "step": 4036 + }, + { + "epoch": 0.47, + "learning_rate": 1.6283506964766474e-07, + "logits/chosen": -3.4398720264434814, + "logits/rejected": -3.0812950134277344, + "logps/chosen": -289.9349060058594, + "logps/rejected": -189.63583374023438, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06960625946521759, + "rewards/margins": 1.6607328653335571, + "rewards/rejected": -1.7303392887115479, + "step": 4037 + }, + { + "epoch": 0.47, + "learning_rate": 1.627999531780405e-07, + "logits/chosen": -3.8933680057525635, + "logits/rejected": -3.85137939453125, + "logps/chosen": -408.55499267578125, + "logps/rejected": -299.9187316894531, + "loss": 0.513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25546589493751526, + "rewards/margins": 1.386702537536621, + "rewards/rejected": -1.642168402671814, + "step": 4038 + }, + { + "epoch": 0.47, + "learning_rate": 1.6276483670841622e-07, + "logits/chosen": -3.684659481048584, + "logits/rejected": -3.539393901824951, + "logps/chosen": -248.04925537109375, + "logps/rejected": -232.35426330566406, + "loss": 0.3165, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06389844417572021, + "rewards/margins": 1.899062991142273, + "rewards/rejected": -1.8351644277572632, + "step": 4039 + }, + { + "epoch": 0.47, + "learning_rate": 1.62729720238792e-07, + "logits/chosen": -3.230335235595703, + "logits/rejected": -3.340547561645508, + "logps/chosen": -341.18359375, + "logps/rejected": -225.42108154296875, + "loss": 0.7633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.690831184387207, + "rewards/margins": 1.0312319993972778, + "rewards/rejected": -1.7220631837844849, + "step": 4040 + }, + { + "epoch": 0.47, + "learning_rate": 1.6269460376916775e-07, + "logits/chosen": -2.921609401702881, + "logits/rejected": -2.8276190757751465, + "logps/chosen": -269.6799621582031, + "logps/rejected": -253.01113891601562, + "loss": 0.3711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3569781184196472, + "rewards/margins": 0.9029462933540344, + "rewards/rejected": -1.2599244117736816, + "step": 4041 + }, + { + "epoch": 0.47, + "learning_rate": 1.6265948729954348e-07, + "logits/chosen": -2.478303909301758, + "logits/rejected": -2.3725249767303467, + "logps/chosen": -299.666748046875, + "logps/rejected": -212.21331787109375, + "loss": 0.2081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29345524311065674, + "rewards/margins": 2.0934982299804688, + "rewards/rejected": -2.386953353881836, + "step": 4042 + }, + { + "epoch": 0.47, + "learning_rate": 1.6262437082991924e-07, + "logits/chosen": -3.449610710144043, + "logits/rejected": -3.4573569297790527, + "logps/chosen": -383.6758728027344, + "logps/rejected": -286.091064453125, + "loss": 0.5381, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5726088285446167, + "rewards/margins": 1.9646403789520264, + "rewards/rejected": -2.5372490882873535, + "step": 4043 + }, + { + "epoch": 0.47, + "learning_rate": 1.6258925436029496e-07, + "logits/chosen": -3.3067967891693115, + "logits/rejected": -3.060866594314575, + "logps/chosen": -274.7522277832031, + "logps/rejected": -220.51214599609375, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07868346571922302, + "rewards/margins": 0.9113374352455139, + "rewards/rejected": -0.9900208711624146, + "step": 4044 + }, + { + "epoch": 0.47, + "learning_rate": 1.6255413789067072e-07, + "logits/chosen": -2.8935694694519043, + "logits/rejected": -2.994480609893799, + "logps/chosen": -194.4086456298828, + "logps/rejected": -315.971923828125, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5314410924911499, + "rewards/margins": 1.877898097038269, + "rewards/rejected": -2.409339189529419, + "step": 4045 + }, + { + "epoch": 0.47, + "learning_rate": 1.6251902142104647e-07, + "logits/chosen": -3.7800559997558594, + "logits/rejected": -3.804351806640625, + "logps/chosen": -106.2520980834961, + "logps/rejected": -224.29290771484375, + "loss": 0.323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02741996943950653, + "rewards/margins": 1.7521636486053467, + "rewards/rejected": -1.7795836925506592, + "step": 4046 + }, + { + "epoch": 0.47, + "learning_rate": 1.624839049514222e-07, + "logits/chosen": -3.0124661922454834, + "logits/rejected": -3.0923705101013184, + "logps/chosen": -332.7197570800781, + "logps/rejected": -330.11163330078125, + "loss": 0.2786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49955371022224426, + "rewards/margins": 3.0667972564697266, + "rewards/rejected": -3.5663509368896484, + "step": 4047 + }, + { + "epoch": 0.47, + "learning_rate": 1.6244878848179795e-07, + "logits/chosen": -3.09550142288208, + "logits/rejected": -3.2110066413879395, + "logps/chosen": -331.6051330566406, + "logps/rejected": -290.2023620605469, + "loss": 0.2116, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03305875509977341, + "rewards/margins": 1.997115135192871, + "rewards/rejected": -1.9640564918518066, + "step": 4048 + }, + { + "epoch": 0.47, + "learning_rate": 1.6241367201217368e-07, + "logits/chosen": -3.4432365894317627, + "logits/rejected": -3.538078784942627, + "logps/chosen": -357.5598449707031, + "logps/rejected": -222.01214599609375, + "loss": 0.2483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3937512934207916, + "rewards/margins": 2.263476610183716, + "rewards/rejected": -2.6572279930114746, + "step": 4049 + }, + { + "epoch": 0.47, + "learning_rate": 1.6237855554254943e-07, + "logits/chosen": -2.975454092025757, + "logits/rejected": -2.8118350505828857, + "logps/chosen": -200.75128173828125, + "logps/rejected": -281.24603271484375, + "loss": 0.4147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2514498233795166, + "rewards/margins": 1.8065667152404785, + "rewards/rejected": -1.555116891860962, + "step": 4050 + }, + { + "epoch": 0.47, + "learning_rate": 1.623434390729252e-07, + "logits/chosen": -3.272108793258667, + "logits/rejected": -2.8883962631225586, + "logps/chosen": -265.5798645019531, + "logps/rejected": -276.1944274902344, + "loss": 0.6888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42662376165390015, + "rewards/margins": 0.7067745327949524, + "rewards/rejected": -1.1333982944488525, + "step": 4051 + }, + { + "epoch": 0.47, + "learning_rate": 1.6230832260330094e-07, + "logits/chosen": -2.88247013092041, + "logits/rejected": -2.8326401710510254, + "logps/chosen": -166.92416381835938, + "logps/rejected": -201.14125061035156, + "loss": 0.974, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9785467982292175, + "rewards/margins": -0.33031877875328064, + "rewards/rejected": -0.6482280492782593, + "step": 4052 + }, + { + "epoch": 0.47, + "learning_rate": 1.622732061336767e-07, + "logits/chosen": -2.825571298599243, + "logits/rejected": -2.823254108428955, + "logps/chosen": -426.6119079589844, + "logps/rejected": -314.38775634765625, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4948697090148926, + "rewards/margins": 1.0519132614135742, + "rewards/rejected": -1.5467829704284668, + "step": 4053 + }, + { + "epoch": 0.47, + "learning_rate": 1.6223808966405245e-07, + "logits/chosen": -2.5725345611572266, + "logits/rejected": -2.6306893825531006, + "logps/chosen": -389.2899169921875, + "logps/rejected": -286.13812255859375, + "loss": 0.3991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11526241898536682, + "rewards/margins": 0.9045605659484863, + "rewards/rejected": -1.0198229551315308, + "step": 4054 + }, + { + "epoch": 0.47, + "learning_rate": 1.6220297319442818e-07, + "logits/chosen": -3.0268611907958984, + "logits/rejected": -3.354990005493164, + "logps/chosen": -323.03240966796875, + "logps/rejected": -193.38967895507812, + "loss": 0.4458, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4127781391143799, + "rewards/margins": 2.2681076526641846, + "rewards/rejected": -1.8553295135498047, + "step": 4055 + }, + { + "epoch": 0.47, + "learning_rate": 1.6216785672480393e-07, + "logits/chosen": -2.512786388397217, + "logits/rejected": -2.458662509918213, + "logps/chosen": -209.0631103515625, + "logps/rejected": -304.8893127441406, + "loss": 0.2517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3590794503688812, + "rewards/margins": 2.5722949504852295, + "rewards/rejected": -2.9313745498657227, + "step": 4056 + }, + { + "epoch": 0.47, + "learning_rate": 1.6213274025517966e-07, + "logits/chosen": -2.889737844467163, + "logits/rejected": -2.872379779815674, + "logps/chosen": -337.4125061035156, + "logps/rejected": -279.84716796875, + "loss": 0.687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0928000956773758, + "rewards/margins": 1.4457640647888184, + "rewards/rejected": -1.5385642051696777, + "step": 4057 + }, + { + "epoch": 0.47, + "learning_rate": 1.620976237855554e-07, + "logits/chosen": -2.735053062438965, + "logits/rejected": -2.6983072757720947, + "logps/chosen": -101.15076446533203, + "logps/rejected": -160.7322540283203, + "loss": 0.6409, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04400122910737991, + "rewards/margins": 1.1571223735809326, + "rewards/rejected": -1.201123595237732, + "step": 4058 + }, + { + "epoch": 0.47, + "learning_rate": 1.6206250731593116e-07, + "logits/chosen": -3.008755683898926, + "logits/rejected": -2.788393497467041, + "logps/chosen": -350.5355224609375, + "logps/rejected": -200.05691528320312, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3124668598175049, + "rewards/margins": 1.648129940032959, + "rewards/rejected": -1.9605967998504639, + "step": 4059 + }, + { + "epoch": 0.47, + "learning_rate": 1.620273908463069e-07, + "logits/chosen": -2.4946846961975098, + "logits/rejected": -2.316445827484131, + "logps/chosen": -380.6108703613281, + "logps/rejected": -169.24020385742188, + "loss": 0.3064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1611902266740799, + "rewards/margins": 1.4213545322418213, + "rewards/rejected": -1.5825448036193848, + "step": 4060 + }, + { + "epoch": 0.47, + "learning_rate": 1.6199227437668265e-07, + "logits/chosen": -3.5908048152923584, + "logits/rejected": -3.109975576400757, + "logps/chosen": -227.29116821289062, + "logps/rejected": -201.41839599609375, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17357294261455536, + "rewards/margins": 1.1250276565551758, + "rewards/rejected": -1.2986005544662476, + "step": 4061 + }, + { + "epoch": 0.47, + "learning_rate": 1.6195715790705843e-07, + "logits/chosen": -3.623020648956299, + "logits/rejected": -3.668264627456665, + "logps/chosen": -299.7003173828125, + "logps/rejected": -366.5696716308594, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.737684428691864, + "rewards/margins": 3.5832314491271973, + "rewards/rejected": -2.8455467224121094, + "step": 4062 + }, + { + "epoch": 0.47, + "learning_rate": 1.6192204143743415e-07, + "logits/chosen": -3.3868188858032227, + "logits/rejected": -3.1987342834472656, + "logps/chosen": -241.6887969970703, + "logps/rejected": -179.24917602539062, + "loss": 0.2625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029952004551887512, + "rewards/margins": 2.0369181632995605, + "rewards/rejected": -2.0668702125549316, + "step": 4063 + }, + { + "epoch": 0.47, + "learning_rate": 1.618869249678099e-07, + "logits/chosen": -2.64900541305542, + "logits/rejected": -2.7251334190368652, + "logps/chosen": -203.93685913085938, + "logps/rejected": -172.60650634765625, + "loss": 0.485, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13756267726421356, + "rewards/margins": 1.1108112335205078, + "rewards/rejected": -0.9732487201690674, + "step": 4064 + }, + { + "epoch": 0.47, + "learning_rate": 1.6185180849818563e-07, + "logits/chosen": -3.597430467605591, + "logits/rejected": -3.70204496383667, + "logps/chosen": -288.0597229003906, + "logps/rejected": -352.9414367675781, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3724621534347534, + "rewards/margins": 4.548531532287598, + "rewards/rejected": -4.176069736480713, + "step": 4065 + }, + { + "epoch": 0.47, + "learning_rate": 1.618166920285614e-07, + "logits/chosen": -3.8972740173339844, + "logits/rejected": -3.8633410930633545, + "logps/chosen": -284.5651550292969, + "logps/rejected": -345.77520751953125, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5070737600326538, + "rewards/margins": 2.334310531616211, + "rewards/rejected": -1.8272366523742676, + "step": 4066 + }, + { + "epoch": 0.47, + "learning_rate": 1.6178157555893714e-07, + "logits/chosen": -2.9364469051361084, + "logits/rejected": -2.991001605987549, + "logps/chosen": -283.2635803222656, + "logps/rejected": -233.94052124023438, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49186983704566956, + "rewards/margins": 2.050222396850586, + "rewards/rejected": -1.5583525896072388, + "step": 4067 + }, + { + "epoch": 0.47, + "learning_rate": 1.6174645908931287e-07, + "logits/chosen": -3.2506465911865234, + "logits/rejected": -3.0985665321350098, + "logps/chosen": -322.5287170410156, + "logps/rejected": -293.9366149902344, + "loss": 0.3962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3096959888935089, + "rewards/margins": 1.9976541996002197, + "rewards/rejected": -2.3073501586914062, + "step": 4068 + }, + { + "epoch": 0.47, + "learning_rate": 1.6171134261968862e-07, + "logits/chosen": -3.557079315185547, + "logits/rejected": -3.2780210971832275, + "logps/chosen": -335.9757080078125, + "logps/rejected": -220.77828979492188, + "loss": 0.3001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05314311757683754, + "rewards/margins": 3.1939144134521484, + "rewards/rejected": -3.2470574378967285, + "step": 4069 + }, + { + "epoch": 0.47, + "learning_rate": 1.6167622615006438e-07, + "logits/chosen": -2.7959022521972656, + "logits/rejected": -2.9880192279815674, + "logps/chosen": -289.2069091796875, + "logps/rejected": -202.97283935546875, + "loss": 0.2452, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018633365631103516, + "rewards/margins": 1.8230901956558228, + "rewards/rejected": -1.8044568300247192, + "step": 4070 + }, + { + "epoch": 0.47, + "learning_rate": 1.616411096804401e-07, + "logits/chosen": -2.9110167026519775, + "logits/rejected": -2.675739288330078, + "logps/chosen": -277.39910888671875, + "logps/rejected": -203.64476013183594, + "loss": 0.7284, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16778340935707092, + "rewards/margins": 1.4668214321136475, + "rewards/rejected": -1.6346049308776855, + "step": 4071 + }, + { + "epoch": 0.47, + "learning_rate": 1.6160599321081586e-07, + "logits/chosen": -3.600123882293701, + "logits/rejected": -3.3775577545166016, + "logps/chosen": -251.54457092285156, + "logps/rejected": -176.36343383789062, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3572041690349579, + "rewards/margins": 2.502413034439087, + "rewards/rejected": -2.1452085971832275, + "step": 4072 + }, + { + "epoch": 0.47, + "learning_rate": 1.6157087674119159e-07, + "logits/chosen": -2.7266476154327393, + "logits/rejected": -2.6865971088409424, + "logps/chosen": -108.95465087890625, + "logps/rejected": -291.3143005371094, + "loss": 0.4454, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0013402849435806274, + "rewards/margins": 2.469937801361084, + "rewards/rejected": -2.468597412109375, + "step": 4073 + }, + { + "epoch": 0.47, + "learning_rate": 1.6153576027156737e-07, + "logits/chosen": -3.0622568130493164, + "logits/rejected": -3.217190742492676, + "logps/chosen": -185.26524353027344, + "logps/rejected": -263.9554138183594, + "loss": 0.3446, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.061963994055986404, + "rewards/margins": 3.2054402828216553, + "rewards/rejected": -3.267404079437256, + "step": 4074 + }, + { + "epoch": 0.47, + "learning_rate": 1.6150064380194312e-07, + "logits/chosen": -2.65216064453125, + "logits/rejected": -2.2063989639282227, + "logps/chosen": -252.00613403320312, + "logps/rejected": -191.04595947265625, + "loss": 0.3888, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3214179575443268, + "rewards/margins": 1.7928763628005981, + "rewards/rejected": -1.4714584350585938, + "step": 4075 + }, + { + "epoch": 0.47, + "learning_rate": 1.6146552733231885e-07, + "logits/chosen": -2.9533815383911133, + "logits/rejected": -2.2752525806427, + "logps/chosen": -442.67108154296875, + "logps/rejected": -295.9713134765625, + "loss": 0.3278, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13567256927490234, + "rewards/margins": 1.4763846397399902, + "rewards/rejected": -1.3407119512557983, + "step": 4076 + }, + { + "epoch": 0.47, + "learning_rate": 1.614304108626946e-07, + "logits/chosen": -3.2652177810668945, + "logits/rejected": -2.9810709953308105, + "logps/chosen": -226.9775848388672, + "logps/rejected": -292.7625732421875, + "loss": 0.175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17412106692790985, + "rewards/margins": 2.4508068561553955, + "rewards/rejected": -2.2766857147216797, + "step": 4077 + }, + { + "epoch": 0.47, + "learning_rate": 1.6139529439307036e-07, + "logits/chosen": -3.5296740531921387, + "logits/rejected": -3.809813976287842, + "logps/chosen": -363.9915771484375, + "logps/rejected": -237.93887329101562, + "loss": 0.464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13368822634220123, + "rewards/margins": 1.0918710231781006, + "rewards/rejected": -1.2255592346191406, + "step": 4078 + }, + { + "epoch": 0.47, + "learning_rate": 1.6136017792344608e-07, + "logits/chosen": -3.5791029930114746, + "logits/rejected": -3.2449162006378174, + "logps/chosen": -372.3717956542969, + "logps/rejected": -319.799560546875, + "loss": 0.2589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0651768147945404, + "rewards/margins": 2.55277156829834, + "rewards/rejected": -2.4875946044921875, + "step": 4079 + }, + { + "epoch": 0.47, + "learning_rate": 1.6132506145382184e-07, + "logits/chosen": -3.2799036502838135, + "logits/rejected": -3.2424380779266357, + "logps/chosen": -478.509033203125, + "logps/rejected": -283.6539306640625, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2687143385410309, + "rewards/margins": 0.6766458749771118, + "rewards/rejected": -0.9453601837158203, + "step": 4080 + }, + { + "epoch": 0.47, + "learning_rate": 1.6128994498419756e-07, + "logits/chosen": -3.247452735900879, + "logits/rejected": -3.2633485794067383, + "logps/chosen": -266.0584716796875, + "logps/rejected": -273.18243408203125, + "loss": 0.188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3341996669769287, + "rewards/margins": 2.7335519790649414, + "rewards/rejected": -2.3993520736694336, + "step": 4081 + }, + { + "epoch": 0.47, + "learning_rate": 1.6125482851457332e-07, + "logits/chosen": -2.675589084625244, + "logits/rejected": -2.3968091011047363, + "logps/chosen": -280.9840393066406, + "logps/rejected": -211.6583709716797, + "loss": 0.3901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4033336043357849, + "rewards/margins": 1.4458403587341309, + "rewards/rejected": -1.8491740226745605, + "step": 4082 + }, + { + "epoch": 0.47, + "learning_rate": 1.612197120449491e-07, + "logits/chosen": -3.692573070526123, + "logits/rejected": -3.7004177570343018, + "logps/chosen": -169.731689453125, + "logps/rejected": -210.74913024902344, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5497320294380188, + "rewards/margins": 2.31565260887146, + "rewards/rejected": -2.865384578704834, + "step": 4083 + }, + { + "epoch": 0.47, + "learning_rate": 1.611845955753248e-07, + "logits/chosen": -2.482783317565918, + "logits/rejected": -2.4182682037353516, + "logps/chosen": -416.8567810058594, + "logps/rejected": -390.0892333984375, + "loss": 0.3689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4432322382926941, + "rewards/margins": 3.286282539367676, + "rewards/rejected": -3.7295150756835938, + "step": 4084 + }, + { + "epoch": 0.47, + "learning_rate": 1.6114947910570058e-07, + "logits/chosen": -2.671804904937744, + "logits/rejected": -2.845076560974121, + "logps/chosen": -445.1983947753906, + "logps/rejected": -451.55914306640625, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11086338758468628, + "rewards/margins": 2.0655102729797363, + "rewards/rejected": -1.9546468257904053, + "step": 4085 + }, + { + "epoch": 0.47, + "learning_rate": 1.6111436263607633e-07, + "logits/chosen": -3.120803117752075, + "logits/rejected": -3.0286331176757812, + "logps/chosen": -210.17247009277344, + "logps/rejected": -160.3361358642578, + "loss": 0.2603, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1874322146177292, + "rewards/margins": 2.1447460651397705, + "rewards/rejected": -1.9573140144348145, + "step": 4086 + }, + { + "epoch": 0.47, + "learning_rate": 1.6107924616645206e-07, + "logits/chosen": -3.103238582611084, + "logits/rejected": -2.979245662689209, + "logps/chosen": -228.33238220214844, + "logps/rejected": -174.96453857421875, + "loss": 0.2908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3574143946170807, + "rewards/margins": 1.6387567520141602, + "rewards/rejected": -1.9961711168289185, + "step": 4087 + }, + { + "epoch": 0.47, + "learning_rate": 1.6104412969682781e-07, + "logits/chosen": -2.671816825866699, + "logits/rejected": -2.587730884552002, + "logps/chosen": -402.6905517578125, + "logps/rejected": -281.73260498046875, + "loss": 0.2517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22346174716949463, + "rewards/margins": 1.6898080110549927, + "rewards/rejected": -1.9132696390151978, + "step": 4088 + }, + { + "epoch": 0.47, + "learning_rate": 1.6100901322720354e-07, + "logits/chosen": -4.099849224090576, + "logits/rejected": -3.711578845977783, + "logps/chosen": -255.94647216796875, + "logps/rejected": -243.58377075195312, + "loss": 0.2448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08249664306640625, + "rewards/margins": 2.994204044342041, + "rewards/rejected": -3.0767006874084473, + "step": 4089 + }, + { + "epoch": 0.47, + "learning_rate": 1.609738967575793e-07, + "logits/chosen": -2.45304799079895, + "logits/rejected": -2.3688225746154785, + "logps/chosen": -416.39617919921875, + "logps/rejected": -447.3258972167969, + "loss": 0.4749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28735458850860596, + "rewards/margins": 1.0852411985397339, + "rewards/rejected": -1.3725957870483398, + "step": 4090 + }, + { + "epoch": 0.47, + "learning_rate": 1.6093878028795505e-07, + "logits/chosen": -3.4454197883605957, + "logits/rejected": -3.4913885593414307, + "logps/chosen": -263.5713806152344, + "logps/rejected": -252.53358459472656, + "loss": 0.1609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10522471368312836, + "rewards/margins": 2.573647975921631, + "rewards/rejected": -2.4684231281280518, + "step": 4091 + }, + { + "epoch": 0.47, + "learning_rate": 1.6090366381833078e-07, + "logits/chosen": -3.3793463706970215, + "logits/rejected": -3.083359956741333, + "logps/chosen": -244.01907348632812, + "logps/rejected": -196.00689697265625, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3730229437351227, + "rewards/margins": 0.6069237589836121, + "rewards/rejected": -0.9799466729164124, + "step": 4092 + }, + { + "epoch": 0.47, + "learning_rate": 1.6086854734870653e-07, + "logits/chosen": -3.5456299781799316, + "logits/rejected": -3.5872933864593506, + "logps/chosen": -249.104248046875, + "logps/rejected": -154.18389892578125, + "loss": 0.3177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22897014021873474, + "rewards/margins": 1.8333407640457153, + "rewards/rejected": -1.6043705940246582, + "step": 4093 + }, + { + "epoch": 0.47, + "learning_rate": 1.6083343087908226e-07, + "logits/chosen": -2.6931679248809814, + "logits/rejected": -2.9569320678710938, + "logps/chosen": -281.9060363769531, + "logps/rejected": -203.78485107421875, + "loss": 0.3851, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03623540699481964, + "rewards/margins": 1.7937355041503906, + "rewards/rejected": -1.7575000524520874, + "step": 4094 + }, + { + "epoch": 0.47, + "learning_rate": 1.60798314409458e-07, + "logits/chosen": -3.352503776550293, + "logits/rejected": -3.1573548316955566, + "logps/chosen": -264.0829772949219, + "logps/rejected": -190.2379150390625, + "loss": 0.6422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30534741282463074, + "rewards/margins": 0.5134832859039307, + "rewards/rejected": -0.20813587307929993, + "step": 4095 + }, + { + "epoch": 0.47, + "learning_rate": 1.607631979398338e-07, + "logits/chosen": -3.5181140899658203, + "logits/rejected": -3.5778889656066895, + "logps/chosen": -214.92332458496094, + "logps/rejected": -171.6587677001953, + "loss": 0.6932, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4388342499732971, + "rewards/margins": 0.4311586916446686, + "rewards/rejected": -0.8699929714202881, + "step": 4096 + }, + { + "epoch": 0.47, + "learning_rate": 1.6072808147020952e-07, + "logits/chosen": -3.6982994079589844, + "logits/rejected": -3.240255832672119, + "logps/chosen": -334.9539794921875, + "logps/rejected": -226.12261962890625, + "loss": 0.2375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05106744170188904, + "rewards/margins": 2.8452961444854736, + "rewards/rejected": -2.7942285537719727, + "step": 4097 + }, + { + "epoch": 0.47, + "learning_rate": 1.6069296500058527e-07, + "logits/chosen": -3.297924757003784, + "logits/rejected": -3.1629793643951416, + "logps/chosen": -153.66806030273438, + "logps/rejected": -145.23361206054688, + "loss": 0.3355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45878908038139343, + "rewards/margins": 1.1152031421661377, + "rewards/rejected": -1.5739922523498535, + "step": 4098 + }, + { + "epoch": 0.47, + "learning_rate": 1.6065784853096103e-07, + "logits/chosen": -2.524756908416748, + "logits/rejected": -2.5333447456359863, + "logps/chosen": -210.82962036132812, + "logps/rejected": -307.5135803222656, + "loss": 0.2457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24274808168411255, + "rewards/margins": 3.9773449897766113, + "rewards/rejected": -4.220093250274658, + "step": 4099 + }, + { + "epoch": 0.47, + "learning_rate": 1.6062273206133675e-07, + "logits/chosen": -2.4171037673950195, + "logits/rejected": -2.5284361839294434, + "logps/chosen": -149.39317321777344, + "logps/rejected": -236.36993408203125, + "loss": 0.2511, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008223028853535652, + "rewards/margins": 2.354922294616699, + "rewards/rejected": -2.3466992378234863, + "step": 4100 + }, + { + "epoch": 0.47, + "learning_rate": 1.605876155917125e-07, + "logits/chosen": -3.033402919769287, + "logits/rejected": -2.9349896907806396, + "logps/chosen": -256.25823974609375, + "logps/rejected": -318.106689453125, + "loss": 0.1769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6802597045898438, + "rewards/margins": 2.5853726863861084, + "rewards/rejected": -1.9051129817962646, + "step": 4101 + }, + { + "epoch": 0.47, + "learning_rate": 1.6055249912208824e-07, + "logits/chosen": -3.7346994876861572, + "logits/rejected": -3.3292951583862305, + "logps/chosen": -382.4684753417969, + "logps/rejected": -221.89723205566406, + "loss": 0.3642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.239411398768425, + "rewards/margins": 1.4270793199539185, + "rewards/rejected": -1.6664905548095703, + "step": 4102 + }, + { + "epoch": 0.47, + "learning_rate": 1.60517382652464e-07, + "logits/chosen": -3.4008290767669678, + "logits/rejected": -3.7134299278259277, + "logps/chosen": -116.62904357910156, + "logps/rejected": -173.69943237304688, + "loss": 0.4707, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005463123321533203, + "rewards/margins": 2.4211981296539307, + "rewards/rejected": -2.4157350063323975, + "step": 4103 + }, + { + "epoch": 0.47, + "learning_rate": 1.6048226618283974e-07, + "logits/chosen": -2.603105306625366, + "logits/rejected": -2.3759660720825195, + "logps/chosen": -182.88177490234375, + "logps/rejected": -273.3739013671875, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014993328601121902, + "rewards/margins": 3.114842414855957, + "rewards/rejected": -3.129835605621338, + "step": 4104 + }, + { + "epoch": 0.47, + "learning_rate": 1.6044714971321547e-07, + "logits/chosen": -2.8716862201690674, + "logits/rejected": -2.983189582824707, + "logps/chosen": -280.8775634765625, + "logps/rejected": -383.16815185546875, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5399225354194641, + "rewards/margins": 1.2310549020767212, + "rewards/rejected": -1.77097749710083, + "step": 4105 + }, + { + "epoch": 0.47, + "learning_rate": 1.6041203324359123e-07, + "logits/chosen": -3.60548734664917, + "logits/rejected": -3.5654544830322266, + "logps/chosen": -316.3787841796875, + "logps/rejected": -498.14739990234375, + "loss": 0.3206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43731316924095154, + "rewards/margins": 2.5786402225494385, + "rewards/rejected": -3.015953540802002, + "step": 4106 + }, + { + "epoch": 0.47, + "learning_rate": 1.60376916773967e-07, + "logits/chosen": -3.241730213165283, + "logits/rejected": -2.9741666316986084, + "logps/chosen": -314.63922119140625, + "logps/rejected": -257.5995178222656, + "loss": 0.3159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3174893260002136, + "rewards/margins": 1.512473225593567, + "rewards/rejected": -1.8299624919891357, + "step": 4107 + }, + { + "epoch": 0.47, + "learning_rate": 1.6034180030434273e-07, + "logits/chosen": -3.2382211685180664, + "logits/rejected": -3.398634195327759, + "logps/chosen": -372.91314697265625, + "logps/rejected": -336.0079650878906, + "loss": 0.3088, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08390974998474121, + "rewards/margins": 2.9107468128204346, + "rewards/rejected": -2.8268370628356934, + "step": 4108 + }, + { + "epoch": 0.47, + "learning_rate": 1.6030668383471849e-07, + "logits/chosen": -3.787719964981079, + "logits/rejected": -3.631359577178955, + "logps/chosen": -194.42880249023438, + "logps/rejected": -205.24838256835938, + "loss": 0.5481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25613611936569214, + "rewards/margins": 0.5395616888999939, + "rewards/rejected": -0.795697808265686, + "step": 4109 + }, + { + "epoch": 0.47, + "learning_rate": 1.6027156736509421e-07, + "logits/chosen": -2.9726340770721436, + "logits/rejected": -3.054720878601074, + "logps/chosen": -134.62191772460938, + "logps/rejected": -175.95867919921875, + "loss": 0.3708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16966161131858826, + "rewards/margins": 1.9331179857254028, + "rewards/rejected": -2.1027796268463135, + "step": 4110 + }, + { + "epoch": 0.47, + "learning_rate": 1.6023645089546997e-07, + "logits/chosen": -2.5126664638519287, + "logits/rejected": -2.7312824726104736, + "logps/chosen": -323.19830322265625, + "logps/rejected": -249.21018981933594, + "loss": 0.4371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.056439101696014404, + "rewards/margins": 1.408325433731079, + "rewards/rejected": -1.4647647142410278, + "step": 4111 + }, + { + "epoch": 0.47, + "learning_rate": 1.6020133442584572e-07, + "logits/chosen": -3.5030529499053955, + "logits/rejected": -3.299456834793091, + "logps/chosen": -305.9978942871094, + "logps/rejected": -175.88290405273438, + "loss": 0.5824, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02139461040496826, + "rewards/margins": 1.480688452720642, + "rewards/rejected": -1.4592939615249634, + "step": 4112 + }, + { + "epoch": 0.47, + "learning_rate": 1.6016621795622145e-07, + "logits/chosen": -3.2485289573669434, + "logits/rejected": -3.3496713638305664, + "logps/chosen": -208.0928955078125, + "logps/rejected": -373.43157958984375, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.041187748312950134, + "rewards/margins": 1.6766462326049805, + "rewards/rejected": -1.7178339958190918, + "step": 4113 + }, + { + "epoch": 0.47, + "learning_rate": 1.601311014865972e-07, + "logits/chosen": -2.4665327072143555, + "logits/rejected": -2.4878625869750977, + "logps/chosen": -285.83404541015625, + "logps/rejected": -289.4356689453125, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.498344361782074, + "rewards/margins": 0.5160529017448425, + "rewards/rejected": -1.014397144317627, + "step": 4114 + }, + { + "epoch": 0.47, + "learning_rate": 1.6009598501697296e-07, + "logits/chosen": -3.664306879043579, + "logits/rejected": -3.5734822750091553, + "logps/chosen": -291.3409729003906, + "logps/rejected": -260.845458984375, + "loss": 0.7286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.601740837097168, + "rewards/margins": 1.6911418437957764, + "rewards/rejected": -2.2928826808929443, + "step": 4115 + }, + { + "epoch": 0.47, + "learning_rate": 1.6006086854734868e-07, + "logits/chosen": -3.783968448638916, + "logits/rejected": -3.341583490371704, + "logps/chosen": -398.2119445800781, + "logps/rejected": -251.64190673828125, + "loss": 0.7566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5985844135284424, + "rewards/margins": 1.6096915006637573, + "rewards/rejected": -2.20827579498291, + "step": 4116 + }, + { + "epoch": 0.47, + "learning_rate": 1.6002575207772446e-07, + "logits/chosen": -3.352285146713257, + "logits/rejected": -3.6951515674591064, + "logps/chosen": -281.4228515625, + "logps/rejected": -219.48483276367188, + "loss": 0.4767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3193380534648895, + "rewards/margins": 1.5731438398361206, + "rewards/rejected": -1.8924819231033325, + "step": 4117 + }, + { + "epoch": 0.47, + "learning_rate": 1.5999063560810017e-07, + "logits/chosen": -3.0560989379882812, + "logits/rejected": -2.8971729278564453, + "logps/chosen": -240.57003784179688, + "logps/rejected": -314.08380126953125, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1691247820854187, + "rewards/margins": 1.8293938636779785, + "rewards/rejected": -1.9985185861587524, + "step": 4118 + }, + { + "epoch": 0.47, + "learning_rate": 1.5995551913847595e-07, + "logits/chosen": -3.1037588119506836, + "logits/rejected": -2.705632448196411, + "logps/chosen": -402.21563720703125, + "logps/rejected": -251.33465576171875, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07259905338287354, + "rewards/margins": 1.6625372171401978, + "rewards/rejected": -1.7351361513137817, + "step": 4119 + }, + { + "epoch": 0.47, + "learning_rate": 1.599204026688517e-07, + "logits/chosen": -2.960047483444214, + "logits/rejected": -2.8574070930480957, + "logps/chosen": -308.9866943359375, + "logps/rejected": -355.22259521484375, + "loss": 0.4589, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06520962715148926, + "rewards/margins": 1.3070883750915527, + "rewards/rejected": -1.2418787479400635, + "step": 4120 + }, + { + "epoch": 0.48, + "learning_rate": 1.5988528619922743e-07, + "logits/chosen": -3.3280909061431885, + "logits/rejected": -2.9432976245880127, + "logps/chosen": -602.5799560546875, + "logps/rejected": -329.9744567871094, + "loss": 0.1425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3884144723415375, + "rewards/margins": 2.5448899269104004, + "rewards/rejected": -2.9333043098449707, + "step": 4121 + }, + { + "epoch": 0.48, + "learning_rate": 1.5985016972960318e-07, + "logits/chosen": -2.320772409439087, + "logits/rejected": -2.4083003997802734, + "logps/chosen": -193.960693359375, + "logps/rejected": -225.5411834716797, + "loss": 0.4921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014483049511909485, + "rewards/margins": 2.1187217235565186, + "rewards/rejected": -2.133204936981201, + "step": 4122 + }, + { + "epoch": 0.48, + "learning_rate": 1.5981505325997893e-07, + "logits/chosen": -2.848829746246338, + "logits/rejected": -2.967329978942871, + "logps/chosen": -286.685546875, + "logps/rejected": -406.13458251953125, + "loss": 0.4769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6589688062667847, + "rewards/margins": 3.739100933074951, + "rewards/rejected": -4.398069858551025, + "step": 4123 + }, + { + "epoch": 0.48, + "learning_rate": 1.5977993679035466e-07, + "logits/chosen": -2.493645668029785, + "logits/rejected": -2.925797939300537, + "logps/chosen": -167.31866455078125, + "logps/rejected": -182.39801025390625, + "loss": 0.3424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24861693382263184, + "rewards/margins": 2.0535411834716797, + "rewards/rejected": -1.8049243688583374, + "step": 4124 + }, + { + "epoch": 0.48, + "learning_rate": 1.5974482032073042e-07, + "logits/chosen": -3.050299644470215, + "logits/rejected": -2.539034843444824, + "logps/chosen": -304.99285888671875, + "logps/rejected": -212.04092407226562, + "loss": 0.5065, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12635576725006104, + "rewards/margins": 0.7090057134628296, + "rewards/rejected": -0.5826498866081238, + "step": 4125 + }, + { + "epoch": 0.48, + "learning_rate": 1.5970970385110614e-07, + "logits/chosen": -2.83903431892395, + "logits/rejected": -3.001051902770996, + "logps/chosen": -286.70294189453125, + "logps/rejected": -242.54562377929688, + "loss": 0.3713, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5777262449264526, + "rewards/margins": 1.608403205871582, + "rewards/rejected": -1.0306769609451294, + "step": 4126 + }, + { + "epoch": 0.48, + "learning_rate": 1.596745873814819e-07, + "logits/chosen": -3.3446526527404785, + "logits/rejected": -2.9687626361846924, + "logps/chosen": -317.37518310546875, + "logps/rejected": -303.0245361328125, + "loss": 0.4259, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22034157812595367, + "rewards/margins": 1.0035682916641235, + "rewards/rejected": -0.783226728439331, + "step": 4127 + }, + { + "epoch": 0.48, + "learning_rate": 1.5963947091185768e-07, + "logits/chosen": -2.7587926387786865, + "logits/rejected": -2.4652693271636963, + "logps/chosen": -372.4881286621094, + "logps/rejected": -443.6074523925781, + "loss": 0.3648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8866021633148193, + "rewards/margins": 1.984591007232666, + "rewards/rejected": -1.0979889631271362, + "step": 4128 + }, + { + "epoch": 0.48, + "learning_rate": 1.5960435444223338e-07, + "logits/chosen": -3.317525863647461, + "logits/rejected": -3.098708391189575, + "logps/chosen": -150.4154052734375, + "logps/rejected": -237.29885864257812, + "loss": 0.4032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16487836837768555, + "rewards/margins": 1.7032475471496582, + "rewards/rejected": -1.8681259155273438, + "step": 4129 + }, + { + "epoch": 0.48, + "learning_rate": 1.5956923797260916e-07, + "logits/chosen": -3.9482340812683105, + "logits/rejected": -3.6534006595611572, + "logps/chosen": -188.116455078125, + "logps/rejected": -210.06655883789062, + "loss": 0.5729, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29493212699890137, + "rewards/margins": 0.8616693019866943, + "rewards/rejected": -1.1566014289855957, + "step": 4130 + }, + { + "epoch": 0.48, + "learning_rate": 1.595341215029849e-07, + "logits/chosen": -3.325497627258301, + "logits/rejected": -3.370974063873291, + "logps/chosen": -221.25601196289062, + "logps/rejected": -348.0758361816406, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058750346302986145, + "rewards/margins": 2.4744784832000732, + "rewards/rejected": -2.533228874206543, + "step": 4131 + }, + { + "epoch": 0.48, + "learning_rate": 1.5949900503336064e-07, + "logits/chosen": -2.582530975341797, + "logits/rejected": -2.632814884185791, + "logps/chosen": -352.6424255371094, + "logps/rejected": -379.5065002441406, + "loss": 0.3629, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.013519465923309326, + "rewards/margins": 1.4324030876159668, + "rewards/rejected": -1.4188838005065918, + "step": 4132 + }, + { + "epoch": 0.48, + "learning_rate": 1.594638885637364e-07, + "logits/chosen": -3.787912368774414, + "logits/rejected": -3.5043094158172607, + "logps/chosen": -302.6733703613281, + "logps/rejected": -200.42913818359375, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058378592133522034, + "rewards/margins": 1.7218562364578247, + "rewards/rejected": -1.7802350521087646, + "step": 4133 + }, + { + "epoch": 0.48, + "learning_rate": 1.5942877209411212e-07, + "logits/chosen": -3.1063055992126465, + "logits/rejected": -3.1047165393829346, + "logps/chosen": -197.04356384277344, + "logps/rejected": -265.9906311035156, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7949743270874023, + "rewards/margins": 2.5004003047943115, + "rewards/rejected": -1.7054260969161987, + "step": 4134 + }, + { + "epoch": 0.48, + "learning_rate": 1.5939365562448788e-07, + "logits/chosen": -3.0185294151306152, + "logits/rejected": -2.831360101699829, + "logps/chosen": -169.4357147216797, + "logps/rejected": -115.54486083984375, + "loss": 0.7575, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4770050346851349, + "rewards/margins": -0.014374271035194397, + "rewards/rejected": -0.4626307487487793, + "step": 4135 + }, + { + "epoch": 0.48, + "learning_rate": 1.5935853915486363e-07, + "logits/chosen": -3.0436177253723145, + "logits/rejected": -2.4734368324279785, + "logps/chosen": -361.73236083984375, + "logps/rejected": -186.24891662597656, + "loss": 0.4325, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3524041175842285, + "rewards/margins": 1.2923951148986816, + "rewards/rejected": -0.9399910569190979, + "step": 4136 + }, + { + "epoch": 0.48, + "learning_rate": 1.5932342268523936e-07, + "logits/chosen": -3.2745347023010254, + "logits/rejected": -2.905576705932617, + "logps/chosen": -260.3238830566406, + "logps/rejected": -167.7847900390625, + "loss": 0.8012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13937358558177948, + "rewards/margins": 0.9905533194541931, + "rewards/rejected": -0.851179838180542, + "step": 4137 + }, + { + "epoch": 0.48, + "learning_rate": 1.592883062156151e-07, + "logits/chosen": -2.9699742794036865, + "logits/rejected": -2.882106065750122, + "logps/chosen": -359.4383544921875, + "logps/rejected": -257.7856140136719, + "loss": 0.5024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3787563741207123, + "rewards/margins": 0.6818729639053345, + "rewards/rejected": -1.0606292486190796, + "step": 4138 + }, + { + "epoch": 0.48, + "learning_rate": 1.592531897459909e-07, + "logits/chosen": -2.567288875579834, + "logits/rejected": -2.5036728382110596, + "logps/chosen": -226.12098693847656, + "logps/rejected": -226.9105224609375, + "loss": 0.1854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006817393004894257, + "rewards/margins": 2.6141862869262695, + "rewards/rejected": -2.6210036277770996, + "step": 4139 + }, + { + "epoch": 0.48, + "learning_rate": 1.592180732763666e-07, + "logits/chosen": -3.2374706268310547, + "logits/rejected": -3.300962448120117, + "logps/chosen": -130.572998046875, + "logps/rejected": -254.7808837890625, + "loss": 0.3892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5896345376968384, + "rewards/margins": 1.9184602499008179, + "rewards/rejected": -2.5080947875976562, + "step": 4140 + }, + { + "epoch": 0.48, + "learning_rate": 1.5918295680674237e-07, + "logits/chosen": -3.3762576580047607, + "logits/rejected": -3.0574512481689453, + "logps/chosen": -347.9324645996094, + "logps/rejected": -264.5272216796875, + "loss": 0.5426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21936669945716858, + "rewards/margins": 1.2588984966278076, + "rewards/rejected": -1.4782652854919434, + "step": 4141 + }, + { + "epoch": 0.48, + "learning_rate": 1.591478403371181e-07, + "logits/chosen": -2.7622249126434326, + "logits/rejected": -3.0797410011291504, + "logps/chosen": -132.38650512695312, + "logps/rejected": -354.5783996582031, + "loss": 0.1921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18097130954265594, + "rewards/margins": 2.9941561222076416, + "rewards/rejected": -2.813184976577759, + "step": 4142 + }, + { + "epoch": 0.48, + "learning_rate": 1.5911272386749385e-07, + "logits/chosen": -3.6517276763916016, + "logits/rejected": -3.1498935222625732, + "logps/chosen": -277.0365295410156, + "logps/rejected": -302.8590393066406, + "loss": 0.4745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1383552849292755, + "rewards/margins": 1.5517141819000244, + "rewards/rejected": -1.690069556236267, + "step": 4143 + }, + { + "epoch": 0.48, + "learning_rate": 1.590776073978696e-07, + "logits/chosen": -2.9874038696289062, + "logits/rejected": -2.884500026702881, + "logps/chosen": -249.96299743652344, + "logps/rejected": -307.10614013671875, + "loss": 0.2875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23816834390163422, + "rewards/margins": 1.740641474723816, + "rewards/rejected": -1.9788098335266113, + "step": 4144 + }, + { + "epoch": 0.48, + "learning_rate": 1.5904249092824533e-07, + "logits/chosen": -2.375847339630127, + "logits/rejected": -2.6630895137786865, + "logps/chosen": -315.50909423828125, + "logps/rejected": -195.85581970214844, + "loss": 0.503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13839535415172577, + "rewards/margins": 0.7343431711196899, + "rewards/rejected": -0.8727384805679321, + "step": 4145 + }, + { + "epoch": 0.48, + "learning_rate": 1.590073744586211e-07, + "logits/chosen": -3.063681125640869, + "logits/rejected": -3.191713571548462, + "logps/chosen": -280.7762451171875, + "logps/rejected": -333.161865234375, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.795738160610199, + "rewards/margins": 4.155856132507324, + "rewards/rejected": -3.3601179122924805, + "step": 4146 + }, + { + "epoch": 0.48, + "learning_rate": 1.5897225798899682e-07, + "logits/chosen": -3.421813726425171, + "logits/rejected": -3.0845956802368164, + "logps/chosen": -227.12229919433594, + "logps/rejected": -194.84848022460938, + "loss": 0.4495, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004799067974090576, + "rewards/margins": 1.5598645210266113, + "rewards/rejected": -1.5550655126571655, + "step": 4147 + }, + { + "epoch": 0.48, + "learning_rate": 1.5893714151937257e-07, + "logits/chosen": -2.4708926677703857, + "logits/rejected": -2.5168709754943848, + "logps/chosen": -157.74334716796875, + "logps/rejected": -294.7514953613281, + "loss": 0.4194, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2014731466770172, + "rewards/margins": 1.6973552703857422, + "rewards/rejected": -1.4958820343017578, + "step": 4148 + }, + { + "epoch": 0.48, + "learning_rate": 1.5890202504974832e-07, + "logits/chosen": -3.1073105335235596, + "logits/rejected": -3.1245694160461426, + "logps/chosen": -197.14981079101562, + "logps/rejected": -161.67953491210938, + "loss": 0.2424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05173312872648239, + "rewards/margins": 1.8211636543273926, + "rewards/rejected": -1.7694306373596191, + "step": 4149 + }, + { + "epoch": 0.48, + "learning_rate": 1.5886690858012405e-07, + "logits/chosen": -3.5924038887023926, + "logits/rejected": -3.124065637588501, + "logps/chosen": -187.65786743164062, + "logps/rejected": -172.00625610351562, + "loss": 0.5185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18045081198215485, + "rewards/margins": 1.0323083400726318, + "rewards/rejected": -1.2127591371536255, + "step": 4150 + }, + { + "epoch": 0.48, + "learning_rate": 1.5883179211049983e-07, + "logits/chosen": -2.971348762512207, + "logits/rejected": -3.1058194637298584, + "logps/chosen": -206.31903076171875, + "logps/rejected": -194.45326232910156, + "loss": 0.4181, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012494999915361404, + "rewards/margins": 0.9520470499992371, + "rewards/rejected": -0.964542031288147, + "step": 4151 + }, + { + "epoch": 0.48, + "learning_rate": 1.5879667564087558e-07, + "logits/chosen": -3.608729839324951, + "logits/rejected": -3.43735671043396, + "logps/chosen": -139.9273223876953, + "logps/rejected": -224.5488739013672, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.494653582572937, + "rewards/margins": 1.9678739309310913, + "rewards/rejected": -1.4732205867767334, + "step": 4152 + }, + { + "epoch": 0.48, + "learning_rate": 1.587615591712513e-07, + "logits/chosen": -3.377650260925293, + "logits/rejected": -3.406541347503662, + "logps/chosen": -109.91259002685547, + "logps/rejected": -203.92257690429688, + "loss": 0.3117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22900398075580597, + "rewards/margins": 1.6668479442596436, + "rewards/rejected": -1.895851969718933, + "step": 4153 + }, + { + "epoch": 0.48, + "learning_rate": 1.5872644270162707e-07, + "logits/chosen": -2.764587879180908, + "logits/rejected": -2.773922920227051, + "logps/chosen": -306.7787780761719, + "logps/rejected": -230.67587280273438, + "loss": 0.7821, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6683786511421204, + "rewards/margins": 0.7920563220977783, + "rewards/rejected": -1.460434913635254, + "step": 4154 + }, + { + "epoch": 0.48, + "learning_rate": 1.586913262320028e-07, + "logits/chosen": -3.078274965286255, + "logits/rejected": -3.2264177799224854, + "logps/chosen": -264.54168701171875, + "logps/rejected": -236.73025512695312, + "loss": 0.186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4314913749694824, + "rewards/margins": 2.771956443786621, + "rewards/rejected": -2.3404650688171387, + "step": 4155 + }, + { + "epoch": 0.48, + "learning_rate": 1.5865620976237855e-07, + "logits/chosen": -4.101260185241699, + "logits/rejected": -3.8840391635894775, + "logps/chosen": -232.98101806640625, + "logps/rejected": -216.6641845703125, + "loss": 0.4229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5030287504196167, + "rewards/margins": 0.859880805015564, + "rewards/rejected": -1.3629095554351807, + "step": 4156 + }, + { + "epoch": 0.48, + "learning_rate": 1.586210932927543e-07, + "logits/chosen": -2.9791393280029297, + "logits/rejected": -2.668295383453369, + "logps/chosen": -266.3150634765625, + "logps/rejected": -311.8343505859375, + "loss": 0.7597, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03563022240996361, + "rewards/margins": 1.127402663230896, + "rewards/rejected": -1.16303288936615, + "step": 4157 + }, + { + "epoch": 0.48, + "learning_rate": 1.5858597682313003e-07, + "logits/chosen": -3.125279188156128, + "logits/rejected": -3.154484510421753, + "logps/chosen": -334.1747741699219, + "logps/rejected": -210.64703369140625, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4042523205280304, + "rewards/margins": 1.8817850351333618, + "rewards/rejected": -2.2860374450683594, + "step": 4158 + }, + { + "epoch": 0.48, + "learning_rate": 1.5855086035350578e-07, + "logits/chosen": -3.1841514110565186, + "logits/rejected": -3.193410873413086, + "logps/chosen": -318.4084167480469, + "logps/rejected": -224.02064514160156, + "loss": 0.3649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1609807014465332, + "rewards/margins": 1.169592022895813, + "rewards/rejected": -1.3305727243423462, + "step": 4159 + }, + { + "epoch": 0.48, + "learning_rate": 1.5851574388388154e-07, + "logits/chosen": -3.733480930328369, + "logits/rejected": -3.541449546813965, + "logps/chosen": -201.5786895751953, + "logps/rejected": -289.6544189453125, + "loss": 0.327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4985743463039398, + "rewards/margins": 2.755082607269287, + "rewards/rejected": -3.2536568641662598, + "step": 4160 + }, + { + "epoch": 0.48, + "learning_rate": 1.5848062741425726e-07, + "logits/chosen": -3.407705783843994, + "logits/rejected": -3.589371919631958, + "logps/chosen": -188.40481567382812, + "logps/rejected": -474.9868469238281, + "loss": 0.358, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2394881695508957, + "rewards/margins": 2.6502137184143066, + "rewards/rejected": -2.4107258319854736, + "step": 4161 + }, + { + "epoch": 0.48, + "learning_rate": 1.5844551094463304e-07, + "logits/chosen": -3.035860538482666, + "logits/rejected": -2.973696708679199, + "logps/chosen": -395.4267578125, + "logps/rejected": -490.580810546875, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2520816922187805, + "rewards/margins": 2.0813684463500977, + "rewards/rejected": -2.3334503173828125, + "step": 4162 + }, + { + "epoch": 0.48, + "learning_rate": 1.5841039447500874e-07, + "logits/chosen": -3.364637851715088, + "logits/rejected": -3.331660032272339, + "logps/chosen": -204.3849334716797, + "logps/rejected": -230.2228240966797, + "loss": 0.4245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4740438759326935, + "rewards/margins": 1.7664529085159302, + "rewards/rejected": -2.2404966354370117, + "step": 4163 + }, + { + "epoch": 0.48, + "learning_rate": 1.5837527800538453e-07, + "logits/chosen": -3.995433807373047, + "logits/rejected": -3.518481731414795, + "logps/chosen": -248.86956787109375, + "logps/rejected": -194.94406127929688, + "loss": 0.3877, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22125881910324097, + "rewards/margins": 1.5231809616088867, + "rewards/rejected": -1.7444398403167725, + "step": 4164 + }, + { + "epoch": 0.48, + "learning_rate": 1.5834016153576028e-07, + "logits/chosen": -2.671725034713745, + "logits/rejected": -2.711016893386841, + "logps/chosen": -304.2961120605469, + "logps/rejected": -266.02398681640625, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1302916407585144, + "rewards/margins": 0.7803993225097656, + "rewards/rejected": -0.9106910228729248, + "step": 4165 + }, + { + "epoch": 0.48, + "learning_rate": 1.58305045066136e-07, + "logits/chosen": -3.256270408630371, + "logits/rejected": -3.1990599632263184, + "logps/chosen": -240.757568359375, + "logps/rejected": -277.7171325683594, + "loss": 0.2958, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1540175974369049, + "rewards/margins": 1.7463639974594116, + "rewards/rejected": -1.9003815650939941, + "step": 4166 + }, + { + "epoch": 0.48, + "learning_rate": 1.5826992859651176e-07, + "logits/chosen": -3.061842679977417, + "logits/rejected": -3.119476795196533, + "logps/chosen": -129.54177856445312, + "logps/rejected": -194.00811767578125, + "loss": 0.5123, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.028519943356513977, + "rewards/margins": 1.5739359855651855, + "rewards/rejected": -1.5454161167144775, + "step": 4167 + }, + { + "epoch": 0.48, + "learning_rate": 1.5823481212688751e-07, + "logits/chosen": -3.072685480117798, + "logits/rejected": -3.0845108032226562, + "logps/chosen": -157.47801208496094, + "logps/rejected": -165.497314453125, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4884871244430542, + "rewards/margins": 0.7290863990783691, + "rewards/rejected": -1.217573642730713, + "step": 4168 + }, + { + "epoch": 0.48, + "learning_rate": 1.5819969565726324e-07, + "logits/chosen": -3.3165225982666016, + "logits/rejected": -3.055427074432373, + "logps/chosen": -380.9521789550781, + "logps/rejected": -222.57113647460938, + "loss": 0.5083, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5689890384674072, + "rewards/margins": 1.333040714263916, + "rewards/rejected": -1.9020297527313232, + "step": 4169 + }, + { + "epoch": 0.48, + "learning_rate": 1.58164579187639e-07, + "logits/chosen": -3.436197280883789, + "logits/rejected": -3.4974095821380615, + "logps/chosen": -211.1973419189453, + "logps/rejected": -230.60227966308594, + "loss": 0.374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26635798811912537, + "rewards/margins": 1.5973553657531738, + "rewards/rejected": -1.8637133836746216, + "step": 4170 + }, + { + "epoch": 0.48, + "learning_rate": 1.5812946271801472e-07, + "logits/chosen": -2.789534568786621, + "logits/rejected": -2.844447135925293, + "logps/chosen": -194.02261352539062, + "logps/rejected": -214.2145538330078, + "loss": 0.3057, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11948111653327942, + "rewards/margins": 1.8482369184494019, + "rewards/rejected": -1.7287559509277344, + "step": 4171 + }, + { + "epoch": 0.48, + "learning_rate": 1.5809434624839048e-07, + "logits/chosen": -3.237412929534912, + "logits/rejected": -3.3599817752838135, + "logps/chosen": -123.2769546508789, + "logps/rejected": -217.07310485839844, + "loss": 0.4538, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23284097015857697, + "rewards/margins": 1.2071439027786255, + "rewards/rejected": -0.9743030071258545, + "step": 4172 + }, + { + "epoch": 0.48, + "learning_rate": 1.5805922977876626e-07, + "logits/chosen": -2.720076560974121, + "logits/rejected": -2.866028308868408, + "logps/chosen": -278.5878601074219, + "logps/rejected": -380.5396728515625, + "loss": 0.4431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20925235748291016, + "rewards/margins": 0.8312242031097412, + "rewards/rejected": -1.0404765605926514, + "step": 4173 + }, + { + "epoch": 0.48, + "learning_rate": 1.5802411330914196e-07, + "logits/chosen": -2.9968390464782715, + "logits/rejected": -2.8339056968688965, + "logps/chosen": -191.98626708984375, + "logps/rejected": -225.85873413085938, + "loss": 0.5175, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33503589034080505, + "rewards/margins": 0.9331789612770081, + "rewards/rejected": -1.2682148218154907, + "step": 4174 + }, + { + "epoch": 0.48, + "learning_rate": 1.5798899683951774e-07, + "logits/chosen": -3.2908854484558105, + "logits/rejected": -3.1715598106384277, + "logps/chosen": -224.91612243652344, + "logps/rejected": -214.47206115722656, + "loss": 0.4118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005865946412086487, + "rewards/margins": 2.2581474781036377, + "rewards/rejected": -2.2640132904052734, + "step": 4175 + }, + { + "epoch": 0.48, + "learning_rate": 1.579538803698935e-07, + "logits/chosen": -2.492210865020752, + "logits/rejected": -2.613584041595459, + "logps/chosen": -315.2838134765625, + "logps/rejected": -326.8943786621094, + "loss": 0.8461, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7833446860313416, + "rewards/margins": 0.2049870491027832, + "rewards/rejected": -0.98833167552948, + "step": 4176 + }, + { + "epoch": 0.48, + "learning_rate": 1.5791876390026922e-07, + "logits/chosen": -3.206112861633301, + "logits/rejected": -3.4229862689971924, + "logps/chosen": -169.55886840820312, + "logps/rejected": -188.4399871826172, + "loss": 0.3536, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29534098505973816, + "rewards/margins": 2.3685572147369385, + "rewards/rejected": -2.073216199874878, + "step": 4177 + }, + { + "epoch": 0.48, + "learning_rate": 1.5788364743064497e-07, + "logits/chosen": -3.458667755126953, + "logits/rejected": -3.0830135345458984, + "logps/chosen": -111.20146179199219, + "logps/rejected": -108.19001770019531, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4490020275115967, + "rewards/margins": 0.9449406266212463, + "rewards/rejected": -1.3939425945281982, + "step": 4178 + }, + { + "epoch": 0.48, + "learning_rate": 1.578485309610207e-07, + "logits/chosen": -3.9089512825012207, + "logits/rejected": -3.9385523796081543, + "logps/chosen": -166.7404327392578, + "logps/rejected": -170.56027221679688, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07523494213819504, + "rewards/margins": 2.6744654178619385, + "rewards/rejected": -2.5992302894592285, + "step": 4179 + }, + { + "epoch": 0.48, + "learning_rate": 1.5781341449139645e-07, + "logits/chosen": -3.507452964782715, + "logits/rejected": -3.558997392654419, + "logps/chosen": -254.794189453125, + "logps/rejected": -274.3380126953125, + "loss": 0.1371, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2245100736618042, + "rewards/margins": 3.1066999435424805, + "rewards/rejected": -2.882189989089966, + "step": 4180 + }, + { + "epoch": 0.48, + "learning_rate": 1.577782980217722e-07, + "logits/chosen": -3.2125909328460693, + "logits/rejected": -3.280500650405884, + "logps/chosen": -188.98243713378906, + "logps/rejected": -212.83255004882812, + "loss": 0.3226, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15744900703430176, + "rewards/margins": 2.0082292556762695, + "rewards/rejected": -1.8507802486419678, + "step": 4181 + }, + { + "epoch": 0.48, + "learning_rate": 1.5774318155214794e-07, + "logits/chosen": -2.36220645904541, + "logits/rejected": -2.720154285430908, + "logps/chosen": -212.7046356201172, + "logps/rejected": -242.90753173828125, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18542778491973877, + "rewards/margins": 2.3256349563598633, + "rewards/rejected": -2.140207290649414, + "step": 4182 + }, + { + "epoch": 0.48, + "learning_rate": 1.577080650825237e-07, + "logits/chosen": -2.5293242931365967, + "logits/rejected": -2.6789298057556152, + "logps/chosen": -131.03353881835938, + "logps/rejected": -193.84263610839844, + "loss": 0.365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31866520643234253, + "rewards/margins": 1.1562637090682983, + "rewards/rejected": -0.8375985026359558, + "step": 4183 + }, + { + "epoch": 0.48, + "learning_rate": 1.5767294861289947e-07, + "logits/chosen": -2.7968947887420654, + "logits/rejected": -2.501553535461426, + "logps/chosen": -210.76560974121094, + "logps/rejected": -813.0324096679688, + "loss": 0.243, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01450406014919281, + "rewards/margins": 3.042210578918457, + "rewards/rejected": -3.0277066230773926, + "step": 4184 + }, + { + "epoch": 0.48, + "learning_rate": 1.576378321432752e-07, + "logits/chosen": -2.9570178985595703, + "logits/rejected": -3.3056976795196533, + "logps/chosen": -355.7897033691406, + "logps/rejected": -413.2002258300781, + "loss": 0.1839, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26672905683517456, + "rewards/margins": 3.47041654586792, + "rewards/rejected": -3.2036876678466797, + "step": 4185 + }, + { + "epoch": 0.48, + "learning_rate": 1.5760271567365095e-07, + "logits/chosen": -3.0302488803863525, + "logits/rejected": -2.9772324562072754, + "logps/chosen": -326.79864501953125, + "logps/rejected": -260.8951416015625, + "loss": 0.3762, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2330956757068634, + "rewards/margins": 1.180436134338379, + "rewards/rejected": -1.4135316610336304, + "step": 4186 + }, + { + "epoch": 0.48, + "learning_rate": 1.5756759920402668e-07, + "logits/chosen": -3.4028916358947754, + "logits/rejected": -3.542877674102783, + "logps/chosen": -123.42025756835938, + "logps/rejected": -216.08273315429688, + "loss": 0.388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11894366145133972, + "rewards/margins": 1.5408967733383179, + "rewards/rejected": -1.65984046459198, + "step": 4187 + }, + { + "epoch": 0.48, + "learning_rate": 1.5753248273440243e-07, + "logits/chosen": -3.4626107215881348, + "logits/rejected": -3.3268773555755615, + "logps/chosen": -191.43275451660156, + "logps/rejected": -182.3699493408203, + "loss": 0.4003, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1424209475517273, + "rewards/margins": 1.4959142208099365, + "rewards/rejected": -1.353493332862854, + "step": 4188 + }, + { + "epoch": 0.48, + "learning_rate": 1.5749736626477819e-07, + "logits/chosen": -2.8813586235046387, + "logits/rejected": -2.739018440246582, + "logps/chosen": -288.30352783203125, + "logps/rejected": -290.74310302734375, + "loss": 0.5272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2124200463294983, + "rewards/margins": 1.4833290576934814, + "rewards/rejected": -1.695749282836914, + "step": 4189 + }, + { + "epoch": 0.48, + "learning_rate": 1.5746224979515391e-07, + "logits/chosen": -3.558523654937744, + "logits/rejected": -3.307265281677246, + "logps/chosen": -165.4458465576172, + "logps/rejected": -186.44650268554688, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08462736010551453, + "rewards/margins": 1.5774463415145874, + "rewards/rejected": -1.492818832397461, + "step": 4190 + }, + { + "epoch": 0.48, + "learning_rate": 1.5742713332552967e-07, + "logits/chosen": -3.555126428604126, + "logits/rejected": -3.742110252380371, + "logps/chosen": -199.5130615234375, + "logps/rejected": -258.3553466796875, + "loss": 0.5752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4030669927597046, + "rewards/margins": 0.862259030342102, + "rewards/rejected": -1.2653260231018066, + "step": 4191 + }, + { + "epoch": 0.48, + "learning_rate": 1.573920168559054e-07, + "logits/chosen": -3.0914368629455566, + "logits/rejected": -3.417205572128296, + "logps/chosen": -289.152587890625, + "logps/rejected": -362.5640869140625, + "loss": 0.4299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6245719790458679, + "rewards/margins": 1.49722421169281, + "rewards/rejected": -2.121796131134033, + "step": 4192 + }, + { + "epoch": 0.48, + "learning_rate": 1.5735690038628115e-07, + "logits/chosen": -2.8086109161376953, + "logits/rejected": -3.094818115234375, + "logps/chosen": -154.9805908203125, + "logps/rejected": -219.42086791992188, + "loss": 0.3281, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7028059959411621, + "rewards/margins": 2.508117198944092, + "rewards/rejected": -1.8053112030029297, + "step": 4193 + }, + { + "epoch": 0.48, + "learning_rate": 1.573217839166569e-07, + "logits/chosen": -3.255518913269043, + "logits/rejected": -3.0628411769866943, + "logps/chosen": -431.36981201171875, + "logps/rejected": -319.5850524902344, + "loss": 0.413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7073135375976562, + "rewards/margins": 1.132576584815979, + "rewards/rejected": -1.8398902416229248, + "step": 4194 + }, + { + "epoch": 0.48, + "learning_rate": 1.5728666744703263e-07, + "logits/chosen": -3.179767608642578, + "logits/rejected": -2.9064841270446777, + "logps/chosen": -230.9851837158203, + "logps/rejected": -145.33529663085938, + "loss": 1.3289, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3804271221160889, + "rewards/margins": 0.5264627933502197, + "rewards/rejected": -1.9068900346755981, + "step": 4195 + }, + { + "epoch": 0.48, + "learning_rate": 1.572515509774084e-07, + "logits/chosen": -3.006401300430298, + "logits/rejected": -3.1342170238494873, + "logps/chosen": -96.12712860107422, + "logps/rejected": -139.85797119140625, + "loss": 0.6567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6084538102149963, + "rewards/margins": 1.7247674465179443, + "rewards/rejected": -2.333221197128296, + "step": 4196 + }, + { + "epoch": 0.48, + "learning_rate": 1.5721643450778416e-07, + "logits/chosen": -3.9080986976623535, + "logits/rejected": -3.6181583404541016, + "logps/chosen": -158.03302001953125, + "logps/rejected": -166.3197479248047, + "loss": 0.4082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6031032800674438, + "rewards/margins": 1.2207880020141602, + "rewards/rejected": -1.8238911628723145, + "step": 4197 + }, + { + "epoch": 0.48, + "learning_rate": 1.571813180381599e-07, + "logits/chosen": -3.410707950592041, + "logits/rejected": -3.3439064025878906, + "logps/chosen": -236.82763671875, + "logps/rejected": -151.862060546875, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19808907806873322, + "rewards/margins": 2.4813783168792725, + "rewards/rejected": -2.2832894325256348, + "step": 4198 + }, + { + "epoch": 0.48, + "learning_rate": 1.5714620156853565e-07, + "logits/chosen": -2.9063782691955566, + "logits/rejected": -3.174765110015869, + "logps/chosen": -214.12179565429688, + "logps/rejected": -355.12481689453125, + "loss": 0.3639, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11183272302150726, + "rewards/margins": 2.061330795288086, + "rewards/rejected": -2.1731631755828857, + "step": 4199 + }, + { + "epoch": 0.48, + "learning_rate": 1.5711108509891137e-07, + "logits/chosen": -2.501115322113037, + "logits/rejected": -2.9578757286071777, + "logps/chosen": -199.41876220703125, + "logps/rejected": -266.54217529296875, + "loss": 0.241, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04883028566837311, + "rewards/margins": 2.872514247894287, + "rewards/rejected": -2.8236842155456543, + "step": 4200 + }, + { + "epoch": 0.48, + "learning_rate": 1.5707596862928713e-07, + "logits/chosen": -3.7064433097839355, + "logits/rejected": -3.3666961193084717, + "logps/chosen": -447.16461181640625, + "logps/rejected": -335.2342529296875, + "loss": 0.3885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0724840983748436, + "rewards/margins": 2.597301959991455, + "rewards/rejected": -2.669785737991333, + "step": 4201 + }, + { + "epoch": 0.48, + "learning_rate": 1.5704085215966288e-07, + "logits/chosen": -3.25488018989563, + "logits/rejected": -3.167314291000366, + "logps/chosen": -306.48760986328125, + "logps/rejected": -220.80726623535156, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2944314479827881, + "rewards/margins": 1.742204189300537, + "rewards/rejected": -1.4477726221084595, + "step": 4202 + }, + { + "epoch": 0.48, + "learning_rate": 1.570057356900386e-07, + "logits/chosen": -2.9310877323150635, + "logits/rejected": -3.0971431732177734, + "logps/chosen": -223.73374938964844, + "logps/rejected": -331.232421875, + "loss": 0.5306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023408517241477966, + "rewards/margins": 1.2062346935272217, + "rewards/rejected": -1.2296431064605713, + "step": 4203 + }, + { + "epoch": 0.48, + "learning_rate": 1.5697061922041436e-07, + "logits/chosen": -3.45886492729187, + "logits/rejected": -3.359304189682007, + "logps/chosen": -211.30303955078125, + "logps/rejected": -229.62911987304688, + "loss": 0.3257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12690195441246033, + "rewards/margins": 1.5767028331756592, + "rewards/rejected": -1.449800729751587, + "step": 4204 + }, + { + "epoch": 0.48, + "learning_rate": 1.5693550275079012e-07, + "logits/chosen": -2.709913492202759, + "logits/rejected": -2.496840000152588, + "logps/chosen": -311.2287902832031, + "logps/rejected": -232.8653564453125, + "loss": 0.5268, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.058904170989990234, + "rewards/margins": 0.9304572343826294, + "rewards/rejected": -0.9893614053726196, + "step": 4205 + }, + { + "epoch": 0.48, + "learning_rate": 1.5690038628116584e-07, + "logits/chosen": -2.466404914855957, + "logits/rejected": -2.4961793422698975, + "logps/chosen": -350.48828125, + "logps/rejected": -313.3509521484375, + "loss": 0.4498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5067083239555359, + "rewards/margins": 1.4223328828811646, + "rewards/rejected": -1.9290411472320557, + "step": 4206 + }, + { + "epoch": 0.48, + "learning_rate": 1.5686526981154162e-07, + "logits/chosen": -2.6970109939575195, + "logits/rejected": -2.7534914016723633, + "logps/chosen": -293.1230163574219, + "logps/rejected": -260.654296875, + "loss": 0.7455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7609258890151978, + "rewards/margins": 2.2377963066101074, + "rewards/rejected": -2.9987223148345947, + "step": 4207 + }, + { + "epoch": 0.49, + "learning_rate": 1.5683015334191732e-07, + "logits/chosen": -3.0272185802459717, + "logits/rejected": -3.288188934326172, + "logps/chosen": -286.25213623046875, + "logps/rejected": -256.9794616699219, + "loss": 0.1881, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20232467353343964, + "rewards/margins": 2.6620264053344727, + "rewards/rejected": -2.4597017765045166, + "step": 4208 + }, + { + "epoch": 0.49, + "learning_rate": 1.567950368722931e-07, + "logits/chosen": -2.322103500366211, + "logits/rejected": -2.1082992553710938, + "logps/chosen": -245.9691925048828, + "logps/rejected": -201.75152587890625, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06403293460607529, + "rewards/margins": 0.6996648907661438, + "rewards/rejected": -0.7636978626251221, + "step": 4209 + }, + { + "epoch": 0.49, + "learning_rate": 1.5675992040266886e-07, + "logits/chosen": -2.707446575164795, + "logits/rejected": -2.630547523498535, + "logps/chosen": -443.3980407714844, + "logps/rejected": -383.24945068359375, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7467562556266785, + "rewards/margins": 0.8330624103546143, + "rewards/rejected": -0.08630618453025818, + "step": 4210 + }, + { + "epoch": 0.49, + "learning_rate": 1.5672480393304459e-07, + "logits/chosen": -3.244673252105713, + "logits/rejected": -2.7307262420654297, + "logps/chosen": -303.138671875, + "logps/rejected": -168.0112762451172, + "loss": 0.2167, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37710079550743103, + "rewards/margins": 2.1766321659088135, + "rewards/rejected": -1.7995314598083496, + "step": 4211 + }, + { + "epoch": 0.49, + "learning_rate": 1.5668968746342034e-07, + "logits/chosen": -2.636540412902832, + "logits/rejected": -2.874347686767578, + "logps/chosen": -429.65777587890625, + "logps/rejected": -254.24232482910156, + "loss": 0.6646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12156897783279419, + "rewards/margins": 0.8506889343261719, + "rewards/rejected": -0.9722579717636108, + "step": 4212 + }, + { + "epoch": 0.49, + "learning_rate": 1.566545709937961e-07, + "logits/chosen": -3.42804217338562, + "logits/rejected": -3.604773998260498, + "logps/chosen": -142.44583129882812, + "logps/rejected": -176.04896545410156, + "loss": 0.5046, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18585234880447388, + "rewards/margins": 1.3978816270828247, + "rewards/rejected": -1.212029218673706, + "step": 4213 + }, + { + "epoch": 0.49, + "learning_rate": 1.5661945452417182e-07, + "logits/chosen": -3.365210771560669, + "logits/rejected": -3.182372570037842, + "logps/chosen": -299.1852722167969, + "logps/rejected": -220.96194458007812, + "loss": 0.5082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4923288822174072, + "rewards/margins": 0.9294145107269287, + "rewards/rejected": -1.421743392944336, + "step": 4214 + }, + { + "epoch": 0.49, + "learning_rate": 1.5658433805454757e-07, + "logits/chosen": -2.574521780014038, + "logits/rejected": -2.6343436241149902, + "logps/chosen": -559.0587158203125, + "logps/rejected": -380.2601623535156, + "loss": 0.5195, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2671915292739868, + "rewards/margins": 0.817820131778717, + "rewards/rejected": -0.5506286025047302, + "step": 4215 + }, + { + "epoch": 0.49, + "learning_rate": 1.565492215849233e-07, + "logits/chosen": -3.251523971557617, + "logits/rejected": -3.603330612182617, + "logps/chosen": -231.7414093017578, + "logps/rejected": -213.80653381347656, + "loss": 0.1922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16383808851242065, + "rewards/margins": 1.8391492366790771, + "rewards/rejected": -2.0029873847961426, + "step": 4216 + }, + { + "epoch": 0.49, + "learning_rate": 1.5651410511529906e-07, + "logits/chosen": -3.026132106781006, + "logits/rejected": -2.8859915733337402, + "logps/chosen": -144.35787963867188, + "logps/rejected": -156.07640075683594, + "loss": 0.605, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47343769669532776, + "rewards/margins": 1.0828468799591064, + "rewards/rejected": -1.5562846660614014, + "step": 4217 + }, + { + "epoch": 0.49, + "learning_rate": 1.5647898864567484e-07, + "logits/chosen": -3.53056263923645, + "logits/rejected": -3.6490373611450195, + "logps/chosen": -154.6624298095703, + "logps/rejected": -172.98361206054688, + "loss": 0.4218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7092382907867432, + "rewards/margins": 1.244614601135254, + "rewards/rejected": -1.953852891921997, + "step": 4218 + }, + { + "epoch": 0.49, + "learning_rate": 1.5644387217605056e-07, + "logits/chosen": -3.7218546867370605, + "logits/rejected": -3.6570980548858643, + "logps/chosen": -344.14385986328125, + "logps/rejected": -243.33856201171875, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2171463519334793, + "rewards/margins": 2.0274670124053955, + "rewards/rejected": -1.8103206157684326, + "step": 4219 + }, + { + "epoch": 0.49, + "learning_rate": 1.5640875570642632e-07, + "logits/chosen": -3.8679966926574707, + "logits/rejected": -3.387474536895752, + "logps/chosen": -242.87124633789062, + "logps/rejected": -183.67166137695312, + "loss": 0.6652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8209249377250671, + "rewards/margins": 0.864315390586853, + "rewards/rejected": -1.6852402687072754, + "step": 4220 + }, + { + "epoch": 0.49, + "learning_rate": 1.5637363923680207e-07, + "logits/chosen": -3.067713737487793, + "logits/rejected": -2.9026548862457275, + "logps/chosen": -159.93829345703125, + "logps/rejected": -169.57919311523438, + "loss": 0.6137, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6820780038833618, + "rewards/margins": 0.3177064061164856, + "rewards/rejected": -0.9997844099998474, + "step": 4221 + }, + { + "epoch": 0.49, + "learning_rate": 1.563385227671778e-07, + "logits/chosen": -3.250931978225708, + "logits/rejected": -2.9872829914093018, + "logps/chosen": -277.15509033203125, + "logps/rejected": -322.9438781738281, + "loss": 0.6291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14678996801376343, + "rewards/margins": 1.599380373954773, + "rewards/rejected": -1.7461705207824707, + "step": 4222 + }, + { + "epoch": 0.49, + "learning_rate": 1.5630340629755355e-07, + "logits/chosen": -2.6062636375427246, + "logits/rejected": -2.682535409927368, + "logps/chosen": -197.87490844726562, + "logps/rejected": -384.34246826171875, + "loss": 0.3425, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17498986423015594, + "rewards/margins": 2.6025538444519043, + "rewards/rejected": -2.4275639057159424, + "step": 4223 + }, + { + "epoch": 0.49, + "learning_rate": 1.5626828982792928e-07, + "logits/chosen": -2.9401419162750244, + "logits/rejected": -2.899386405944824, + "logps/chosen": -420.2550354003906, + "logps/rejected": -417.41424560546875, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16950300335884094, + "rewards/margins": 2.0824363231658936, + "rewards/rejected": -1.9129332304000854, + "step": 4224 + }, + { + "epoch": 0.49, + "learning_rate": 1.5623317335830503e-07, + "logits/chosen": -2.8833069801330566, + "logits/rejected": -3.197333335876465, + "logps/chosen": -271.4185791015625, + "logps/rejected": -166.21835327148438, + "loss": 0.4951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09382620453834534, + "rewards/margins": 1.207953929901123, + "rewards/rejected": -1.3017802238464355, + "step": 4225 + }, + { + "epoch": 0.49, + "learning_rate": 1.561980568886808e-07, + "logits/chosen": -2.993009090423584, + "logits/rejected": -2.9859085083007812, + "logps/chosen": -377.30682373046875, + "logps/rejected": -305.17755126953125, + "loss": 0.4752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14905662834644318, + "rewards/margins": 1.534812569618225, + "rewards/rejected": -1.6838691234588623, + "step": 4226 + }, + { + "epoch": 0.49, + "learning_rate": 1.5616294041905652e-07, + "logits/chosen": -3.4217991828918457, + "logits/rejected": -3.350095510482788, + "logps/chosen": -172.26104736328125, + "logps/rejected": -200.93963623046875, + "loss": 0.2791, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4336477518081665, + "rewards/margins": 1.4851210117340088, + "rewards/rejected": -1.0514732599258423, + "step": 4227 + }, + { + "epoch": 0.49, + "learning_rate": 1.5612782394943227e-07, + "logits/chosen": -3.4234859943389893, + "logits/rejected": -3.6478309631347656, + "logps/chosen": -355.0272521972656, + "logps/rejected": -292.01788330078125, + "loss": 0.615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7134689688682556, + "rewards/margins": 1.1417183876037598, + "rewards/rejected": -1.8551874160766602, + "step": 4228 + }, + { + "epoch": 0.49, + "learning_rate": 1.5609270747980805e-07, + "logits/chosen": -3.3391733169555664, + "logits/rejected": -3.241987705230713, + "logps/chosen": -240.895263671875, + "logps/rejected": -240.29330444335938, + "loss": 0.4052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2518627345561981, + "rewards/margins": 0.9093825221061707, + "rewards/rejected": -1.1612452268600464, + "step": 4229 + }, + { + "epoch": 0.49, + "learning_rate": 1.5605759101018378e-07, + "logits/chosen": -3.732632637023926, + "logits/rejected": -3.5530200004577637, + "logps/chosen": -268.4189758300781, + "logps/rejected": -181.23297119140625, + "loss": 0.4128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46662265062332153, + "rewards/margins": 2.0227432250976562, + "rewards/rejected": -2.489366054534912, + "step": 4230 + }, + { + "epoch": 0.49, + "learning_rate": 1.5602247454055953e-07, + "logits/chosen": -2.5775036811828613, + "logits/rejected": -2.691265106201172, + "logps/chosen": -398.1290283203125, + "logps/rejected": -354.6471252441406, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16386035084724426, + "rewards/margins": 0.906028687953949, + "rewards/rejected": -0.7421683669090271, + "step": 4231 + }, + { + "epoch": 0.49, + "learning_rate": 1.5598735807093526e-07, + "logits/chosen": -3.540642738342285, + "logits/rejected": -3.715785026550293, + "logps/chosen": -176.8072509765625, + "logps/rejected": -174.71444702148438, + "loss": 0.3515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5884497761726379, + "rewards/margins": 2.1111292839050293, + "rewards/rejected": -2.6995792388916016, + "step": 4232 + }, + { + "epoch": 0.49, + "learning_rate": 1.55952241601311e-07, + "logits/chosen": -3.19585919380188, + "logits/rejected": -3.176499605178833, + "logps/chosen": -318.3712158203125, + "logps/rejected": -414.4798583984375, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2189367413520813, + "rewards/margins": 2.98225998878479, + "rewards/rejected": -3.2011966705322266, + "step": 4233 + }, + { + "epoch": 0.49, + "learning_rate": 1.5591712513168677e-07, + "logits/chosen": -3.364260673522949, + "logits/rejected": -3.1206307411193848, + "logps/chosen": -189.42726135253906, + "logps/rejected": -154.1235809326172, + "loss": 0.3679, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18897829949855804, + "rewards/margins": 1.0639965534210205, + "rewards/rejected": -1.2529747486114502, + "step": 4234 + }, + { + "epoch": 0.49, + "learning_rate": 1.558820086620625e-07, + "logits/chosen": -3.2002058029174805, + "logits/rejected": -2.9947011470794678, + "logps/chosen": -411.40997314453125, + "logps/rejected": -274.5831604003906, + "loss": 0.2626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4559458792209625, + "rewards/margins": 3.0677223205566406, + "rewards/rejected": -3.5236682891845703, + "step": 4235 + }, + { + "epoch": 0.49, + "learning_rate": 1.5584689219243825e-07, + "logits/chosen": -3.203077554702759, + "logits/rejected": -3.1748199462890625, + "logps/chosen": -182.33050537109375, + "logps/rejected": -198.01498413085938, + "loss": 0.3675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48663392663002014, + "rewards/margins": 1.2618520259857178, + "rewards/rejected": -0.7752181887626648, + "step": 4236 + }, + { + "epoch": 0.49, + "learning_rate": 1.5581177572281397e-07, + "logits/chosen": -3.6511764526367188, + "logits/rejected": -3.474350690841675, + "logps/chosen": -256.4237060546875, + "logps/rejected": -304.7333068847656, + "loss": 0.461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2401297390460968, + "rewards/margins": 1.5414609909057617, + "rewards/rejected": -1.7815905809402466, + "step": 4237 + }, + { + "epoch": 0.49, + "learning_rate": 1.5577665925318973e-07, + "logits/chosen": -3.169029712677002, + "logits/rejected": -3.134168863296509, + "logps/chosen": -276.2920837402344, + "logps/rejected": -174.845703125, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7510141730308533, + "rewards/margins": 2.3454251289367676, + "rewards/rejected": -1.594411015510559, + "step": 4238 + }, + { + "epoch": 0.49, + "learning_rate": 1.5574154278356548e-07, + "logits/chosen": -3.0075979232788086, + "logits/rejected": -3.0985002517700195, + "logps/chosen": -139.8168487548828, + "logps/rejected": -218.863525390625, + "loss": 0.3281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4088268280029297, + "rewards/margins": 1.648471474647522, + "rewards/rejected": -2.057298421859741, + "step": 4239 + }, + { + "epoch": 0.49, + "learning_rate": 1.557064263139412e-07, + "logits/chosen": -3.7345809936523438, + "logits/rejected": -3.436410427093506, + "logps/chosen": -186.59226989746094, + "logps/rejected": -169.72384643554688, + "loss": 0.4119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03154118359088898, + "rewards/margins": 1.0449697971343994, + "rewards/rejected": -1.076511025428772, + "step": 4240 + }, + { + "epoch": 0.49, + "learning_rate": 1.55671309844317e-07, + "logits/chosen": -3.218388080596924, + "logits/rejected": -3.270371198654175, + "logps/chosen": -304.01513671875, + "logps/rejected": -222.3992462158203, + "loss": 0.2911, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29795411229133606, + "rewards/margins": 1.6727856397628784, + "rewards/rejected": -1.3748315572738647, + "step": 4241 + }, + { + "epoch": 0.49, + "learning_rate": 1.5563619337469274e-07, + "logits/chosen": -3.5634844303131104, + "logits/rejected": -3.6227400302886963, + "logps/chosen": -270.78521728515625, + "logps/rejected": -431.0711364746094, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.845138430595398, + "rewards/margins": 3.4672791957855225, + "rewards/rejected": -2.622140884399414, + "step": 4242 + }, + { + "epoch": 0.49, + "learning_rate": 1.5560107690506847e-07, + "logits/chosen": -3.5754261016845703, + "logits/rejected": -3.3849782943725586, + "logps/chosen": -458.4036560058594, + "logps/rejected": -357.0942077636719, + "loss": 0.0865, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8364957571029663, + "rewards/margins": 3.2704739570617676, + "rewards/rejected": -2.4339780807495117, + "step": 4243 + }, + { + "epoch": 0.49, + "learning_rate": 1.5556596043544422e-07, + "logits/chosen": -3.0007286071777344, + "logits/rejected": -3.0706701278686523, + "logps/chosen": -220.73667907714844, + "logps/rejected": -197.514404296875, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3154059946537018, + "rewards/margins": 1.0341111421585083, + "rewards/rejected": -0.7187052369117737, + "step": 4244 + }, + { + "epoch": 0.49, + "learning_rate": 1.5553084396581995e-07, + "logits/chosen": -3.0850510597229004, + "logits/rejected": -3.114885091781616, + "logps/chosen": -313.07269287109375, + "logps/rejected": -372.5010986328125, + "loss": 0.6345, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5051722526550293, + "rewards/margins": 0.9299687743186951, + "rewards/rejected": -1.4351409673690796, + "step": 4245 + }, + { + "epoch": 0.49, + "learning_rate": 1.554957274961957e-07, + "logits/chosen": -3.0106728076934814, + "logits/rejected": -2.982508897781372, + "logps/chosen": -285.2098083496094, + "logps/rejected": -291.25421142578125, + "loss": 0.225, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39871105551719666, + "rewards/margins": 2.478614330291748, + "rewards/rejected": -2.0799033641815186, + "step": 4246 + }, + { + "epoch": 0.49, + "learning_rate": 1.5546061102657146e-07, + "logits/chosen": -2.9142961502075195, + "logits/rejected": -2.661449670791626, + "logps/chosen": -473.8479309082031, + "logps/rejected": -259.98773193359375, + "loss": 0.3156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23213748633861542, + "rewards/margins": 1.9801653623580933, + "rewards/rejected": -1.7480278015136719, + "step": 4247 + }, + { + "epoch": 0.49, + "learning_rate": 1.554254945569472e-07, + "logits/chosen": -3.015434980392456, + "logits/rejected": -2.4764256477355957, + "logps/chosen": -394.7430419921875, + "logps/rejected": -302.72711181640625, + "loss": 0.3018, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25437530875205994, + "rewards/margins": 2.9203319549560547, + "rewards/rejected": -2.665956497192383, + "step": 4248 + }, + { + "epoch": 0.49, + "learning_rate": 1.5539037808732294e-07, + "logits/chosen": -3.2985901832580566, + "logits/rejected": -3.5631816387176514, + "logps/chosen": -313.31622314453125, + "logps/rejected": -276.43011474609375, + "loss": 0.6255, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12556932866573334, + "rewards/margins": 1.2440885305404663, + "rewards/rejected": -1.3696579933166504, + "step": 4249 + }, + { + "epoch": 0.49, + "learning_rate": 1.553552616176987e-07, + "logits/chosen": -2.2757298946380615, + "logits/rejected": -2.2371432781219482, + "logps/chosen": -396.76214599609375, + "logps/rejected": -314.0649719238281, + "loss": 0.4346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11369448155164719, + "rewards/margins": 1.3494445085525513, + "rewards/rejected": -1.4631389379501343, + "step": 4250 + }, + { + "epoch": 0.49, + "learning_rate": 1.5532014514807442e-07, + "logits/chosen": -3.2242932319641113, + "logits/rejected": -3.4841742515563965, + "logps/chosen": -177.48495483398438, + "logps/rejected": -169.81857299804688, + "loss": 0.507, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5749505758285522, + "rewards/margins": 1.4896697998046875, + "rewards/rejected": -2.06462025642395, + "step": 4251 + }, + { + "epoch": 0.49, + "learning_rate": 1.552850286784502e-07, + "logits/chosen": -2.6729962825775146, + "logits/rejected": -2.699465274810791, + "logps/chosen": -265.9630432128906, + "logps/rejected": -268.8183288574219, + "loss": 0.4266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07760922610759735, + "rewards/margins": 2.1892967224121094, + "rewards/rejected": -2.2669060230255127, + "step": 4252 + }, + { + "epoch": 0.49, + "learning_rate": 1.5524991220882593e-07, + "logits/chosen": -3.195957660675049, + "logits/rejected": -3.351248264312744, + "logps/chosen": -269.3422546386719, + "logps/rejected": -295.34222412109375, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27985018491744995, + "rewards/margins": 2.2151968479156494, + "rewards/rejected": -1.9353466033935547, + "step": 4253 + }, + { + "epoch": 0.49, + "learning_rate": 1.5521479573920168e-07, + "logits/chosen": -2.72761869430542, + "logits/rejected": -2.7936758995056152, + "logps/chosen": -170.69908142089844, + "logps/rejected": -299.31597900390625, + "loss": 0.4097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44829198718070984, + "rewards/margins": 2.8198447227478027, + "rewards/rejected": -3.268136739730835, + "step": 4254 + }, + { + "epoch": 0.49, + "learning_rate": 1.5517967926957744e-07, + "logits/chosen": -3.437605381011963, + "logits/rejected": -3.6807217597961426, + "logps/chosen": -250.77328491210938, + "logps/rejected": -272.0954284667969, + "loss": 0.5243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38885074853897095, + "rewards/margins": 1.0766921043395996, + "rewards/rejected": -1.4655427932739258, + "step": 4255 + }, + { + "epoch": 0.49, + "learning_rate": 1.5514456279995317e-07, + "logits/chosen": -3.5275001525878906, + "logits/rejected": -3.3601009845733643, + "logps/chosen": -513.3865356445312, + "logps/rejected": -251.1932830810547, + "loss": 0.2872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10246928036212921, + "rewards/margins": 1.4779465198516846, + "rewards/rejected": -1.3754773139953613, + "step": 4256 + }, + { + "epoch": 0.49, + "learning_rate": 1.5510944633032892e-07, + "logits/chosen": -3.6029481887817383, + "logits/rejected": -3.7882893085479736, + "logps/chosen": -192.38475036621094, + "logps/rejected": -277.5639343261719, + "loss": 0.3912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3484286069869995, + "rewards/margins": 1.4554061889648438, + "rewards/rejected": -1.8038346767425537, + "step": 4257 + }, + { + "epoch": 0.49, + "learning_rate": 1.5507432986070467e-07, + "logits/chosen": -2.7120699882507324, + "logits/rejected": -2.444258689880371, + "logps/chosen": -300.184326171875, + "logps/rejected": -321.5472106933594, + "loss": 0.286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23452740907669067, + "rewards/margins": 1.508455514907837, + "rewards/rejected": -1.273928165435791, + "step": 4258 + }, + { + "epoch": 0.49, + "learning_rate": 1.550392133910804e-07, + "logits/chosen": -3.350231647491455, + "logits/rejected": -3.3053174018859863, + "logps/chosen": -189.99618530273438, + "logps/rejected": -314.37884521484375, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02450786530971527, + "rewards/margins": 3.134544849395752, + "rewards/rejected": -3.159052610397339, + "step": 4259 + }, + { + "epoch": 0.49, + "learning_rate": 1.5500409692145615e-07, + "logits/chosen": -2.9293618202209473, + "logits/rejected": -3.2945683002471924, + "logps/chosen": -324.18304443359375, + "logps/rejected": -179.22003173828125, + "loss": 0.3123, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24158847332000732, + "rewards/margins": 2.1195404529571533, + "rewards/rejected": -1.877951979637146, + "step": 4260 + }, + { + "epoch": 0.49, + "learning_rate": 1.5496898045183188e-07, + "logits/chosen": -2.4225058555603027, + "logits/rejected": -2.681544303894043, + "logps/chosen": -364.311279296875, + "logps/rejected": -360.9361572265625, + "loss": 0.5676, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7741743326187134, + "rewards/margins": 0.9486208558082581, + "rewards/rejected": -1.7227951288223267, + "step": 4261 + }, + { + "epoch": 0.49, + "learning_rate": 1.5493386398220764e-07, + "logits/chosen": -3.041181802749634, + "logits/rejected": -3.151578903198242, + "logps/chosen": -365.9093017578125, + "logps/rejected": -291.69696044921875, + "loss": 0.3964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07847069948911667, + "rewards/margins": 1.0078822374343872, + "rewards/rejected": -1.0863529443740845, + "step": 4262 + }, + { + "epoch": 0.49, + "learning_rate": 1.5489874751258342e-07, + "logits/chosen": -4.153163909912109, + "logits/rejected": -3.8480470180511475, + "logps/chosen": -333.59814453125, + "logps/rejected": -252.2152099609375, + "loss": 0.1713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5049145817756653, + "rewards/margins": 2.8392794132232666, + "rewards/rejected": -2.334364891052246, + "step": 4263 + }, + { + "epoch": 0.49, + "learning_rate": 1.5486363104295914e-07, + "logits/chosen": -3.313730478286743, + "logits/rejected": -3.125652313232422, + "logps/chosen": -312.0753173828125, + "logps/rejected": -302.1621398925781, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.186031311750412, + "rewards/margins": 2.134692668914795, + "rewards/rejected": -1.9486613273620605, + "step": 4264 + }, + { + "epoch": 0.49, + "learning_rate": 1.548285145733349e-07, + "logits/chosen": -3.338886022567749, + "logits/rejected": -3.135590076446533, + "logps/chosen": -227.12118530273438, + "logps/rejected": -209.9407958984375, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07148130238056183, + "rewards/margins": 2.3658154010772705, + "rewards/rejected": -2.2943339347839355, + "step": 4265 + }, + { + "epoch": 0.49, + "learning_rate": 1.5479339810371065e-07, + "logits/chosen": -3.3191421031951904, + "logits/rejected": -3.0274291038513184, + "logps/chosen": -284.0174255371094, + "logps/rejected": -103.72421264648438, + "loss": 0.7024, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06072060763835907, + "rewards/margins": 0.7407540082931519, + "rewards/rejected": -0.6800334453582764, + "step": 4266 + }, + { + "epoch": 0.49, + "learning_rate": 1.5475828163408638e-07, + "logits/chosen": -3.2885265350341797, + "logits/rejected": -3.047283411026001, + "logps/chosen": -310.8927917480469, + "logps/rejected": -223.30227661132812, + "loss": 0.6973, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5787513256072998, + "rewards/margins": 0.13170669972896576, + "rewards/rejected": -0.7104580998420715, + "step": 4267 + }, + { + "epoch": 0.49, + "learning_rate": 1.5472316516446213e-07, + "logits/chosen": -3.228416919708252, + "logits/rejected": -3.5127434730529785, + "logps/chosen": -190.70462036132812, + "logps/rejected": -199.34390258789062, + "loss": 0.3552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11502022296190262, + "rewards/margins": 2.401282548904419, + "rewards/rejected": -2.5163025856018066, + "step": 4268 + }, + { + "epoch": 0.49, + "learning_rate": 1.5468804869483786e-07, + "logits/chosen": -2.7805447578430176, + "logits/rejected": -2.699923038482666, + "logps/chosen": -310.7448425292969, + "logps/rejected": -249.8104248046875, + "loss": 0.4198, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.020518869161605835, + "rewards/margins": 1.5944247245788574, + "rewards/rejected": -1.5739057064056396, + "step": 4269 + }, + { + "epoch": 0.49, + "learning_rate": 1.5465293222521361e-07, + "logits/chosen": -2.729285955429077, + "logits/rejected": -2.938908576965332, + "logps/chosen": -197.95718383789062, + "logps/rejected": -278.8391418457031, + "loss": 0.2232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23390258848667145, + "rewards/margins": 3.3272204399108887, + "rewards/rejected": -3.093317985534668, + "step": 4270 + }, + { + "epoch": 0.49, + "learning_rate": 1.5461781575558937e-07, + "logits/chosen": -2.3629744052886963, + "logits/rejected": -2.420750141143799, + "logps/chosen": -253.925537109375, + "logps/rejected": -243.19322204589844, + "loss": 0.339, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09295044839382172, + "rewards/margins": 1.498010277748108, + "rewards/rejected": -1.405059814453125, + "step": 4271 + }, + { + "epoch": 0.49, + "learning_rate": 1.545826992859651e-07, + "logits/chosen": -2.986513137817383, + "logits/rejected": -2.91434383392334, + "logps/chosen": -244.45603942871094, + "logps/rejected": -154.60813903808594, + "loss": 0.4283, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36696621775627136, + "rewards/margins": 1.496220588684082, + "rewards/rejected": -1.1292543411254883, + "step": 4272 + }, + { + "epoch": 0.49, + "learning_rate": 1.5454758281634085e-07, + "logits/chosen": -3.0584897994995117, + "logits/rejected": -3.1681861877441406, + "logps/chosen": -297.70306396484375, + "logps/rejected": -277.62982177734375, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26723724603652954, + "rewards/margins": 2.0633718967437744, + "rewards/rejected": -2.3306093215942383, + "step": 4273 + }, + { + "epoch": 0.49, + "learning_rate": 1.5451246634671663e-07, + "logits/chosen": -2.5133297443389893, + "logits/rejected": -2.6567559242248535, + "logps/chosen": -272.3305358886719, + "logps/rejected": -176.26513671875, + "loss": 0.4541, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3723936676979065, + "rewards/margins": 1.9132310152053833, + "rewards/rejected": -1.5408375263214111, + "step": 4274 + }, + { + "epoch": 0.49, + "learning_rate": 1.5447734987709236e-07, + "logits/chosen": -3.7197265625, + "logits/rejected": -3.4434762001037598, + "logps/chosen": -340.06939697265625, + "logps/rejected": -338.12518310546875, + "loss": 0.7822, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37703490257263184, + "rewards/margins": 1.6792926788330078, + "rewards/rejected": -2.0563273429870605, + "step": 4275 + }, + { + "epoch": 0.49, + "learning_rate": 1.544422334074681e-07, + "logits/chosen": -3.471404552459717, + "logits/rejected": -3.4031448364257812, + "logps/chosen": -211.17233276367188, + "logps/rejected": -255.6654052734375, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3411221206188202, + "rewards/margins": 2.6552062034606934, + "rewards/rejected": -2.314084053039551, + "step": 4276 + }, + { + "epoch": 0.49, + "learning_rate": 1.5440711693784384e-07, + "logits/chosen": -4.029463768005371, + "logits/rejected": -3.7630763053894043, + "logps/chosen": -203.9584197998047, + "logps/rejected": -164.60647583007812, + "loss": 0.7409, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31074589490890503, + "rewards/margins": 1.895453691482544, + "rewards/rejected": -2.2061996459960938, + "step": 4277 + }, + { + "epoch": 0.49, + "learning_rate": 1.543720004682196e-07, + "logits/chosen": -3.709306478500366, + "logits/rejected": -3.6453418731689453, + "logps/chosen": -163.29310607910156, + "logps/rejected": -247.66061401367188, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14385786652565002, + "rewards/margins": 1.9157966375350952, + "rewards/rejected": -2.059654712677002, + "step": 4278 + }, + { + "epoch": 0.49, + "learning_rate": 1.5433688399859535e-07, + "logits/chosen": -2.556473970413208, + "logits/rejected": -2.5628554821014404, + "logps/chosen": -311.23046875, + "logps/rejected": -297.63092041015625, + "loss": 0.2219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24030542373657227, + "rewards/margins": 2.8378944396972656, + "rewards/rejected": -3.078199863433838, + "step": 4279 + }, + { + "epoch": 0.49, + "learning_rate": 1.5430176752897107e-07, + "logits/chosen": -3.118302345275879, + "logits/rejected": -3.322199821472168, + "logps/chosen": -428.4708251953125, + "logps/rejected": -363.459716796875, + "loss": 0.3394, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08563782274723053, + "rewards/margins": 2.6950995922088623, + "rewards/rejected": -2.609461784362793, + "step": 4280 + }, + { + "epoch": 0.49, + "learning_rate": 1.5426665105934683e-07, + "logits/chosen": -2.991495370864868, + "logits/rejected": -3.2710366249084473, + "logps/chosen": -187.23611450195312, + "logps/rejected": -214.78778076171875, + "loss": 0.3038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23089809715747833, + "rewards/margins": 1.5562303066253662, + "rewards/rejected": -1.7871284484863281, + "step": 4281 + }, + { + "epoch": 0.49, + "learning_rate": 1.5423153458972258e-07, + "logits/chosen": -3.504746913909912, + "logits/rejected": -3.508291006088257, + "logps/chosen": -140.92364501953125, + "logps/rejected": -181.0018310546875, + "loss": 0.3544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08114970475435257, + "rewards/margins": 1.6918983459472656, + "rewards/rejected": -1.6107486486434937, + "step": 4282 + }, + { + "epoch": 0.49, + "learning_rate": 1.541964181200983e-07, + "logits/chosen": -2.5548577308654785, + "logits/rejected": -2.3360445499420166, + "logps/chosen": -436.4823913574219, + "logps/rejected": -294.167724609375, + "loss": 0.2517, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.707363486289978, + "rewards/margins": 1.8429533243179321, + "rewards/rejected": -1.135589838027954, + "step": 4283 + }, + { + "epoch": 0.49, + "learning_rate": 1.5416130165047406e-07, + "logits/chosen": -3.411285877227783, + "logits/rejected": -3.415295124053955, + "logps/chosen": -207.49159240722656, + "logps/rejected": -346.6177673339844, + "loss": 0.6683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4316660165786743, + "rewards/margins": 1.3793939352035522, + "rewards/rejected": -1.811059832572937, + "step": 4284 + }, + { + "epoch": 0.49, + "learning_rate": 1.541261851808498e-07, + "logits/chosen": -3.730743646621704, + "logits/rejected": -3.716920852661133, + "logps/chosen": -370.18170166015625, + "logps/rejected": -274.3343200683594, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4774865508079529, + "rewards/margins": 2.0788557529449463, + "rewards/rejected": -2.556342124938965, + "step": 4285 + }, + { + "epoch": 0.49, + "learning_rate": 1.5409106871122557e-07, + "logits/chosen": -3.2651567459106445, + "logits/rejected": -3.290393829345703, + "logps/chosen": -223.17340087890625, + "logps/rejected": -306.7315979003906, + "loss": 0.3742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.016801893711090088, + "rewards/margins": 1.3391062021255493, + "rewards/rejected": -1.3559080362319946, + "step": 4286 + }, + { + "epoch": 0.49, + "learning_rate": 1.5405595224160132e-07, + "logits/chosen": -2.5970003604888916, + "logits/rejected": -2.786433458328247, + "logps/chosen": -164.9182586669922, + "logps/rejected": -221.38204956054688, + "loss": 0.4937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24331718683242798, + "rewards/margins": 1.5659312009811401, + "rewards/rejected": -1.8092483282089233, + "step": 4287 + }, + { + "epoch": 0.49, + "learning_rate": 1.5402083577197705e-07, + "logits/chosen": -2.9223663806915283, + "logits/rejected": -3.1509416103363037, + "logps/chosen": -227.314208984375, + "logps/rejected": -195.67503356933594, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0181247740983963, + "rewards/margins": 3.06099796295166, + "rewards/rejected": -3.0428733825683594, + "step": 4288 + }, + { + "epoch": 0.49, + "learning_rate": 1.539857193023528e-07, + "logits/chosen": -2.556455612182617, + "logits/rejected": -2.560861587524414, + "logps/chosen": -344.978271484375, + "logps/rejected": -364.9228515625, + "loss": 0.1787, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15989960730075836, + "rewards/margins": 2.260153293609619, + "rewards/rejected": -2.1002538204193115, + "step": 4289 + }, + { + "epoch": 0.49, + "learning_rate": 1.5395060283272853e-07, + "logits/chosen": -3.4339513778686523, + "logits/rejected": -3.381817579269409, + "logps/chosen": -194.7606201171875, + "logps/rejected": -184.34353637695312, + "loss": 0.388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3357822299003601, + "rewards/margins": 1.3374630212783813, + "rewards/rejected": -1.6732453107833862, + "step": 4290 + }, + { + "epoch": 0.49, + "learning_rate": 1.5391548636310429e-07, + "logits/chosen": -3.5005691051483154, + "logits/rejected": -3.5526740550994873, + "logps/chosen": -449.18450927734375, + "logps/rejected": -315.04766845703125, + "loss": 0.1748, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6397150754928589, + "rewards/margins": 2.375354766845703, + "rewards/rejected": -1.7356396913528442, + "step": 4291 + }, + { + "epoch": 0.49, + "learning_rate": 1.5388036989348004e-07, + "logits/chosen": -3.597813606262207, + "logits/rejected": -3.6424758434295654, + "logps/chosen": -327.48828125, + "logps/rejected": -206.46420288085938, + "loss": 0.4587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035460442304611206, + "rewards/margins": 1.8111014366149902, + "rewards/rejected": -1.7756409645080566, + "step": 4292 + }, + { + "epoch": 0.49, + "learning_rate": 1.5384525342385577e-07, + "logits/chosen": -2.477278709411621, + "logits/rejected": -2.5465385913848877, + "logps/chosen": -347.1973876953125, + "logps/rejected": -219.20623779296875, + "loss": 0.3007, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2723860442638397, + "rewards/margins": 2.3205997943878174, + "rewards/rejected": -2.0482139587402344, + "step": 4293 + }, + { + "epoch": 0.5, + "learning_rate": 1.5381013695423152e-07, + "logits/chosen": -3.118471622467041, + "logits/rejected": -3.2404327392578125, + "logps/chosen": -240.72564697265625, + "logps/rejected": -210.66355895996094, + "loss": 0.5995, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5552998781204224, + "rewards/margins": 1.0253130197525024, + "rewards/rejected": -1.5806130170822144, + "step": 4294 + }, + { + "epoch": 0.5, + "learning_rate": 1.5377502048460727e-07, + "logits/chosen": -2.632335662841797, + "logits/rejected": -2.719329833984375, + "logps/chosen": -306.66845703125, + "logps/rejected": -174.6659698486328, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07939299196004868, + "rewards/margins": 1.0492253303527832, + "rewards/rejected": -0.9698323607444763, + "step": 4295 + }, + { + "epoch": 0.5, + "learning_rate": 1.53739904014983e-07, + "logits/chosen": -3.3876118659973145, + "logits/rejected": -3.1232876777648926, + "logps/chosen": -147.7434539794922, + "logps/rejected": -283.1214599609375, + "loss": 0.4113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23189568519592285, + "rewards/margins": 1.7073016166687012, + "rewards/rejected": -1.9391974210739136, + "step": 4296 + }, + { + "epoch": 0.5, + "learning_rate": 1.5370478754535878e-07, + "logits/chosen": -2.832066535949707, + "logits/rejected": -2.8992743492126465, + "logps/chosen": -345.2873229980469, + "logps/rejected": -313.4912109375, + "loss": 0.3301, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4334893524646759, + "rewards/margins": 3.3997535705566406, + "rewards/rejected": -2.966264247894287, + "step": 4297 + }, + { + "epoch": 0.5, + "learning_rate": 1.536696710757345e-07, + "logits/chosen": -3.63503360748291, + "logits/rejected": -3.5896596908569336, + "logps/chosen": -240.21676635742188, + "logps/rejected": -371.07830810546875, + "loss": 0.136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08506511151790619, + "rewards/margins": 2.8511862754821777, + "rewards/rejected": -2.7661213874816895, + "step": 4298 + }, + { + "epoch": 0.5, + "learning_rate": 1.5363455460611026e-07, + "logits/chosen": -2.9974560737609863, + "logits/rejected": -2.824087142944336, + "logps/chosen": -486.75323486328125, + "logps/rejected": -281.33984375, + "loss": 0.2544, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18109309673309326, + "rewards/margins": 1.912896752357483, + "rewards/rejected": -1.7318035364151, + "step": 4299 + }, + { + "epoch": 0.5, + "learning_rate": 1.5359943813648602e-07, + "logits/chosen": -3.6033737659454346, + "logits/rejected": -3.24165678024292, + "logps/chosen": -298.1138916015625, + "logps/rejected": -315.6715087890625, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1615678071975708, + "rewards/margins": 1.861816167831421, + "rewards/rejected": -2.0233840942382812, + "step": 4300 + }, + { + "epoch": 0.5, + "learning_rate": 1.5356432166686174e-07, + "logits/chosen": -3.476006031036377, + "logits/rejected": -3.3934459686279297, + "logps/chosen": -332.7486572265625, + "logps/rejected": -355.778564453125, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1129886731505394, + "rewards/margins": 2.5368080139160156, + "rewards/rejected": -2.4238193035125732, + "step": 4301 + }, + { + "epoch": 0.5, + "learning_rate": 1.535292051972375e-07, + "logits/chosen": -3.1930768489837646, + "logits/rejected": -2.8808445930480957, + "logps/chosen": -356.93890380859375, + "logps/rejected": -379.7453308105469, + "loss": 0.2241, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3100447952747345, + "rewards/margins": 2.159363031387329, + "rewards/rejected": -1.849318265914917, + "step": 4302 + }, + { + "epoch": 0.5, + "learning_rate": 1.5349408872761325e-07, + "logits/chosen": -3.519726276397705, + "logits/rejected": -3.567011833190918, + "logps/chosen": -223.61976623535156, + "logps/rejected": -180.75791931152344, + "loss": 0.9607, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0219066143035889, + "rewards/margins": 0.00025978684425354004, + "rewards/rejected": -1.0221664905548096, + "step": 4303 + }, + { + "epoch": 0.5, + "learning_rate": 1.5345897225798898e-07, + "logits/chosen": -2.596865177154541, + "logits/rejected": -2.8274571895599365, + "logps/chosen": -218.16639709472656, + "logps/rejected": -275.51434326171875, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1814868152141571, + "rewards/margins": 2.738020896911621, + "rewards/rejected": -2.9195075035095215, + "step": 4304 + }, + { + "epoch": 0.5, + "learning_rate": 1.5342385578836473e-07, + "logits/chosen": -3.5933730602264404, + "logits/rejected": -3.670039653778076, + "logps/chosen": -141.3319091796875, + "logps/rejected": -134.354248046875, + "loss": 0.3227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26410186290740967, + "rewards/margins": 1.3488707542419434, + "rewards/rejected": -1.0847687721252441, + "step": 4305 + }, + { + "epoch": 0.5, + "learning_rate": 1.5338873931874046e-07, + "logits/chosen": -3.088113784790039, + "logits/rejected": -3.3945322036743164, + "logps/chosen": -308.6744079589844, + "logps/rejected": -274.1036376953125, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05853720009326935, + "rewards/margins": 1.9904723167419434, + "rewards/rejected": -1.9319349527359009, + "step": 4306 + }, + { + "epoch": 0.5, + "learning_rate": 1.5335362284911621e-07, + "logits/chosen": -2.7588510513305664, + "logits/rejected": -2.8285720348358154, + "logps/chosen": -318.97021484375, + "logps/rejected": -283.6190490722656, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08488290011882782, + "rewards/margins": 1.5864958763122559, + "rewards/rejected": -1.6713788509368896, + "step": 4307 + }, + { + "epoch": 0.5, + "learning_rate": 1.53318506379492e-07, + "logits/chosen": -2.623995065689087, + "logits/rejected": -2.3765671253204346, + "logps/chosen": -147.02392578125, + "logps/rejected": -204.38284301757812, + "loss": 0.635, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36885184049606323, + "rewards/margins": 1.0702910423278809, + "rewards/rejected": -1.4391427040100098, + "step": 4308 + }, + { + "epoch": 0.5, + "learning_rate": 1.5328338990986772e-07, + "logits/chosen": -3.4304025173187256, + "logits/rejected": -2.8442471027374268, + "logps/chosen": -546.1985473632812, + "logps/rejected": -297.4148864746094, + "loss": 0.4996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05596911907196045, + "rewards/margins": 1.4554903507232666, + "rewards/rejected": -1.5114593505859375, + "step": 4309 + }, + { + "epoch": 0.5, + "learning_rate": 1.5324827344024348e-07, + "logits/chosen": -3.1929404735565186, + "logits/rejected": -3.457733631134033, + "logps/chosen": -171.1824951171875, + "logps/rejected": -212.07928466796875, + "loss": 0.5239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5457698106765747, + "rewards/margins": 1.8894262313842773, + "rewards/rejected": -2.4351961612701416, + "step": 4310 + }, + { + "epoch": 0.5, + "learning_rate": 1.5321315697061923e-07, + "logits/chosen": -3.421989917755127, + "logits/rejected": -3.1232411861419678, + "logps/chosen": -157.9214324951172, + "logps/rejected": -227.15835571289062, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05309556424617767, + "rewards/margins": 1.7201213836669922, + "rewards/rejected": -1.6670256853103638, + "step": 4311 + }, + { + "epoch": 0.5, + "learning_rate": 1.5317804050099496e-07, + "logits/chosen": -3.571376323699951, + "logits/rejected": -3.7931506633758545, + "logps/chosen": -215.34329223632812, + "logps/rejected": -249.10101318359375, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.007148772478103638, + "rewards/margins": 2.550438165664673, + "rewards/rejected": -2.557586908340454, + "step": 4312 + }, + { + "epoch": 0.5, + "learning_rate": 1.531429240313707e-07, + "logits/chosen": -2.622481346130371, + "logits/rejected": -3.104994773864746, + "logps/chosen": -332.6460266113281, + "logps/rejected": -287.17071533203125, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38218504190444946, + "rewards/margins": 3.495710849761963, + "rewards/rejected": -3.113525867462158, + "step": 4313 + }, + { + "epoch": 0.5, + "learning_rate": 1.5310780756174644e-07, + "logits/chosen": -2.9433114528656006, + "logits/rejected": -3.0924386978149414, + "logps/chosen": -253.6914825439453, + "logps/rejected": -263.57550048828125, + "loss": 0.6465, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5032229423522949, + "rewards/margins": 1.1186459064483643, + "rewards/rejected": -1.6218688488006592, + "step": 4314 + }, + { + "epoch": 0.5, + "learning_rate": 1.530726910921222e-07, + "logits/chosen": -3.1789400577545166, + "logits/rejected": -3.2611005306243896, + "logps/chosen": -283.39642333984375, + "logps/rejected": -310.5025634765625, + "loss": 0.723, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.042109109461307526, + "rewards/margins": 1.278292179107666, + "rewards/rejected": -1.3204011917114258, + "step": 4315 + }, + { + "epoch": 0.5, + "learning_rate": 1.5303757462249795e-07, + "logits/chosen": -3.6112489700317383, + "logits/rejected": -3.9560658931732178, + "logps/chosen": -143.69947814941406, + "logps/rejected": -203.34048461914062, + "loss": 0.1541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30643394589424133, + "rewards/margins": 2.8478317260742188, + "rewards/rejected": -2.541398048400879, + "step": 4316 + }, + { + "epoch": 0.5, + "learning_rate": 1.5300245815287367e-07, + "logits/chosen": -3.8152012825012207, + "logits/rejected": -3.7606964111328125, + "logps/chosen": -184.01206970214844, + "logps/rejected": -283.0292053222656, + "loss": 0.4219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1609235256910324, + "rewards/margins": 1.2734662294387817, + "rewards/rejected": -1.4343898296356201, + "step": 4317 + }, + { + "epoch": 0.5, + "learning_rate": 1.5296734168324943e-07, + "logits/chosen": -2.952589988708496, + "logits/rejected": -3.237651824951172, + "logps/chosen": -145.03277587890625, + "logps/rejected": -275.06793212890625, + "loss": 0.4529, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17373663187026978, + "rewards/margins": 1.7595665454864502, + "rewards/rejected": -1.5858299732208252, + "step": 4318 + }, + { + "epoch": 0.5, + "learning_rate": 1.529322252136252e-07, + "logits/chosen": -3.3865129947662354, + "logits/rejected": -2.983956813812256, + "logps/chosen": -267.1505126953125, + "logps/rejected": -288.906494140625, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41714102029800415, + "rewards/margins": 1.9098066091537476, + "rewards/rejected": -2.3269476890563965, + "step": 4319 + }, + { + "epoch": 0.5, + "learning_rate": 1.5289710874400094e-07, + "logits/chosen": -2.7838242053985596, + "logits/rejected": -2.710378646850586, + "logps/chosen": -312.75457763671875, + "logps/rejected": -226.76536560058594, + "loss": 0.3814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38128241896629333, + "rewards/margins": 1.5275380611419678, + "rewards/rejected": -1.908820629119873, + "step": 4320 + }, + { + "epoch": 0.5, + "learning_rate": 1.528619922743767e-07, + "logits/chosen": -3.054166793823242, + "logits/rejected": -3.326118230819702, + "logps/chosen": -221.9783935546875, + "logps/rejected": -264.2236328125, + "loss": 0.2025, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2801345884799957, + "rewards/margins": 2.543579339981079, + "rewards/rejected": -2.2634449005126953, + "step": 4321 + }, + { + "epoch": 0.5, + "learning_rate": 1.5282687580475242e-07, + "logits/chosen": -3.012678623199463, + "logits/rejected": -2.8826327323913574, + "logps/chosen": -158.38796997070312, + "logps/rejected": -128.5760040283203, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17204123735427856, + "rewards/margins": 0.9547150135040283, + "rewards/rejected": -1.126756191253662, + "step": 4322 + }, + { + "epoch": 0.5, + "learning_rate": 1.5279175933512817e-07, + "logits/chosen": -2.7960379123687744, + "logits/rejected": -2.6179189682006836, + "logps/chosen": -263.7035827636719, + "logps/rejected": -166.1133270263672, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14601784944534302, + "rewards/margins": 1.0945788621902466, + "rewards/rejected": -1.2405967712402344, + "step": 4323 + }, + { + "epoch": 0.5, + "learning_rate": 1.5275664286550392e-07, + "logits/chosen": -3.5331737995147705, + "logits/rejected": -3.1034798622131348, + "logps/chosen": -258.3570251464844, + "logps/rejected": -240.05247497558594, + "loss": 0.4052, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11056377738714218, + "rewards/margins": 1.030532956123352, + "rewards/rejected": -0.9199692010879517, + "step": 4324 + }, + { + "epoch": 0.5, + "learning_rate": 1.5272152639587965e-07, + "logits/chosen": -3.2509894371032715, + "logits/rejected": -3.2348217964172363, + "logps/chosen": -153.45252990722656, + "logps/rejected": -186.507568359375, + "loss": 0.3041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18447771668434143, + "rewards/margins": 1.9643664360046387, + "rewards/rejected": -2.1488442420959473, + "step": 4325 + }, + { + "epoch": 0.5, + "learning_rate": 1.526864099262554e-07, + "logits/chosen": -3.0422749519348145, + "logits/rejected": -2.887605667114258, + "logps/chosen": -350.14227294921875, + "logps/rejected": -392.3241882324219, + "loss": 0.2988, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4190361499786377, + "rewards/margins": 1.4657340049743652, + "rewards/rejected": -1.046697735786438, + "step": 4326 + }, + { + "epoch": 0.5, + "learning_rate": 1.5265129345663116e-07, + "logits/chosen": -3.6773428916931152, + "logits/rejected": -3.4764792919158936, + "logps/chosen": -216.73037719726562, + "logps/rejected": -243.27769470214844, + "loss": 0.7284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2985934615135193, + "rewards/margins": 0.9518216848373413, + "rewards/rejected": -1.2504152059555054, + "step": 4327 + }, + { + "epoch": 0.5, + "learning_rate": 1.526161769870069e-07, + "logits/chosen": -2.914721965789795, + "logits/rejected": -2.5376343727111816, + "logps/chosen": -276.6370544433594, + "logps/rejected": -336.686279296875, + "loss": 0.465, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09736742824316025, + "rewards/margins": 1.4225717782974243, + "rewards/rejected": -1.5199393033981323, + "step": 4328 + }, + { + "epoch": 0.5, + "learning_rate": 1.5258106051738264e-07, + "logits/chosen": -3.0304510593414307, + "logits/rejected": -2.9962449073791504, + "logps/chosen": -171.1322479248047, + "logps/rejected": -238.05624389648438, + "loss": 0.2156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010245583951473236, + "rewards/margins": 3.0334830284118652, + "rewards/rejected": -3.023237466812134, + "step": 4329 + }, + { + "epoch": 0.5, + "learning_rate": 1.5254594404775837e-07, + "logits/chosen": -2.7044529914855957, + "logits/rejected": -2.591447591781616, + "logps/chosen": -297.8039245605469, + "logps/rejected": -254.88629150390625, + "loss": 0.5128, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2347683161497116, + "rewards/margins": 1.2874822616577148, + "rewards/rejected": -1.5222506523132324, + "step": 4330 + }, + { + "epoch": 0.5, + "learning_rate": 1.5251082757813415e-07, + "logits/chosen": -3.270585060119629, + "logits/rejected": -3.217857599258423, + "logps/chosen": -245.9214324951172, + "logps/rejected": -277.43768310546875, + "loss": 0.328, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22295339405536652, + "rewards/margins": 1.4489483833312988, + "rewards/rejected": -1.2259950637817383, + "step": 4331 + }, + { + "epoch": 0.5, + "learning_rate": 1.524757111085099e-07, + "logits/chosen": -2.6677193641662598, + "logits/rejected": -2.567304849624634, + "logps/chosen": -245.222412109375, + "logps/rejected": -300.80816650390625, + "loss": 0.5077, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2571207284927368, + "rewards/margins": 1.7925922870635986, + "rewards/rejected": -2.049712896347046, + "step": 4332 + }, + { + "epoch": 0.5, + "learning_rate": 1.5244059463888563e-07, + "logits/chosen": -2.9411396980285645, + "logits/rejected": -3.1203718185424805, + "logps/chosen": -243.03067016601562, + "logps/rejected": -242.32382202148438, + "loss": 0.5715, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36854732036590576, + "rewards/margins": 1.3076350688934326, + "rewards/rejected": -1.6761822700500488, + "step": 4333 + }, + { + "epoch": 0.5, + "learning_rate": 1.5240547816926138e-07, + "logits/chosen": -3.131685495376587, + "logits/rejected": -3.1436080932617188, + "logps/chosen": -210.40829467773438, + "logps/rejected": -194.9435577392578, + "loss": 0.3698, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26902303099632263, + "rewards/margins": 1.715477466583252, + "rewards/rejected": -1.446454405784607, + "step": 4334 + }, + { + "epoch": 0.5, + "learning_rate": 1.523703616996371e-07, + "logits/chosen": -2.9350483417510986, + "logits/rejected": -3.36898136138916, + "logps/chosen": -350.0063171386719, + "logps/rejected": -315.55999755859375, + "loss": 0.4473, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021467216312885284, + "rewards/margins": 1.919000506401062, + "rewards/rejected": -1.8975332975387573, + "step": 4335 + }, + { + "epoch": 0.5, + "learning_rate": 1.5233524523001286e-07, + "logits/chosen": -3.468585968017578, + "logits/rejected": -3.4708735942840576, + "logps/chosen": -259.3088073730469, + "logps/rejected": -275.88983154296875, + "loss": 0.2177, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21335579454898834, + "rewards/margins": 3.158236503601074, + "rewards/rejected": -2.944880723953247, + "step": 4336 + }, + { + "epoch": 0.5, + "learning_rate": 1.5230012876038862e-07, + "logits/chosen": -3.5894057750701904, + "logits/rejected": -4.07175350189209, + "logps/chosen": -258.2807312011719, + "logps/rejected": -377.88433837890625, + "loss": 0.1754, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.020924709737300873, + "rewards/margins": 2.864034414291382, + "rewards/rejected": -2.8849592208862305, + "step": 4337 + }, + { + "epoch": 0.5, + "learning_rate": 1.5226501229076435e-07, + "logits/chosen": -2.7482848167419434, + "logits/rejected": -2.8134498596191406, + "logps/chosen": -220.245849609375, + "logps/rejected": -244.49044799804688, + "loss": 0.5872, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9350711703300476, + "rewards/margins": 0.6833840608596802, + "rewards/rejected": -1.618455171585083, + "step": 4338 + }, + { + "epoch": 0.5, + "learning_rate": 1.522298958211401e-07, + "logits/chosen": -3.1900813579559326, + "logits/rejected": -2.983151912689209, + "logps/chosen": -287.6822204589844, + "logps/rejected": -232.12005615234375, + "loss": 0.4724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.595203161239624, + "rewards/margins": 1.861502766609192, + "rewards/rejected": -2.4567060470581055, + "step": 4339 + }, + { + "epoch": 0.5, + "learning_rate": 1.5219477935151588e-07, + "logits/chosen": -3.405463933944702, + "logits/rejected": -3.8356313705444336, + "logps/chosen": -241.0276641845703, + "logps/rejected": -344.3280334472656, + "loss": 0.2981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.740489661693573, + "rewards/margins": 2.790050983428955, + "rewards/rejected": -2.0495612621307373, + "step": 4340 + }, + { + "epoch": 0.5, + "learning_rate": 1.5215966288189158e-07, + "logits/chosen": -2.251642942428589, + "logits/rejected": -2.215527296066284, + "logps/chosen": -241.87911987304688, + "logps/rejected": -272.67401123046875, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14238114655017853, + "rewards/margins": 2.0987648963928223, + "rewards/rejected": -2.2411460876464844, + "step": 4341 + }, + { + "epoch": 0.5, + "learning_rate": 1.5212454641226736e-07, + "logits/chosen": -2.880380392074585, + "logits/rejected": -3.083404064178467, + "logps/chosen": -285.37188720703125, + "logps/rejected": -216.2775115966797, + "loss": 0.4331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20240969955921173, + "rewards/margins": 1.6976351737976074, + "rewards/rejected": -1.9000449180603027, + "step": 4342 + }, + { + "epoch": 0.5, + "learning_rate": 1.520894299426431e-07, + "logits/chosen": -2.954178810119629, + "logits/rejected": -2.8907527923583984, + "logps/chosen": -460.04095458984375, + "logps/rejected": -347.473388671875, + "loss": 0.8078, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3514825105667114, + "rewards/margins": 0.2154478132724762, + "rewards/rejected": -1.5669302940368652, + "step": 4343 + }, + { + "epoch": 0.5, + "learning_rate": 1.5205431347301884e-07, + "logits/chosen": -2.8842124938964844, + "logits/rejected": -2.9241912364959717, + "logps/chosen": -422.3749694824219, + "logps/rejected": -245.2810516357422, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8633109331130981, + "rewards/margins": 0.9062386751174927, + "rewards/rejected": -1.7695496082305908, + "step": 4344 + }, + { + "epoch": 0.5, + "learning_rate": 1.520191970033946e-07, + "logits/chosen": -3.1952526569366455, + "logits/rejected": -3.0497982501983643, + "logps/chosen": -247.4343719482422, + "logps/rejected": -217.57260131835938, + "loss": 0.4565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8429983854293823, + "rewards/margins": 1.032486915588379, + "rewards/rejected": -1.8754854202270508, + "step": 4345 + }, + { + "epoch": 0.5, + "learning_rate": 1.5198408053377032e-07, + "logits/chosen": -3.1732306480407715, + "logits/rejected": -2.5270819664001465, + "logps/chosen": -187.05599975585938, + "logps/rejected": -195.52894592285156, + "loss": 0.9775, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.39661121368408203, + "rewards/margins": -0.39676156640052795, + "rewards/rejected": 0.00015035271644592285, + "step": 4346 + }, + { + "epoch": 0.5, + "learning_rate": 1.5194896406414608e-07, + "logits/chosen": -2.3318662643432617, + "logits/rejected": -2.538515329360962, + "logps/chosen": -413.87493896484375, + "logps/rejected": -386.77423095703125, + "loss": 0.3905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.218570739030838, + "rewards/margins": 1.9617739915847778, + "rewards/rejected": -2.180344581604004, + "step": 4347 + }, + { + "epoch": 0.5, + "learning_rate": 1.5191384759452183e-07, + "logits/chosen": -3.0957298278808594, + "logits/rejected": -3.1337296962738037, + "logps/chosen": -204.4673309326172, + "logps/rejected": -226.9080810546875, + "loss": 0.5102, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5207543969154358, + "rewards/margins": 1.1098259687423706, + "rewards/rejected": -1.6305804252624512, + "step": 4348 + }, + { + "epoch": 0.5, + "learning_rate": 1.5187873112489756e-07, + "logits/chosen": -3.4197230339050293, + "logits/rejected": -3.3105177879333496, + "logps/chosen": -175.8695068359375, + "logps/rejected": -191.9119110107422, + "loss": 0.4738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4821392297744751, + "rewards/margins": 1.9957146644592285, + "rewards/rejected": -2.477854013442993, + "step": 4349 + }, + { + "epoch": 0.5, + "learning_rate": 1.518436146552733e-07, + "logits/chosen": -3.420919895172119, + "logits/rejected": -3.5084307193756104, + "logps/chosen": -402.57720947265625, + "logps/rejected": -259.1300048828125, + "loss": 0.2908, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11197924613952637, + "rewards/margins": 2.2586443424224854, + "rewards/rejected": -2.3706235885620117, + "step": 4350 + }, + { + "epoch": 0.5, + "learning_rate": 1.5180849818564904e-07, + "logits/chosen": -2.614835023880005, + "logits/rejected": -2.646704912185669, + "logps/chosen": -364.3536376953125, + "logps/rejected": -387.14105224609375, + "loss": 0.3531, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.031184419989585876, + "rewards/margins": 1.9985307455062866, + "rewards/rejected": -1.9673463106155396, + "step": 4351 + }, + { + "epoch": 0.5, + "learning_rate": 1.517733817160248e-07, + "logits/chosen": -3.0071463584899902, + "logits/rejected": -2.820805072784424, + "logps/chosen": -487.60791015625, + "logps/rejected": -267.1512451171875, + "loss": 0.4815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1285768747329712, + "rewards/margins": 1.3028665781021118, + "rewards/rejected": -1.431443452835083, + "step": 4352 + }, + { + "epoch": 0.5, + "learning_rate": 1.5173826524640057e-07, + "logits/chosen": -2.438716173171997, + "logits/rejected": -2.2845852375030518, + "logps/chosen": -333.16571044921875, + "logps/rejected": -378.7651672363281, + "loss": 0.4016, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021643638610839844, + "rewards/margins": 2.5265846252441406, + "rewards/rejected": -2.504940986633301, + "step": 4353 + }, + { + "epoch": 0.5, + "learning_rate": 1.517031487767763e-07, + "logits/chosen": -2.881808280944824, + "logits/rejected": -2.944437026977539, + "logps/chosen": -266.9672546386719, + "logps/rejected": -256.4344177246094, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4486692547798157, + "rewards/margins": 2.8912971019744873, + "rewards/rejected": -2.4426279067993164, + "step": 4354 + }, + { + "epoch": 0.5, + "learning_rate": 1.5166803230715206e-07, + "logits/chosen": -2.6030170917510986, + "logits/rejected": -2.6742677688598633, + "logps/chosen": -344.979736328125, + "logps/rejected": -435.9056701660156, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1490846574306488, + "rewards/margins": 1.406320571899414, + "rewards/rejected": -1.2572360038757324, + "step": 4355 + }, + { + "epoch": 0.5, + "learning_rate": 1.516329158375278e-07, + "logits/chosen": -3.1855664253234863, + "logits/rejected": -2.9454476833343506, + "logps/chosen": -236.99270629882812, + "logps/rejected": -241.7470245361328, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23349729180335999, + "rewards/margins": 1.9345985651016235, + "rewards/rejected": -1.7011014223098755, + "step": 4356 + }, + { + "epoch": 0.5, + "learning_rate": 1.5159779936790354e-07, + "logits/chosen": -3.0357120037078857, + "logits/rejected": -3.1227214336395264, + "logps/chosen": -132.56565856933594, + "logps/rejected": -178.5340576171875, + "loss": 0.5921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5090728998184204, + "rewards/margins": 0.8391250967979431, + "rewards/rejected": -1.3481979370117188, + "step": 4357 + }, + { + "epoch": 0.5, + "learning_rate": 1.515626828982793e-07, + "logits/chosen": -2.733494758605957, + "logits/rejected": -2.66791033744812, + "logps/chosen": -274.7735595703125, + "logps/rejected": -298.78363037109375, + "loss": 0.2522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1007273867726326, + "rewards/margins": 1.97842538356781, + "rewards/rejected": -1.8776981830596924, + "step": 4358 + }, + { + "epoch": 0.5, + "learning_rate": 1.5152756642865502e-07, + "logits/chosen": -3.2907021045684814, + "logits/rejected": -3.1837379932403564, + "logps/chosen": -131.64645385742188, + "logps/rejected": -244.5100860595703, + "loss": 0.2827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09286855161190033, + "rewards/margins": 1.8562700748443604, + "rewards/rejected": -1.9491386413574219, + "step": 4359 + }, + { + "epoch": 0.5, + "learning_rate": 1.5149244995903077e-07, + "logits/chosen": -3.263777732849121, + "logits/rejected": -3.0694971084594727, + "logps/chosen": -238.61192321777344, + "logps/rejected": -164.87884521484375, + "loss": 0.6013, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23120154440402985, + "rewards/margins": 0.8004652261734009, + "rewards/rejected": -1.0316667556762695, + "step": 4360 + }, + { + "epoch": 0.5, + "learning_rate": 1.5145733348940653e-07, + "logits/chosen": -2.444157123565674, + "logits/rejected": -2.5378336906433105, + "logps/chosen": -201.5517120361328, + "logps/rejected": -304.38702392578125, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35066503286361694, + "rewards/margins": 2.5386009216308594, + "rewards/rejected": -2.1879358291625977, + "step": 4361 + }, + { + "epoch": 0.5, + "learning_rate": 1.5142221701978225e-07, + "logits/chosen": -2.8233203887939453, + "logits/rejected": -2.3043649196624756, + "logps/chosen": -244.50408935546875, + "logps/rejected": -249.8031005859375, + "loss": 0.8239, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23058897256851196, + "rewards/margins": 0.511406421661377, + "rewards/rejected": -0.7419954538345337, + "step": 4362 + }, + { + "epoch": 0.5, + "learning_rate": 1.51387100550158e-07, + "logits/chosen": -2.7693850994110107, + "logits/rejected": -2.70414400100708, + "logps/chosen": -248.0126953125, + "logps/rejected": -299.70587158203125, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12779343128204346, + "rewards/margins": 1.714442253112793, + "rewards/rejected": -1.586648941040039, + "step": 4363 + }, + { + "epoch": 0.5, + "learning_rate": 1.513519840805338e-07, + "logits/chosen": -2.958725690841675, + "logits/rejected": -2.4526329040527344, + "logps/chosen": -279.5599670410156, + "logps/rejected": -218.04818725585938, + "loss": 0.4381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46563881635665894, + "rewards/margins": 1.102436900138855, + "rewards/rejected": -1.5680756568908691, + "step": 4364 + }, + { + "epoch": 0.5, + "learning_rate": 1.5131686761090951e-07, + "logits/chosen": -4.099704742431641, + "logits/rejected": -3.954615354537964, + "logps/chosen": -390.666259765625, + "logps/rejected": -342.74322509765625, + "loss": 0.2235, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15247058868408203, + "rewards/margins": 2.370198965072632, + "rewards/rejected": -2.217728614807129, + "step": 4365 + }, + { + "epoch": 0.5, + "learning_rate": 1.5128175114128527e-07, + "logits/chosen": -3.5607848167419434, + "logits/rejected": -3.17744779586792, + "logps/chosen": -209.37461853027344, + "logps/rejected": -261.11126708984375, + "loss": 0.746, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43594038486480713, + "rewards/margins": 0.322238951921463, + "rewards/rejected": -0.7581793665885925, + "step": 4366 + }, + { + "epoch": 0.5, + "learning_rate": 1.51246634671661e-07, + "logits/chosen": -3.3152482509613037, + "logits/rejected": -3.2359282970428467, + "logps/chosen": -424.8460388183594, + "logps/rejected": -372.7846374511719, + "loss": 0.3487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08495499193668365, + "rewards/margins": 2.606382369995117, + "rewards/rejected": -2.6913375854492188, + "step": 4367 + }, + { + "epoch": 0.5, + "learning_rate": 1.5121151820203675e-07, + "logits/chosen": -3.177506446838379, + "logits/rejected": -3.040463924407959, + "logps/chosen": -370.3385925292969, + "logps/rejected": -255.0132293701172, + "loss": 0.7806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.143892303109169, + "rewards/margins": 0.7528940439224243, + "rewards/rejected": -0.8967862725257874, + "step": 4368 + }, + { + "epoch": 0.5, + "learning_rate": 1.511764017324125e-07, + "logits/chosen": -2.9256277084350586, + "logits/rejected": -2.79984450340271, + "logps/chosen": -167.287353515625, + "logps/rejected": -144.92601013183594, + "loss": 0.1846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27300214767456055, + "rewards/margins": 2.126824378967285, + "rewards/rejected": -2.3998262882232666, + "step": 4369 + }, + { + "epoch": 0.5, + "learning_rate": 1.5114128526278823e-07, + "logits/chosen": -3.4003474712371826, + "logits/rejected": -3.333348035812378, + "logps/chosen": -208.35316467285156, + "logps/rejected": -236.85565185546875, + "loss": 0.3747, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07052738964557648, + "rewards/margins": 1.8659526109695435, + "rewards/rejected": -1.936479926109314, + "step": 4370 + }, + { + "epoch": 0.5, + "learning_rate": 1.5110616879316399e-07, + "logits/chosen": -3.586909055709839, + "logits/rejected": -3.4962854385375977, + "logps/chosen": -311.3375549316406, + "logps/rejected": -300.5721435546875, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27034351229667664, + "rewards/margins": 2.939575672149658, + "rewards/rejected": -3.2099194526672363, + "step": 4371 + }, + { + "epoch": 0.5, + "learning_rate": 1.5107105232353974e-07, + "logits/chosen": -3.6852259635925293, + "logits/rejected": -3.8589088916778564, + "logps/chosen": -217.17784118652344, + "logps/rejected": -229.31895446777344, + "loss": 0.3724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5467852354049683, + "rewards/margins": 2.5945963859558105, + "rewards/rejected": -3.1413817405700684, + "step": 4372 + }, + { + "epoch": 0.5, + "learning_rate": 1.5103593585391547e-07, + "logits/chosen": -3.1161463260650635, + "logits/rejected": -3.361480474472046, + "logps/chosen": -212.10824584960938, + "logps/rejected": -285.73980712890625, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25610148906707764, + "rewards/margins": 1.057396411895752, + "rewards/rejected": -1.3134979009628296, + "step": 4373 + }, + { + "epoch": 0.5, + "learning_rate": 1.5100081938429125e-07, + "logits/chosen": -2.880406379699707, + "logits/rejected": -2.973616361618042, + "logps/chosen": -232.09947204589844, + "logps/rejected": -239.45208740234375, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39500609040260315, + "rewards/margins": 2.0360450744628906, + "rewards/rejected": -1.641039252281189, + "step": 4374 + }, + { + "epoch": 0.5, + "learning_rate": 1.5096570291466695e-07, + "logits/chosen": -3.364504337310791, + "logits/rejected": -2.9983022212982178, + "logps/chosen": -332.7726135253906, + "logps/rejected": -205.62091064453125, + "loss": 0.7572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37113887071609497, + "rewards/margins": 1.4453901052474976, + "rewards/rejected": -1.8165290355682373, + "step": 4375 + }, + { + "epoch": 0.5, + "learning_rate": 1.5093058644504273e-07, + "logits/chosen": -3.013300895690918, + "logits/rejected": -3.1054279804229736, + "logps/chosen": -268.9093322753906, + "logps/rejected": -321.533935546875, + "loss": 0.7096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.377937376499176, + "rewards/margins": 1.3101989030838013, + "rewards/rejected": -1.688136339187622, + "step": 4376 + }, + { + "epoch": 0.5, + "learning_rate": 1.5089546997541848e-07, + "logits/chosen": -3.387188673019409, + "logits/rejected": -3.7440662384033203, + "logps/chosen": -219.77719116210938, + "logps/rejected": -246.3402557373047, + "loss": 0.5868, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4854031503200531, + "rewards/margins": 1.3160405158996582, + "rewards/rejected": -1.8014435768127441, + "step": 4377 + }, + { + "epoch": 0.5, + "learning_rate": 1.508603535057942e-07, + "logits/chosen": -2.4300150871276855, + "logits/rejected": -2.282163143157959, + "logps/chosen": -206.68984985351562, + "logps/rejected": -337.49383544921875, + "loss": 0.5565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01374092698097229, + "rewards/margins": 1.552530288696289, + "rewards/rejected": -1.5387895107269287, + "step": 4378 + }, + { + "epoch": 0.5, + "learning_rate": 1.5082523703616996e-07, + "logits/chosen": -3.096707820892334, + "logits/rejected": -3.152357816696167, + "logps/chosen": -221.82876586914062, + "logps/rejected": -132.79945373535156, + "loss": 0.4855, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05497577786445618, + "rewards/margins": 1.5173349380493164, + "rewards/rejected": -1.4623591899871826, + "step": 4379 + }, + { + "epoch": 0.5, + "learning_rate": 1.507901205665457e-07, + "logits/chosen": -3.2750179767608643, + "logits/rejected": -3.3553595542907715, + "logps/chosen": -245.12274169921875, + "logps/rejected": -223.70758056640625, + "loss": 0.5897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1376083791255951, + "rewards/margins": 0.8249518871307373, + "rewards/rejected": -0.9625602960586548, + "step": 4380 + }, + { + "epoch": 0.51, + "learning_rate": 1.5075500409692144e-07, + "logits/chosen": -2.7313644886016846, + "logits/rejected": -2.7743587493896484, + "logps/chosen": -540.2749633789062, + "logps/rejected": -460.87469482421875, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19075126945972443, + "rewards/margins": 2.1861088275909424, + "rewards/rejected": -2.3768601417541504, + "step": 4381 + }, + { + "epoch": 0.51, + "learning_rate": 1.507198876272972e-07, + "logits/chosen": -2.812267303466797, + "logits/rejected": -2.6020193099975586, + "logps/chosen": -294.84356689453125, + "logps/rejected": -347.6492919921875, + "loss": 0.5273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005048975348472595, + "rewards/margins": 1.069933295249939, + "rewards/rejected": -1.0749822854995728, + "step": 4382 + }, + { + "epoch": 0.51, + "learning_rate": 1.5068477115767293e-07, + "logits/chosen": -2.158843755722046, + "logits/rejected": -1.9106693267822266, + "logps/chosen": -303.96807861328125, + "logps/rejected": -395.33856201171875, + "loss": 0.7692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22693565487861633, + "rewards/margins": 1.0839990377426147, + "rewards/rejected": -0.8570634126663208, + "step": 4383 + }, + { + "epoch": 0.51, + "learning_rate": 1.5064965468804868e-07, + "logits/chosen": -3.5930843353271484, + "logits/rejected": -3.3900411128997803, + "logps/chosen": -287.32232666015625, + "logps/rejected": -253.5941925048828, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6565792560577393, + "rewards/margins": 3.5085532665252686, + "rewards/rejected": -2.8519740104675293, + "step": 4384 + }, + { + "epoch": 0.51, + "learning_rate": 1.5061453821842446e-07, + "logits/chosen": -3.605456829071045, + "logits/rejected": -3.0852580070495605, + "logps/chosen": -230.35739135742188, + "logps/rejected": -126.30648803710938, + "loss": 0.6576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41916847229003906, + "rewards/margins": 0.625045120716095, + "rewards/rejected": -1.0442136526107788, + "step": 4385 + }, + { + "epoch": 0.51, + "learning_rate": 1.5057942174880016e-07, + "logits/chosen": -3.6225457191467285, + "logits/rejected": -3.457719087600708, + "logps/chosen": -388.4093017578125, + "logps/rejected": -215.0829620361328, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18484365940093994, + "rewards/margins": 1.8490149974822998, + "rewards/rejected": -1.6641713380813599, + "step": 4386 + }, + { + "epoch": 0.51, + "learning_rate": 1.5054430527917594e-07, + "logits/chosen": -2.8146331310272217, + "logits/rejected": -3.00343656539917, + "logps/chosen": -351.9966735839844, + "logps/rejected": -319.5005187988281, + "loss": 0.4834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8603094220161438, + "rewards/margins": 0.8501147031784058, + "rewards/rejected": -1.7104240655899048, + "step": 4387 + }, + { + "epoch": 0.51, + "learning_rate": 1.5050918880955167e-07, + "logits/chosen": -3.8175971508026123, + "logits/rejected": -3.799285411834717, + "logps/chosen": -248.61203002929688, + "logps/rejected": -258.4638366699219, + "loss": 0.4995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4062659442424774, + "rewards/margins": 0.9454419016838074, + "rewards/rejected": -1.3517078161239624, + "step": 4388 + }, + { + "epoch": 0.51, + "learning_rate": 1.5047407233992742e-07, + "logits/chosen": -3.2852892875671387, + "logits/rejected": -3.452378034591675, + "logps/chosen": -140.73739624023438, + "logps/rejected": -175.56150817871094, + "loss": 0.4708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22385556995868683, + "rewards/margins": 2.776371955871582, + "rewards/rejected": -3.000227689743042, + "step": 4389 + }, + { + "epoch": 0.51, + "learning_rate": 1.5043895587030318e-07, + "logits/chosen": -3.8097360134124756, + "logits/rejected": -3.8893585205078125, + "logps/chosen": -186.71109008789062, + "logps/rejected": -225.67286682128906, + "loss": 0.418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3583275079727173, + "rewards/margins": 1.45175039768219, + "rewards/rejected": -1.8100779056549072, + "step": 4390 + }, + { + "epoch": 0.51, + "learning_rate": 1.504038394006789e-07, + "logits/chosen": -2.8321709632873535, + "logits/rejected": -2.5557775497436523, + "logps/chosen": -262.4767150878906, + "logps/rejected": -282.2918395996094, + "loss": 0.852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13637547194957733, + "rewards/margins": 0.42150604724884033, + "rewards/rejected": -0.5578815340995789, + "step": 4391 + }, + { + "epoch": 0.51, + "learning_rate": 1.5036872293105466e-07, + "logits/chosen": -3.213109254837036, + "logits/rejected": -3.2657155990600586, + "logps/chosen": -248.5978546142578, + "logps/rejected": -132.88436889648438, + "loss": 0.3951, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.183022603392601, + "rewards/margins": 1.1139625310897827, + "rewards/rejected": -0.9309399724006653, + "step": 4392 + }, + { + "epoch": 0.51, + "learning_rate": 1.503336064614304e-07, + "logits/chosen": -3.4584922790527344, + "logits/rejected": -3.479621171951294, + "logps/chosen": -322.09735107421875, + "logps/rejected": -275.2425537109375, + "loss": 0.3798, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16465619206428528, + "rewards/margins": 1.8723613023757935, + "rewards/rejected": -2.037017583847046, + "step": 4393 + }, + { + "epoch": 0.51, + "learning_rate": 1.5029848999180614e-07, + "logits/chosen": -3.163559675216675, + "logits/rejected": -2.973947048187256, + "logps/chosen": -361.8778991699219, + "logps/rejected": -230.2376708984375, + "loss": 0.5491, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.486987441778183, + "rewards/margins": 1.4815021753311157, + "rewards/rejected": -1.968489646911621, + "step": 4394 + }, + { + "epoch": 0.51, + "learning_rate": 1.502633735221819e-07, + "logits/chosen": -3.8698008060455322, + "logits/rejected": -3.915782928466797, + "logps/chosen": -191.20315551757812, + "logps/rejected": -270.44012451171875, + "loss": 0.4353, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.322562575340271, + "rewards/margins": 1.7500865459442139, + "rewards/rejected": -1.4275238513946533, + "step": 4395 + }, + { + "epoch": 0.51, + "learning_rate": 1.5022825705255762e-07, + "logits/chosen": -2.4990415573120117, + "logits/rejected": -2.477761745452881, + "logps/chosen": -316.3808288574219, + "logps/rejected": -283.4256591796875, + "loss": 0.5598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8832269906997681, + "rewards/margins": 1.0751512050628662, + "rewards/rejected": -1.9583783149719238, + "step": 4396 + }, + { + "epoch": 0.51, + "learning_rate": 1.5019314058293337e-07, + "logits/chosen": -2.932175636291504, + "logits/rejected": -2.7984063625335693, + "logps/chosen": -375.29150390625, + "logps/rejected": -345.2872314453125, + "loss": 0.5167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12368771433830261, + "rewards/margins": 0.7360104322433472, + "rewards/rejected": -0.8596981763839722, + "step": 4397 + }, + { + "epoch": 0.51, + "learning_rate": 1.5015802411330915e-07, + "logits/chosen": -2.7629036903381348, + "logits/rejected": -2.5199787616729736, + "logps/chosen": -215.64041137695312, + "logps/rejected": -272.7957763671875, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8380064368247986, + "rewards/margins": 3.336116313934326, + "rewards/rejected": -2.498109817504883, + "step": 4398 + }, + { + "epoch": 0.51, + "learning_rate": 1.5012290764368488e-07, + "logits/chosen": -2.8755970001220703, + "logits/rejected": -2.979569911956787, + "logps/chosen": -180.9604949951172, + "logps/rejected": -205.2681884765625, + "loss": 0.3172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17452417314052582, + "rewards/margins": 2.5646045207977295, + "rewards/rejected": -2.739128589630127, + "step": 4399 + }, + { + "epoch": 0.51, + "learning_rate": 1.5008779117406064e-07, + "logits/chosen": -3.3309595584869385, + "logits/rejected": -3.5195393562316895, + "logps/chosen": -155.01641845703125, + "logps/rejected": -241.80455017089844, + "loss": 0.3502, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2577670216560364, + "rewards/margins": 2.0736489295959473, + "rewards/rejected": -1.8158820867538452, + "step": 4400 + }, + { + "epoch": 0.51, + "learning_rate": 1.500526747044364e-07, + "logits/chosen": -3.2131705284118652, + "logits/rejected": -3.1328415870666504, + "logps/chosen": -315.64520263671875, + "logps/rejected": -254.4636993408203, + "loss": 0.2353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0831565409898758, + "rewards/margins": 3.3797080516815186, + "rewards/rejected": -3.296551465988159, + "step": 4401 + }, + { + "epoch": 0.51, + "learning_rate": 1.5001755823481212e-07, + "logits/chosen": -3.269278049468994, + "logits/rejected": -2.965360403060913, + "logps/chosen": -362.98297119140625, + "logps/rejected": -238.10060119628906, + "loss": 0.4696, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24326296150684357, + "rewards/margins": 2.0962040424346924, + "rewards/rejected": -1.8529409170150757, + "step": 4402 + }, + { + "epoch": 0.51, + "learning_rate": 1.4998244176518787e-07, + "logits/chosen": -3.8166439533233643, + "logits/rejected": -3.814970016479492, + "logps/chosen": -232.79920959472656, + "logps/rejected": -221.6922607421875, + "loss": 0.4514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46532317996025085, + "rewards/margins": 2.42971134185791, + "rewards/rejected": -2.8950347900390625, + "step": 4403 + }, + { + "epoch": 0.51, + "learning_rate": 1.4994732529556362e-07, + "logits/chosen": -3.784109115600586, + "logits/rejected": -3.1705081462860107, + "logps/chosen": -424.74676513671875, + "logps/rejected": -292.21197509765625, + "loss": 0.7107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10034163296222687, + "rewards/margins": 1.4442434310913086, + "rewards/rejected": -1.5445849895477295, + "step": 4404 + }, + { + "epoch": 0.51, + "learning_rate": 1.4991220882593935e-07, + "logits/chosen": -2.2929201126098633, + "logits/rejected": -2.325254440307617, + "logps/chosen": -525.851806640625, + "logps/rejected": -365.5827941894531, + "loss": 0.3614, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17604303359985352, + "rewards/margins": 2.818106174468994, + "rewards/rejected": -2.6420631408691406, + "step": 4405 + }, + { + "epoch": 0.51, + "learning_rate": 1.498770923563151e-07, + "logits/chosen": -3.649132251739502, + "logits/rejected": -3.6692564487457275, + "logps/chosen": -472.29913330078125, + "logps/rejected": -265.0448913574219, + "loss": 0.4561, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8902941942214966, + "rewards/margins": 1.2425613403320312, + "rewards/rejected": -2.1328556537628174, + "step": 4406 + }, + { + "epoch": 0.51, + "learning_rate": 1.4984197588669086e-07, + "logits/chosen": -4.007996082305908, + "logits/rejected": -4.043598175048828, + "logps/chosen": -218.08935546875, + "logps/rejected": -215.3282470703125, + "loss": 0.4032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3583040237426758, + "rewards/margins": 1.7505686283111572, + "rewards/rejected": -2.108872413635254, + "step": 4407 + }, + { + "epoch": 0.51, + "learning_rate": 1.498068594170666e-07, + "logits/chosen": -3.574333906173706, + "logits/rejected": -3.482656240463257, + "logps/chosen": -289.4785461425781, + "logps/rejected": -249.06854248046875, + "loss": 0.4187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30419886112213135, + "rewards/margins": 2.53704833984375, + "rewards/rejected": -2.841247320175171, + "step": 4408 + }, + { + "epoch": 0.51, + "learning_rate": 1.4977174294744234e-07, + "logits/chosen": -2.5965232849121094, + "logits/rejected": -2.196737766265869, + "logps/chosen": -340.62823486328125, + "logps/rejected": -303.58209228515625, + "loss": 0.3998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35948652029037476, + "rewards/margins": 0.894048273563385, + "rewards/rejected": -0.5345617532730103, + "step": 4409 + }, + { + "epoch": 0.51, + "learning_rate": 1.497366264778181e-07, + "logits/chosen": -3.259554386138916, + "logits/rejected": -3.1776890754699707, + "logps/chosen": -169.858154296875, + "logps/rejected": -280.4656677246094, + "loss": 0.5325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08583186566829681, + "rewards/margins": 1.023802399635315, + "rewards/rejected": -1.109634280204773, + "step": 4410 + }, + { + "epoch": 0.51, + "learning_rate": 1.4970151000819385e-07, + "logits/chosen": -3.028939723968506, + "logits/rejected": -3.0365071296691895, + "logps/chosen": -354.32659912109375, + "logps/rejected": -307.4539794921875, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018413707613945007, + "rewards/margins": 2.7685017585754395, + "rewards/rejected": -2.7500882148742676, + "step": 4411 + }, + { + "epoch": 0.51, + "learning_rate": 1.4966639353856958e-07, + "logits/chosen": -3.134254217147827, + "logits/rejected": -2.996080160140991, + "logps/chosen": -313.74981689453125, + "logps/rejected": -473.1878662109375, + "loss": 0.625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43938106298446655, + "rewards/margins": 1.235554814338684, + "rewards/rejected": -1.6749359369277954, + "step": 4412 + }, + { + "epoch": 0.51, + "learning_rate": 1.4963127706894533e-07, + "logits/chosen": -2.737462282180786, + "logits/rejected": -2.7807319164276123, + "logps/chosen": -222.94297790527344, + "logps/rejected": -209.45831298828125, + "loss": 0.4198, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2705109715461731, + "rewards/margins": 0.9310286641120911, + "rewards/rejected": -0.660517692565918, + "step": 4413 + }, + { + "epoch": 0.51, + "learning_rate": 1.4959616059932106e-07, + "logits/chosen": -3.1707406044006348, + "logits/rejected": -3.258849620819092, + "logps/chosen": -340.6451110839844, + "logps/rejected": -407.4120788574219, + "loss": 0.6579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.045621439814567566, + "rewards/margins": 0.5628175139427185, + "rewards/rejected": -0.6084389686584473, + "step": 4414 + }, + { + "epoch": 0.51, + "learning_rate": 1.4956104412969684e-07, + "logits/chosen": -2.5895838737487793, + "logits/rejected": -2.456406593322754, + "logps/chosen": -334.995361328125, + "logps/rejected": -233.80030822753906, + "loss": 0.3257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03696444630622864, + "rewards/margins": 1.3640143871307373, + "rewards/rejected": -1.3270500898361206, + "step": 4415 + }, + { + "epoch": 0.51, + "learning_rate": 1.4952592766007256e-07, + "logits/chosen": -3.2398808002471924, + "logits/rejected": -2.8593647480010986, + "logps/chosen": -207.0592041015625, + "logps/rejected": -282.97509765625, + "loss": 0.1948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039132606238126755, + "rewards/margins": 3.950399875640869, + "rewards/rejected": -3.989531993865967, + "step": 4416 + }, + { + "epoch": 0.51, + "learning_rate": 1.4949081119044832e-07, + "logits/chosen": -3.310863971710205, + "logits/rejected": -2.8487966060638428, + "logps/chosen": -237.89346313476562, + "logps/rejected": -289.961669921875, + "loss": 0.3426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44219154119491577, + "rewards/margins": 2.1357641220092773, + "rewards/rejected": -2.577955722808838, + "step": 4417 + }, + { + "epoch": 0.51, + "learning_rate": 1.4945569472082405e-07, + "logits/chosen": -2.9670209884643555, + "logits/rejected": -2.81146502494812, + "logps/chosen": -334.0722961425781, + "logps/rejected": -236.03326416015625, + "loss": 0.4056, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12095295637845993, + "rewards/margins": 1.6039036512374878, + "rewards/rejected": -1.4829509258270264, + "step": 4418 + }, + { + "epoch": 0.51, + "learning_rate": 1.4942057825119983e-07, + "logits/chosen": -2.8091940879821777, + "logits/rejected": -3.009946346282959, + "logps/chosen": -359.671142578125, + "logps/rejected": -229.96270751953125, + "loss": 0.2424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13167595863342285, + "rewards/margins": 2.332341194152832, + "rewards/rejected": -2.464016914367676, + "step": 4419 + }, + { + "epoch": 0.51, + "learning_rate": 1.4938546178157555e-07, + "logits/chosen": -3.322906494140625, + "logits/rejected": -3.467156171798706, + "logps/chosen": -132.37025451660156, + "logps/rejected": -163.06884765625, + "loss": 0.3691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1954316794872284, + "rewards/margins": 1.3765482902526855, + "rewards/rejected": -1.1811165809631348, + "step": 4420 + }, + { + "epoch": 0.51, + "learning_rate": 1.493503453119513e-07, + "logits/chosen": -2.797746181488037, + "logits/rejected": -2.850905418395996, + "logps/chosen": -339.642578125, + "logps/rejected": -344.678955078125, + "loss": 0.3704, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2056894302368164, + "rewards/margins": 1.4266995191574097, + "rewards/rejected": -1.2210102081298828, + "step": 4421 + }, + { + "epoch": 0.51, + "learning_rate": 1.4931522884232703e-07, + "logits/chosen": -2.9927191734313965, + "logits/rejected": -2.8240818977355957, + "logps/chosen": -232.27395629882812, + "logps/rejected": -258.9929504394531, + "loss": 0.5626, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5572441816329956, + "rewards/margins": 0.8226922154426575, + "rewards/rejected": -0.26544803380966187, + "step": 4422 + }, + { + "epoch": 0.51, + "learning_rate": 1.492801123727028e-07, + "logits/chosen": -2.2440619468688965, + "logits/rejected": -2.090486764907837, + "logps/chosen": -279.66473388671875, + "logps/rejected": -223.8531494140625, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029772847890853882, + "rewards/margins": 0.7437382936477661, + "rewards/rejected": -0.7735111117362976, + "step": 4423 + }, + { + "epoch": 0.51, + "learning_rate": 1.4924499590307854e-07, + "logits/chosen": -3.514378786087036, + "logits/rejected": -3.6415669918060303, + "logps/chosen": -191.73605346679688, + "logps/rejected": -198.2625274658203, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4174414575099945, + "rewards/margins": 2.6252870559692383, + "rewards/rejected": -3.0427284240722656, + "step": 4424 + }, + { + "epoch": 0.51, + "learning_rate": 1.492098794334543e-07, + "logits/chosen": -2.643167018890381, + "logits/rejected": -2.650768995285034, + "logps/chosen": -149.72711181640625, + "logps/rejected": -278.10491943359375, + "loss": 0.4056, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1489008516073227, + "rewards/margins": 2.0988264083862305, + "rewards/rejected": -1.949925422668457, + "step": 4425 + }, + { + "epoch": 0.51, + "learning_rate": 1.4917476296383002e-07, + "logits/chosen": -3.0202174186706543, + "logits/rejected": -3.0639896392822266, + "logps/chosen": -258.9016418457031, + "logps/rejected": -243.50985717773438, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22540387511253357, + "rewards/margins": 1.4120639562606812, + "rewards/rejected": -1.637467861175537, + "step": 4426 + }, + { + "epoch": 0.51, + "learning_rate": 1.4913964649420578e-07, + "logits/chosen": -3.5829193592071533, + "logits/rejected": -3.4704067707061768, + "logps/chosen": -144.79978942871094, + "logps/rejected": -148.05810546875, + "loss": 0.9794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8509852290153503, + "rewards/margins": 0.4472019672393799, + "rewards/rejected": -1.298187017440796, + "step": 4427 + }, + { + "epoch": 0.51, + "learning_rate": 1.4910453002458153e-07, + "logits/chosen": -3.1583967208862305, + "logits/rejected": -3.1966285705566406, + "logps/chosen": -203.6033477783203, + "logps/rejected": -310.8119812011719, + "loss": 0.1494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39956507086753845, + "rewards/margins": 3.8972878456115723, + "rewards/rejected": -3.497722625732422, + "step": 4428 + }, + { + "epoch": 0.51, + "learning_rate": 1.4906941355495726e-07, + "logits/chosen": -3.2788944244384766, + "logits/rejected": -3.121509552001953, + "logps/chosen": -205.30548095703125, + "logps/rejected": -277.1590881347656, + "loss": 0.559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1136661097407341, + "rewards/margins": 1.0267040729522705, + "rewards/rejected": -1.14037024974823, + "step": 4429 + }, + { + "epoch": 0.51, + "learning_rate": 1.49034297085333e-07, + "logits/chosen": -2.9656434059143066, + "logits/rejected": -2.999607801437378, + "logps/chosen": -332.5215148925781, + "logps/rejected": -239.9374542236328, + "loss": 0.3896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4691407084465027, + "rewards/margins": 1.0405354499816895, + "rewards/rejected": -0.571394681930542, + "step": 4430 + }, + { + "epoch": 0.51, + "learning_rate": 1.4899918061570874e-07, + "logits/chosen": -3.8042986392974854, + "logits/rejected": -3.655273675918579, + "logps/chosen": -148.4359130859375, + "logps/rejected": -181.92172241210938, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.149905726313591, + "rewards/margins": 0.7844531536102295, + "rewards/rejected": -0.6345474720001221, + "step": 4431 + }, + { + "epoch": 0.51, + "learning_rate": 1.4896406414608452e-07, + "logits/chosen": -3.5535364151000977, + "logits/rejected": -3.6351075172424316, + "logps/chosen": -227.19775390625, + "logps/rejected": -273.96795654296875, + "loss": 0.2142, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1983420848846436, + "rewards/margins": 2.7118406295776367, + "rewards/rejected": -1.5134987831115723, + "step": 4432 + }, + { + "epoch": 0.51, + "learning_rate": 1.4892894767646025e-07, + "logits/chosen": -3.7133255004882812, + "logits/rejected": -4.147195816040039, + "logps/chosen": -135.9350128173828, + "logps/rejected": -306.511474609375, + "loss": 0.4503, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6263597011566162, + "rewards/margins": 2.3074469566345215, + "rewards/rejected": -2.9338066577911377, + "step": 4433 + }, + { + "epoch": 0.51, + "learning_rate": 1.48893831206836e-07, + "logits/chosen": -2.8249757289886475, + "logits/rejected": -2.8673744201660156, + "logps/chosen": -263.5940246582031, + "logps/rejected": -208.982666015625, + "loss": 0.5149, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1985962837934494, + "rewards/margins": 1.2444778680801392, + "rewards/rejected": -1.0458815097808838, + "step": 4434 + }, + { + "epoch": 0.51, + "learning_rate": 1.4885871473721173e-07, + "logits/chosen": -2.9143614768981934, + "logits/rejected": -2.78123140335083, + "logps/chosen": -186.65493774414062, + "logps/rejected": -163.7853240966797, + "loss": 0.2555, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09070964902639389, + "rewards/margins": 1.5750693082809448, + "rewards/rejected": -1.4843597412109375, + "step": 4435 + }, + { + "epoch": 0.51, + "learning_rate": 1.488235982675875e-07, + "logits/chosen": -3.3565430641174316, + "logits/rejected": -3.221827745437622, + "logps/chosen": -219.25714111328125, + "logps/rejected": -365.2272033691406, + "loss": 0.5166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5754996538162231, + "rewards/margins": 4.24521017074585, + "rewards/rejected": -4.820710182189941, + "step": 4436 + }, + { + "epoch": 0.51, + "learning_rate": 1.4878848179796324e-07, + "logits/chosen": -2.369292974472046, + "logits/rejected": -2.383864164352417, + "logps/chosen": -231.35989379882812, + "logps/rejected": -190.14627075195312, + "loss": 0.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4834654927253723, + "rewards/margins": 2.323647975921631, + "rewards/rejected": -1.8401823043823242, + "step": 4437 + }, + { + "epoch": 0.51, + "learning_rate": 1.48753365328339e-07, + "logits/chosen": -2.5285165309906006, + "logits/rejected": -2.7143735885620117, + "logps/chosen": -338.71844482421875, + "logps/rejected": -288.13250732421875, + "loss": 0.5167, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16128399968147278, + "rewards/margins": 1.4340345859527588, + "rewards/rejected": -1.2727504968643188, + "step": 4438 + }, + { + "epoch": 0.51, + "learning_rate": 1.4871824885871472e-07, + "logits/chosen": -3.5200295448303223, + "logits/rejected": -3.30025053024292, + "logps/chosen": -187.4435577392578, + "logps/rejected": -214.3146514892578, + "loss": 0.1836, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06196771562099457, + "rewards/margins": 2.5855486392974854, + "rewards/rejected": -2.52358078956604, + "step": 4439 + }, + { + "epoch": 0.51, + "learning_rate": 1.4868313238909047e-07, + "logits/chosen": -2.809601306915283, + "logits/rejected": -2.520714044570923, + "logps/chosen": -244.7889404296875, + "logps/rejected": -239.95440673828125, + "loss": 0.3179, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2035580575466156, + "rewards/margins": 1.294797658920288, + "rewards/rejected": -1.498355746269226, + "step": 4440 + }, + { + "epoch": 0.51, + "learning_rate": 1.4864801591946623e-07, + "logits/chosen": -3.6114373207092285, + "logits/rejected": -3.5216073989868164, + "logps/chosen": -219.03671264648438, + "logps/rejected": -265.8146667480469, + "loss": 0.5364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6353132724761963, + "rewards/margins": 0.6936151385307312, + "rewards/rejected": -1.3289283514022827, + "step": 4441 + }, + { + "epoch": 0.51, + "learning_rate": 1.4861289944984198e-07, + "logits/chosen": -3.6401960849761963, + "logits/rejected": -3.621309995651245, + "logps/chosen": -278.2627868652344, + "logps/rejected": -198.7195281982422, + "loss": 0.5203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21571093797683716, + "rewards/margins": 1.6665693521499634, + "rewards/rejected": -1.8822803497314453, + "step": 4442 + }, + { + "epoch": 0.51, + "learning_rate": 1.485777829802177e-07, + "logits/chosen": -2.9097719192504883, + "logits/rejected": -2.8784117698669434, + "logps/chosen": -183.85911560058594, + "logps/rejected": -159.04727172851562, + "loss": 0.6063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05276265740394592, + "rewards/margins": 0.836016058921814, + "rewards/rejected": -0.8887786865234375, + "step": 4443 + }, + { + "epoch": 0.51, + "learning_rate": 1.4854266651059346e-07, + "logits/chosen": -2.7692065238952637, + "logits/rejected": -2.798689842224121, + "logps/chosen": -208.16793823242188, + "logps/rejected": -201.27383422851562, + "loss": 0.4149, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03829929232597351, + "rewards/margins": 0.8330237865447998, + "rewards/rejected": -0.7947244644165039, + "step": 4444 + }, + { + "epoch": 0.51, + "learning_rate": 1.4850755004096921e-07, + "logits/chosen": -2.9386284351348877, + "logits/rejected": -3.0295722484588623, + "logps/chosen": -322.6251525878906, + "logps/rejected": -317.8377380371094, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09657379984855652, + "rewards/margins": 1.689000129699707, + "rewards/rejected": -1.785573959350586, + "step": 4445 + }, + { + "epoch": 0.51, + "learning_rate": 1.4847243357134494e-07, + "logits/chosen": -2.327707290649414, + "logits/rejected": -2.397747039794922, + "logps/chosen": -198.6548614501953, + "logps/rejected": -342.28570556640625, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4047154188156128, + "rewards/margins": 4.239616394042969, + "rewards/rejected": -3.8349013328552246, + "step": 4446 + }, + { + "epoch": 0.51, + "learning_rate": 1.484373171017207e-07, + "logits/chosen": -3.198585033416748, + "logits/rejected": -2.9963932037353516, + "logps/chosen": -290.19500732421875, + "logps/rejected": -237.94703674316406, + "loss": 0.2749, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17651121318340302, + "rewards/margins": 2.323784589767456, + "rewards/rejected": -2.500295639038086, + "step": 4447 + }, + { + "epoch": 0.51, + "learning_rate": 1.4840220063209645e-07, + "logits/chosen": -2.4866113662719727, + "logits/rejected": -2.4639556407928467, + "logps/chosen": -422.8250732421875, + "logps/rejected": -441.9895324707031, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4765273928642273, + "rewards/margins": 2.7117953300476074, + "rewards/rejected": -2.2352683544158936, + "step": 4448 + }, + { + "epoch": 0.51, + "learning_rate": 1.483670841624722e-07, + "logits/chosen": -2.889845609664917, + "logits/rejected": -3.0506131649017334, + "logps/chosen": -238.18173217773438, + "logps/rejected": -256.6129150390625, + "loss": 0.4197, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0840979516506195, + "rewards/margins": 1.2450153827667236, + "rewards/rejected": -1.1609172821044922, + "step": 4449 + }, + { + "epoch": 0.51, + "learning_rate": 1.4833196769284793e-07, + "logits/chosen": -2.688924789428711, + "logits/rejected": -2.687879800796509, + "logps/chosen": -293.0924072265625, + "logps/rejected": -254.48776245117188, + "loss": 0.5309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37013405561447144, + "rewards/margins": 1.4146028757095337, + "rewards/rejected": -1.7847368717193604, + "step": 4450 + }, + { + "epoch": 0.51, + "learning_rate": 1.4829685122322368e-07, + "logits/chosen": -2.7830095291137695, + "logits/rejected": -2.76973295211792, + "logps/chosen": -492.3653564453125, + "logps/rejected": -217.7748260498047, + "loss": 0.3886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9104022979736328, + "rewards/margins": 0.9988981485366821, + "rewards/rejected": -1.909300446510315, + "step": 4451 + }, + { + "epoch": 0.51, + "learning_rate": 1.4826173475359944e-07, + "logits/chosen": -2.7749366760253906, + "logits/rejected": -2.8776772022247314, + "logps/chosen": -254.55068969726562, + "logps/rejected": -160.39788818359375, + "loss": 0.4646, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.548332154750824, + "rewards/margins": 1.146446704864502, + "rewards/rejected": -1.6947788000106812, + "step": 4452 + }, + { + "epoch": 0.51, + "learning_rate": 1.482266182839752e-07, + "logits/chosen": -2.840045690536499, + "logits/rejected": -2.918057918548584, + "logps/chosen": -194.0352020263672, + "logps/rejected": -235.64617919921875, + "loss": 0.2706, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32584789395332336, + "rewards/margins": 2.1029727458953857, + "rewards/rejected": -1.7771248817443848, + "step": 4453 + }, + { + "epoch": 0.51, + "learning_rate": 1.4819150181435092e-07, + "logits/chosen": -3.4630203247070312, + "logits/rejected": -3.376509189605713, + "logps/chosen": -425.3464050292969, + "logps/rejected": -304.2779235839844, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6722319722175598, + "rewards/margins": 1.1780495643615723, + "rewards/rejected": -1.8502817153930664, + "step": 4454 + }, + { + "epoch": 0.51, + "learning_rate": 1.4815638534472667e-07, + "logits/chosen": -2.607647657394409, + "logits/rejected": -2.6655056476593018, + "logps/chosen": -325.3609924316406, + "logps/rejected": -286.13543701171875, + "loss": 0.2558, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08824195712804794, + "rewards/margins": 2.7558321952819824, + "rewards/rejected": -2.6675901412963867, + "step": 4455 + }, + { + "epoch": 0.51, + "learning_rate": 1.4812126887510243e-07, + "logits/chosen": -3.135873794555664, + "logits/rejected": -3.3056201934814453, + "logps/chosen": -245.12570190429688, + "logps/rejected": -264.8995361328125, + "loss": 0.6259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38996973633766174, + "rewards/margins": 2.7337241172790527, + "rewards/rejected": -3.1236939430236816, + "step": 4456 + }, + { + "epoch": 0.51, + "learning_rate": 1.4808615240547815e-07, + "logits/chosen": -3.240366220474243, + "logits/rejected": -3.0487334728240967, + "logps/chosen": -342.52850341796875, + "logps/rejected": -277.62335205078125, + "loss": 0.3061, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6486632823944092, + "rewards/margins": 2.538386106491089, + "rewards/rejected": -1.8897227048873901, + "step": 4457 + }, + { + "epoch": 0.51, + "learning_rate": 1.480510359358539e-07, + "logits/chosen": -3.3613197803497314, + "logits/rejected": -3.5905370712280273, + "logps/chosen": -475.66656494140625, + "logps/rejected": -372.38623046875, + "loss": 0.2048, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0389859601855278, + "rewards/margins": 2.8403737545013428, + "rewards/rejected": -2.8013877868652344, + "step": 4458 + }, + { + "epoch": 0.51, + "learning_rate": 1.4801591946622966e-07, + "logits/chosen": -3.6479711532592773, + "logits/rejected": -3.561401128768921, + "logps/chosen": -169.01539611816406, + "logps/rejected": -173.09194946289062, + "loss": 0.5249, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009226039052009583, + "rewards/margins": 1.0817134380340576, + "rewards/rejected": -1.0724873542785645, + "step": 4459 + }, + { + "epoch": 0.51, + "learning_rate": 1.4798080299660542e-07, + "logits/chosen": -2.7888941764831543, + "logits/rejected": -2.960435152053833, + "logps/chosen": -328.793212890625, + "logps/rejected": -252.36912536621094, + "loss": 0.1952, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2533559203147888, + "rewards/margins": 2.528257131576538, + "rewards/rejected": -2.2749011516571045, + "step": 4460 + }, + { + "epoch": 0.51, + "learning_rate": 1.4794568652698114e-07, + "logits/chosen": -2.904768466949463, + "logits/rejected": -2.7378928661346436, + "logps/chosen": -543.7408447265625, + "logps/rejected": -370.409423828125, + "loss": 0.2415, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3002813458442688, + "rewards/margins": 2.6945126056671143, + "rewards/rejected": -2.3942313194274902, + "step": 4461 + }, + { + "epoch": 0.51, + "learning_rate": 1.479105700573569e-07, + "logits/chosen": -2.733698606491089, + "logits/rejected": -3.0256755352020264, + "logps/chosen": -280.9068908691406, + "logps/rejected": -342.5966796875, + "loss": 0.3081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24758386611938477, + "rewards/margins": 2.0729103088378906, + "rewards/rejected": -2.3204944133758545, + "step": 4462 + }, + { + "epoch": 0.51, + "learning_rate": 1.4787545358773262e-07, + "logits/chosen": -2.533634662628174, + "logits/rejected": -2.7438161373138428, + "logps/chosen": -432.3868713378906, + "logps/rejected": -353.08721923828125, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3993610739707947, + "rewards/margins": 2.7170588970184326, + "rewards/rejected": -2.317697763442993, + "step": 4463 + }, + { + "epoch": 0.51, + "learning_rate": 1.478403371181084e-07, + "logits/chosen": -3.056331157684326, + "logits/rejected": -2.744035005569458, + "logps/chosen": -317.1224670410156, + "logps/rejected": -314.74591064453125, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13479551672935486, + "rewards/margins": 1.9855000972747803, + "rewards/rejected": -1.8507044315338135, + "step": 4464 + }, + { + "epoch": 0.51, + "learning_rate": 1.4780522064848413e-07, + "logits/chosen": -3.764188528060913, + "logits/rejected": -3.520798921585083, + "logps/chosen": -320.7261962890625, + "logps/rejected": -232.54324340820312, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7982425093650818, + "rewards/margins": 1.206192970275879, + "rewards/rejected": -2.0044355392456055, + "step": 4465 + }, + { + "epoch": 0.51, + "learning_rate": 1.4777010417885989e-07, + "logits/chosen": -2.884833335876465, + "logits/rejected": -2.7404165267944336, + "logps/chosen": -539.05810546875, + "logps/rejected": -169.5865478515625, + "loss": 0.3276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5535966157913208, + "rewards/margins": 1.5286110639572144, + "rewards/rejected": -2.082207679748535, + "step": 4466 + }, + { + "epoch": 0.51, + "learning_rate": 1.4773498770923561e-07, + "logits/chosen": -3.553219795227051, + "logits/rejected": -3.366637706756592, + "logps/chosen": -201.02578735351562, + "logps/rejected": -165.07138061523438, + "loss": 0.3506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42999449372291565, + "rewards/margins": 1.5295319557189941, + "rewards/rejected": -1.959526538848877, + "step": 4467 + }, + { + "epoch": 0.52, + "learning_rate": 1.4769987123961137e-07, + "logits/chosen": -2.861527442932129, + "logits/rejected": -3.08376407623291, + "logps/chosen": -224.36859130859375, + "logps/rejected": -289.195068359375, + "loss": 0.1926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7883998155593872, + "rewards/margins": 3.3427419662475586, + "rewards/rejected": -2.554342269897461, + "step": 4468 + }, + { + "epoch": 0.52, + "learning_rate": 1.4766475476998712e-07, + "logits/chosen": -2.6032204627990723, + "logits/rejected": -2.862846851348877, + "logps/chosen": -350.8592529296875, + "logps/rejected": -235.1274871826172, + "loss": 0.2162, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14778056740760803, + "rewards/margins": 2.2069203853607178, + "rewards/rejected": -2.0591397285461426, + "step": 4469 + }, + { + "epoch": 0.52, + "learning_rate": 1.4762963830036288e-07, + "logits/chosen": -2.4603216648101807, + "logits/rejected": -2.3557114601135254, + "logps/chosen": -333.974609375, + "logps/rejected": -322.43310546875, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2209373563528061, + "rewards/margins": 1.2770605087280273, + "rewards/rejected": -1.497997760772705, + "step": 4470 + }, + { + "epoch": 0.52, + "learning_rate": 1.475945218307386e-07, + "logits/chosen": -3.36956787109375, + "logits/rejected": -3.2256016731262207, + "logps/chosen": -203.27102661132812, + "logps/rejected": -193.315673828125, + "loss": 0.24, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49033570289611816, + "rewards/margins": 1.9732637405395508, + "rewards/rejected": -1.4829280376434326, + "step": 4471 + }, + { + "epoch": 0.52, + "learning_rate": 1.4755940536111436e-07, + "logits/chosen": -2.995180606842041, + "logits/rejected": -2.9362497329711914, + "logps/chosen": -256.50396728515625, + "logps/rejected": -248.5731201171875, + "loss": 0.6174, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3274630904197693, + "rewards/margins": 1.0640506744384766, + "rewards/rejected": -0.736587643623352, + "step": 4472 + }, + { + "epoch": 0.52, + "learning_rate": 1.475242888914901e-07, + "logits/chosen": -3.0070202350616455, + "logits/rejected": -3.336422920227051, + "logps/chosen": -244.78758239746094, + "logps/rejected": -317.3347473144531, + "loss": 0.2423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14586450159549713, + "rewards/margins": 4.2756876945495605, + "rewards/rejected": -4.421552658081055, + "step": 4473 + }, + { + "epoch": 0.52, + "learning_rate": 1.4748917242186584e-07, + "logits/chosen": -3.2747576236724854, + "logits/rejected": -2.99515962600708, + "logps/chosen": -261.779052734375, + "logps/rejected": -231.59133911132812, + "loss": 0.2772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14123916625976562, + "rewards/margins": 1.8015648126602173, + "rewards/rejected": -1.660325527191162, + "step": 4474 + }, + { + "epoch": 0.52, + "learning_rate": 1.474540559522416e-07, + "logits/chosen": -2.3643627166748047, + "logits/rejected": -2.6696486473083496, + "logps/chosen": -315.5561828613281, + "logps/rejected": -239.1826629638672, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03897427022457123, + "rewards/margins": 1.5130833387374878, + "rewards/rejected": -1.4741090536117554, + "step": 4475 + }, + { + "epoch": 0.52, + "learning_rate": 1.4741893948261735e-07, + "logits/chosen": -2.7469329833984375, + "logits/rejected": -2.8960094451904297, + "logps/chosen": -370.9471435546875, + "logps/rejected": -243.80166625976562, + "loss": 0.4825, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17920604348182678, + "rewards/margins": 0.9487786889076233, + "rewards/rejected": -0.7695726752281189, + "step": 4476 + }, + { + "epoch": 0.52, + "learning_rate": 1.473838230129931e-07, + "logits/chosen": -3.068401336669922, + "logits/rejected": -2.94319486618042, + "logps/chosen": -274.8902587890625, + "logps/rejected": -235.03656005859375, + "loss": 0.5592, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06936344504356384, + "rewards/margins": 0.7675840854644775, + "rewards/rejected": -0.6982207298278809, + "step": 4477 + }, + { + "epoch": 0.52, + "learning_rate": 1.4734870654336883e-07, + "logits/chosen": -3.350950241088867, + "logits/rejected": -3.365607500076294, + "logps/chosen": -321.41217041015625, + "logps/rejected": -183.76742553710938, + "loss": 0.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.673633873462677, + "rewards/margins": 2.1617050170898438, + "rewards/rejected": -1.4880712032318115, + "step": 4478 + }, + { + "epoch": 0.52, + "learning_rate": 1.4731359007374458e-07, + "logits/chosen": -3.223801851272583, + "logits/rejected": -3.50046443939209, + "logps/chosen": -257.3007507324219, + "logps/rejected": -277.92059326171875, + "loss": 0.4401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5318021178245544, + "rewards/margins": 1.0931155681610107, + "rewards/rejected": -1.6249176263809204, + "step": 4479 + }, + { + "epoch": 0.52, + "learning_rate": 1.472784736041203e-07, + "logits/chosen": -3.196894407272339, + "logits/rejected": -2.894228935241699, + "logps/chosen": -349.4918518066406, + "logps/rejected": -169.04286193847656, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3423619866371155, + "rewards/margins": 1.5952322483062744, + "rewards/rejected": -1.2528700828552246, + "step": 4480 + }, + { + "epoch": 0.52, + "learning_rate": 1.472433571344961e-07, + "logits/chosen": -3.222590923309326, + "logits/rejected": -3.051413059234619, + "logps/chosen": -309.04803466796875, + "logps/rejected": -256.827880859375, + "loss": 0.2576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.254152774810791, + "rewards/margins": 2.37849760055542, + "rewards/rejected": -2.124344825744629, + "step": 4481 + }, + { + "epoch": 0.52, + "learning_rate": 1.4720824066487182e-07, + "logits/chosen": -3.0856611728668213, + "logits/rejected": -3.1687586307525635, + "logps/chosen": -255.06033325195312, + "logps/rejected": -169.47666931152344, + "loss": 0.4179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26923736929893494, + "rewards/margins": 1.7413792610168457, + "rewards/rejected": -2.0106165409088135, + "step": 4482 + }, + { + "epoch": 0.52, + "learning_rate": 1.4717312419524757e-07, + "logits/chosen": -3.219921588897705, + "logits/rejected": -3.323270320892334, + "logps/chosen": -158.9097442626953, + "logps/rejected": -189.7758331298828, + "loss": 0.5271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4286966025829315, + "rewards/margins": 1.1500000953674316, + "rewards/rejected": -1.578696608543396, + "step": 4483 + }, + { + "epoch": 0.52, + "learning_rate": 1.471380077256233e-07, + "logits/chosen": -3.181910276412964, + "logits/rejected": -3.078341484069824, + "logps/chosen": -166.9610137939453, + "logps/rejected": -241.27218627929688, + "loss": 0.3852, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12652593851089478, + "rewards/margins": 1.823262095451355, + "rewards/rejected": -1.6967360973358154, + "step": 4484 + }, + { + "epoch": 0.52, + "learning_rate": 1.4710289125599905e-07, + "logits/chosen": -3.8667984008789062, + "logits/rejected": -3.7569265365600586, + "logps/chosen": -224.504638671875, + "logps/rejected": -233.3239288330078, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2830770015716553, + "rewards/margins": 0.5985735058784485, + "rewards/rejected": -0.8816505670547485, + "step": 4485 + }, + { + "epoch": 0.52, + "learning_rate": 1.470677747863748e-07, + "logits/chosen": -1.9860210418701172, + "logits/rejected": -2.179534435272217, + "logps/chosen": -453.6348876953125, + "logps/rejected": -432.6441650390625, + "loss": 0.867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30538517236709595, + "rewards/margins": 0.22654691338539124, + "rewards/rejected": -0.5319320559501648, + "step": 4486 + }, + { + "epoch": 0.52, + "learning_rate": 1.4703265831675056e-07, + "logits/chosen": -3.313593864440918, + "logits/rejected": -3.3290834426879883, + "logps/chosen": -224.21533203125, + "logps/rejected": -281.1157531738281, + "loss": 0.976, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6812168955802917, + "rewards/margins": 0.7177561521530151, + "rewards/rejected": -1.398972988128662, + "step": 4487 + }, + { + "epoch": 0.52, + "learning_rate": 1.4699754184712629e-07, + "logits/chosen": -2.922367572784424, + "logits/rejected": -2.682837963104248, + "logps/chosen": -359.6688232421875, + "logps/rejected": -466.0274658203125, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06858719885349274, + "rewards/margins": 1.5292675495147705, + "rewards/rejected": -1.5978548526763916, + "step": 4488 + }, + { + "epoch": 0.52, + "learning_rate": 1.4696242537750204e-07, + "logits/chosen": -2.973273754119873, + "logits/rejected": -3.23939847946167, + "logps/chosen": -268.39715576171875, + "logps/rejected": -292.90350341796875, + "loss": 0.1751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8730762600898743, + "rewards/margins": 2.6055731773376465, + "rewards/rejected": -1.732496976852417, + "step": 4489 + }, + { + "epoch": 0.52, + "learning_rate": 1.469273089078778e-07, + "logits/chosen": -3.0905542373657227, + "logits/rejected": -3.041985511779785, + "logps/chosen": -500.0098876953125, + "logps/rejected": -401.12091064453125, + "loss": 0.2019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3779023289680481, + "rewards/margins": 2.4129090309143066, + "rewards/rejected": -2.790811538696289, + "step": 4490 + }, + { + "epoch": 0.52, + "learning_rate": 1.4689219243825352e-07, + "logits/chosen": -2.878777027130127, + "logits/rejected": -2.6340315341949463, + "logps/chosen": -281.171142578125, + "logps/rejected": -233.99771118164062, + "loss": 0.5341, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4875459671020508, + "rewards/margins": 1.1651856899261475, + "rewards/rejected": -1.6527316570281982, + "step": 4491 + }, + { + "epoch": 0.52, + "learning_rate": 1.4685707596862928e-07, + "logits/chosen": -3.6824026107788086, + "logits/rejected": -3.3325459957122803, + "logps/chosen": -158.31658935546875, + "logps/rejected": -146.34243774414062, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02467847615480423, + "rewards/margins": 1.7604386806488037, + "rewards/rejected": -1.73576021194458, + "step": 4492 + }, + { + "epoch": 0.52, + "learning_rate": 1.4682195949900503e-07, + "logits/chosen": -2.7841150760650635, + "logits/rejected": -2.8871712684631348, + "logps/chosen": -319.2080078125, + "logps/rejected": -314.50579833984375, + "loss": 0.3568, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.242509663105011, + "rewards/margins": 2.324477195739746, + "rewards/rejected": -2.08196759223938, + "step": 4493 + }, + { + "epoch": 0.52, + "learning_rate": 1.4678684302938078e-07, + "logits/chosen": -2.989781141281128, + "logits/rejected": -2.7853798866271973, + "logps/chosen": -118.13092803955078, + "logps/rejected": -172.00779724121094, + "loss": 0.4128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.006840720772743225, + "rewards/margins": 1.9979379177093506, + "rewards/rejected": -2.0047786235809326, + "step": 4494 + }, + { + "epoch": 0.52, + "learning_rate": 1.467517265597565e-07, + "logits/chosen": -2.8594789505004883, + "logits/rejected": -3.0644407272338867, + "logps/chosen": -244.71946716308594, + "logps/rejected": -257.74169921875, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018032744526863098, + "rewards/margins": 2.1033644676208496, + "rewards/rejected": -2.085331439971924, + "step": 4495 + }, + { + "epoch": 0.52, + "learning_rate": 1.4671661009013226e-07, + "logits/chosen": -3.654344320297241, + "logits/rejected": -3.5573959350585938, + "logps/chosen": -241.46047973632812, + "logps/rejected": -173.82058715820312, + "loss": 0.4471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3898181915283203, + "rewards/margins": 0.94444739818573, + "rewards/rejected": -1.3342655897140503, + "step": 4496 + }, + { + "epoch": 0.52, + "learning_rate": 1.4668149362050802e-07, + "logits/chosen": -3.2653326988220215, + "logits/rejected": -3.362544059753418, + "logps/chosen": -220.3084716796875, + "logps/rejected": -197.75144958496094, + "loss": 0.6626, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7692713737487793, + "rewards/margins": 1.0185774564743042, + "rewards/rejected": -1.7878488302230835, + "step": 4497 + }, + { + "epoch": 0.52, + "learning_rate": 1.4664637715088377e-07, + "logits/chosen": -3.9611096382141113, + "logits/rejected": -3.4598300457000732, + "logps/chosen": -260.696533203125, + "logps/rejected": -180.54263305664062, + "loss": 0.2311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22186364233493805, + "rewards/margins": 2.259312152862549, + "rewards/rejected": -2.4811758995056152, + "step": 4498 + }, + { + "epoch": 0.52, + "learning_rate": 1.466112606812595e-07, + "logits/chosen": -3.0132431983947754, + "logits/rejected": -3.0410525798797607, + "logps/chosen": -287.25958251953125, + "logps/rejected": -430.99932861328125, + "loss": 0.2157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29073768854141235, + "rewards/margins": 2.485402822494507, + "rewards/rejected": -2.19466495513916, + "step": 4499 + }, + { + "epoch": 0.52, + "learning_rate": 1.4657614421163525e-07, + "logits/chosen": -3.0384936332702637, + "logits/rejected": -3.4473626613616943, + "logps/chosen": -258.6608581542969, + "logps/rejected": -310.5686340332031, + "loss": 0.38, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3637028932571411, + "rewards/margins": 2.8127434253692627, + "rewards/rejected": -3.1764464378356934, + "step": 4500 + }, + { + "epoch": 0.52, + "learning_rate": 1.46541027742011e-07, + "logits/chosen": -3.47510027885437, + "logits/rejected": -3.4909675121307373, + "logps/chosen": -291.4259033203125, + "logps/rejected": -274.2096862792969, + "loss": 0.6776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40233972668647766, + "rewards/margins": 0.4615073800086975, + "rewards/rejected": -0.863847017288208, + "step": 4501 + }, + { + "epoch": 0.52, + "learning_rate": 1.4650591127238673e-07, + "logits/chosen": -3.1235296726226807, + "logits/rejected": -2.704055070877075, + "logps/chosen": -274.13031005859375, + "logps/rejected": -280.25384521484375, + "loss": 0.3552, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.045905157923698425, + "rewards/margins": 1.592152714729309, + "rewards/rejected": -1.5462477207183838, + "step": 4502 + }, + { + "epoch": 0.52, + "learning_rate": 1.464707948027625e-07, + "logits/chosen": -2.946523666381836, + "logits/rejected": -3.1511316299438477, + "logps/chosen": -247.1290740966797, + "logps/rejected": -217.93673706054688, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17224368453025818, + "rewards/margins": 1.4275144338607788, + "rewards/rejected": -1.5997581481933594, + "step": 4503 + }, + { + "epoch": 0.52, + "learning_rate": 1.4643567833313824e-07, + "logits/chosen": -2.623812675476074, + "logits/rejected": -2.9154510498046875, + "logps/chosen": -335.95513916015625, + "logps/rejected": -165.47157287597656, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49656814336776733, + "rewards/margins": 1.5633776187896729, + "rewards/rejected": -1.0668094158172607, + "step": 4504 + }, + { + "epoch": 0.52, + "learning_rate": 1.46400561863514e-07, + "logits/chosen": -3.0457658767700195, + "logits/rejected": -3.1613523960113525, + "logps/chosen": -369.1893615722656, + "logps/rejected": -249.90890502929688, + "loss": 0.5557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08801822364330292, + "rewards/margins": 0.9505049586296082, + "rewards/rejected": -0.862486720085144, + "step": 4505 + }, + { + "epoch": 0.52, + "learning_rate": 1.4636544539388972e-07, + "logits/chosen": -3.5869054794311523, + "logits/rejected": -3.865297794342041, + "logps/chosen": -144.99798583984375, + "logps/rejected": -191.77459716796875, + "loss": 0.1866, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3276800811290741, + "rewards/margins": 3.2599399089813232, + "rewards/rejected": -2.9322595596313477, + "step": 4506 + }, + { + "epoch": 0.52, + "learning_rate": 1.4633032892426548e-07, + "logits/chosen": -3.2022783756256104, + "logits/rejected": -3.572930335998535, + "logps/chosen": -188.64453125, + "logps/rejected": -230.6282501220703, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5223579406738281, + "rewards/margins": 2.955181360244751, + "rewards/rejected": -2.432823419570923, + "step": 4507 + }, + { + "epoch": 0.52, + "learning_rate": 1.462952124546412e-07, + "logits/chosen": -3.485935688018799, + "logits/rejected": -3.160102128982544, + "logps/chosen": -363.5455322265625, + "logps/rejected": -287.32525634765625, + "loss": 0.282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.571636974811554, + "rewards/margins": 1.8148936033248901, + "rewards/rejected": -1.2432568073272705, + "step": 4508 + }, + { + "epoch": 0.52, + "learning_rate": 1.4626009598501698e-07, + "logits/chosen": -2.9633209705352783, + "logits/rejected": -2.9464824199676514, + "logps/chosen": -273.2046813964844, + "logps/rejected": -204.05027770996094, + "loss": 0.3751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0629628598690033, + "rewards/margins": 1.535428762435913, + "rewards/rejected": -1.5983915328979492, + "step": 4509 + }, + { + "epoch": 0.52, + "learning_rate": 1.462249795153927e-07, + "logits/chosen": -3.1914849281311035, + "logits/rejected": -2.9607882499694824, + "logps/chosen": -296.96063232421875, + "logps/rejected": -334.2647705078125, + "loss": 0.3228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2850176692008972, + "rewards/margins": 3.0599935054779053, + "rewards/rejected": -2.7749757766723633, + "step": 4510 + }, + { + "epoch": 0.52, + "learning_rate": 1.4618986304576847e-07, + "logits/chosen": -2.4930849075317383, + "logits/rejected": -2.5018396377563477, + "logps/chosen": -242.8787078857422, + "logps/rejected": -265.0387878417969, + "loss": 0.6049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32550880312919617, + "rewards/margins": 0.7893013954162598, + "rewards/rejected": -1.1148102283477783, + "step": 4511 + }, + { + "epoch": 0.52, + "learning_rate": 1.461547465761442e-07, + "logits/chosen": -2.8281402587890625, + "logits/rejected": -2.964817762374878, + "logps/chosen": -197.91180419921875, + "logps/rejected": -158.55967712402344, + "loss": 1.0826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7154462933540344, + "rewards/margins": 0.2448439598083496, + "rewards/rejected": -0.9602901935577393, + "step": 4512 + }, + { + "epoch": 0.52, + "learning_rate": 1.4611963010651995e-07, + "logits/chosen": -2.135673761367798, + "logits/rejected": -2.313724994659424, + "logps/chosen": -292.45501708984375, + "logps/rejected": -268.8369140625, + "loss": 0.4157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09686976671218872, + "rewards/margins": 1.5096077919006348, + "rewards/rejected": -1.6064776182174683, + "step": 4513 + }, + { + "epoch": 0.52, + "learning_rate": 1.460845136368957e-07, + "logits/chosen": -3.309363842010498, + "logits/rejected": -3.078136920928955, + "logps/chosen": -387.93377685546875, + "logps/rejected": -275.83111572265625, + "loss": 0.3709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3221232295036316, + "rewards/margins": 1.6704798936843872, + "rewards/rejected": -1.3483566045761108, + "step": 4514 + }, + { + "epoch": 0.52, + "learning_rate": 1.4604939716727145e-07, + "logits/chosen": -2.7592804431915283, + "logits/rejected": -2.7590973377227783, + "logps/chosen": -225.7200469970703, + "logps/rejected": -182.85997009277344, + "loss": 0.5739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7386026382446289, + "rewards/margins": 0.6514223217964172, + "rewards/rejected": -1.3900249004364014, + "step": 4515 + }, + { + "epoch": 0.52, + "learning_rate": 1.4601428069764718e-07, + "logits/chosen": -3.7803540229797363, + "logits/rejected": -3.643894910812378, + "logps/chosen": -183.2703094482422, + "logps/rejected": -222.65489196777344, + "loss": 0.4361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16703426837921143, + "rewards/margins": 1.3481113910675049, + "rewards/rejected": -1.1810771226882935, + "step": 4516 + }, + { + "epoch": 0.52, + "learning_rate": 1.4597916422802294e-07, + "logits/chosen": -3.38935923576355, + "logits/rejected": -3.526336669921875, + "logps/chosen": -240.83563232421875, + "logps/rejected": -247.01268005371094, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2620702087879181, + "rewards/margins": 1.4285826683044434, + "rewards/rejected": -1.1665124893188477, + "step": 4517 + }, + { + "epoch": 0.52, + "learning_rate": 1.459440477583987e-07, + "logits/chosen": -2.5997231006622314, + "logits/rejected": -2.676978588104248, + "logps/chosen": -381.77862548828125, + "logps/rejected": -304.8307800292969, + "loss": 0.2589, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6638932228088379, + "rewards/margins": 2.397510051727295, + "rewards/rejected": -1.7336170673370361, + "step": 4518 + }, + { + "epoch": 0.52, + "learning_rate": 1.4590893128877442e-07, + "logits/chosen": -3.4240500926971436, + "logits/rejected": -3.1350722312927246, + "logps/chosen": -394.4221496582031, + "logps/rejected": -185.21954345703125, + "loss": 0.5203, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15591832995414734, + "rewards/margins": 1.2731294631958008, + "rewards/rejected": -1.117211103439331, + "step": 4519 + }, + { + "epoch": 0.52, + "learning_rate": 1.4587381481915017e-07, + "logits/chosen": -3.7600042819976807, + "logits/rejected": -3.584879159927368, + "logps/chosen": -234.7393798828125, + "logps/rejected": -302.9960632324219, + "loss": 0.3496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6580209732055664, + "rewards/margins": 2.0400314331054688, + "rewards/rejected": -2.698052406311035, + "step": 4520 + }, + { + "epoch": 0.52, + "learning_rate": 1.4583869834952593e-07, + "logits/chosen": -2.6463563442230225, + "logits/rejected": -2.847277879714966, + "logps/chosen": -376.23944091796875, + "logps/rejected": -266.58392333984375, + "loss": 0.1383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8506337404251099, + "rewards/margins": 2.9745781421661377, + "rewards/rejected": -2.1239442825317383, + "step": 4521 + }, + { + "epoch": 0.52, + "learning_rate": 1.4580358187990168e-07, + "logits/chosen": -3.6484646797180176, + "logits/rejected": -3.3016843795776367, + "logps/chosen": -438.1533203125, + "logps/rejected": -358.65374755859375, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6232179403305054, + "rewards/margins": 3.3576555252075195, + "rewards/rejected": -2.7344377040863037, + "step": 4522 + }, + { + "epoch": 0.52, + "learning_rate": 1.457684654102774e-07, + "logits/chosen": -3.1892309188842773, + "logits/rejected": -3.211682081222534, + "logps/chosen": -295.88555908203125, + "logps/rejected": -279.38946533203125, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28768986463546753, + "rewards/margins": 1.305981993675232, + "rewards/rejected": -1.5936717987060547, + "step": 4523 + }, + { + "epoch": 0.52, + "learning_rate": 1.4573334894065316e-07, + "logits/chosen": -2.8580026626586914, + "logits/rejected": -3.077594518661499, + "logps/chosen": -224.65945434570312, + "logps/rejected": -124.761474609375, + "loss": 0.6244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017807617783546448, + "rewards/margins": 0.7208440899848938, + "rewards/rejected": -0.738651692867279, + "step": 4524 + }, + { + "epoch": 0.52, + "learning_rate": 1.456982324710289e-07, + "logits/chosen": -3.5685439109802246, + "logits/rejected": -3.3410568237304688, + "logps/chosen": -409.82635498046875, + "logps/rejected": -305.61322021484375, + "loss": 0.2554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4066388010978699, + "rewards/margins": 1.5497736930847168, + "rewards/rejected": -1.1431347131729126, + "step": 4525 + }, + { + "epoch": 0.52, + "learning_rate": 1.4566311600140467e-07, + "logits/chosen": -3.1090521812438965, + "logits/rejected": -3.2673966884613037, + "logps/chosen": -220.19656372070312, + "logps/rejected": -196.0310516357422, + "loss": 0.6751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5848056077957153, + "rewards/margins": 1.1758410930633545, + "rewards/rejected": -1.7606467008590698, + "step": 4526 + }, + { + "epoch": 0.52, + "learning_rate": 1.456279995317804e-07, + "logits/chosen": -3.439317464828491, + "logits/rejected": -3.5200862884521484, + "logps/chosen": -448.24847412109375, + "logps/rejected": -566.74658203125, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21919900178909302, + "rewards/margins": 1.9185432195663452, + "rewards/rejected": -1.699344277381897, + "step": 4527 + }, + { + "epoch": 0.52, + "learning_rate": 1.4559288306215615e-07, + "logits/chosen": -3.617074966430664, + "logits/rejected": -3.689352512359619, + "logps/chosen": -145.47332763671875, + "logps/rejected": -119.52456665039062, + "loss": 0.3715, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25587666034698486, + "rewards/margins": 1.603288173675537, + "rewards/rejected": -1.3474115133285522, + "step": 4528 + }, + { + "epoch": 0.52, + "learning_rate": 1.4555776659253188e-07, + "logits/chosen": -3.689966917037964, + "logits/rejected": -4.051967620849609, + "logps/chosen": -85.65380096435547, + "logps/rejected": -129.17864990234375, + "loss": 0.547, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2323906421661377, + "rewards/margins": 1.0100207328796387, + "rewards/rejected": -1.242411494255066, + "step": 4529 + }, + { + "epoch": 0.52, + "learning_rate": 1.4552265012290763e-07, + "logits/chosen": -3.255544662475586, + "logits/rejected": -2.908787250518799, + "logps/chosen": -314.61328125, + "logps/rejected": -244.54473876953125, + "loss": 0.4696, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22261370718479156, + "rewards/margins": 2.0103931427001953, + "rewards/rejected": -1.7877795696258545, + "step": 4530 + }, + { + "epoch": 0.52, + "learning_rate": 1.4548753365328338e-07, + "logits/chosen": -3.3215649127960205, + "logits/rejected": -2.872685432434082, + "logps/chosen": -328.25787353515625, + "logps/rejected": -197.47457885742188, + "loss": 0.3597, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.298471063375473, + "rewards/margins": 1.280191421508789, + "rewards/rejected": -0.9817203283309937, + "step": 4531 + }, + { + "epoch": 0.52, + "learning_rate": 1.4545241718365914e-07, + "logits/chosen": -2.810771942138672, + "logits/rejected": -2.8076107501983643, + "logps/chosen": -189.19036865234375, + "logps/rejected": -187.88670349121094, + "loss": 0.7244, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7424294352531433, + "rewards/margins": 0.777534544467926, + "rewards/rejected": -1.5199639797210693, + "step": 4532 + }, + { + "epoch": 0.52, + "learning_rate": 1.4541730071403487e-07, + "logits/chosen": -3.2213761806488037, + "logits/rejected": -3.234488010406494, + "logps/chosen": -157.89962768554688, + "logps/rejected": -157.41455078125, + "loss": 0.2002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07875081896781921, + "rewards/margins": 1.9244698286056519, + "rewards/rejected": -2.003220558166504, + "step": 4533 + }, + { + "epoch": 0.52, + "learning_rate": 1.4538218424441062e-07, + "logits/chosen": -3.7212352752685547, + "logits/rejected": -3.7019686698913574, + "logps/chosen": -154.13929748535156, + "logps/rejected": -192.4016876220703, + "loss": 0.3152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2624971866607666, + "rewards/margins": 3.044203281402588, + "rewards/rejected": -3.3067007064819336, + "step": 4534 + }, + { + "epoch": 0.52, + "learning_rate": 1.4534706777478637e-07, + "logits/chosen": -3.3896772861480713, + "logits/rejected": -3.346013069152832, + "logps/chosen": -232.71644592285156, + "logps/rejected": -201.4299774169922, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11186444759368896, + "rewards/margins": 1.7746059894561768, + "rewards/rejected": -1.8864705562591553, + "step": 4535 + }, + { + "epoch": 0.52, + "learning_rate": 1.453119513051621e-07, + "logits/chosen": -3.2645771503448486, + "logits/rejected": -3.2260966300964355, + "logps/chosen": -170.376220703125, + "logps/rejected": -221.6679229736328, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18128237128257751, + "rewards/margins": 2.3960623741149902, + "rewards/rejected": -2.5773448944091797, + "step": 4536 + }, + { + "epoch": 0.52, + "learning_rate": 1.4527683483553785e-07, + "logits/chosen": -3.1881306171417236, + "logits/rejected": -3.1731247901916504, + "logps/chosen": -212.89830017089844, + "logps/rejected": -194.3070068359375, + "loss": 0.4836, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24336591362953186, + "rewards/margins": 1.2229398488998413, + "rewards/rejected": -1.4663057327270508, + "step": 4537 + }, + { + "epoch": 0.52, + "learning_rate": 1.452417183659136e-07, + "logits/chosen": -3.5487096309661865, + "logits/rejected": -3.2463550567626953, + "logps/chosen": -270.8427734375, + "logps/rejected": -182.46804809570312, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03521338105201721, + "rewards/margins": 1.657698154449463, + "rewards/rejected": -1.6224846839904785, + "step": 4538 + }, + { + "epoch": 0.52, + "learning_rate": 1.4520660189628936e-07, + "logits/chosen": -3.2083234786987305, + "logits/rejected": -3.591989517211914, + "logps/chosen": -116.93147277832031, + "logps/rejected": -272.55462646484375, + "loss": 0.578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.035640276968479156, + "rewards/margins": 1.617920160293579, + "rewards/rejected": -1.6535604000091553, + "step": 4539 + }, + { + "epoch": 0.52, + "learning_rate": 1.451714854266651e-07, + "logits/chosen": -3.0910043716430664, + "logits/rejected": -2.9869306087493896, + "logps/chosen": -203.59634399414062, + "logps/rejected": -205.57298278808594, + "loss": 0.2441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08964620530605316, + "rewards/margins": 1.8415952920913696, + "rewards/rejected": -1.7519490718841553, + "step": 4540 + }, + { + "epoch": 0.52, + "learning_rate": 1.4513636895704084e-07, + "logits/chosen": -3.8237862586975098, + "logits/rejected": -3.81353497505188, + "logps/chosen": -175.53274536132812, + "logps/rejected": -164.8250274658203, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08865071833133698, + "rewards/margins": 1.1709126234054565, + "rewards/rejected": -1.2595632076263428, + "step": 4541 + }, + { + "epoch": 0.52, + "learning_rate": 1.451012524874166e-07, + "logits/chosen": -3.5613174438476562, + "logits/rejected": -3.688434362411499, + "logps/chosen": -271.90313720703125, + "logps/rejected": -139.69740295410156, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.682332456111908, + "rewards/margins": 2.541240930557251, + "rewards/rejected": -1.8589084148406982, + "step": 4542 + }, + { + "epoch": 0.52, + "learning_rate": 1.4506613601779235e-07, + "logits/chosen": -3.673847198486328, + "logits/rejected": -3.4159200191497803, + "logps/chosen": -213.395751953125, + "logps/rejected": -208.93601989746094, + "loss": 0.677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38980770111083984, + "rewards/margins": 0.6813926100730896, + "rewards/rejected": -1.0712002515792847, + "step": 4543 + }, + { + "epoch": 0.52, + "learning_rate": 1.4503101954816808e-07, + "logits/chosen": -2.6312665939331055, + "logits/rejected": -2.779625654220581, + "logps/chosen": -284.84619140625, + "logps/rejected": -395.837890625, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31688806414604187, + "rewards/margins": 1.2289832830429077, + "rewards/rejected": -0.9120951890945435, + "step": 4544 + }, + { + "epoch": 0.52, + "learning_rate": 1.4499590307854383e-07, + "logits/chosen": -3.376478672027588, + "logits/rejected": -3.656719923019409, + "logps/chosen": -119.56896209716797, + "logps/rejected": -184.76536560058594, + "loss": 0.4139, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2122991383075714, + "rewards/margins": 1.583847165107727, + "rewards/rejected": -1.796146273612976, + "step": 4545 + }, + { + "epoch": 0.52, + "learning_rate": 1.4496078660891959e-07, + "logits/chosen": -3.5364222526550293, + "logits/rejected": -3.3374640941619873, + "logps/chosen": -111.90615844726562, + "logps/rejected": -133.36312866210938, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13662220537662506, + "rewards/margins": 1.194567322731018, + "rewards/rejected": -1.3311896324157715, + "step": 4546 + }, + { + "epoch": 0.52, + "learning_rate": 1.4492567013929531e-07, + "logits/chosen": -2.3638458251953125, + "logits/rejected": -2.5904104709625244, + "logps/chosen": -262.13629150390625, + "logps/rejected": -282.65753173828125, + "loss": 0.407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04864295572042465, + "rewards/margins": 1.4064637422561646, + "rewards/rejected": -1.4551066160202026, + "step": 4547 + }, + { + "epoch": 0.52, + "learning_rate": 1.4489055366967107e-07, + "logits/chosen": -3.233232021331787, + "logits/rejected": -3.1042211055755615, + "logps/chosen": -270.427734375, + "logps/rejected": -164.15811157226562, + "loss": 0.5005, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.098228819668293, + "rewards/margins": 1.1732428073883057, + "rewards/rejected": -1.2714717388153076, + "step": 4548 + }, + { + "epoch": 0.52, + "learning_rate": 1.4485543720004682e-07, + "logits/chosen": -3.498145341873169, + "logits/rejected": -3.727795124053955, + "logps/chosen": -275.287353515625, + "logps/rejected": -202.54971313476562, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8178858160972595, + "rewards/margins": 0.9077301621437073, + "rewards/rejected": -1.7256159782409668, + "step": 4549 + }, + { + "epoch": 0.52, + "learning_rate": 1.4482032073042258e-07, + "logits/chosen": -2.4420251846313477, + "logits/rejected": -2.5405540466308594, + "logps/chosen": -374.2143859863281, + "logps/rejected": -330.7633361816406, + "loss": 0.4153, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1848951131105423, + "rewards/margins": 2.0091989040374756, + "rewards/rejected": -1.8243037462234497, + "step": 4550 + }, + { + "epoch": 0.52, + "learning_rate": 1.447852042607983e-07, + "logits/chosen": -2.64052677154541, + "logits/rejected": -2.637890338897705, + "logps/chosen": -272.3152160644531, + "logps/rejected": -247.793701171875, + "loss": 0.3195, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38582882285118103, + "rewards/margins": 1.779038906097412, + "rewards/rejected": -1.3932100534439087, + "step": 4551 + }, + { + "epoch": 0.52, + "learning_rate": 1.4475008779117406e-07, + "logits/chosen": -4.040778160095215, + "logits/rejected": -3.296001434326172, + "logps/chosen": -294.9337158203125, + "logps/rejected": -199.19500732421875, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8824362754821777, + "rewards/margins": 2.226883888244629, + "rewards/rejected": -3.1093201637268066, + "step": 4552 + }, + { + "epoch": 0.52, + "learning_rate": 1.4471497132154978e-07, + "logits/chosen": -3.4529309272766113, + "logits/rejected": -3.3314085006713867, + "logps/chosen": -216.83767700195312, + "logps/rejected": -162.25607299804688, + "loss": 0.401, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0001845136284828186, + "rewards/margins": 1.6669538021087646, + "rewards/rejected": -1.6667691469192505, + "step": 4553 + }, + { + "epoch": 0.52, + "learning_rate": 1.4467985485192556e-07, + "logits/chosen": -2.461735963821411, + "logits/rejected": -2.4717156887054443, + "logps/chosen": -160.6231231689453, + "logps/rejected": -191.66448974609375, + "loss": 0.3535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35553234815597534, + "rewards/margins": 2.0131242275238037, + "rewards/rejected": -2.3686563968658447, + "step": 4554 + }, + { + "epoch": 0.53, + "learning_rate": 1.446447383823013e-07, + "logits/chosen": -3.109954833984375, + "logits/rejected": -3.0926620960235596, + "logps/chosen": -305.19732666015625, + "logps/rejected": -229.22430419921875, + "loss": 0.2968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07442638278007507, + "rewards/margins": 1.4922597408294678, + "rewards/rejected": -1.4178333282470703, + "step": 4555 + }, + { + "epoch": 0.53, + "learning_rate": 1.4460962191267705e-07, + "logits/chosen": -2.54498291015625, + "logits/rejected": -2.602360486984253, + "logps/chosen": -474.4638671875, + "logps/rejected": -347.944091796875, + "loss": 0.2012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2334541380405426, + "rewards/margins": 2.7988123893737793, + "rewards/rejected": -3.032266616821289, + "step": 4556 + }, + { + "epoch": 0.53, + "learning_rate": 1.4457450544305277e-07, + "logits/chosen": -2.531528949737549, + "logits/rejected": -2.7545177936553955, + "logps/chosen": -372.0047607421875, + "logps/rejected": -424.03729248046875, + "loss": 0.6357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.085598424077034, + "rewards/margins": 0.9958369731903076, + "rewards/rejected": -1.0814354419708252, + "step": 4557 + }, + { + "epoch": 0.53, + "learning_rate": 1.4453938897342853e-07, + "logits/chosen": -2.9523301124572754, + "logits/rejected": -2.9954721927642822, + "logps/chosen": -139.28836059570312, + "logps/rejected": -154.98193359375, + "loss": 0.5023, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21690887212753296, + "rewards/margins": 1.1869704723358154, + "rewards/rejected": -1.4038792848587036, + "step": 4558 + }, + { + "epoch": 0.53, + "learning_rate": 1.4450427250380428e-07, + "logits/chosen": -3.3322062492370605, + "logits/rejected": -3.3333044052124023, + "logps/chosen": -196.14761352539062, + "logps/rejected": -198.09396362304688, + "loss": 0.5792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45698341727256775, + "rewards/margins": 1.8107103109359741, + "rewards/rejected": -2.267693519592285, + "step": 4559 + }, + { + "epoch": 0.53, + "learning_rate": 1.4446915603418003e-07, + "logits/chosen": -3.0984978675842285, + "logits/rejected": -2.8150222301483154, + "logps/chosen": -257.37353515625, + "logps/rejected": -262.20587158203125, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7484826445579529, + "rewards/margins": 3.245074510574341, + "rewards/rejected": -2.496591806411743, + "step": 4560 + }, + { + "epoch": 0.53, + "learning_rate": 1.4443403956455576e-07, + "logits/chosen": -3.2418785095214844, + "logits/rejected": -3.5650794506073, + "logps/chosen": -180.28765869140625, + "logps/rejected": -196.23568725585938, + "loss": 0.3169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01657930016517639, + "rewards/margins": 1.5889216661453247, + "rewards/rejected": -1.6055009365081787, + "step": 4561 + }, + { + "epoch": 0.53, + "learning_rate": 1.4439892309493152e-07, + "logits/chosen": -3.2095441818237305, + "logits/rejected": -3.250314950942993, + "logps/chosen": -100.3193359375, + "logps/rejected": -209.22415161132812, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36452147364616394, + "rewards/margins": 2.124643564224243, + "rewards/rejected": -1.7601221799850464, + "step": 4562 + }, + { + "epoch": 0.53, + "learning_rate": 1.4436380662530727e-07, + "logits/chosen": -3.7570927143096924, + "logits/rejected": -3.773345470428467, + "logps/chosen": -350.0870361328125, + "logps/rejected": -317.1756896972656, + "loss": 0.4121, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02452746033668518, + "rewards/margins": 1.4482901096343994, + "rewards/rejected": -1.4237627983093262, + "step": 4563 + }, + { + "epoch": 0.53, + "learning_rate": 1.44328690155683e-07, + "logits/chosen": -2.7558796405792236, + "logits/rejected": -2.824437141418457, + "logps/chosen": -253.21527099609375, + "logps/rejected": -287.856201171875, + "loss": 0.2744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02722858637571335, + "rewards/margins": 3.4359841346740723, + "rewards/rejected": -3.463212728500366, + "step": 4564 + }, + { + "epoch": 0.53, + "learning_rate": 1.4429357368605875e-07, + "logits/chosen": -3.440643310546875, + "logits/rejected": -3.2163705825805664, + "logps/chosen": -397.6063232421875, + "logps/rejected": -314.3331298828125, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6726202964782715, + "rewards/margins": 3.234943151473999, + "rewards/rejected": -2.5623230934143066, + "step": 4565 + }, + { + "epoch": 0.53, + "learning_rate": 1.442584572164345e-07, + "logits/chosen": -2.8255410194396973, + "logits/rejected": -2.8346610069274902, + "logps/chosen": -173.08009338378906, + "logps/rejected": -190.33807373046875, + "loss": 0.4175, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1201213076710701, + "rewards/margins": 1.3137295246124268, + "rewards/rejected": -1.193608045578003, + "step": 4566 + }, + { + "epoch": 0.53, + "learning_rate": 1.4422334074681026e-07, + "logits/chosen": -3.2480435371398926, + "logits/rejected": -3.2287280559539795, + "logps/chosen": -302.8789978027344, + "logps/rejected": -283.169189453125, + "loss": 0.4636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3232758641242981, + "rewards/margins": 1.1039314270019531, + "rewards/rejected": -1.4272072315216064, + "step": 4567 + }, + { + "epoch": 0.53, + "learning_rate": 1.4418822427718599e-07, + "logits/chosen": -3.321974277496338, + "logits/rejected": -3.1685423851013184, + "logps/chosen": -373.6947021484375, + "logps/rejected": -312.84967041015625, + "loss": 0.2023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4350305199623108, + "rewards/margins": 2.217876672744751, + "rewards/rejected": -1.7828460931777954, + "step": 4568 + }, + { + "epoch": 0.53, + "learning_rate": 1.4415310780756174e-07, + "logits/chosen": -3.5175695419311523, + "logits/rejected": -3.632200241088867, + "logps/chosen": -313.5005798339844, + "logps/rejected": -355.1815185546875, + "loss": 0.2374, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09643249213695526, + "rewards/margins": 2.1383137702941895, + "rewards/rejected": -2.0418810844421387, + "step": 4569 + }, + { + "epoch": 0.53, + "learning_rate": 1.4411799133793747e-07, + "logits/chosen": -3.549882411956787, + "logits/rejected": -3.3712549209594727, + "logps/chosen": -154.60055541992188, + "logps/rejected": -270.7834777832031, + "loss": 0.6021, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4910079538822174, + "rewards/margins": 0.645799994468689, + "rewards/rejected": -1.136807918548584, + "step": 4570 + }, + { + "epoch": 0.53, + "learning_rate": 1.4408287486831325e-07, + "logits/chosen": -3.262845039367676, + "logits/rejected": -3.05125093460083, + "logps/chosen": -215.6374053955078, + "logps/rejected": -191.34481811523438, + "loss": 0.3327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45906418561935425, + "rewards/margins": 1.6038028001785278, + "rewards/rejected": -2.0628671646118164, + "step": 4571 + }, + { + "epoch": 0.53, + "learning_rate": 1.4404775839868897e-07, + "logits/chosen": -2.6186609268188477, + "logits/rejected": -2.422074317932129, + "logps/chosen": -258.3731689453125, + "logps/rejected": -185.49208068847656, + "loss": 0.9902, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0658153295516968, + "rewards/margins": 0.2729703187942505, + "rewards/rejected": -1.3387857675552368, + "step": 4572 + }, + { + "epoch": 0.53, + "learning_rate": 1.4401264192906473e-07, + "logits/chosen": -3.162992000579834, + "logits/rejected": -3.011418342590332, + "logps/chosen": -177.0764923095703, + "logps/rejected": -239.6697998046875, + "loss": 0.847, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23454564809799194, + "rewards/margins": 0.42406052350997925, + "rewards/rejected": -0.1895148605108261, + "step": 4573 + }, + { + "epoch": 0.53, + "learning_rate": 1.4397752545944046e-07, + "logits/chosen": -3.091902732849121, + "logits/rejected": -3.0524702072143555, + "logps/chosen": -313.5158386230469, + "logps/rejected": -329.92767333984375, + "loss": 0.1631, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08120383322238922, + "rewards/margins": 2.048280715942383, + "rewards/rejected": -1.9670766592025757, + "step": 4574 + }, + { + "epoch": 0.53, + "learning_rate": 1.439424089898162e-07, + "logits/chosen": -3.274196147918701, + "logits/rejected": -2.5410547256469727, + "logps/chosen": -182.36268615722656, + "logps/rejected": -168.9410858154297, + "loss": 0.5924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14333055913448334, + "rewards/margins": 1.574354648590088, + "rewards/rejected": -1.7176851034164429, + "step": 4575 + }, + { + "epoch": 0.53, + "learning_rate": 1.4390729252019196e-07, + "logits/chosen": -2.6212213039398193, + "logits/rejected": -2.731314182281494, + "logps/chosen": -518.8555297851562, + "logps/rejected": -434.15521240234375, + "loss": 0.5506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8988786935806274, + "rewards/margins": 1.0927289724349976, + "rewards/rejected": -1.991607666015625, + "step": 4576 + }, + { + "epoch": 0.53, + "learning_rate": 1.4387217605056772e-07, + "logits/chosen": -3.500058174133301, + "logits/rejected": -3.516181230545044, + "logps/chosen": -116.54816436767578, + "logps/rejected": -145.6199188232422, + "loss": 0.286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023239217698574066, + "rewards/margins": 1.7005071640014648, + "rewards/rejected": -1.6772680282592773, + "step": 4577 + }, + { + "epoch": 0.53, + "learning_rate": 1.4383705958094344e-07, + "logits/chosen": -2.840970993041992, + "logits/rejected": -2.6827032566070557, + "logps/chosen": -136.5832977294922, + "logps/rejected": -180.3238067626953, + "loss": 0.2752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38811808824539185, + "rewards/margins": 1.4643056392669678, + "rewards/rejected": -1.8524236679077148, + "step": 4578 + }, + { + "epoch": 0.53, + "learning_rate": 1.438019431113192e-07, + "logits/chosen": -3.246171474456787, + "logits/rejected": -3.0562143325805664, + "logps/chosen": -273.30572509765625, + "logps/rejected": -195.18002319335938, + "loss": 0.3046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06066463142633438, + "rewards/margins": 1.9875521659851074, + "rewards/rejected": -2.0482168197631836, + "step": 4579 + }, + { + "epoch": 0.53, + "learning_rate": 1.4376682664169495e-07, + "logits/chosen": -2.9094042778015137, + "logits/rejected": -2.979661703109741, + "logps/chosen": -236.27906799316406, + "logps/rejected": -248.86660766601562, + "loss": 0.6234, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13385868072509766, + "rewards/margins": 1.3448107242584229, + "rewards/rejected": -1.4786694049835205, + "step": 4580 + }, + { + "epoch": 0.53, + "learning_rate": 1.4373171017207068e-07, + "logits/chosen": -3.8400301933288574, + "logits/rejected": -3.7674267292022705, + "logps/chosen": -358.4031982421875, + "logps/rejected": -254.9463653564453, + "loss": 0.7624, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8132634162902832, + "rewards/margins": 0.7580273151397705, + "rewards/rejected": -1.5712908506393433, + "step": 4581 + }, + { + "epoch": 0.53, + "learning_rate": 1.4369659370244643e-07, + "logits/chosen": -2.625728130340576, + "logits/rejected": -2.787111282348633, + "logps/chosen": -252.4321746826172, + "logps/rejected": -229.53628540039062, + "loss": 0.5251, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31606990098953247, + "rewards/margins": 1.0829651355743408, + "rewards/rejected": -1.399035096168518, + "step": 4582 + }, + { + "epoch": 0.53, + "learning_rate": 1.436614772328222e-07, + "logits/chosen": -2.91627836227417, + "logits/rejected": -2.7716126441955566, + "logps/chosen": -336.0606689453125, + "logps/rejected": -376.5575866699219, + "loss": 0.5416, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7730026245117188, + "rewards/margins": 1.593205213546753, + "rewards/rejected": -2.3662075996398926, + "step": 4583 + }, + { + "epoch": 0.53, + "learning_rate": 1.4362636076319794e-07, + "logits/chosen": -3.792706251144409, + "logits/rejected": -3.4713611602783203, + "logps/chosen": -439.90863037109375, + "logps/rejected": -489.5748291015625, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05380548909306526, + "rewards/margins": 3.1306755542755127, + "rewards/rejected": -3.184481143951416, + "step": 4584 + }, + { + "epoch": 0.53, + "learning_rate": 1.4359124429357367e-07, + "logits/chosen": -3.6243393421173096, + "logits/rejected": -3.4488272666931152, + "logps/chosen": -234.18824768066406, + "logps/rejected": -195.2311248779297, + "loss": 0.382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2825080454349518, + "rewards/margins": 1.0459206104278564, + "rewards/rejected": -1.3284286260604858, + "step": 4585 + }, + { + "epoch": 0.53, + "learning_rate": 1.4355612782394942e-07, + "logits/chosen": -3.2642922401428223, + "logits/rejected": -3.262443780899048, + "logps/chosen": -342.9779052734375, + "logps/rejected": -301.19854736328125, + "loss": 0.3956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21773774921894073, + "rewards/margins": 1.4630885124206543, + "rewards/rejected": -1.6808264255523682, + "step": 4586 + }, + { + "epoch": 0.53, + "learning_rate": 1.4352101135432518e-07, + "logits/chosen": -3.0994157791137695, + "logits/rejected": -2.7915172576904297, + "logps/chosen": -433.4837646484375, + "logps/rejected": -281.1668701171875, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5110697746276855, + "rewards/margins": 1.85420823097229, + "rewards/rejected": -1.3431384563446045, + "step": 4587 + }, + { + "epoch": 0.53, + "learning_rate": 1.4348589488470093e-07, + "logits/chosen": -2.4662575721740723, + "logits/rejected": -2.4495491981506348, + "logps/chosen": -297.8823547363281, + "logps/rejected": -392.26434326171875, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6297708749771118, + "rewards/margins": 3.1127095222473145, + "rewards/rejected": -2.482938528060913, + "step": 4588 + }, + { + "epoch": 0.53, + "learning_rate": 1.4345077841507666e-07, + "logits/chosen": -3.109022617340088, + "logits/rejected": -2.682875156402588, + "logps/chosen": -261.17578125, + "logps/rejected": -200.8707275390625, + "loss": 0.4731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16605547070503235, + "rewards/margins": 1.8451519012451172, + "rewards/rejected": -2.011207342147827, + "step": 4589 + }, + { + "epoch": 0.53, + "learning_rate": 1.434156619454524e-07, + "logits/chosen": -3.6277854442596436, + "logits/rejected": -3.4555397033691406, + "logps/chosen": -251.6851043701172, + "logps/rejected": -263.46002197265625, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.504522979259491, + "rewards/margins": 1.3603379726409912, + "rewards/rejected": -1.864861011505127, + "step": 4590 + }, + { + "epoch": 0.53, + "learning_rate": 1.4338054547582817e-07, + "logits/chosen": -4.043933868408203, + "logits/rejected": -3.6875576972961426, + "logps/chosen": -299.49560546875, + "logps/rejected": -191.56935119628906, + "loss": 0.6094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3058225214481354, + "rewards/margins": 0.9476535320281982, + "rewards/rejected": -1.2534761428833008, + "step": 4591 + }, + { + "epoch": 0.53, + "learning_rate": 1.433454290062039e-07, + "logits/chosen": -3.690720558166504, + "logits/rejected": -3.22398042678833, + "logps/chosen": -191.22264099121094, + "logps/rejected": -139.98681640625, + "loss": 0.473, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10526669770479202, + "rewards/margins": 1.5428450107574463, + "rewards/rejected": -1.4375783205032349, + "step": 4592 + }, + { + "epoch": 0.53, + "learning_rate": 1.4331031253657965e-07, + "logits/chosen": -2.88511323928833, + "logits/rejected": -3.0347137451171875, + "logps/chosen": -109.47860717773438, + "logps/rejected": -131.4563446044922, + "loss": 0.3728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36035746335983276, + "rewards/margins": 1.3290297985076904, + "rewards/rejected": -1.689387321472168, + "step": 4593 + }, + { + "epoch": 0.53, + "learning_rate": 1.432751960669554e-07, + "logits/chosen": -2.8215506076812744, + "logits/rejected": -2.836158514022827, + "logps/chosen": -338.61968994140625, + "logps/rejected": -487.50244140625, + "loss": 0.345, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3590484857559204, + "rewards/margins": 2.185894012451172, + "rewards/rejected": -1.826845407485962, + "step": 4594 + }, + { + "epoch": 0.53, + "learning_rate": 1.4324007959733115e-07, + "logits/chosen": -2.957360029220581, + "logits/rejected": -2.9505960941314697, + "logps/chosen": -284.9811706542969, + "logps/rejected": -312.8219299316406, + "loss": 0.2696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11642752587795258, + "rewards/margins": 1.5495460033416748, + "rewards/rejected": -1.4331185817718506, + "step": 4595 + }, + { + "epoch": 0.53, + "learning_rate": 1.4320496312770688e-07, + "logits/chosen": -3.1995177268981934, + "logits/rejected": -3.34700345993042, + "logps/chosen": -114.85945892333984, + "logps/rejected": -203.09185791015625, + "loss": 0.244, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.165593221783638, + "rewards/margins": 2.6160504817962646, + "rewards/rejected": -2.4504568576812744, + "step": 4596 + }, + { + "epoch": 0.53, + "learning_rate": 1.4316984665808264e-07, + "logits/chosen": -2.668905258178711, + "logits/rejected": -2.4016284942626953, + "logps/chosen": -210.7563018798828, + "logps/rejected": -264.86126708984375, + "loss": 0.363, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4332781434059143, + "rewards/margins": 1.7063796520233154, + "rewards/rejected": -2.139657735824585, + "step": 4597 + }, + { + "epoch": 0.53, + "learning_rate": 1.4313473018845836e-07, + "logits/chosen": -3.028956413269043, + "logits/rejected": -3.2863101959228516, + "logps/chosen": -400.9277038574219, + "logps/rejected": -171.89358520507812, + "loss": 0.6773, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1865827590227127, + "rewards/margins": 1.0037997961044312, + "rewards/rejected": -1.1903825998306274, + "step": 4598 + }, + { + "epoch": 0.53, + "learning_rate": 1.4309961371883414e-07, + "logits/chosen": -3.1716747283935547, + "logits/rejected": -3.069523811340332, + "logps/chosen": -265.24066162109375, + "logps/rejected": -233.5712127685547, + "loss": 0.3198, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09300629049539566, + "rewards/margins": 2.3481945991516113, + "rewards/rejected": -2.25518798828125, + "step": 4599 + }, + { + "epoch": 0.53, + "learning_rate": 1.4306449724920987e-07, + "logits/chosen": -3.2458336353302, + "logits/rejected": -3.231771945953369, + "logps/chosen": -184.26431274414062, + "logps/rejected": -215.38148498535156, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11146067082881927, + "rewards/margins": 2.8260674476623535, + "rewards/rejected": -2.714606761932373, + "step": 4600 + }, + { + "epoch": 0.53, + "learning_rate": 1.4302938077958562e-07, + "logits/chosen": -3.5882365703582764, + "logits/rejected": -3.5013482570648193, + "logps/chosen": -207.10707092285156, + "logps/rejected": -252.5864715576172, + "loss": 0.2717, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11974400281906128, + "rewards/margins": 1.7795013189315796, + "rewards/rejected": -1.659757137298584, + "step": 4601 + }, + { + "epoch": 0.53, + "learning_rate": 1.4299426430996135e-07, + "logits/chosen": -3.222297191619873, + "logits/rejected": -3.1981773376464844, + "logps/chosen": -175.72015380859375, + "logps/rejected": -137.79771423339844, + "loss": 0.7187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4052913784980774, + "rewards/margins": 0.5377423763275146, + "rewards/rejected": -0.943033754825592, + "step": 4602 + }, + { + "epoch": 0.53, + "learning_rate": 1.429591478403371e-07, + "logits/chosen": -3.5260543823242188, + "logits/rejected": -3.3549399375915527, + "logps/chosen": -399.3951416015625, + "logps/rejected": -342.8174133300781, + "loss": 0.4493, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30068516731262207, + "rewards/margins": 0.9108918905258179, + "rewards/rejected": -1.21157705783844, + "step": 4603 + }, + { + "epoch": 0.53, + "learning_rate": 1.4292403137071286e-07, + "logits/chosen": -3.377521276473999, + "logits/rejected": -3.3984177112579346, + "logps/chosen": -287.64056396484375, + "logps/rejected": -319.78948974609375, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026783868670463562, + "rewards/margins": 1.526104211807251, + "rewards/rejected": -1.552888035774231, + "step": 4604 + }, + { + "epoch": 0.53, + "learning_rate": 1.4288891490108861e-07, + "logits/chosen": -2.391757011413574, + "logits/rejected": -2.209441661834717, + "logps/chosen": -65.5755386352539, + "logps/rejected": -131.6504669189453, + "loss": 0.3651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11203740537166595, + "rewards/margins": 0.9377835988998413, + "rewards/rejected": -0.8257461786270142, + "step": 4605 + }, + { + "epoch": 0.53, + "learning_rate": 1.4285379843146434e-07, + "logits/chosen": -3.7533528804779053, + "logits/rejected": -4.035670280456543, + "logps/chosen": -314.9356994628906, + "logps/rejected": -307.5156555175781, + "loss": 0.3608, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2547226548194885, + "rewards/margins": 1.7278378009796143, + "rewards/rejected": -1.982560396194458, + "step": 4606 + }, + { + "epoch": 0.53, + "learning_rate": 1.428186819618401e-07, + "logits/chosen": -4.130621910095215, + "logits/rejected": -3.7439889907836914, + "logps/chosen": -272.05084228515625, + "logps/rejected": -244.03213500976562, + "loss": 0.3218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021763205528259277, + "rewards/margins": 2.253709077835083, + "rewards/rejected": -2.2754721641540527, + "step": 4607 + }, + { + "epoch": 0.53, + "learning_rate": 1.4278356549221585e-07, + "logits/chosen": -2.8559913635253906, + "logits/rejected": -3.159079074859619, + "logps/chosen": -226.8187255859375, + "logps/rejected": -211.24705505371094, + "loss": 0.4086, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.054330699145793915, + "rewards/margins": 0.9295872449874878, + "rewards/rejected": -0.8752565383911133, + "step": 4608 + }, + { + "epoch": 0.53, + "learning_rate": 1.4274844902259158e-07, + "logits/chosen": -2.7460527420043945, + "logits/rejected": -2.7483699321746826, + "logps/chosen": -335.0164489746094, + "logps/rejected": -196.95849609375, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5476331114768982, + "rewards/margins": 1.0728462934494019, + "rewards/rejected": -1.6204794645309448, + "step": 4609 + }, + { + "epoch": 0.53, + "learning_rate": 1.4271333255296733e-07, + "logits/chosen": -3.693007469177246, + "logits/rejected": -3.4511523246765137, + "logps/chosen": -166.8677978515625, + "logps/rejected": -145.77413940429688, + "loss": 0.4142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025151312351226807, + "rewards/margins": 1.8659749031066895, + "rewards/rejected": -1.8911261558532715, + "step": 4610 + }, + { + "epoch": 0.53, + "learning_rate": 1.4267821608334308e-07, + "logits/chosen": -2.1685128211975098, + "logits/rejected": -2.5386693477630615, + "logps/chosen": -185.9340057373047, + "logps/rejected": -172.3708953857422, + "loss": 0.6493, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36689960956573486, + "rewards/margins": 0.34803205728530884, + "rewards/rejected": -0.7149317264556885, + "step": 4611 + }, + { + "epoch": 0.53, + "learning_rate": 1.4264309961371884e-07, + "logits/chosen": -2.4270427227020264, + "logits/rejected": -2.670897960662842, + "logps/chosen": -163.20852661132812, + "logps/rejected": -197.865234375, + "loss": 0.2082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2065826654434204, + "rewards/margins": 1.8135071992874146, + "rewards/rejected": -2.020089864730835, + "step": 4612 + }, + { + "epoch": 0.53, + "learning_rate": 1.4260798314409457e-07, + "logits/chosen": -3.1261074542999268, + "logits/rejected": -3.315230369567871, + "logps/chosen": -208.6081085205078, + "logps/rejected": -170.54977416992188, + "loss": 0.3615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31942033767700195, + "rewards/margins": 1.8757269382476807, + "rewards/rejected": -2.1951472759246826, + "step": 4613 + }, + { + "epoch": 0.53, + "learning_rate": 1.4257286667447032e-07, + "logits/chosen": -4.088421821594238, + "logits/rejected": -3.9290683269500732, + "logps/chosen": -199.40283203125, + "logps/rejected": -179.57009887695312, + "loss": 0.48, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18187138438224792, + "rewards/margins": 0.900641679763794, + "rewards/rejected": -1.0825130939483643, + "step": 4614 + }, + { + "epoch": 0.53, + "learning_rate": 1.4253775020484607e-07, + "logits/chosen": -2.104647636413574, + "logits/rejected": -2.253892421722412, + "logps/chosen": -294.1556091308594, + "logps/rejected": -257.3692321777344, + "loss": 0.2571, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3456215560436249, + "rewards/margins": 2.419269323348999, + "rewards/rejected": -2.0736477375030518, + "step": 4615 + }, + { + "epoch": 0.53, + "learning_rate": 1.4250263373522183e-07, + "logits/chosen": -3.243697166442871, + "logits/rejected": -3.0998575687408447, + "logps/chosen": -312.909423828125, + "logps/rejected": -350.56201171875, + "loss": 0.2439, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21444807946681976, + "rewards/margins": 2.4560890197753906, + "rewards/rejected": -2.241641044616699, + "step": 4616 + }, + { + "epoch": 0.53, + "learning_rate": 1.4246751726559755e-07, + "logits/chosen": -2.850947141647339, + "logits/rejected": -3.095313787460327, + "logps/chosen": -201.74822998046875, + "logps/rejected": -220.62673950195312, + "loss": 0.2127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014567950740456581, + "rewards/margins": 2.471315622329712, + "rewards/rejected": -2.4858834743499756, + "step": 4617 + }, + { + "epoch": 0.53, + "learning_rate": 1.424324007959733e-07, + "logits/chosen": -2.916757583618164, + "logits/rejected": -3.2335076332092285, + "logps/chosen": -254.91783142089844, + "logps/rejected": -292.67095947265625, + "loss": 0.2285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15013496577739716, + "rewards/margins": 2.0953125953674316, + "rewards/rejected": -2.2454476356506348, + "step": 4618 + }, + { + "epoch": 0.53, + "learning_rate": 1.4239728432634904e-07, + "logits/chosen": -3.587071180343628, + "logits/rejected": -3.352964401245117, + "logps/chosen": -162.939208984375, + "logps/rejected": -234.41358947753906, + "loss": 0.4668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12905755639076233, + "rewards/margins": 1.4858365058898926, + "rewards/rejected": -1.6148940324783325, + "step": 4619 + }, + { + "epoch": 0.53, + "learning_rate": 1.423621678567248e-07, + "logits/chosen": -3.0941572189331055, + "logits/rejected": -3.323101282119751, + "logps/chosen": -304.724609375, + "logps/rejected": -304.7298583984375, + "loss": 0.8569, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23589180409908295, + "rewards/margins": 1.6888132095336914, + "rewards/rejected": -1.924704909324646, + "step": 4620 + }, + { + "epoch": 0.53, + "learning_rate": 1.4232705138710054e-07, + "logits/chosen": -3.1968417167663574, + "logits/rejected": -3.2664029598236084, + "logps/chosen": -250.17294311523438, + "logps/rejected": -257.6095275878906, + "loss": 0.4558, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31729021668434143, + "rewards/margins": 0.8636941313743591, + "rewards/rejected": -1.1809842586517334, + "step": 4621 + }, + { + "epoch": 0.53, + "learning_rate": 1.422919349174763e-07, + "logits/chosen": -2.7375307083129883, + "logits/rejected": -2.786038398742676, + "logps/chosen": -202.6448211669922, + "logps/rejected": -323.6896057128906, + "loss": 0.2909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6523049473762512, + "rewards/margins": 1.5118284225463867, + "rewards/rejected": -0.8595235347747803, + "step": 4622 + }, + { + "epoch": 0.53, + "learning_rate": 1.4225681844785202e-07, + "logits/chosen": -2.834411144256592, + "logits/rejected": -2.951763153076172, + "logps/chosen": -204.31112670898438, + "logps/rejected": -191.3324737548828, + "loss": 0.676, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3486764132976532, + "rewards/margins": 0.4741789400577545, + "rewards/rejected": -0.8228553533554077, + "step": 4623 + }, + { + "epoch": 0.53, + "learning_rate": 1.4222170197822778e-07, + "logits/chosen": -3.4489760398864746, + "logits/rejected": -3.5248024463653564, + "logps/chosen": -195.9695587158203, + "logps/rejected": -247.38052368164062, + "loss": 0.3433, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3724173605442047, + "rewards/margins": 1.3800979852676392, + "rewards/rejected": -1.0076805353164673, + "step": 4624 + }, + { + "epoch": 0.53, + "learning_rate": 1.4218658550860353e-07, + "logits/chosen": -3.6186683177948, + "logits/rejected": -3.325660467147827, + "logps/chosen": -222.59072875976562, + "logps/rejected": -133.26699829101562, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6611917018890381, + "rewards/margins": 0.5636796951293945, + "rewards/rejected": -1.2248713970184326, + "step": 4625 + }, + { + "epoch": 0.53, + "learning_rate": 1.4215146903897926e-07, + "logits/chosen": -3.1923749446868896, + "logits/rejected": -3.6259548664093018, + "logps/chosen": -220.82205200195312, + "logps/rejected": -204.48068237304688, + "loss": 0.6344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2766581177711487, + "rewards/margins": 0.924058735370636, + "rewards/rejected": -1.2007167339324951, + "step": 4626 + }, + { + "epoch": 0.53, + "learning_rate": 1.42116352569355e-07, + "logits/chosen": -2.1793856620788574, + "logits/rejected": -2.2384564876556396, + "logps/chosen": -143.64585876464844, + "logps/rejected": -208.00254821777344, + "loss": 0.6703, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5607753396034241, + "rewards/margins": 1.1841896772384644, + "rewards/rejected": -1.7449650764465332, + "step": 4627 + }, + { + "epoch": 0.53, + "learning_rate": 1.4208123609973077e-07, + "logits/chosen": -3.1614902019500732, + "logits/rejected": -3.391058921813965, + "logps/chosen": -219.74337768554688, + "logps/rejected": -257.330810546875, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05867205560207367, + "rewards/margins": 1.9495136737823486, + "rewards/rejected": -2.008185863494873, + "step": 4628 + }, + { + "epoch": 0.53, + "learning_rate": 1.4204611963010652e-07, + "logits/chosen": -3.5170469284057617, + "logits/rejected": -3.4494740962982178, + "logps/chosen": -145.37135314941406, + "logps/rejected": -229.75003051757812, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1338469535112381, + "rewards/margins": 1.5853161811828613, + "rewards/rejected": -1.4514694213867188, + "step": 4629 + }, + { + "epoch": 0.53, + "learning_rate": 1.4201100316048225e-07, + "logits/chosen": -3.2329773902893066, + "logits/rejected": -3.286543369293213, + "logps/chosen": -181.6484375, + "logps/rejected": -189.28201293945312, + "loss": 0.4321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.346297025680542, + "rewards/margins": 1.6266545057296753, + "rewards/rejected": -1.9729516506195068, + "step": 4630 + }, + { + "epoch": 0.53, + "learning_rate": 1.41975886690858e-07, + "logits/chosen": -3.439216136932373, + "logits/rejected": -3.405200481414795, + "logps/chosen": -211.86582946777344, + "logps/rejected": -310.40289306640625, + "loss": 0.3858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23384466767311096, + "rewards/margins": 1.7470457553863525, + "rewards/rejected": -1.5132009983062744, + "step": 4631 + }, + { + "epoch": 0.53, + "learning_rate": 1.4194077022123376e-07, + "logits/chosen": -4.04298210144043, + "logits/rejected": -3.778897762298584, + "logps/chosen": -132.88592529296875, + "logps/rejected": -156.7513885498047, + "loss": 0.4545, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24449820816516876, + "rewards/margins": 0.9277173280715942, + "rewards/rejected": -1.1722155809402466, + "step": 4632 + }, + { + "epoch": 0.53, + "learning_rate": 1.419056537516095e-07, + "logits/chosen": -3.053957462310791, + "logits/rejected": -3.1718177795410156, + "logps/chosen": -169.68153381347656, + "logps/rejected": -204.7326202392578, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.052358999848365784, + "rewards/margins": 1.3198165893554688, + "rewards/rejected": -1.2674574851989746, + "step": 4633 + }, + { + "epoch": 0.53, + "learning_rate": 1.4187053728198524e-07, + "logits/chosen": -3.364145278930664, + "logits/rejected": -3.5119853019714355, + "logps/chosen": -247.12188720703125, + "logps/rejected": -287.3652038574219, + "loss": 0.6167, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1498367339372635, + "rewards/margins": 0.45358219742774963, + "rewards/rejected": -0.6034189462661743, + "step": 4634 + }, + { + "epoch": 0.53, + "learning_rate": 1.41835420812361e-07, + "logits/chosen": -2.6197762489318848, + "logits/rejected": -2.4730117321014404, + "logps/chosen": -264.7226257324219, + "logps/rejected": -211.59378051757812, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7698895931243896, + "rewards/margins": 2.7609078884124756, + "rewards/rejected": -1.991018295288086, + "step": 4635 + }, + { + "epoch": 0.53, + "learning_rate": 1.4180030434273674e-07, + "logits/chosen": -2.725533962249756, + "logits/rejected": -2.8058531284332275, + "logps/chosen": -171.79757690429688, + "logps/rejected": -219.75584411621094, + "loss": 0.3394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1167459487915039, + "rewards/margins": 2.7010581493377686, + "rewards/rejected": -2.8178038597106934, + "step": 4636 + }, + { + "epoch": 0.53, + "learning_rate": 1.4176518787311247e-07, + "logits/chosen": -3.052982807159424, + "logits/rejected": -3.0893144607543945, + "logps/chosen": -255.37319946289062, + "logps/rejected": -334.57379150390625, + "loss": 0.9105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4372723400592804, + "rewards/margins": 1.8997199535369873, + "rewards/rejected": -2.3369922637939453, + "step": 4637 + }, + { + "epoch": 0.53, + "learning_rate": 1.4173007140348823e-07, + "logits/chosen": -2.5956056118011475, + "logits/rejected": -2.904731035232544, + "logps/chosen": -249.20111083984375, + "logps/rejected": -245.632568359375, + "loss": 0.7877, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4067867398262024, + "rewards/margins": 1.210875153541565, + "rewards/rejected": -1.617661952972412, + "step": 4638 + }, + { + "epoch": 0.53, + "learning_rate": 1.4169495493386398e-07, + "logits/chosen": -3.4260201454162598, + "logits/rejected": -3.7473998069763184, + "logps/chosen": -214.485595703125, + "logps/rejected": -365.0674743652344, + "loss": 0.2743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011061137542128563, + "rewards/margins": 3.0875535011291504, + "rewards/rejected": -3.0986146926879883, + "step": 4639 + }, + { + "epoch": 0.53, + "learning_rate": 1.4165983846423973e-07, + "logits/chosen": -2.5056331157684326, + "logits/rejected": -2.765486240386963, + "logps/chosen": -423.15118408203125, + "logps/rejected": -337.60528564453125, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008405506610870361, + "rewards/margins": 2.8290276527404785, + "rewards/rejected": -2.820622205734253, + "step": 4640 + }, + { + "epoch": 0.54, + "learning_rate": 1.4162472199461546e-07, + "logits/chosen": -3.0209665298461914, + "logits/rejected": -2.8075129985809326, + "logps/chosen": -334.4495849609375, + "logps/rejected": -214.6701202392578, + "loss": 0.3539, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49488046765327454, + "rewards/margins": 1.2564538717269897, + "rewards/rejected": -1.7513344287872314, + "step": 4641 + }, + { + "epoch": 0.54, + "learning_rate": 1.4158960552499122e-07, + "logits/chosen": -3.435163974761963, + "logits/rejected": -3.3027267456054688, + "logps/chosen": -325.6151123046875, + "logps/rejected": -243.59970092773438, + "loss": 0.7463, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48360806703567505, + "rewards/margins": 0.5647733211517334, + "rewards/rejected": -1.0483814477920532, + "step": 4642 + }, + { + "epoch": 0.54, + "learning_rate": 1.4155448905536694e-07, + "logits/chosen": -3.4138598442077637, + "logits/rejected": -3.23895525932312, + "logps/chosen": -274.93743896484375, + "logps/rejected": -279.0364990234375, + "loss": 0.4294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5230594873428345, + "rewards/margins": 2.832472801208496, + "rewards/rejected": -3.35553240776062, + "step": 4643 + }, + { + "epoch": 0.54, + "learning_rate": 1.4151937258574272e-07, + "logits/chosen": -2.8412296772003174, + "logits/rejected": -2.6380763053894043, + "logps/chosen": -221.533447265625, + "logps/rejected": -192.26138305664062, + "loss": 0.4012, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3895959258079529, + "rewards/margins": 1.2296205759048462, + "rewards/rejected": -0.8400247097015381, + "step": 4644 + }, + { + "epoch": 0.54, + "learning_rate": 1.4148425611611845e-07, + "logits/chosen": -3.062307119369507, + "logits/rejected": -3.032033920288086, + "logps/chosen": -183.82199096679688, + "logps/rejected": -188.78472900390625, + "loss": 0.3519, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007539689540863037, + "rewards/margins": 1.270946979522705, + "rewards/rejected": -1.2634073495864868, + "step": 4645 + }, + { + "epoch": 0.54, + "learning_rate": 1.414491396464942e-07, + "logits/chosen": -3.6543631553649902, + "logits/rejected": -3.7638378143310547, + "logps/chosen": -177.5653076171875, + "logps/rejected": -213.0252227783203, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.321727991104126, + "rewards/margins": 3.050652265548706, + "rewards/rejected": -2.728924512863159, + "step": 4646 + }, + { + "epoch": 0.54, + "learning_rate": 1.4141402317686993e-07, + "logits/chosen": -3.4963760375976562, + "logits/rejected": -3.193756103515625, + "logps/chosen": -186.1361541748047, + "logps/rejected": -177.0031280517578, + "loss": 1.0611, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0598514080047607, + "rewards/margins": 0.10249119997024536, + "rewards/rejected": -1.1623425483703613, + "step": 4647 + }, + { + "epoch": 0.54, + "learning_rate": 1.413789067072457e-07, + "logits/chosen": -3.4884800910949707, + "logits/rejected": -3.5109357833862305, + "logps/chosen": -266.54388427734375, + "logps/rejected": -261.41131591796875, + "loss": 0.596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20411443710327148, + "rewards/margins": 1.427620530128479, + "rewards/rejected": -1.6317349672317505, + "step": 4648 + }, + { + "epoch": 0.54, + "learning_rate": 1.4134379023762144e-07, + "logits/chosen": -3.406165838241577, + "logits/rejected": -3.526381015777588, + "logps/chosen": -105.44722747802734, + "logps/rejected": -140.10626220703125, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1854010820388794, + "rewards/margins": 1.8816380500793457, + "rewards/rejected": -1.6962368488311768, + "step": 4649 + }, + { + "epoch": 0.54, + "learning_rate": 1.413086737679972e-07, + "logits/chosen": -3.075880765914917, + "logits/rejected": -3.1867194175720215, + "logps/chosen": -226.75888061523438, + "logps/rejected": -270.6036376953125, + "loss": 0.6404, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21310710906982422, + "rewards/margins": 1.0599406957626343, + "rewards/rejected": -1.273047685623169, + "step": 4650 + }, + { + "epoch": 0.54, + "learning_rate": 1.4127355729837292e-07, + "logits/chosen": -2.968837022781372, + "logits/rejected": -2.864225387573242, + "logps/chosen": -276.084716796875, + "logps/rejected": -237.89459228515625, + "loss": 0.5462, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11995664238929749, + "rewards/margins": 0.638728141784668, + "rewards/rejected": -0.7586847543716431, + "step": 4651 + }, + { + "epoch": 0.54, + "learning_rate": 1.4123844082874867e-07, + "logits/chosen": -3.6682209968566895, + "logits/rejected": -3.6186184883117676, + "logps/chosen": -168.29066467285156, + "logps/rejected": -130.3525390625, + "loss": 0.4596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44341227412223816, + "rewards/margins": 1.503288984298706, + "rewards/rejected": -1.946701169013977, + "step": 4652 + }, + { + "epoch": 0.54, + "learning_rate": 1.4120332435912443e-07, + "logits/chosen": -2.894619941711426, + "logits/rejected": -2.8599884510040283, + "logps/chosen": -150.656005859375, + "logps/rejected": -185.25869750976562, + "loss": 0.3589, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08228855580091476, + "rewards/margins": 2.2661499977111816, + "rewards/rejected": -2.3484387397766113, + "step": 4653 + }, + { + "epoch": 0.54, + "learning_rate": 1.4116820788950016e-07, + "logits/chosen": -4.018063068389893, + "logits/rejected": -3.637143611907959, + "logps/chosen": -270.9552917480469, + "logps/rejected": -244.3907470703125, + "loss": 0.2357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04697135090827942, + "rewards/margins": 2.9214859008789062, + "rewards/rejected": -2.9684574604034424, + "step": 4654 + }, + { + "epoch": 0.54, + "learning_rate": 1.411330914198759e-07, + "logits/chosen": -2.8683011531829834, + "logits/rejected": -2.807751178741455, + "logps/chosen": -132.15916442871094, + "logps/rejected": -129.3094482421875, + "loss": 0.473, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3225565552711487, + "rewards/margins": 1.3265774250030518, + "rewards/rejected": -1.0040208101272583, + "step": 4655 + }, + { + "epoch": 0.54, + "learning_rate": 1.4109797495025166e-07, + "logits/chosen": -2.4702959060668945, + "logits/rejected": -2.5995278358459473, + "logps/chosen": -285.7110595703125, + "logps/rejected": -237.1140594482422, + "loss": 0.3736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1175636500120163, + "rewards/margins": 1.5881903171539307, + "rewards/rejected": -1.4706265926361084, + "step": 4656 + }, + { + "epoch": 0.54, + "learning_rate": 1.4106285848062742e-07, + "logits/chosen": -3.650271415710449, + "logits/rejected": -3.368236541748047, + "logps/chosen": -250.63536071777344, + "logps/rejected": -226.20860290527344, + "loss": 0.3522, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35954898595809937, + "rewards/margins": 1.8012950420379639, + "rewards/rejected": -2.160843849182129, + "step": 4657 + }, + { + "epoch": 0.54, + "learning_rate": 1.4102774201100314e-07, + "logits/chosen": -2.871408700942993, + "logits/rejected": -3.0128707885742188, + "logps/chosen": -254.10948181152344, + "logps/rejected": -275.2237243652344, + "loss": 0.4799, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2701619565486908, + "rewards/margins": 1.6599647998809814, + "rewards/rejected": -1.3898028135299683, + "step": 4658 + }, + { + "epoch": 0.54, + "learning_rate": 1.409926255413789e-07, + "logits/chosen": -2.9990861415863037, + "logits/rejected": -2.586759328842163, + "logps/chosen": -212.7902374267578, + "logps/rejected": -194.28961181640625, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29550760984420776, + "rewards/margins": 2.437692642211914, + "rewards/rejected": -2.1421849727630615, + "step": 4659 + }, + { + "epoch": 0.54, + "learning_rate": 1.4095750907175465e-07, + "logits/chosen": -2.8536975383758545, + "logits/rejected": -2.702028751373291, + "logps/chosen": -308.3877258300781, + "logps/rejected": -301.6170959472656, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08953350782394409, + "rewards/margins": 1.291090726852417, + "rewards/rejected": -1.3806240558624268, + "step": 4660 + }, + { + "epoch": 0.54, + "learning_rate": 1.409223926021304e-07, + "logits/chosen": -2.7628026008605957, + "logits/rejected": -3.0002951622009277, + "logps/chosen": -350.18115234375, + "logps/rejected": -236.19769287109375, + "loss": 0.3732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07502049207687378, + "rewards/margins": 1.2234488725662231, + "rewards/rejected": -1.2984693050384521, + "step": 4661 + }, + { + "epoch": 0.54, + "learning_rate": 1.4088727613250613e-07, + "logits/chosen": -3.0279855728149414, + "logits/rejected": -3.0245797634124756, + "logps/chosen": -317.2729187011719, + "logps/rejected": -235.84738159179688, + "loss": 0.4762, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09522318840026855, + "rewards/margins": 1.5873258113861084, + "rewards/rejected": -1.682548999786377, + "step": 4662 + }, + { + "epoch": 0.54, + "learning_rate": 1.408521596628819e-07, + "logits/chosen": -2.8225667476654053, + "logits/rejected": -2.847048282623291, + "logps/chosen": -185.09768676757812, + "logps/rejected": -231.48963928222656, + "loss": 0.4843, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1774609088897705, + "rewards/margins": 1.846146821975708, + "rewards/rejected": -1.6686859130859375, + "step": 4663 + }, + { + "epoch": 0.54, + "learning_rate": 1.4081704319325764e-07, + "logits/chosen": -3.5010719299316406, + "logits/rejected": -3.5061142444610596, + "logps/chosen": -252.60552978515625, + "logps/rejected": -257.0217590332031, + "loss": 0.7429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.903741717338562, + "rewards/margins": 0.6754860877990723, + "rewards/rejected": -1.5792276859283447, + "step": 4664 + }, + { + "epoch": 0.54, + "learning_rate": 1.407819267236334e-07, + "logits/chosen": -2.9090185165405273, + "logits/rejected": -2.731482744216919, + "logps/chosen": -144.25259399414062, + "logps/rejected": -199.80506896972656, + "loss": 0.3782, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35093986988067627, + "rewards/margins": 1.4023282527923584, + "rewards/rejected": -1.0513883829116821, + "step": 4665 + }, + { + "epoch": 0.54, + "learning_rate": 1.4074681025400912e-07, + "logits/chosen": -2.934305191040039, + "logits/rejected": -2.8084001541137695, + "logps/chosen": -231.37936401367188, + "logps/rejected": -273.19134521484375, + "loss": 0.2143, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22984401881694794, + "rewards/margins": 2.327094316482544, + "rewards/rejected": -2.097250461578369, + "step": 4666 + }, + { + "epoch": 0.54, + "learning_rate": 1.4071169378438488e-07, + "logits/chosen": -2.7702255249023438, + "logits/rejected": -2.746880292892456, + "logps/chosen": -242.52320861816406, + "logps/rejected": -218.83184814453125, + "loss": 0.3993, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11429911106824875, + "rewards/margins": 1.9123032093048096, + "rewards/rejected": -1.798004150390625, + "step": 4667 + }, + { + "epoch": 0.54, + "learning_rate": 1.406765773147606e-07, + "logits/chosen": -3.2397429943084717, + "logits/rejected": -3.1407721042633057, + "logps/chosen": -246.2645263671875, + "logps/rejected": -201.72218322753906, + "loss": 0.4119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10836967080831528, + "rewards/margins": 0.9455664157867432, + "rewards/rejected": -1.0539361238479614, + "step": 4668 + }, + { + "epoch": 0.54, + "learning_rate": 1.4064146084513636e-07, + "logits/chosen": -3.1567304134368896, + "logits/rejected": -3.0415146350860596, + "logps/chosen": -297.19036865234375, + "logps/rejected": -330.3018798828125, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2226795256137848, + "rewards/margins": 0.91374671459198, + "rewards/rejected": -1.136426329612732, + "step": 4669 + }, + { + "epoch": 0.54, + "learning_rate": 1.406063443755121e-07, + "logits/chosen": -3.565927743911743, + "logits/rejected": -3.1886425018310547, + "logps/chosen": -293.0825500488281, + "logps/rejected": -215.7290802001953, + "loss": 0.2989, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19525247812271118, + "rewards/margins": 2.457744836807251, + "rewards/rejected": -2.2624924182891846, + "step": 4670 + }, + { + "epoch": 0.54, + "learning_rate": 1.4057122790588784e-07, + "logits/chosen": -2.6605098247528076, + "logits/rejected": -2.2641890048980713, + "logps/chosen": -243.30657958984375, + "logps/rejected": -273.71002197265625, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2926764190196991, + "rewards/margins": 2.197005271911621, + "rewards/rejected": -2.4896817207336426, + "step": 4671 + }, + { + "epoch": 0.54, + "learning_rate": 1.405361114362636e-07, + "logits/chosen": -2.384458541870117, + "logits/rejected": -2.525826930999756, + "logps/chosen": -319.62860107421875, + "logps/rejected": -322.22601318359375, + "loss": 0.2194, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4804350733757019, + "rewards/margins": 2.4036972522735596, + "rewards/rejected": -1.923262357711792, + "step": 4672 + }, + { + "epoch": 0.54, + "learning_rate": 1.4050099496663935e-07, + "logits/chosen": -3.8208842277526855, + "logits/rejected": -3.843156337738037, + "logps/chosen": -268.55963134765625, + "logps/rejected": -259.74188232421875, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2539820671081543, + "rewards/margins": 3.4758763313293457, + "rewards/rejected": -3.7298583984375, + "step": 4673 + }, + { + "epoch": 0.54, + "learning_rate": 1.404658784970151e-07, + "logits/chosen": -2.485898733139038, + "logits/rejected": -2.3794684410095215, + "logps/chosen": -439.56707763671875, + "logps/rejected": -417.7510986328125, + "loss": 0.3646, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37318718433380127, + "rewards/margins": 1.3578860759735107, + "rewards/rejected": -0.9846988916397095, + "step": 4674 + }, + { + "epoch": 0.54, + "learning_rate": 1.4043076202739083e-07, + "logits/chosen": -3.1545543670654297, + "logits/rejected": -3.2717788219451904, + "logps/chosen": -230.1149139404297, + "logps/rejected": -251.31527709960938, + "loss": 0.5655, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009678855538368225, + "rewards/margins": 2.41192364692688, + "rewards/rejected": -2.402245044708252, + "step": 4675 + }, + { + "epoch": 0.54, + "learning_rate": 1.4039564555776658e-07, + "logits/chosen": -2.5920634269714355, + "logits/rejected": -2.561222553253174, + "logps/chosen": -245.43800354003906, + "logps/rejected": -330.5764465332031, + "loss": 0.4574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10159578919410706, + "rewards/margins": 2.0351943969726562, + "rewards/rejected": -2.1367902755737305, + "step": 4676 + }, + { + "epoch": 0.54, + "learning_rate": 1.4036052908814234e-07, + "logits/chosen": -3.501150608062744, + "logits/rejected": -3.8247456550598145, + "logps/chosen": -160.11151123046875, + "logps/rejected": -229.1749267578125, + "loss": 0.4396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023726943880319595, + "rewards/margins": 1.1383543014526367, + "rewards/rejected": -1.162081241607666, + "step": 4677 + }, + { + "epoch": 0.54, + "learning_rate": 1.403254126185181e-07, + "logits/chosen": -3.170712947845459, + "logits/rejected": -2.811497926712036, + "logps/chosen": -148.25515747070312, + "logps/rejected": -279.9303894042969, + "loss": 0.7245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2003413587808609, + "rewards/margins": 0.8889018297195435, + "rewards/rejected": -1.0892431735992432, + "step": 4678 + }, + { + "epoch": 0.54, + "learning_rate": 1.4029029614889382e-07, + "logits/chosen": -2.363435745239258, + "logits/rejected": -2.544154644012451, + "logps/chosen": -309.96612548828125, + "logps/rejected": -437.32122802734375, + "loss": 0.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3384389877319336, + "rewards/margins": 1.9136712551116943, + "rewards/rejected": -2.252110242843628, + "step": 4679 + }, + { + "epoch": 0.54, + "learning_rate": 1.4025517967926957e-07, + "logits/chosen": -3.5231094360351562, + "logits/rejected": -3.4379916191101074, + "logps/chosen": -195.85107421875, + "logps/rejected": -163.14642333984375, + "loss": 0.9995, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7417532205581665, + "rewards/margins": 0.7241140007972717, + "rewards/rejected": -1.465867280960083, + "step": 4680 + }, + { + "epoch": 0.54, + "learning_rate": 1.4022006320964532e-07, + "logits/chosen": -3.1691508293151855, + "logits/rejected": -3.076690435409546, + "logps/chosen": -440.95489501953125, + "logps/rejected": -501.3960266113281, + "loss": 0.357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11357621848583221, + "rewards/margins": 1.9033855199813843, + "rewards/rejected": -2.0169615745544434, + "step": 4681 + }, + { + "epoch": 0.54, + "learning_rate": 1.4018494674002108e-07, + "logits/chosen": -1.9653334617614746, + "logits/rejected": -2.0161287784576416, + "logps/chosen": -425.2652587890625, + "logps/rejected": -312.5126037597656, + "loss": 0.3806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23846927285194397, + "rewards/margins": 1.0259150266647339, + "rewards/rejected": -0.7874457836151123, + "step": 4682 + }, + { + "epoch": 0.54, + "learning_rate": 1.401498302703968e-07, + "logits/chosen": -2.9213643074035645, + "logits/rejected": -3.507218837738037, + "logps/chosen": -127.2892074584961, + "logps/rejected": -186.74868774414062, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21131734549999237, + "rewards/margins": 3.1384501457214355, + "rewards/rejected": -3.3497676849365234, + "step": 4683 + }, + { + "epoch": 0.54, + "learning_rate": 1.4011471380077256e-07, + "logits/chosen": -3.2747092247009277, + "logits/rejected": -3.0826008319854736, + "logps/chosen": -193.3975372314453, + "logps/rejected": -153.68399047851562, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21724870800971985, + "rewards/margins": 1.5708012580871582, + "rewards/rejected": -1.7880499362945557, + "step": 4684 + }, + { + "epoch": 0.54, + "learning_rate": 1.400795973311483e-07, + "logits/chosen": -2.5389785766601562, + "logits/rejected": -2.555305242538452, + "logps/chosen": -217.6708984375, + "logps/rejected": -218.95616149902344, + "loss": 0.5918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33195197582244873, + "rewards/margins": 0.6003660559654236, + "rewards/rejected": -0.9323179721832275, + "step": 4685 + }, + { + "epoch": 0.54, + "learning_rate": 1.4004448086152404e-07, + "logits/chosen": -2.608656883239746, + "logits/rejected": -2.7045297622680664, + "logps/chosen": -420.5458984375, + "logps/rejected": -375.48162841796875, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37921661138534546, + "rewards/margins": 0.7585627436637878, + "rewards/rejected": -1.1377793550491333, + "step": 4686 + }, + { + "epoch": 0.54, + "learning_rate": 1.400093643918998e-07, + "logits/chosen": -3.3471102714538574, + "logits/rejected": -3.0839757919311523, + "logps/chosen": -293.6072998046875, + "logps/rejected": -372.4629821777344, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17730873823165894, + "rewards/margins": 3.623513698577881, + "rewards/rejected": -3.446204662322998, + "step": 4687 + }, + { + "epoch": 0.54, + "learning_rate": 1.3997424792227552e-07, + "logits/chosen": -3.351966142654419, + "logits/rejected": -3.410693407058716, + "logps/chosen": -260.5738220214844, + "logps/rejected": -240.7497100830078, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1470785290002823, + "rewards/margins": 2.271552562713623, + "rewards/rejected": -2.418631076812744, + "step": 4688 + }, + { + "epoch": 0.54, + "learning_rate": 1.399391314526513e-07, + "logits/chosen": -3.0154452323913574, + "logits/rejected": -3.4063873291015625, + "logps/chosen": -231.49441528320312, + "logps/rejected": -323.3878173828125, + "loss": 0.3712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7253485918045044, + "rewards/margins": 2.124648094177246, + "rewards/rejected": -2.849996566772461, + "step": 4689 + }, + { + "epoch": 0.54, + "learning_rate": 1.3990401498302703e-07, + "logits/chosen": -2.568394899368286, + "logits/rejected": -2.6165900230407715, + "logps/chosen": -182.81642150878906, + "logps/rejected": -207.80699157714844, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009611807763576508, + "rewards/margins": 3.1242194175720215, + "rewards/rejected": -3.13383150100708, + "step": 4690 + }, + { + "epoch": 0.54, + "learning_rate": 1.3986889851340278e-07, + "logits/chosen": -2.581697940826416, + "logits/rejected": -2.884397506713867, + "logps/chosen": -354.98187255859375, + "logps/rejected": -253.52713012695312, + "loss": 0.6134, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22055235505104065, + "rewards/margins": 1.9010350704193115, + "rewards/rejected": -2.1215872764587402, + "step": 4691 + }, + { + "epoch": 0.54, + "learning_rate": 1.398337820437785e-07, + "logits/chosen": -3.029862880706787, + "logits/rejected": -2.9398303031921387, + "logps/chosen": -278.1956481933594, + "logps/rejected": -317.8135681152344, + "loss": 0.4342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10420399904251099, + "rewards/margins": 1.1984055042266846, + "rewards/rejected": -1.3026095628738403, + "step": 4692 + }, + { + "epoch": 0.54, + "learning_rate": 1.397986655741543e-07, + "logits/chosen": -3.699066162109375, + "logits/rejected": -3.2209718227386475, + "logps/chosen": -431.55755615234375, + "logps/rejected": -289.0267333984375, + "loss": 0.2514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07281896471977234, + "rewards/margins": 2.0119473934173584, + "rewards/rejected": -2.084766387939453, + "step": 4693 + }, + { + "epoch": 0.54, + "learning_rate": 1.3976354910453002e-07, + "logits/chosen": -3.0211410522460938, + "logits/rejected": -2.8240106105804443, + "logps/chosen": -419.4019775390625, + "logps/rejected": -634.1710815429688, + "loss": 0.5182, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10153341293334961, + "rewards/margins": 1.6351358890533447, + "rewards/rejected": -1.5336024761199951, + "step": 4694 + }, + { + "epoch": 0.54, + "learning_rate": 1.3972843263490577e-07, + "logits/chosen": -3.682283639907837, + "logits/rejected": -3.818006992340088, + "logps/chosen": -123.84111022949219, + "logps/rejected": -173.22438049316406, + "loss": 0.5292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5851384401321411, + "rewards/margins": 0.47670483589172363, + "rewards/rejected": -1.0618432760238647, + "step": 4695 + }, + { + "epoch": 0.54, + "learning_rate": 1.396933161652815e-07, + "logits/chosen": -3.3117833137512207, + "logits/rejected": -3.4310519695281982, + "logps/chosen": -213.0489959716797, + "logps/rejected": -242.7095489501953, + "loss": 0.2543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3056524693965912, + "rewards/margins": 1.458221435546875, + "rewards/rejected": -1.7638740539550781, + "step": 4696 + }, + { + "epoch": 0.54, + "learning_rate": 1.3965819969565725e-07, + "logits/chosen": -3.2982993125915527, + "logits/rejected": -3.339001178741455, + "logps/chosen": -156.62362670898438, + "logps/rejected": -141.66297912597656, + "loss": 0.4612, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2686714231967926, + "rewards/margins": 0.6488252282142639, + "rewards/rejected": -0.9174966812133789, + "step": 4697 + }, + { + "epoch": 0.54, + "learning_rate": 1.39623083226033e-07, + "logits/chosen": -3.4820759296417236, + "logits/rejected": -3.251004219055176, + "logps/chosen": -150.22772216796875, + "logps/rejected": -202.87637329101562, + "loss": 0.444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2894437313079834, + "rewards/margins": 1.8850950002670288, + "rewards/rejected": -2.1745388507843018, + "step": 4698 + }, + { + "epoch": 0.54, + "learning_rate": 1.3958796675640876e-07, + "logits/chosen": -2.736966609954834, + "logits/rejected": -2.730278491973877, + "logps/chosen": -154.86595153808594, + "logps/rejected": -180.88189697265625, + "loss": 0.3503, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11602663993835449, + "rewards/margins": 1.5858906507492065, + "rewards/rejected": -1.4698638916015625, + "step": 4699 + }, + { + "epoch": 0.54, + "learning_rate": 1.395528502867845e-07, + "logits/chosen": -3.2960777282714844, + "logits/rejected": -3.3512511253356934, + "logps/chosen": -325.6885986328125, + "logps/rejected": -362.6979675292969, + "loss": 0.2672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23892849683761597, + "rewards/margins": 1.7996593713760376, + "rewards/rejected": -1.5607309341430664, + "step": 4700 + }, + { + "epoch": 0.54, + "learning_rate": 1.3951773381716024e-07, + "logits/chosen": -2.905561923980713, + "logits/rejected": -3.065286874771118, + "logps/chosen": -196.11239624023438, + "logps/rejected": -187.29652404785156, + "loss": 0.376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03573183715343475, + "rewards/margins": 1.6489616632461548, + "rewards/rejected": -1.6132298707962036, + "step": 4701 + }, + { + "epoch": 0.54, + "learning_rate": 1.39482617347536e-07, + "logits/chosen": -2.988740921020508, + "logits/rejected": -2.919459342956543, + "logps/chosen": -140.0156707763672, + "logps/rejected": -194.56871032714844, + "loss": 0.3494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0001882016658782959, + "rewards/margins": 2.126237630844116, + "rewards/rejected": -2.1264257431030273, + "step": 4702 + }, + { + "epoch": 0.54, + "learning_rate": 1.3944750087791172e-07, + "logits/chosen": -4.135402679443359, + "logits/rejected": -4.040011405944824, + "logps/chosen": -203.88528442382812, + "logps/rejected": -225.95120239257812, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05629311501979828, + "rewards/margins": 2.4498515129089355, + "rewards/rejected": -2.3935585021972656, + "step": 4703 + }, + { + "epoch": 0.54, + "learning_rate": 1.3941238440828748e-07, + "logits/chosen": -3.3013529777526855, + "logits/rejected": -3.5394821166992188, + "logps/chosen": -261.58856201171875, + "logps/rejected": -266.6440734863281, + "loss": 0.2494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0376463383436203, + "rewards/margins": 2.8416640758514404, + "rewards/rejected": -2.8040177822113037, + "step": 4704 + }, + { + "epoch": 0.54, + "learning_rate": 1.3937726793866323e-07, + "logits/chosen": -3.944586753845215, + "logits/rejected": -3.557952880859375, + "logps/chosen": -454.6298828125, + "logps/rejected": -387.8685302734375, + "loss": 0.291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2164437472820282, + "rewards/margins": 3.886897087097168, + "rewards/rejected": -4.1033406257629395, + "step": 4705 + }, + { + "epoch": 0.54, + "learning_rate": 1.3934215146903899e-07, + "logits/chosen": -3.022952079772949, + "logits/rejected": -2.8368983268737793, + "logps/chosen": -464.324462890625, + "logps/rejected": -285.7774963378906, + "loss": 0.6515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19956420361995697, + "rewards/margins": 0.834123432636261, + "rewards/rejected": -1.0336875915527344, + "step": 4706 + }, + { + "epoch": 0.54, + "learning_rate": 1.393070349994147e-07, + "logits/chosen": -3.2620513439178467, + "logits/rejected": -3.228442668914795, + "logps/chosen": -401.785888671875, + "logps/rejected": -340.40545654296875, + "loss": 0.2517, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48069310188293457, + "rewards/margins": 2.308873414993286, + "rewards/rejected": -1.8281803131103516, + "step": 4707 + }, + { + "epoch": 0.54, + "learning_rate": 1.3927191852979047e-07, + "logits/chosen": -2.3521177768707275, + "logits/rejected": -2.612572193145752, + "logps/chosen": -168.17086791992188, + "logps/rejected": -160.645751953125, + "loss": 0.4614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16594013571739197, + "rewards/margins": 0.9671852588653564, + "rewards/rejected": -1.1331254243850708, + "step": 4708 + }, + { + "epoch": 0.54, + "learning_rate": 1.3923680206016622e-07, + "logits/chosen": -3.1569342613220215, + "logits/rejected": -3.1778571605682373, + "logps/chosen": -246.5291748046875, + "logps/rejected": -158.7622833251953, + "loss": 0.4864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20952104032039642, + "rewards/margins": 1.1787139177322388, + "rewards/rejected": -1.3882349729537964, + "step": 4709 + }, + { + "epoch": 0.54, + "learning_rate": 1.3920168559054197e-07, + "logits/chosen": -2.6276261806488037, + "logits/rejected": -2.856135368347168, + "logps/chosen": -382.7586975097656, + "logps/rejected": -225.76675415039062, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3186935782432556, + "rewards/margins": 2.545628786087036, + "rewards/rejected": -2.2269351482391357, + "step": 4710 + }, + { + "epoch": 0.54, + "learning_rate": 1.391665691209177e-07, + "logits/chosen": -3.2601020336151123, + "logits/rejected": -3.4754204750061035, + "logps/chosen": -261.68865966796875, + "logps/rejected": -290.8757019042969, + "loss": 0.2334, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.298372358083725, + "rewards/margins": 2.884760856628418, + "rewards/rejected": -2.58638858795166, + "step": 4711 + }, + { + "epoch": 0.54, + "learning_rate": 1.3913145265129346e-07, + "logits/chosen": -2.970379114151001, + "logits/rejected": -2.7609448432922363, + "logps/chosen": -323.9457092285156, + "logps/rejected": -220.30023193359375, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30516403913497925, + "rewards/margins": 1.4469820261001587, + "rewards/rejected": -1.7521458864212036, + "step": 4712 + }, + { + "epoch": 0.54, + "learning_rate": 1.3909633618166918e-07, + "logits/chosen": -2.671915054321289, + "logits/rejected": -2.7442636489868164, + "logps/chosen": -319.6053161621094, + "logps/rejected": -272.1247253417969, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.502479612827301, + "rewards/margins": 2.685864210128784, + "rewards/rejected": -2.183384656906128, + "step": 4713 + }, + { + "epoch": 0.54, + "learning_rate": 1.3906121971204494e-07, + "logits/chosen": -2.9911317825317383, + "logits/rejected": -3.1705493927001953, + "logps/chosen": -156.8232421875, + "logps/rejected": -212.71681213378906, + "loss": 0.4764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42897093296051025, + "rewards/margins": 1.50592839717865, + "rewards/rejected": -1.0769574642181396, + "step": 4714 + }, + { + "epoch": 0.54, + "learning_rate": 1.390261032424207e-07, + "logits/chosen": -2.5154919624328613, + "logits/rejected": -2.77345871925354, + "logps/chosen": -105.4158935546875, + "logps/rejected": -149.5496826171875, + "loss": 0.2542, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6032509207725525, + "rewards/margins": 2.117307186126709, + "rewards/rejected": -1.5140562057495117, + "step": 4715 + }, + { + "epoch": 0.54, + "learning_rate": 1.3899098677279644e-07, + "logits/chosen": -2.1190104484558105, + "logits/rejected": -2.1633810997009277, + "logps/chosen": -342.3714599609375, + "logps/rejected": -294.890869140625, + "loss": 0.8875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9499032497406006, + "rewards/margins": 0.37363824248313904, + "rewards/rejected": -1.3235414028167725, + "step": 4716 + }, + { + "epoch": 0.54, + "learning_rate": 1.3895587030317217e-07, + "logits/chosen": -2.767604112625122, + "logits/rejected": -2.847093105316162, + "logps/chosen": -362.0243225097656, + "logps/rejected": -306.44097900390625, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1335172802209854, + "rewards/margins": 1.8291388750076294, + "rewards/rejected": -1.962656021118164, + "step": 4717 + }, + { + "epoch": 0.54, + "learning_rate": 1.3892075383354793e-07, + "logits/chosen": -3.319711923599243, + "logits/rejected": -3.5005030632019043, + "logps/chosen": -156.6175079345703, + "logps/rejected": -214.26315307617188, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03350120782852173, + "rewards/margins": 1.8960381746292114, + "rewards/rejected": -1.8625370264053345, + "step": 4718 + }, + { + "epoch": 0.54, + "learning_rate": 1.3888563736392368e-07, + "logits/chosen": -2.622309446334839, + "logits/rejected": -2.633680582046509, + "logps/chosen": -233.0819091796875, + "logps/rejected": -139.0605010986328, + "loss": 0.4468, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.040934622287750244, + "rewards/margins": 1.4217700958251953, + "rewards/rejected": -1.3808354139328003, + "step": 4719 + }, + { + "epoch": 0.54, + "learning_rate": 1.388505208942994e-07, + "logits/chosen": -3.418431520462036, + "logits/rejected": -3.195725440979004, + "logps/chosen": -245.3814239501953, + "logps/rejected": -210.36465454101562, + "loss": 0.5691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0925130769610405, + "rewards/margins": 1.1795004606246948, + "rewards/rejected": -1.2720136642456055, + "step": 4720 + }, + { + "epoch": 0.54, + "learning_rate": 1.3881540442467516e-07, + "logits/chosen": -3.151397228240967, + "logits/rejected": -3.3218812942504883, + "logps/chosen": -310.5677490234375, + "logps/rejected": -293.6673278808594, + "loss": 0.2507, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.448203444480896, + "rewards/margins": 2.0137596130371094, + "rewards/rejected": -1.5655561685562134, + "step": 4721 + }, + { + "epoch": 0.54, + "learning_rate": 1.3878028795505091e-07, + "logits/chosen": -3.7744526863098145, + "logits/rejected": -3.5349133014678955, + "logps/chosen": -197.65277099609375, + "logps/rejected": -192.4982147216797, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06469686329364777, + "rewards/margins": 1.572541356086731, + "rewards/rejected": -1.5078444480895996, + "step": 4722 + }, + { + "epoch": 0.54, + "learning_rate": 1.3874517148542667e-07, + "logits/chosen": -3.4163730144500732, + "logits/rejected": -3.4043445587158203, + "logps/chosen": -286.1738586425781, + "logps/rejected": -236.41534423828125, + "loss": 0.2187, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23494893312454224, + "rewards/margins": 2.4291608333587646, + "rewards/rejected": -2.664109706878662, + "step": 4723 + }, + { + "epoch": 0.54, + "learning_rate": 1.387100550158024e-07, + "logits/chosen": -2.988279342651367, + "logits/rejected": -2.858722686767578, + "logps/chosen": -286.3377380371094, + "logps/rejected": -289.56256103515625, + "loss": 0.2929, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24002760648727417, + "rewards/margins": 1.9144186973571777, + "rewards/rejected": -1.6743909120559692, + "step": 4724 + }, + { + "epoch": 0.54, + "learning_rate": 1.3867493854617815e-07, + "logits/chosen": -2.806779146194458, + "logits/rejected": -3.0547685623168945, + "logps/chosen": -171.98867797851562, + "logps/rejected": -144.65679931640625, + "loss": 0.4965, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030558064579963684, + "rewards/margins": 1.1799174547195435, + "rewards/rejected": -1.2104755640029907, + "step": 4725 + }, + { + "epoch": 0.54, + "learning_rate": 1.386398220765539e-07, + "logits/chosen": -2.7373886108398438, + "logits/rejected": -2.7626864910125732, + "logps/chosen": -394.13482666015625, + "logps/rejected": -245.0577392578125, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5343860387802124, + "rewards/margins": 1.542799949645996, + "rewards/rejected": -2.077185869216919, + "step": 4726 + }, + { + "epoch": 0.54, + "learning_rate": 1.3860470560692966e-07, + "logits/chosen": -3.1168816089630127, + "logits/rejected": -3.2346205711364746, + "logps/chosen": -218.20220947265625, + "logps/rejected": -306.5650939941406, + "loss": 0.1624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4989706575870514, + "rewards/margins": 2.6829659938812256, + "rewards/rejected": -2.183995246887207, + "step": 4727 + }, + { + "epoch": 0.55, + "learning_rate": 1.3856958913730538e-07, + "logits/chosen": -2.559622049331665, + "logits/rejected": -2.4015250205993652, + "logps/chosen": -172.04736328125, + "logps/rejected": -179.10598754882812, + "loss": 0.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4799600839614868, + "rewards/margins": 1.6612842082977295, + "rewards/rejected": -2.1412441730499268, + "step": 4728 + }, + { + "epoch": 0.55, + "learning_rate": 1.3853447266768114e-07, + "logits/chosen": -2.6120941638946533, + "logits/rejected": -2.642918825149536, + "logps/chosen": -194.12249755859375, + "logps/rejected": -137.72784423828125, + "loss": 0.5036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23825594782829285, + "rewards/margins": 0.6579334139823914, + "rewards/rejected": -0.8961893320083618, + "step": 4729 + }, + { + "epoch": 0.55, + "learning_rate": 1.384993561980569e-07, + "logits/chosen": -2.778114080429077, + "logits/rejected": -2.629110097885132, + "logps/chosen": -292.0668640136719, + "logps/rejected": -266.2926330566406, + "loss": 0.2706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33000218868255615, + "rewards/margins": 1.7206790447235107, + "rewards/rejected": -2.0506811141967773, + "step": 4730 + }, + { + "epoch": 0.55, + "learning_rate": 1.3846423972843262e-07, + "logits/chosen": -3.38948392868042, + "logits/rejected": -3.453326940536499, + "logps/chosen": -505.832275390625, + "logps/rejected": -402.2357177734375, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3381009101867676, + "rewards/margins": 2.5125632286071777, + "rewards/rejected": -2.850663900375366, + "step": 4731 + }, + { + "epoch": 0.55, + "learning_rate": 1.3842912325880837e-07, + "logits/chosen": -2.299744129180908, + "logits/rejected": -2.183795690536499, + "logps/chosen": -260.35626220703125, + "logps/rejected": -276.78289794921875, + "loss": 0.6366, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12211322784423828, + "rewards/margins": 0.9472222328186035, + "rewards/rejected": -1.0693355798721313, + "step": 4732 + }, + { + "epoch": 0.55, + "learning_rate": 1.3839400678918413e-07, + "logits/chosen": -3.7000043392181396, + "logits/rejected": -3.2421011924743652, + "logps/chosen": -153.04244995117188, + "logps/rejected": -228.89523315429688, + "loss": 0.4069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19727803766727448, + "rewards/margins": 1.2480695247650146, + "rewards/rejected": -1.445347547531128, + "step": 4733 + }, + { + "epoch": 0.55, + "learning_rate": 1.3835889031955988e-07, + "logits/chosen": -2.833785057067871, + "logits/rejected": -2.817981719970703, + "logps/chosen": -289.9038391113281, + "logps/rejected": -200.1741180419922, + "loss": 0.6666, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36155134439468384, + "rewards/margins": 0.31675493717193604, + "rewards/rejected": 0.04479638487100601, + "step": 4734 + }, + { + "epoch": 0.55, + "learning_rate": 1.383237738499356e-07, + "logits/chosen": -3.6494758129119873, + "logits/rejected": -3.26887845993042, + "logps/chosen": -315.0583801269531, + "logps/rejected": -188.7802734375, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35809722542762756, + "rewards/margins": 1.6324986219406128, + "rewards/rejected": -1.9905959367752075, + "step": 4735 + }, + { + "epoch": 0.55, + "learning_rate": 1.3828865738031136e-07, + "logits/chosen": -2.348493814468384, + "logits/rejected": -2.388981342315674, + "logps/chosen": -278.9490051269531, + "logps/rejected": -335.938232421875, + "loss": 0.2205, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5573590993881226, + "rewards/margins": 2.697774887084961, + "rewards/rejected": -2.140415668487549, + "step": 4736 + }, + { + "epoch": 0.55, + "learning_rate": 1.382535409106871e-07, + "logits/chosen": -2.520266056060791, + "logits/rejected": -2.6790950298309326, + "logps/chosen": -131.79978942871094, + "logps/rejected": -258.58502197265625, + "loss": 0.5491, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06394777446985245, + "rewards/margins": 1.9960215091705322, + "rewards/rejected": -1.932073712348938, + "step": 4737 + }, + { + "epoch": 0.55, + "learning_rate": 1.3821842444106287e-07, + "logits/chosen": -3.3347573280334473, + "logits/rejected": -3.3170440196990967, + "logps/chosen": -132.14028930664062, + "logps/rejected": -283.2257080078125, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06014992296695709, + "rewards/margins": 3.2007343769073486, + "rewards/rejected": -3.1405844688415527, + "step": 4738 + }, + { + "epoch": 0.55, + "learning_rate": 1.381833079714386e-07, + "logits/chosen": -3.2767345905303955, + "logits/rejected": -3.1271183490753174, + "logps/chosen": -140.1322784423828, + "logps/rejected": -264.0682373046875, + "loss": 0.345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08590898662805557, + "rewards/margins": 2.190504550933838, + "rewards/rejected": -2.2764134407043457, + "step": 4739 + }, + { + "epoch": 0.55, + "learning_rate": 1.3814819150181435e-07, + "logits/chosen": -3.019453763961792, + "logits/rejected": -2.7211737632751465, + "logps/chosen": -335.6041259765625, + "logps/rejected": -306.0028991699219, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35958099365234375, + "rewards/margins": 2.7053897380828857, + "rewards/rejected": -2.345808982849121, + "step": 4740 + }, + { + "epoch": 0.55, + "learning_rate": 1.3811307503219008e-07, + "logits/chosen": -3.5259194374084473, + "logits/rejected": -3.7692911624908447, + "logps/chosen": -68.09434509277344, + "logps/rejected": -232.1933135986328, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20137208700180054, + "rewards/margins": 3.4941813945770264, + "rewards/rejected": -3.29280948638916, + "step": 4741 + }, + { + "epoch": 0.55, + "learning_rate": 1.3807795856256583e-07, + "logits/chosen": -3.0738956928253174, + "logits/rejected": -2.8246266841888428, + "logps/chosen": -422.4683837890625, + "logps/rejected": -364.1755676269531, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08188116550445557, + "rewards/margins": 1.6663875579833984, + "rewards/rejected": -1.5845062732696533, + "step": 4742 + }, + { + "epoch": 0.55, + "learning_rate": 1.3804284209294159e-07, + "logits/chosen": -3.40942120552063, + "logits/rejected": -3.2601516246795654, + "logps/chosen": -284.62567138671875, + "logps/rejected": -340.7852478027344, + "loss": 0.5613, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23790070414543152, + "rewards/margins": 0.9650171995162964, + "rewards/rejected": -0.7271165251731873, + "step": 4743 + }, + { + "epoch": 0.55, + "learning_rate": 1.3800772562331734e-07, + "logits/chosen": -3.602189302444458, + "logits/rejected": -3.447265625, + "logps/chosen": -302.26953125, + "logps/rejected": -283.555908203125, + "loss": 0.5365, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07298246771097183, + "rewards/margins": 1.5904346704483032, + "rewards/rejected": -1.5174522399902344, + "step": 4744 + }, + { + "epoch": 0.55, + "learning_rate": 1.3797260915369307e-07, + "logits/chosen": -3.6791839599609375, + "logits/rejected": -3.621366500854492, + "logps/chosen": -240.1277313232422, + "logps/rejected": -209.745361328125, + "loss": 0.7993, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3555561602115631, + "rewards/margins": 0.8271989822387695, + "rewards/rejected": -1.1827552318572998, + "step": 4745 + }, + { + "epoch": 0.55, + "learning_rate": 1.3793749268406882e-07, + "logits/chosen": -2.9438323974609375, + "logits/rejected": -3.0170044898986816, + "logps/chosen": -300.80706787109375, + "logps/rejected": -386.40228271484375, + "loss": 0.1835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25196754932403564, + "rewards/margins": 2.5038819313049316, + "rewards/rejected": -2.755849838256836, + "step": 4746 + }, + { + "epoch": 0.55, + "learning_rate": 1.3790237621444458e-07, + "logits/chosen": -3.1970205307006836, + "logits/rejected": -3.5423922538757324, + "logps/chosen": -239.9945068359375, + "logps/rejected": -229.82025146484375, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3635498285293579, + "rewards/margins": 2.9292171001434326, + "rewards/rejected": -2.5656676292419434, + "step": 4747 + }, + { + "epoch": 0.55, + "learning_rate": 1.378672597448203e-07, + "logits/chosen": -2.9111506938934326, + "logits/rejected": -3.1504087448120117, + "logps/chosen": -221.18809509277344, + "logps/rejected": -253.2954864501953, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5290796756744385, + "rewards/margins": 2.065487861633301, + "rewards/rejected": -1.5364079475402832, + "step": 4748 + }, + { + "epoch": 0.55, + "learning_rate": 1.3783214327519606e-07, + "logits/chosen": -2.6343464851379395, + "logits/rejected": -2.4619016647338867, + "logps/chosen": -341.64337158203125, + "logps/rejected": -333.05474853515625, + "loss": 0.2774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21963489055633545, + "rewards/margins": 2.0280959606170654, + "rewards/rejected": -2.2477309703826904, + "step": 4749 + }, + { + "epoch": 0.55, + "learning_rate": 1.377970268055718e-07, + "logits/chosen": -3.430717945098877, + "logits/rejected": -3.4509775638580322, + "logps/chosen": -120.05661010742188, + "logps/rejected": -216.53517150878906, + "loss": 0.221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34520742297172546, + "rewards/margins": 2.9100663661956787, + "rewards/rejected": -2.56485915184021, + "step": 4750 + }, + { + "epoch": 0.55, + "learning_rate": 1.3776191033594756e-07, + "logits/chosen": -2.9677910804748535, + "logits/rejected": -3.0517513751983643, + "logps/chosen": -308.7592468261719, + "logps/rejected": -257.6866760253906, + "loss": 0.3783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4154060184955597, + "rewards/margins": 2.0111756324768066, + "rewards/rejected": -2.426581382751465, + "step": 4751 + }, + { + "epoch": 0.55, + "learning_rate": 1.377267938663233e-07, + "logits/chosen": -3.241328716278076, + "logits/rejected": -3.1433277130126953, + "logps/chosen": -392.1812438964844, + "logps/rejected": -280.78363037109375, + "loss": 0.7511, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017477944493293762, + "rewards/margins": 0.6029520034790039, + "rewards/rejected": -0.5854740142822266, + "step": 4752 + }, + { + "epoch": 0.55, + "learning_rate": 1.3769167739669905e-07, + "logits/chosen": -3.722938299179077, + "logits/rejected": -3.385831594467163, + "logps/chosen": -303.8262634277344, + "logps/rejected": -239.1048583984375, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04275143891572952, + "rewards/margins": 2.1554694175720215, + "rewards/rejected": -2.198220729827881, + "step": 4753 + }, + { + "epoch": 0.55, + "learning_rate": 1.376565609270748e-07, + "logits/chosen": -2.7159626483917236, + "logits/rejected": -2.6557090282440186, + "logps/chosen": -336.16033935546875, + "logps/rejected": -331.54425048828125, + "loss": 0.2238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28443628549575806, + "rewards/margins": 2.945359230041504, + "rewards/rejected": -2.6609230041503906, + "step": 4754 + }, + { + "epoch": 0.55, + "learning_rate": 1.3762144445745055e-07, + "logits/chosen": -3.290484666824341, + "logits/rejected": -3.150261878967285, + "logps/chosen": -211.47274780273438, + "logps/rejected": -266.88946533203125, + "loss": 0.4109, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14907371997833252, + "rewards/margins": 2.3213891983032227, + "rewards/rejected": -2.172315835952759, + "step": 4755 + }, + { + "epoch": 0.55, + "learning_rate": 1.3758632798782628e-07, + "logits/chosen": -3.523017168045044, + "logits/rejected": -3.6178290843963623, + "logps/chosen": -246.3368682861328, + "logps/rejected": -326.0635986328125, + "loss": 0.3711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16913747787475586, + "rewards/margins": 1.975386142730713, + "rewards/rejected": -1.8062485456466675, + "step": 4756 + }, + { + "epoch": 0.55, + "learning_rate": 1.3755121151820203e-07, + "logits/chosen": -3.230670928955078, + "logits/rejected": -2.73875093460083, + "logps/chosen": -300.72222900390625, + "logps/rejected": -249.3724365234375, + "loss": 0.2861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5103797316551208, + "rewards/margins": 1.8377691507339478, + "rewards/rejected": -2.348148822784424, + "step": 4757 + }, + { + "epoch": 0.55, + "learning_rate": 1.375160950485778e-07, + "logits/chosen": -3.124053478240967, + "logits/rejected": -2.880309581756592, + "logps/chosen": -278.70562744140625, + "logps/rejected": -325.6264953613281, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24388253688812256, + "rewards/margins": 1.9178682565689087, + "rewards/rejected": -1.6739857196807861, + "step": 4758 + }, + { + "epoch": 0.55, + "learning_rate": 1.3748097857895352e-07, + "logits/chosen": -3.0835227966308594, + "logits/rejected": -3.179074764251709, + "logps/chosen": -315.98004150390625, + "logps/rejected": -341.903564453125, + "loss": 0.297, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21314242482185364, + "rewards/margins": 2.545158624649048, + "rewards/rejected": -2.3320164680480957, + "step": 4759 + }, + { + "epoch": 0.55, + "learning_rate": 1.3744586210932927e-07, + "logits/chosen": -2.894813299179077, + "logits/rejected": -3.2617759704589844, + "logps/chosen": -310.975830078125, + "logps/rejected": -162.55955505371094, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7916417121887207, + "rewards/margins": 1.0256259441375732, + "rewards/rejected": -1.8172677755355835, + "step": 4760 + }, + { + "epoch": 0.55, + "learning_rate": 1.3741074563970502e-07, + "logits/chosen": -2.90720796585083, + "logits/rejected": -2.8846826553344727, + "logps/chosen": -256.6512756347656, + "logps/rejected": -121.60044860839844, + "loss": 0.4394, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22586476802825928, + "rewards/margins": 1.1595749855041504, + "rewards/rejected": -0.9337102770805359, + "step": 4761 + }, + { + "epoch": 0.55, + "learning_rate": 1.3737562917008075e-07, + "logits/chosen": -2.7786357402801514, + "logits/rejected": -2.803652286529541, + "logps/chosen": -358.59515380859375, + "logps/rejected": -394.73077392578125, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06906656175851822, + "rewards/margins": 1.9555354118347168, + "rewards/rejected": -2.024601936340332, + "step": 4762 + }, + { + "epoch": 0.55, + "learning_rate": 1.373405127004565e-07, + "logits/chosen": -2.944713592529297, + "logits/rejected": -3.0749309062957764, + "logps/chosen": -217.2674560546875, + "logps/rejected": -182.97012329101562, + "loss": 0.3266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16660384833812714, + "rewards/margins": 1.3222711086273193, + "rewards/rejected": -1.1556673049926758, + "step": 4763 + }, + { + "epoch": 0.55, + "learning_rate": 1.3730539623083226e-07, + "logits/chosen": -3.0557074546813965, + "logits/rejected": -2.946401357650757, + "logps/chosen": -163.19021606445312, + "logps/rejected": -240.97003173828125, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05854501202702522, + "rewards/margins": 1.8657103776931763, + "rewards/rejected": -1.8071653842926025, + "step": 4764 + }, + { + "epoch": 0.55, + "learning_rate": 1.3727027976120799e-07, + "logits/chosen": -3.425889015197754, + "logits/rejected": -3.277805805206299, + "logps/chosen": -373.53753662109375, + "logps/rejected": -452.89080810546875, + "loss": 0.648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5641120672225952, + "rewards/margins": 2.3204426765441895, + "rewards/rejected": -2.884554386138916, + "step": 4765 + }, + { + "epoch": 0.55, + "learning_rate": 1.3723516329158374e-07, + "logits/chosen": -3.17521595954895, + "logits/rejected": -3.098491907119751, + "logps/chosen": -232.3490447998047, + "logps/rejected": -283.936767578125, + "loss": 0.7542, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38145989179611206, + "rewards/margins": 1.3110957145690918, + "rewards/rejected": -1.692555546760559, + "step": 4766 + }, + { + "epoch": 0.55, + "learning_rate": 1.372000468219595e-07, + "logits/chosen": -3.1587986946105957, + "logits/rejected": -3.6165928840637207, + "logps/chosen": -206.18096923828125, + "logps/rejected": -216.03515625, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7203680872917175, + "rewards/margins": 2.8779284954071045, + "rewards/rejected": -2.157560348510742, + "step": 4767 + }, + { + "epoch": 0.55, + "learning_rate": 1.3716493035233525e-07, + "logits/chosen": -2.966845989227295, + "logits/rejected": -3.5483169555664062, + "logps/chosen": -188.53749084472656, + "logps/rejected": -287.4612121582031, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24445034563541412, + "rewards/margins": 2.1717090606689453, + "rewards/rejected": -1.9272587299346924, + "step": 4768 + }, + { + "epoch": 0.55, + "learning_rate": 1.3712981388271098e-07, + "logits/chosen": -2.7689146995544434, + "logits/rejected": -2.6971542835235596, + "logps/chosen": -349.046875, + "logps/rejected": -344.5328369140625, + "loss": 0.1835, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22790539264678955, + "rewards/margins": 2.462186813354492, + "rewards/rejected": -2.234281539916992, + "step": 4769 + }, + { + "epoch": 0.55, + "learning_rate": 1.3709469741308673e-07, + "logits/chosen": -2.5629732608795166, + "logits/rejected": -2.927661657333374, + "logps/chosen": -299.96820068359375, + "logps/rejected": -283.882080078125, + "loss": 0.4321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020544201135635376, + "rewards/margins": 2.650297164916992, + "rewards/rejected": -2.670841693878174, + "step": 4770 + }, + { + "epoch": 0.55, + "learning_rate": 1.3705958094346248e-07, + "logits/chosen": -3.1924710273742676, + "logits/rejected": -3.0200133323669434, + "logps/chosen": -381.2283935546875, + "logps/rejected": -180.1150360107422, + "loss": 0.5874, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5874235033988953, + "rewards/margins": 0.46681854128837585, + "rewards/rejected": -1.0542420148849487, + "step": 4771 + }, + { + "epoch": 0.55, + "learning_rate": 1.3702446447383824e-07, + "logits/chosen": -3.7137227058410645, + "logits/rejected": -3.7481746673583984, + "logps/chosen": -370.99176025390625, + "logps/rejected": -345.04376220703125, + "loss": 0.4851, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16713115572929382, + "rewards/margins": 1.2075865268707275, + "rewards/rejected": -1.3747177124023438, + "step": 4772 + }, + { + "epoch": 0.55, + "learning_rate": 1.3698934800421396e-07, + "logits/chosen": -3.4891738891601562, + "logits/rejected": -3.478107452392578, + "logps/chosen": -222.07122802734375, + "logps/rejected": -177.7850341796875, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.508493185043335, + "rewards/margins": 0.9051705598831177, + "rewards/rejected": -1.4136638641357422, + "step": 4773 + }, + { + "epoch": 0.55, + "learning_rate": 1.3695423153458972e-07, + "logits/chosen": -3.403113603591919, + "logits/rejected": -3.3659071922302246, + "logps/chosen": -159.90444946289062, + "logps/rejected": -163.7713165283203, + "loss": 0.5965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07910767197608948, + "rewards/margins": 1.87180757522583, + "rewards/rejected": -1.9509150981903076, + "step": 4774 + }, + { + "epoch": 0.55, + "learning_rate": 1.3691911506496547e-07, + "logits/chosen": -3.2375857830047607, + "logits/rejected": -3.3286445140838623, + "logps/chosen": -203.01637268066406, + "logps/rejected": -386.72662353515625, + "loss": 0.4089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06815581023693085, + "rewards/margins": 2.1461100578308105, + "rewards/rejected": -2.214265823364258, + "step": 4775 + }, + { + "epoch": 0.55, + "learning_rate": 1.368839985953412e-07, + "logits/chosen": -3.6438465118408203, + "logits/rejected": -3.6152353286743164, + "logps/chosen": -254.95199584960938, + "logps/rejected": -254.06752014160156, + "loss": 0.4163, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4058321416378021, + "rewards/margins": 1.363391399383545, + "rewards/rejected": -0.9575592279434204, + "step": 4776 + }, + { + "epoch": 0.55, + "learning_rate": 1.3684888212571695e-07, + "logits/chosen": -3.0051753520965576, + "logits/rejected": -3.066567897796631, + "logps/chosen": -356.7967224121094, + "logps/rejected": -275.3105773925781, + "loss": 0.3912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23509468138217926, + "rewards/margins": 1.6956194639205933, + "rewards/rejected": -1.9307141304016113, + "step": 4777 + }, + { + "epoch": 0.55, + "learning_rate": 1.368137656560927e-07, + "logits/chosen": -2.690336227416992, + "logits/rejected": -2.590217113494873, + "logps/chosen": -583.7532348632812, + "logps/rejected": -191.1849365234375, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2811686396598816, + "rewards/margins": 1.1447964906692505, + "rewards/rejected": -0.8636279106140137, + "step": 4778 + }, + { + "epoch": 0.55, + "learning_rate": 1.3677864918646846e-07, + "logits/chosen": -3.4166295528411865, + "logits/rejected": -3.526822090148926, + "logps/chosen": -225.54876708984375, + "logps/rejected": -283.5772705078125, + "loss": 0.3007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2571033537387848, + "rewards/margins": 1.8120750188827515, + "rewards/rejected": -1.5549718141555786, + "step": 4779 + }, + { + "epoch": 0.55, + "learning_rate": 1.367435327168442e-07, + "logits/chosen": -2.805954694747925, + "logits/rejected": -2.9064149856567383, + "logps/chosen": -293.5096130371094, + "logps/rejected": -330.1612854003906, + "loss": 0.4177, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08531782776117325, + "rewards/margins": 1.5060094594955444, + "rewards/rejected": -1.420691728591919, + "step": 4780 + }, + { + "epoch": 0.55, + "learning_rate": 1.3670841624721994e-07, + "logits/chosen": -3.5663695335388184, + "logits/rejected": -3.2378907203674316, + "logps/chosen": -232.62371826171875, + "logps/rejected": -323.1640625, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06929884850978851, + "rewards/margins": 3.327594757080078, + "rewards/rejected": -3.3968937397003174, + "step": 4781 + }, + { + "epoch": 0.55, + "learning_rate": 1.3667329977759567e-07, + "logits/chosen": -2.9413044452667236, + "logits/rejected": -2.8289926052093506, + "logps/chosen": -270.1614685058594, + "logps/rejected": -228.17552185058594, + "loss": 0.6262, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4310479462146759, + "rewards/margins": 0.7258161306381226, + "rewards/rejected": -1.1568641662597656, + "step": 4782 + }, + { + "epoch": 0.55, + "learning_rate": 1.3663818330797145e-07, + "logits/chosen": -2.992344856262207, + "logits/rejected": -3.3419852256774902, + "logps/chosen": -137.5279541015625, + "logps/rejected": -292.6808166503906, + "loss": 0.5452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.581852912902832, + "rewards/margins": 1.3821008205413818, + "rewards/rejected": -1.9639538526535034, + "step": 4783 + }, + { + "epoch": 0.55, + "learning_rate": 1.3660306683834718e-07, + "logits/chosen": -4.064541816711426, + "logits/rejected": -3.7459912300109863, + "logps/chosen": -305.4259033203125, + "logps/rejected": -347.2422790527344, + "loss": 0.5234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24908877909183502, + "rewards/margins": 1.738128423690796, + "rewards/rejected": -1.9872173070907593, + "step": 4784 + }, + { + "epoch": 0.55, + "learning_rate": 1.3656795036872293e-07, + "logits/chosen": -3.4650232791900635, + "logits/rejected": -3.0478177070617676, + "logps/chosen": -251.04412841796875, + "logps/rejected": -149.64674377441406, + "loss": 0.1965, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14485712349414825, + "rewards/margins": 1.8390424251556396, + "rewards/rejected": -1.6941853761672974, + "step": 4785 + }, + { + "epoch": 0.55, + "learning_rate": 1.3653283389909866e-07, + "logits/chosen": -3.394439697265625, + "logits/rejected": -3.5003890991210938, + "logps/chosen": -104.1009521484375, + "logps/rejected": -211.92538452148438, + "loss": 0.2541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38176268339157104, + "rewards/margins": 2.1907334327697754, + "rewards/rejected": -1.8089708089828491, + "step": 4786 + }, + { + "epoch": 0.55, + "learning_rate": 1.364977174294744e-07, + "logits/chosen": -3.2622852325439453, + "logits/rejected": -3.328895092010498, + "logps/chosen": -299.6871032714844, + "logps/rejected": -289.53302001953125, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3952397108078003, + "rewards/margins": 2.314263343811035, + "rewards/rejected": -1.9190236330032349, + "step": 4787 + }, + { + "epoch": 0.55, + "learning_rate": 1.3646260095985017e-07, + "logits/chosen": -3.513597011566162, + "logits/rejected": -3.5034642219543457, + "logps/chosen": -161.7049560546875, + "logps/rejected": -216.4225311279297, + "loss": 0.4203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2205355167388916, + "rewards/margins": 2.2276575565338135, + "rewards/rejected": -2.448193073272705, + "step": 4788 + }, + { + "epoch": 0.55, + "learning_rate": 1.3642748449022592e-07, + "logits/chosen": -2.6898934841156006, + "logits/rejected": -2.591270923614502, + "logps/chosen": -329.4867248535156, + "logps/rejected": -391.5140686035156, + "loss": 0.4408, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5852369070053101, + "rewards/margins": 2.055316925048828, + "rewards/rejected": -1.470080018043518, + "step": 4789 + }, + { + "epoch": 0.55, + "learning_rate": 1.3639236802060165e-07, + "logits/chosen": -2.629234552383423, + "logits/rejected": -2.688204050064087, + "logps/chosen": -154.25674438476562, + "logps/rejected": -232.88870239257812, + "loss": 0.1959, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2423698902130127, + "rewards/margins": 3.418701648712158, + "rewards/rejected": -3.1763315200805664, + "step": 4790 + }, + { + "epoch": 0.55, + "learning_rate": 1.363572515509774e-07, + "logits/chosen": -2.697660207748413, + "logits/rejected": -2.5636579990386963, + "logps/chosen": -274.52862548828125, + "logps/rejected": -384.6920166015625, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23290996253490448, + "rewards/margins": 2.1088058948516846, + "rewards/rejected": -1.8758959770202637, + "step": 4791 + }, + { + "epoch": 0.55, + "learning_rate": 1.3632213508135316e-07, + "logits/chosen": -2.831786632537842, + "logits/rejected": -2.716884136199951, + "logps/chosen": -414.04925537109375, + "logps/rejected": -347.8641052246094, + "loss": 0.3906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39507362246513367, + "rewards/margins": 2.0783607959747314, + "rewards/rejected": -2.4734344482421875, + "step": 4792 + }, + { + "epoch": 0.55, + "learning_rate": 1.3628701861172888e-07, + "logits/chosen": -3.36822509765625, + "logits/rejected": -3.4875707626342773, + "logps/chosen": -313.02471923828125, + "logps/rejected": -353.93646240234375, + "loss": 0.5727, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.390035480260849, + "rewards/margins": 0.4695722162723541, + "rewards/rejected": -0.8596076369285583, + "step": 4793 + }, + { + "epoch": 0.55, + "learning_rate": 1.3625190214210464e-07, + "logits/chosen": -2.6339492797851562, + "logits/rejected": -2.8939030170440674, + "logps/chosen": -257.9361572265625, + "logps/rejected": -298.8621826171875, + "loss": 0.3541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20023319125175476, + "rewards/margins": 2.6396138668060303, + "rewards/rejected": -2.8398468494415283, + "step": 4794 + }, + { + "epoch": 0.55, + "learning_rate": 1.362167856724804e-07, + "logits/chosen": -2.952746629714966, + "logits/rejected": -2.8004980087280273, + "logps/chosen": -309.64385986328125, + "logps/rejected": -275.4190368652344, + "loss": 0.8406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6241786479949951, + "rewards/margins": 0.6050137281417847, + "rewards/rejected": -1.2291922569274902, + "step": 4795 + }, + { + "epoch": 0.55, + "learning_rate": 1.3618166920285614e-07, + "logits/chosen": -3.093174934387207, + "logits/rejected": -3.395916223526001, + "logps/chosen": -359.6121826171875, + "logps/rejected": -322.474853515625, + "loss": 0.3001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11656445264816284, + "rewards/margins": 1.5859254598617554, + "rewards/rejected": -1.4693608283996582, + "step": 4796 + }, + { + "epoch": 0.55, + "learning_rate": 1.3614655273323187e-07, + "logits/chosen": -2.72330904006958, + "logits/rejected": -2.7322347164154053, + "logps/chosen": -386.3936767578125, + "logps/rejected": -264.81646728515625, + "loss": 0.3924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35599571466445923, + "rewards/margins": 1.1342438459396362, + "rewards/rejected": -1.4902396202087402, + "step": 4797 + }, + { + "epoch": 0.55, + "learning_rate": 1.3611143626360763e-07, + "logits/chosen": -3.757974147796631, + "logits/rejected": -3.329442262649536, + "logps/chosen": -289.66925048828125, + "logps/rejected": -261.1196594238281, + "loss": 0.7946, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0787913054227829, + "rewards/margins": 0.4578053057193756, + "rewards/rejected": -0.3790140450000763, + "step": 4798 + }, + { + "epoch": 0.55, + "learning_rate": 1.3607631979398338e-07, + "logits/chosen": -3.269937038421631, + "logits/rejected": -3.1520469188690186, + "logps/chosen": -308.7500915527344, + "logps/rejected": -334.4271240234375, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10839928686618805, + "rewards/margins": 2.4948372840881348, + "rewards/rejected": -2.603236436843872, + "step": 4799 + }, + { + "epoch": 0.55, + "learning_rate": 1.3604120332435913e-07, + "logits/chosen": -2.708425760269165, + "logits/rejected": -2.9537200927734375, + "logps/chosen": -199.80999755859375, + "logps/rejected": -256.37347412109375, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7223774194717407, + "rewards/margins": 2.809574842453003, + "rewards/rejected": -2.0871973037719727, + "step": 4800 + }, + { + "epoch": 0.55, + "learning_rate": 1.3600608685473486e-07, + "logits/chosen": -3.7470955848693848, + "logits/rejected": -3.7523744106292725, + "logps/chosen": -406.14764404296875, + "logps/rejected": -446.1534729003906, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24842742085456848, + "rewards/margins": 1.9037872552871704, + "rewards/rejected": -1.6553596258163452, + "step": 4801 + }, + { + "epoch": 0.55, + "learning_rate": 1.3597097038511061e-07, + "logits/chosen": -3.582993984222412, + "logits/rejected": -3.6742324829101562, + "logps/chosen": -199.32125854492188, + "logps/rejected": -169.59710693359375, + "loss": 0.2085, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6986535787582397, + "rewards/margins": 2.683745861053467, + "rewards/rejected": -1.9850924015045166, + "step": 4802 + }, + { + "epoch": 0.55, + "learning_rate": 1.3593585391548637e-07, + "logits/chosen": -3.098726272583008, + "logits/rejected": -3.030748128890991, + "logps/chosen": -305.23419189453125, + "logps/rejected": -259.2082824707031, + "loss": 0.7404, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40921056270599365, + "rewards/margins": 0.6607764959335327, + "rewards/rejected": -1.0699870586395264, + "step": 4803 + }, + { + "epoch": 0.55, + "learning_rate": 1.359007374458621e-07, + "logits/chosen": -2.99583101272583, + "logits/rejected": -3.0239319801330566, + "logps/chosen": -183.1527099609375, + "logps/rejected": -166.0958251953125, + "loss": 0.3348, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.400499165058136, + "rewards/margins": 1.7980079650878906, + "rewards/rejected": -1.3975088596343994, + "step": 4804 + }, + { + "epoch": 0.55, + "learning_rate": 1.3586562097623785e-07, + "logits/chosen": -3.8504719734191895, + "logits/rejected": -3.223623752593994, + "logps/chosen": -269.53009033203125, + "logps/rejected": -199.30810546875, + "loss": 0.2154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014839313924312592, + "rewards/margins": 2.951749801635742, + "rewards/rejected": -2.9665892124176025, + "step": 4805 + }, + { + "epoch": 0.55, + "learning_rate": 1.358305045066136e-07, + "logits/chosen": -3.0472030639648438, + "logits/rejected": -3.147517204284668, + "logps/chosen": -164.67572021484375, + "logps/rejected": -276.1445617675781, + "loss": 0.5887, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4820007085800171, + "rewards/margins": 0.7346557378768921, + "rewards/rejected": -1.2166563272476196, + "step": 4806 + }, + { + "epoch": 0.55, + "learning_rate": 1.3579538803698936e-07, + "logits/chosen": -4.021679878234863, + "logits/rejected": -4.035638809204102, + "logps/chosen": -139.6142578125, + "logps/rejected": -132.82315063476562, + "loss": 0.3292, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7592330574989319, + "rewards/margins": 1.7872421741485596, + "rewards/rejected": -1.028009057044983, + "step": 4807 + }, + { + "epoch": 0.55, + "learning_rate": 1.3576027156736508e-07, + "logits/chosen": -2.7108912467956543, + "logits/rejected": -2.6139345169067383, + "logps/chosen": -289.5403747558594, + "logps/rejected": -303.59991455078125, + "loss": 0.42, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01578378677368164, + "rewards/margins": 2.0974960327148438, + "rewards/rejected": -2.081712007522583, + "step": 4808 + }, + { + "epoch": 0.55, + "learning_rate": 1.3572515509774084e-07, + "logits/chosen": -2.6065118312835693, + "logits/rejected": -2.6107654571533203, + "logps/chosen": -437.61395263671875, + "logps/rejected": -373.51678466796875, + "loss": 0.3496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07688727974891663, + "rewards/margins": 2.0648584365844727, + "rewards/rejected": -2.1417455673217773, + "step": 4809 + }, + { + "epoch": 0.55, + "learning_rate": 1.3569003862811657e-07, + "logits/chosen": -2.8912014961242676, + "logits/rejected": -2.8658504486083984, + "logps/chosen": -234.30279541015625, + "logps/rejected": -218.7839813232422, + "loss": 0.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7684351801872253, + "rewards/margins": 2.9947919845581055, + "rewards/rejected": -2.2263569831848145, + "step": 4810 + }, + { + "epoch": 0.55, + "learning_rate": 1.3565492215849232e-07, + "logits/chosen": -3.134061813354492, + "logits/rejected": -2.8946752548217773, + "logps/chosen": -199.5254364013672, + "logps/rejected": -204.36962890625, + "loss": 0.312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5705145001411438, + "rewards/margins": 1.340956449508667, + "rewards/rejected": -1.911470890045166, + "step": 4811 + }, + { + "epoch": 0.55, + "learning_rate": 1.3561980568886807e-07, + "logits/chosen": -2.400561571121216, + "logits/rejected": -2.366670608520508, + "logps/chosen": -213.44029235839844, + "logps/rejected": -262.0770263671875, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014767736196517944, + "rewards/margins": 2.195666790008545, + "rewards/rejected": -2.2104344367980957, + "step": 4812 + }, + { + "epoch": 0.55, + "learning_rate": 1.3558468921924383e-07, + "logits/chosen": -3.2016470432281494, + "logits/rejected": -3.0882012844085693, + "logps/chosen": -203.08450317382812, + "logps/rejected": -283.7563781738281, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14292840659618378, + "rewards/margins": 1.3093523979187012, + "rewards/rejected": -1.166424036026001, + "step": 4813 + }, + { + "epoch": 0.55, + "learning_rate": 1.3554957274961955e-07, + "logits/chosen": -2.70068621635437, + "logits/rejected": -2.8671205043792725, + "logps/chosen": -267.1661071777344, + "logps/rejected": -339.5758056640625, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04394865781068802, + "rewards/margins": 2.448624610900879, + "rewards/rejected": -2.4046759605407715, + "step": 4814 + }, + { + "epoch": 0.56, + "learning_rate": 1.355144562799953e-07, + "logits/chosen": -2.507544994354248, + "logits/rejected": -2.3712058067321777, + "logps/chosen": -115.41021728515625, + "logps/rejected": -148.20950317382812, + "loss": 0.492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1376158595085144, + "rewards/margins": 1.3625843524932861, + "rewards/rejected": -1.5002002716064453, + "step": 4815 + }, + { + "epoch": 0.56, + "learning_rate": 1.3547933981037106e-07, + "logits/chosen": -2.4999780654907227, + "logits/rejected": -2.5281057357788086, + "logps/chosen": -366.2011413574219, + "logps/rejected": -405.9465026855469, + "loss": 0.5795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1387409120798111, + "rewards/margins": 0.6671519875526428, + "rewards/rejected": -0.8058929443359375, + "step": 4816 + }, + { + "epoch": 0.56, + "learning_rate": 1.3544422334074682e-07, + "logits/chosen": -3.052475690841675, + "logits/rejected": -3.20241641998291, + "logps/chosen": -194.8136749267578, + "logps/rejected": -205.76348876953125, + "loss": 0.4408, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0834440365433693, + "rewards/margins": 1.7555515766143799, + "rewards/rejected": -1.6721075773239136, + "step": 4817 + }, + { + "epoch": 0.56, + "learning_rate": 1.3540910687112254e-07, + "logits/chosen": -2.832705020904541, + "logits/rejected": -2.8966004848480225, + "logps/chosen": -227.5538330078125, + "logps/rejected": -232.8944091796875, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12707018852233887, + "rewards/margins": 1.9887573719024658, + "rewards/rejected": -1.861687183380127, + "step": 4818 + }, + { + "epoch": 0.56, + "learning_rate": 1.353739904014983e-07, + "logits/chosen": -3.3973307609558105, + "logits/rejected": -3.32358455657959, + "logps/chosen": -374.03277587890625, + "logps/rejected": -297.51641845703125, + "loss": 0.3603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2773796319961548, + "rewards/margins": 1.5671623945236206, + "rewards/rejected": -1.8445419073104858, + "step": 4819 + }, + { + "epoch": 0.56, + "learning_rate": 1.3533887393187405e-07, + "logits/chosen": -2.989832878112793, + "logits/rejected": -3.230113983154297, + "logps/chosen": -215.4257049560547, + "logps/rejected": -155.75863647460938, + "loss": 0.4069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2831127643585205, + "rewards/margins": 1.4341282844543457, + "rewards/rejected": -1.7172410488128662, + "step": 4820 + }, + { + "epoch": 0.56, + "learning_rate": 1.3530375746224978e-07, + "logits/chosen": -3.1014227867126465, + "logits/rejected": -3.2705137729644775, + "logps/chosen": -170.6185302734375, + "logps/rejected": -166.13600158691406, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20078733563423157, + "rewards/margins": 1.4193609952926636, + "rewards/rejected": -1.6201481819152832, + "step": 4821 + }, + { + "epoch": 0.56, + "learning_rate": 1.3526864099262553e-07, + "logits/chosen": -3.4650824069976807, + "logits/rejected": -3.531498432159424, + "logps/chosen": -370.0041809082031, + "logps/rejected": -258.9075012207031, + "loss": 0.2501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1490071564912796, + "rewards/margins": 1.9374613761901855, + "rewards/rejected": -2.0864686965942383, + "step": 4822 + }, + { + "epoch": 0.56, + "learning_rate": 1.3523352452300129e-07, + "logits/chosen": -2.775221109390259, + "logits/rejected": -2.6706061363220215, + "logps/chosen": -313.1297302246094, + "logps/rejected": -240.7827911376953, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27965545654296875, + "rewards/margins": 1.176781415939331, + "rewards/rejected": -1.4564369916915894, + "step": 4823 + }, + { + "epoch": 0.56, + "learning_rate": 1.3519840805337704e-07, + "logits/chosen": -3.0441746711730957, + "logits/rejected": -2.8616628646850586, + "logps/chosen": -184.55111694335938, + "logps/rejected": -257.4252014160156, + "loss": 0.499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3580019176006317, + "rewards/margins": 1.341821312904358, + "rewards/rejected": -1.699823260307312, + "step": 4824 + }, + { + "epoch": 0.56, + "learning_rate": 1.3516329158375277e-07, + "logits/chosen": -3.325838088989258, + "logits/rejected": -3.4475948810577393, + "logps/chosen": -226.41952514648438, + "logps/rejected": -215.51629638671875, + "loss": 0.2754, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.320473849773407, + "rewards/margins": 2.2675461769104004, + "rewards/rejected": -1.9470725059509277, + "step": 4825 + }, + { + "epoch": 0.56, + "learning_rate": 1.3512817511412852e-07, + "logits/chosen": -3.012838363647461, + "logits/rejected": -3.366415500640869, + "logps/chosen": -299.0970153808594, + "logps/rejected": -248.8348388671875, + "loss": 0.3798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19273032248020172, + "rewards/margins": 2.461395025253296, + "rewards/rejected": -2.268664836883545, + "step": 4826 + }, + { + "epoch": 0.56, + "learning_rate": 1.3509305864450425e-07, + "logits/chosen": -3.459104061126709, + "logits/rejected": -3.38262939453125, + "logps/chosen": -191.81817626953125, + "logps/rejected": -249.34674072265625, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3030503988265991, + "rewards/margins": 2.5507140159606934, + "rewards/rejected": -2.2476634979248047, + "step": 4827 + }, + { + "epoch": 0.56, + "learning_rate": 1.3505794217488003e-07, + "logits/chosen": -3.1061248779296875, + "logits/rejected": -3.4108190536499023, + "logps/chosen": -232.8472442626953, + "logps/rejected": -286.937255859375, + "loss": 0.3343, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11478567868471146, + "rewards/margins": 1.961240530014038, + "rewards/rejected": -1.8464550971984863, + "step": 4828 + }, + { + "epoch": 0.56, + "learning_rate": 1.3502282570525576e-07, + "logits/chosen": -3.045334815979004, + "logits/rejected": -3.1079835891723633, + "logps/chosen": -243.71426391601562, + "logps/rejected": -149.33319091796875, + "loss": 0.7043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41339150071144104, + "rewards/margins": 0.4816927909851074, + "rewards/rejected": -0.8950843811035156, + "step": 4829 + }, + { + "epoch": 0.56, + "learning_rate": 1.349877092356315e-07, + "logits/chosen": -3.48516583442688, + "logits/rejected": -3.730537176132202, + "logps/chosen": -159.69068908691406, + "logps/rejected": -242.7703094482422, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.262426495552063, + "rewards/margins": 2.048241138458252, + "rewards/rejected": -2.3106675148010254, + "step": 4830 + }, + { + "epoch": 0.56, + "learning_rate": 1.3495259276600724e-07, + "logits/chosen": -3.784970283508301, + "logits/rejected": -4.0897626876831055, + "logps/chosen": -158.39793395996094, + "logps/rejected": -282.8544616699219, + "loss": 0.2463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47611457109451294, + "rewards/margins": 1.9744093418121338, + "rewards/rejected": -2.450523853302002, + "step": 4831 + }, + { + "epoch": 0.56, + "learning_rate": 1.34917476296383e-07, + "logits/chosen": -2.930586576461792, + "logits/rejected": -2.8573992252349854, + "logps/chosen": -450.7336730957031, + "logps/rejected": -419.904296875, + "loss": 0.1355, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23014883697032928, + "rewards/margins": 2.7520954608917236, + "rewards/rejected": -2.521946430206299, + "step": 4832 + }, + { + "epoch": 0.56, + "learning_rate": 1.3488235982675875e-07, + "logits/chosen": -2.9913249015808105, + "logits/rejected": -2.9230878353118896, + "logps/chosen": -291.359619140625, + "logps/rejected": -184.53817749023438, + "loss": 0.5056, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08024879544973373, + "rewards/margins": 0.7657583355903625, + "rewards/rejected": -0.8460071682929993, + "step": 4833 + }, + { + "epoch": 0.56, + "learning_rate": 1.348472433571345e-07, + "logits/chosen": -2.9319801330566406, + "logits/rejected": -3.274892568588257, + "logps/chosen": -266.0690612792969, + "logps/rejected": -194.55953979492188, + "loss": 0.5484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3379361927509308, + "rewards/margins": 1.5490832328796387, + "rewards/rejected": -1.887019395828247, + "step": 4834 + }, + { + "epoch": 0.56, + "learning_rate": 1.3481212688751023e-07, + "logits/chosen": -3.1912598609924316, + "logits/rejected": -3.297440767288208, + "logps/chosen": -419.15704345703125, + "logps/rejected": -322.160400390625, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05621958523988724, + "rewards/margins": 2.4918596744537354, + "rewards/rejected": -2.4356398582458496, + "step": 4835 + }, + { + "epoch": 0.56, + "learning_rate": 1.3477701041788598e-07, + "logits/chosen": -3.407176971435547, + "logits/rejected": -3.4314703941345215, + "logps/chosen": -269.7158508300781, + "logps/rejected": -441.9659423828125, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028047889471054077, + "rewards/margins": 1.861912727355957, + "rewards/rejected": -1.889960527420044, + "step": 4836 + }, + { + "epoch": 0.56, + "learning_rate": 1.3474189394826173e-07, + "logits/chosen": -3.571760654449463, + "logits/rejected": -3.4172940254211426, + "logps/chosen": -293.99151611328125, + "logps/rejected": -196.97222900390625, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16517163813114166, + "rewards/margins": 2.8512794971466064, + "rewards/rejected": -2.686107873916626, + "step": 4837 + }, + { + "epoch": 0.56, + "learning_rate": 1.3470677747863746e-07, + "logits/chosen": -2.9262590408325195, + "logits/rejected": -2.8587985038757324, + "logps/chosen": -294.5781555175781, + "logps/rejected": -357.4911193847656, + "loss": 0.5016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.042933061718940735, + "rewards/margins": 0.6989614963531494, + "rewards/rejected": -0.6560283899307251, + "step": 4838 + }, + { + "epoch": 0.56, + "learning_rate": 1.3467166100901322e-07, + "logits/chosen": -2.828927993774414, + "logits/rejected": -2.7206318378448486, + "logps/chosen": -278.33807373046875, + "logps/rejected": -235.42579650878906, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08209299296140671, + "rewards/margins": 1.8494737148284912, + "rewards/rejected": -1.767380714416504, + "step": 4839 + }, + { + "epoch": 0.56, + "learning_rate": 1.3463654453938897e-07, + "logits/chosen": -3.3520681858062744, + "logits/rejected": -3.4561681747436523, + "logps/chosen": -330.1048278808594, + "logps/rejected": -300.9739990234375, + "loss": 0.3267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6589967608451843, + "rewards/margins": 2.8070249557495117, + "rewards/rejected": -3.4660215377807617, + "step": 4840 + }, + { + "epoch": 0.56, + "learning_rate": 1.3460142806976472e-07, + "logits/chosen": -2.923001527786255, + "logits/rejected": -2.761061191558838, + "logps/chosen": -194.61488342285156, + "logps/rejected": -282.61181640625, + "loss": 0.4844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5914462804794312, + "rewards/margins": 1.8088228702545166, + "rewards/rejected": -2.400269031524658, + "step": 4841 + }, + { + "epoch": 0.56, + "learning_rate": 1.3456631160014045e-07, + "logits/chosen": -3.000657081604004, + "logits/rejected": -2.9436168670654297, + "logps/chosen": -364.169921875, + "logps/rejected": -370.56103515625, + "loss": 0.6439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4509199559688568, + "rewards/margins": 0.9152641296386719, + "rewards/rejected": -1.3661839962005615, + "step": 4842 + }, + { + "epoch": 0.56, + "learning_rate": 1.345311951305162e-07, + "logits/chosen": -2.811077117919922, + "logits/rejected": -2.747037410736084, + "logps/chosen": -130.36419677734375, + "logps/rejected": -176.325927734375, + "loss": 0.43, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13964799046516418, + "rewards/margins": 1.609838604927063, + "rewards/rejected": -1.4701906442642212, + "step": 4843 + }, + { + "epoch": 0.56, + "learning_rate": 1.3449607866089196e-07, + "logits/chosen": -2.7604429721832275, + "logits/rejected": -3.0650200843811035, + "logps/chosen": -248.79454040527344, + "logps/rejected": -297.4755554199219, + "loss": 0.3708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04954535514116287, + "rewards/margins": 2.1999387741088867, + "rewards/rejected": -2.2494845390319824, + "step": 4844 + }, + { + "epoch": 0.56, + "learning_rate": 1.344609621912677e-07, + "logits/chosen": -3.021958112716675, + "logits/rejected": -3.329211950302124, + "logps/chosen": -288.27117919921875, + "logps/rejected": -396.48529052734375, + "loss": 0.2632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.369141161441803, + "rewards/margins": 3.005490303039551, + "rewards/rejected": -2.6363492012023926, + "step": 4845 + }, + { + "epoch": 0.56, + "learning_rate": 1.3442584572164344e-07, + "logits/chosen": -2.798164129257202, + "logits/rejected": -2.4707658290863037, + "logps/chosen": -365.00738525390625, + "logps/rejected": -181.79739379882812, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5706957578659058, + "rewards/margins": 0.9513221979141235, + "rewards/rejected": -1.5220179557800293, + "step": 4846 + }, + { + "epoch": 0.56, + "learning_rate": 1.343907292520192e-07, + "logits/chosen": -2.6897623538970947, + "logits/rejected": -2.7125229835510254, + "logps/chosen": -185.63497924804688, + "logps/rejected": -251.07875061035156, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7228617668151855, + "rewards/margins": 0.850774884223938, + "rewards/rejected": -1.5736368894577026, + "step": 4847 + }, + { + "epoch": 0.56, + "learning_rate": 1.3435561278239495e-07, + "logits/chosen": -2.2424614429473877, + "logits/rejected": -2.366542339324951, + "logps/chosen": -296.88250732421875, + "logps/rejected": -222.3978271484375, + "loss": 0.3989, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2708043158054352, + "rewards/margins": 1.9222540855407715, + "rewards/rejected": -2.1930582523345947, + "step": 4848 + }, + { + "epoch": 0.56, + "learning_rate": 1.3432049631277067e-07, + "logits/chosen": -3.0405662059783936, + "logits/rejected": -3.073359727859497, + "logps/chosen": -144.9718780517578, + "logps/rejected": -209.8593292236328, + "loss": 0.6226, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43822360038757324, + "rewards/margins": 0.2565137445926666, + "rewards/rejected": -0.6947373747825623, + "step": 4849 + }, + { + "epoch": 0.56, + "learning_rate": 1.3428537984314643e-07, + "logits/chosen": -2.567556142807007, + "logits/rejected": -2.6248531341552734, + "logps/chosen": -267.56622314453125, + "logps/rejected": -355.36273193359375, + "loss": 0.4833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16132131218910217, + "rewards/margins": 1.9581066370010376, + "rewards/rejected": -2.1194279193878174, + "step": 4850 + }, + { + "epoch": 0.56, + "learning_rate": 1.3425026337352218e-07, + "logits/chosen": -3.149705171585083, + "logits/rejected": -3.2515146732330322, + "logps/chosen": -273.7870178222656, + "logps/rejected": -307.6505126953125, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2879689633846283, + "rewards/margins": 2.1306846141815186, + "rewards/rejected": -1.8427156209945679, + "step": 4851 + }, + { + "epoch": 0.56, + "learning_rate": 1.3421514690389794e-07, + "logits/chosen": -2.9041621685028076, + "logits/rejected": -3.0985188484191895, + "logps/chosen": -218.04209899902344, + "logps/rejected": -201.47117614746094, + "loss": 0.4004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10240790992975235, + "rewards/margins": 1.3379887342453003, + "rewards/rejected": -1.4403966665267944, + "step": 4852 + }, + { + "epoch": 0.56, + "learning_rate": 1.3418003043427366e-07, + "logits/chosen": -2.756856679916382, + "logits/rejected": -2.6693592071533203, + "logps/chosen": -380.99359130859375, + "logps/rejected": -552.88916015625, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13441553711891174, + "rewards/margins": 1.7115068435668945, + "rewards/rejected": -1.5770913362503052, + "step": 4853 + }, + { + "epoch": 0.56, + "learning_rate": 1.3414491396464942e-07, + "logits/chosen": -3.142477512359619, + "logits/rejected": -3.204620838165283, + "logps/chosen": -229.48768615722656, + "logps/rejected": -340.72991943359375, + "loss": 0.2358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20311178267002106, + "rewards/margins": 4.246486663818359, + "rewards/rejected": -4.043375015258789, + "step": 4854 + }, + { + "epoch": 0.56, + "learning_rate": 1.3410979749502515e-07, + "logits/chosen": -3.6987924575805664, + "logits/rejected": -3.6687729358673096, + "logps/chosen": -381.3331604003906, + "logps/rejected": -201.689453125, + "loss": 0.5829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32105523347854614, + "rewards/margins": 1.1463972330093384, + "rewards/rejected": -1.4674524068832397, + "step": 4855 + }, + { + "epoch": 0.56, + "learning_rate": 1.340746810254009e-07, + "logits/chosen": -2.589390277862549, + "logits/rejected": -2.2709686756134033, + "logps/chosen": -310.2882080078125, + "logps/rejected": -344.4661865234375, + "loss": 0.3742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1064542829990387, + "rewards/margins": 1.180747628211975, + "rewards/rejected": -1.2872018814086914, + "step": 4856 + }, + { + "epoch": 0.56, + "learning_rate": 1.3403956455577665e-07, + "logits/chosen": -3.822092056274414, + "logits/rejected": -4.0002031326293945, + "logps/chosen": -239.52542114257812, + "logps/rejected": -335.8203430175781, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29226887226104736, + "rewards/margins": 2.665064811706543, + "rewards/rejected": -2.957333564758301, + "step": 4857 + }, + { + "epoch": 0.56, + "learning_rate": 1.340044480861524e-07, + "logits/chosen": -2.62776780128479, + "logits/rejected": -2.612459182739258, + "logps/chosen": -322.82635498046875, + "logps/rejected": -277.3119201660156, + "loss": 0.635, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10354135185480118, + "rewards/margins": 1.2655569314956665, + "rewards/rejected": -1.162015676498413, + "step": 4858 + }, + { + "epoch": 0.56, + "learning_rate": 1.3396933161652813e-07, + "logits/chosen": -2.488922119140625, + "logits/rejected": -2.429255723953247, + "logps/chosen": -362.50244140625, + "logps/rejected": -326.0660400390625, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014360696077346802, + "rewards/margins": 2.083865165710449, + "rewards/rejected": -2.0982260704040527, + "step": 4859 + }, + { + "epoch": 0.56, + "learning_rate": 1.339342151469039e-07, + "logits/chosen": -2.264695405960083, + "logits/rejected": -2.3530220985412598, + "logps/chosen": -238.4569091796875, + "logps/rejected": -292.154052734375, + "loss": 0.4651, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17009392380714417, + "rewards/margins": 0.6949657201766968, + "rewards/rejected": -0.8650596141815186, + "step": 4860 + }, + { + "epoch": 0.56, + "learning_rate": 1.3389909867727964e-07, + "logits/chosen": -2.9196932315826416, + "logits/rejected": -3.095858097076416, + "logps/chosen": -257.50958251953125, + "logps/rejected": -201.7657012939453, + "loss": 0.3847, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16587576270103455, + "rewards/margins": 1.9857512712478638, + "rewards/rejected": -1.8198754787445068, + "step": 4861 + }, + { + "epoch": 0.56, + "learning_rate": 1.338639822076554e-07, + "logits/chosen": -3.3623828887939453, + "logits/rejected": -3.0689690113067627, + "logps/chosen": -303.13214111328125, + "logps/rejected": -155.29820251464844, + "loss": 0.6533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34094107151031494, + "rewards/margins": 0.659579873085022, + "rewards/rejected": -1.000520944595337, + "step": 4862 + }, + { + "epoch": 0.56, + "learning_rate": 1.3382886573803112e-07, + "logits/chosen": -3.2355270385742188, + "logits/rejected": -3.4388670921325684, + "logps/chosen": -162.42108154296875, + "logps/rejected": -267.4080810546875, + "loss": 0.3119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36710667610168457, + "rewards/margins": 1.5989726781845093, + "rewards/rejected": -1.9660792350769043, + "step": 4863 + }, + { + "epoch": 0.56, + "learning_rate": 1.3379374926840688e-07, + "logits/chosen": -3.60128116607666, + "logits/rejected": -3.4398837089538574, + "logps/chosen": -246.8362274169922, + "logps/rejected": -210.9217987060547, + "loss": 0.201, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13883084058761597, + "rewards/margins": 2.3321924209594727, + "rewards/rejected": -2.193361520767212, + "step": 4864 + }, + { + "epoch": 0.56, + "learning_rate": 1.3375863279878263e-07, + "logits/chosen": -3.412571907043457, + "logits/rejected": -3.04681396484375, + "logps/chosen": -188.23965454101562, + "logps/rejected": -131.96505737304688, + "loss": 0.6603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4602607488632202, + "rewards/margins": 0.4447513818740845, + "rewards/rejected": -0.9050121307373047, + "step": 4865 + }, + { + "epoch": 0.56, + "learning_rate": 1.3372351632915836e-07, + "logits/chosen": -2.7117178440093994, + "logits/rejected": -2.9011940956115723, + "logps/chosen": -350.18780517578125, + "logps/rejected": -270.3172607421875, + "loss": 0.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11174158751964569, + "rewards/margins": 1.5295052528381348, + "rewards/rejected": -1.4177637100219727, + "step": 4866 + }, + { + "epoch": 0.56, + "learning_rate": 1.336883998595341e-07, + "logits/chosen": -3.244351863861084, + "logits/rejected": -3.3928301334381104, + "logps/chosen": -261.85455322265625, + "logps/rejected": -209.16921997070312, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5272128582000732, + "rewards/margins": 1.4830909967422485, + "rewards/rejected": -2.0103037357330322, + "step": 4867 + }, + { + "epoch": 0.56, + "learning_rate": 1.3365328338990987e-07, + "logits/chosen": -3.489499568939209, + "logits/rejected": -3.1806702613830566, + "logps/chosen": -228.4987335205078, + "logps/rejected": -296.7225341796875, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3213415741920471, + "rewards/margins": 3.3911681175231934, + "rewards/rejected": -3.06982684135437, + "step": 4868 + }, + { + "epoch": 0.56, + "learning_rate": 1.3361816692028562e-07, + "logits/chosen": -3.2473902702331543, + "logits/rejected": -3.317274808883667, + "logps/chosen": -179.48220825195312, + "logps/rejected": -132.71267700195312, + "loss": 0.4831, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5110719203948975, + "rewards/margins": 1.3085829019546509, + "rewards/rejected": -0.7975109815597534, + "step": 4869 + }, + { + "epoch": 0.56, + "learning_rate": 1.3358305045066135e-07, + "logits/chosen": -3.4229869842529297, + "logits/rejected": -3.6082475185394287, + "logps/chosen": -251.3162841796875, + "logps/rejected": -287.1957092285156, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0703224390745163, + "rewards/margins": 2.5182015895843506, + "rewards/rejected": -2.5885238647460938, + "step": 4870 + }, + { + "epoch": 0.56, + "learning_rate": 1.335479339810371e-07, + "logits/chosen": -3.167391777038574, + "logits/rejected": -3.154592275619507, + "logps/chosen": -209.53909301757812, + "logps/rejected": -238.13009643554688, + "loss": 0.4809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27252107858657837, + "rewards/margins": 1.8771737813949585, + "rewards/rejected": -2.1496949195861816, + "step": 4871 + }, + { + "epoch": 0.56, + "learning_rate": 1.3351281751141283e-07, + "logits/chosen": -2.583113431930542, + "logits/rejected": -2.6187350749969482, + "logps/chosen": -251.20697021484375, + "logps/rejected": -229.94151306152344, + "loss": 0.3229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23218902945518494, + "rewards/margins": 2.5315258502960205, + "rewards/rejected": -2.7637150287628174, + "step": 4872 + }, + { + "epoch": 0.56, + "learning_rate": 1.334777010417886e-07, + "logits/chosen": -3.906602144241333, + "logits/rejected": -3.305623769760132, + "logps/chosen": -476.9632568359375, + "logps/rejected": -165.501220703125, + "loss": 0.2043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5393829345703125, + "rewards/margins": 2.1506009101867676, + "rewards/rejected": -1.611217975616455, + "step": 4873 + }, + { + "epoch": 0.56, + "learning_rate": 1.3344258457216434e-07, + "logits/chosen": -3.215756416320801, + "logits/rejected": -3.1138904094696045, + "logps/chosen": -234.2472381591797, + "logps/rejected": -269.13409423828125, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41102439165115356, + "rewards/margins": 1.8459210395812988, + "rewards/rejected": -1.4348968267440796, + "step": 4874 + }, + { + "epoch": 0.56, + "learning_rate": 1.334074681025401e-07, + "logits/chosen": -2.70554518699646, + "logits/rejected": -2.6370763778686523, + "logps/chosen": -326.5443115234375, + "logps/rejected": -226.34170532226562, + "loss": 0.474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06508886814117432, + "rewards/margins": 0.9070602059364319, + "rewards/rejected": -0.972149133682251, + "step": 4875 + }, + { + "epoch": 0.56, + "learning_rate": 1.3337235163291582e-07, + "logits/chosen": -3.4912710189819336, + "logits/rejected": -3.4224023818969727, + "logps/chosen": -126.04425048828125, + "logps/rejected": -239.3314208984375, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09109699726104736, + "rewards/margins": 1.4223251342773438, + "rewards/rejected": -1.331228256225586, + "step": 4876 + }, + { + "epoch": 0.56, + "learning_rate": 1.3333723516329157e-07, + "logits/chosen": -3.041226387023926, + "logits/rejected": -2.8421080112457275, + "logps/chosen": -390.6431884765625, + "logps/rejected": -197.08226013183594, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5289754867553711, + "rewards/margins": 1.0216331481933594, + "rewards/rejected": -1.55060875415802, + "step": 4877 + }, + { + "epoch": 0.56, + "learning_rate": 1.3330211869366732e-07, + "logits/chosen": -3.5621020793914795, + "logits/rejected": -3.4112510681152344, + "logps/chosen": -216.70046997070312, + "logps/rejected": -261.461669921875, + "loss": 0.3477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3270961046218872, + "rewards/margins": 1.328822135925293, + "rewards/rejected": -1.0017261505126953, + "step": 4878 + }, + { + "epoch": 0.56, + "learning_rate": 1.3326700222404308e-07, + "logits/chosen": -3.0315003395080566, + "logits/rejected": -2.9616591930389404, + "logps/chosen": -333.7755126953125, + "logps/rejected": -291.47711181640625, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5803419351577759, + "rewards/margins": 1.4283270835876465, + "rewards/rejected": -2.008668899536133, + "step": 4879 + }, + { + "epoch": 0.56, + "learning_rate": 1.332318857544188e-07, + "logits/chosen": -2.868744373321533, + "logits/rejected": -3.033236503601074, + "logps/chosen": -259.2884521484375, + "logps/rejected": -182.00111389160156, + "loss": 0.2573, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13484808802604675, + "rewards/margins": 1.9310410022735596, + "rewards/rejected": -1.7961928844451904, + "step": 4880 + }, + { + "epoch": 0.56, + "learning_rate": 1.3319676928479456e-07, + "logits/chosen": -2.076016664505005, + "logits/rejected": -1.9806132316589355, + "logps/chosen": -349.8100280761719, + "logps/rejected": -349.3825378417969, + "loss": 0.4444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12152034044265747, + "rewards/margins": 1.422515630722046, + "rewards/rejected": -1.5440359115600586, + "step": 4881 + }, + { + "epoch": 0.56, + "learning_rate": 1.3316165281517031e-07, + "logits/chosen": -3.6869988441467285, + "logits/rejected": -3.1068663597106934, + "logps/chosen": -321.1912841796875, + "logps/rejected": -229.72854614257812, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6402081251144409, + "rewards/margins": 2.112602949142456, + "rewards/rejected": -2.7528111934661865, + "step": 4882 + }, + { + "epoch": 0.56, + "learning_rate": 1.3312653634554604e-07, + "logits/chosen": -3.2297158241271973, + "logits/rejected": -3.28519344329834, + "logps/chosen": -243.52206420898438, + "logps/rejected": -150.48068237304688, + "loss": 0.4912, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04699200391769409, + "rewards/margins": 0.974571943283081, + "rewards/rejected": -1.02156400680542, + "step": 4883 + }, + { + "epoch": 0.56, + "learning_rate": 1.330914198759218e-07, + "logits/chosen": -2.942178964614868, + "logits/rejected": -3.0404787063598633, + "logps/chosen": -293.04449462890625, + "logps/rejected": -186.2045135498047, + "loss": 0.3281, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38310950994491577, + "rewards/margins": 1.6764382123947144, + "rewards/rejected": -1.2933286428451538, + "step": 4884 + }, + { + "epoch": 0.56, + "learning_rate": 1.3305630340629755e-07, + "logits/chosen": -3.228635549545288, + "logits/rejected": -2.8563525676727295, + "logps/chosen": -278.8379821777344, + "logps/rejected": -279.0132751464844, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25605177879333496, + "rewards/margins": 1.6962398290634155, + "rewards/rejected": -1.95229172706604, + "step": 4885 + }, + { + "epoch": 0.56, + "learning_rate": 1.330211869366733e-07, + "logits/chosen": -2.89609432220459, + "logits/rejected": -2.9884021282196045, + "logps/chosen": -222.19039916992188, + "logps/rejected": -175.80857849121094, + "loss": 0.3163, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32426342368125916, + "rewards/margins": 1.8459675312042236, + "rewards/rejected": -2.1702308654785156, + "step": 4886 + }, + { + "epoch": 0.56, + "learning_rate": 1.3298607046704903e-07, + "logits/chosen": -2.879338264465332, + "logits/rejected": -2.895312786102295, + "logps/chosen": -195.06227111816406, + "logps/rejected": -237.02513122558594, + "loss": 0.3715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21890462934970856, + "rewards/margins": 2.262279510498047, + "rewards/rejected": -2.4811840057373047, + "step": 4887 + }, + { + "epoch": 0.56, + "learning_rate": 1.3295095399742478e-07, + "logits/chosen": -3.116154193878174, + "logits/rejected": -2.980287551879883, + "logps/chosen": -504.2994384765625, + "logps/rejected": -240.18603515625, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1286935806274414, + "rewards/margins": 1.1999173164367676, + "rewards/rejected": -1.328610897064209, + "step": 4888 + }, + { + "epoch": 0.56, + "learning_rate": 1.3291583752780054e-07, + "logits/chosen": -3.5475594997406006, + "logits/rejected": -3.5474624633789062, + "logps/chosen": -190.1613006591797, + "logps/rejected": -196.43894958496094, + "loss": 0.5236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26426127552986145, + "rewards/margins": 1.8001375198364258, + "rewards/rejected": -2.064398765563965, + "step": 4889 + }, + { + "epoch": 0.56, + "learning_rate": 1.328807210581763e-07, + "logits/chosen": -2.6877055168151855, + "logits/rejected": -2.7855048179626465, + "logps/chosen": -253.08489990234375, + "logps/rejected": -361.6291198730469, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06393276154994965, + "rewards/margins": 1.3458161354064941, + "rewards/rejected": -1.4097487926483154, + "step": 4890 + }, + { + "epoch": 0.56, + "learning_rate": 1.3284560458855202e-07, + "logits/chosen": -3.1982803344726562, + "logits/rejected": -3.3237831592559814, + "logps/chosen": -387.70697021484375, + "logps/rejected": -313.0211486816406, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3383297026157379, + "rewards/margins": 2.886847972869873, + "rewards/rejected": -3.2251780033111572, + "step": 4891 + }, + { + "epoch": 0.56, + "learning_rate": 1.3281048811892777e-07, + "logits/chosen": -2.916992664337158, + "logits/rejected": -2.759927749633789, + "logps/chosen": -251.814697265625, + "logps/rejected": -257.35211181640625, + "loss": 0.2704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23687852919101715, + "rewards/margins": 1.3340376615524292, + "rewards/rejected": -1.0971591472625732, + "step": 4892 + }, + { + "epoch": 0.56, + "learning_rate": 1.3277537164930353e-07, + "logits/chosen": -3.240095615386963, + "logits/rejected": -2.864713191986084, + "logps/chosen": -229.0552978515625, + "logps/rejected": -240.14401245117188, + "loss": 0.3682, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1837874948978424, + "rewards/margins": 2.5745575428009033, + "rewards/rejected": -2.758344888687134, + "step": 4893 + }, + { + "epoch": 0.56, + "learning_rate": 1.3274025517967925e-07, + "logits/chosen": -2.4051156044006348, + "logits/rejected": -2.198991298675537, + "logps/chosen": -400.2027282714844, + "logps/rejected": -327.56475830078125, + "loss": 0.6706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33958742022514343, + "rewards/margins": 0.5712059736251831, + "rewards/rejected": -0.9107934236526489, + "step": 4894 + }, + { + "epoch": 0.56, + "learning_rate": 1.32705138710055e-07, + "logits/chosen": -3.3251729011535645, + "logits/rejected": -3.405212879180908, + "logps/chosen": -127.55083465576172, + "logps/rejected": -179.42562866210938, + "loss": 0.4453, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24944451451301575, + "rewards/margins": 1.9824185371398926, + "rewards/rejected": -1.7329740524291992, + "step": 4895 + }, + { + "epoch": 0.56, + "learning_rate": 1.3267002224043076e-07, + "logits/chosen": -3.0091235637664795, + "logits/rejected": -3.131532669067383, + "logps/chosen": -338.9698486328125, + "logps/rejected": -267.18963623046875, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39405956864356995, + "rewards/margins": 2.4632673263549805, + "rewards/rejected": -2.0692076683044434, + "step": 4896 + }, + { + "epoch": 0.56, + "learning_rate": 1.3263490577080652e-07, + "logits/chosen": -3.1258771419525146, + "logits/rejected": -2.738579273223877, + "logps/chosen": -231.29055786132812, + "logps/rejected": -162.5377655029297, + "loss": 0.4911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18818935751914978, + "rewards/margins": 1.9141067266464233, + "rewards/rejected": -2.1022961139678955, + "step": 4897 + }, + { + "epoch": 0.56, + "learning_rate": 1.3259978930118224e-07, + "logits/chosen": -2.9624364376068115, + "logits/rejected": -3.0433218479156494, + "logps/chosen": -444.4438171386719, + "logps/rejected": -636.9229125976562, + "loss": 0.7902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40306004881858826, + "rewards/margins": 0.9127146601676941, + "rewards/rejected": -0.5096545815467834, + "step": 4898 + }, + { + "epoch": 0.56, + "learning_rate": 1.32564672831558e-07, + "logits/chosen": -3.2270336151123047, + "logits/rejected": -2.948911666870117, + "logps/chosen": -317.98828125, + "logps/rejected": -210.378173828125, + "loss": 0.4427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19304323196411133, + "rewards/margins": 1.8410844802856445, + "rewards/rejected": -2.034127712249756, + "step": 4899 + }, + { + "epoch": 0.56, + "learning_rate": 1.3252955636193372e-07, + "logits/chosen": -2.9118876457214355, + "logits/rejected": -2.8420119285583496, + "logps/chosen": -298.6072998046875, + "logps/rejected": -333.1017761230469, + "loss": 0.5898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4325065612792969, + "rewards/margins": 0.8824589252471924, + "rewards/rejected": -1.3149654865264893, + "step": 4900 + }, + { + "epoch": 0.56, + "learning_rate": 1.324944398923095e-07, + "logits/chosen": -2.8226513862609863, + "logits/rejected": -2.9257545471191406, + "logps/chosen": -185.9884033203125, + "logps/rejected": -234.37887573242188, + "loss": 0.5428, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3633840084075928, + "rewards/margins": 1.5272854566574097, + "rewards/rejected": -1.890669345855713, + "step": 4901 + }, + { + "epoch": 0.57, + "learning_rate": 1.3245932342268523e-07, + "logits/chosen": -2.2878658771514893, + "logits/rejected": -2.267481803894043, + "logps/chosen": -181.4043731689453, + "logps/rejected": -210.738525390625, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10961493849754333, + "rewards/margins": 1.8995696306228638, + "rewards/rejected": -2.0091843605041504, + "step": 4902 + }, + { + "epoch": 0.57, + "learning_rate": 1.3242420695306099e-07, + "logits/chosen": -4.1411285400390625, + "logits/rejected": -3.600309371948242, + "logps/chosen": -266.3943786621094, + "logps/rejected": -231.4290771484375, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30869436264038086, + "rewards/margins": 1.6257681846618652, + "rewards/rejected": -1.934462547302246, + "step": 4903 + }, + { + "epoch": 0.57, + "learning_rate": 1.3238909048343671e-07, + "logits/chosen": -2.721696138381958, + "logits/rejected": -2.9153900146484375, + "logps/chosen": -336.3002624511719, + "logps/rejected": -339.7408447265625, + "loss": 0.3325, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4062209129333496, + "rewards/margins": 2.216686487197876, + "rewards/rejected": -1.810465693473816, + "step": 4904 + }, + { + "epoch": 0.57, + "learning_rate": 1.3235397401381247e-07, + "logits/chosen": -3.0409438610076904, + "logits/rejected": -3.128696918487549, + "logps/chosen": -194.29405212402344, + "logps/rejected": -221.87432861328125, + "loss": 0.7538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3714163303375244, + "rewards/margins": 0.3739473819732666, + "rewards/rejected": -0.745363712310791, + "step": 4905 + }, + { + "epoch": 0.57, + "learning_rate": 1.3231885754418822e-07, + "logits/chosen": -3.6239473819732666, + "logits/rejected": -3.068600654602051, + "logps/chosen": -282.214111328125, + "logps/rejected": -188.49365234375, + "loss": 0.3005, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13825896382331848, + "rewards/margins": 2.6470577716827393, + "rewards/rejected": -2.7853169441223145, + "step": 4906 + }, + { + "epoch": 0.57, + "learning_rate": 1.3228374107456397e-07, + "logits/chosen": -3.0840489864349365, + "logits/rejected": -3.2296829223632812, + "logps/chosen": -197.3389129638672, + "logps/rejected": -214.23655700683594, + "loss": 0.5496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6762574911117554, + "rewards/margins": 0.8705728650093079, + "rewards/rejected": -1.5468302965164185, + "step": 4907 + }, + { + "epoch": 0.57, + "learning_rate": 1.322486246049397e-07, + "logits/chosen": -2.6512768268585205, + "logits/rejected": -2.9251527786254883, + "logps/chosen": -353.0219421386719, + "logps/rejected": -388.5227355957031, + "loss": 0.5655, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24956826865673065, + "rewards/margins": 2.1941239833831787, + "rewards/rejected": -2.443692207336426, + "step": 4908 + }, + { + "epoch": 0.57, + "learning_rate": 1.3221350813531546e-07, + "logits/chosen": -2.487590789794922, + "logits/rejected": -2.6427364349365234, + "logps/chosen": -373.6787414550781, + "logps/rejected": -385.35650634765625, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3407079577445984, + "rewards/margins": 2.158514976501465, + "rewards/rejected": -1.8178067207336426, + "step": 4909 + }, + { + "epoch": 0.57, + "learning_rate": 1.321783916656912e-07, + "logits/chosen": -3.645325183868408, + "logits/rejected": -3.5673344135284424, + "logps/chosen": -293.7283935546875, + "logps/rejected": -264.0010681152344, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7474640607833862, + "rewards/margins": 1.7244240045547485, + "rewards/rejected": -0.9769598245620728, + "step": 4910 + }, + { + "epoch": 0.57, + "learning_rate": 1.3214327519606694e-07, + "logits/chosen": -3.6679060459136963, + "logits/rejected": -3.796190023422241, + "logps/chosen": -181.4422149658203, + "logps/rejected": -291.94854736328125, + "loss": 0.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10799375176429749, + "rewards/margins": 0.9891284108161926, + "rewards/rejected": -1.0971221923828125, + "step": 4911 + }, + { + "epoch": 0.57, + "learning_rate": 1.321081587264427e-07, + "logits/chosen": -2.6542954444885254, + "logits/rejected": -2.646676778793335, + "logps/chosen": -175.72618103027344, + "logps/rejected": -210.57452392578125, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4149402976036072, + "rewards/margins": 2.5189316272735596, + "rewards/rejected": -2.1039915084838867, + "step": 4912 + }, + { + "epoch": 0.57, + "learning_rate": 1.3207304225681845e-07, + "logits/chosen": -2.654193639755249, + "logits/rejected": -2.824047565460205, + "logps/chosen": -209.71621704101562, + "logps/rejected": -294.3822937011719, + "loss": 0.2103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2966623902320862, + "rewards/margins": 2.2196731567382812, + "rewards/rejected": -1.9230107069015503, + "step": 4913 + }, + { + "epoch": 0.57, + "learning_rate": 1.320379257871942e-07, + "logits/chosen": -2.647714376449585, + "logits/rejected": -2.917083501815796, + "logps/chosen": -255.4839324951172, + "logps/rejected": -318.06103515625, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12622520327568054, + "rewards/margins": 1.362691879272461, + "rewards/rejected": -1.2364667654037476, + "step": 4914 + }, + { + "epoch": 0.57, + "learning_rate": 1.3200280931756993e-07, + "logits/chosen": -3.5380845069885254, + "logits/rejected": -3.477008819580078, + "logps/chosen": -199.2886962890625, + "logps/rejected": -124.77758026123047, + "loss": 0.8601, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7538517117500305, + "rewards/margins": 0.053162574768066406, + "rewards/rejected": -0.8070142865180969, + "step": 4915 + }, + { + "epoch": 0.57, + "learning_rate": 1.3196769284794568e-07, + "logits/chosen": -2.753117561340332, + "logits/rejected": -2.453453540802002, + "logps/chosen": -259.119384765625, + "logps/rejected": -501.6586608886719, + "loss": 0.2173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5415043830871582, + "rewards/margins": 2.6375315189361572, + "rewards/rejected": -3.1790361404418945, + "step": 4916 + }, + { + "epoch": 0.57, + "learning_rate": 1.319325763783214e-07, + "logits/chosen": -3.4097723960876465, + "logits/rejected": -3.266212224960327, + "logps/chosen": -232.3413543701172, + "logps/rejected": -204.48028564453125, + "loss": 0.5325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5029879808425903, + "rewards/margins": 0.472893089056015, + "rewards/rejected": -0.9758810997009277, + "step": 4917 + }, + { + "epoch": 0.57, + "learning_rate": 1.318974599086972e-07, + "logits/chosen": -2.788933277130127, + "logits/rejected": -2.9494869709014893, + "logps/chosen": -363.47259521484375, + "logps/rejected": -179.11434936523438, + "loss": 0.4688, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006411215290427208, + "rewards/margins": 1.4677619934082031, + "rewards/rejected": -1.461350679397583, + "step": 4918 + }, + { + "epoch": 0.57, + "learning_rate": 1.3186234343907292e-07, + "logits/chosen": -3.520358085632324, + "logits/rejected": -3.615854024887085, + "logps/chosen": -237.20941162109375, + "logps/rejected": -310.415771484375, + "loss": 0.3146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37379762530326843, + "rewards/margins": 1.78755521774292, + "rewards/rejected": -2.1613526344299316, + "step": 4919 + }, + { + "epoch": 0.57, + "learning_rate": 1.3182722696944867e-07, + "logits/chosen": -3.468078374862671, + "logits/rejected": -4.001546859741211, + "logps/chosen": -89.44667053222656, + "logps/rejected": -243.12420654296875, + "loss": 0.1656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13077807426452637, + "rewards/margins": 2.8602848052978516, + "rewards/rejected": -2.729506731033325, + "step": 4920 + }, + { + "epoch": 0.57, + "learning_rate": 1.317921104998244e-07, + "logits/chosen": -2.8403260707855225, + "logits/rejected": -3.0432260036468506, + "logps/chosen": -198.11672973632812, + "logps/rejected": -249.2513427734375, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20555776357650757, + "rewards/margins": 2.364628314971924, + "rewards/rejected": -2.1590704917907715, + "step": 4921 + }, + { + "epoch": 0.57, + "learning_rate": 1.3175699403020018e-07, + "logits/chosen": -3.4067773818969727, + "logits/rejected": -3.585719585418701, + "logps/chosen": -157.963623046875, + "logps/rejected": -172.57028198242188, + "loss": 0.9706, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5504069924354553, + "rewards/margins": 0.7901512384414673, + "rewards/rejected": -1.3405581712722778, + "step": 4922 + }, + { + "epoch": 0.57, + "learning_rate": 1.317218775605759e-07, + "logits/chosen": -2.902548313140869, + "logits/rejected": -2.780874252319336, + "logps/chosen": -366.435546875, + "logps/rejected": -368.0665588378906, + "loss": 0.3679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06786558032035828, + "rewards/margins": 1.2423560619354248, + "rewards/rejected": -1.1744904518127441, + "step": 4923 + }, + { + "epoch": 0.57, + "learning_rate": 1.3168676109095166e-07, + "logits/chosen": -2.9311952590942383, + "logits/rejected": -2.7318408489227295, + "logps/chosen": -263.76934814453125, + "logps/rejected": -245.297119140625, + "loss": 0.4397, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5990593433380127, + "rewards/margins": 1.364341378211975, + "rewards/rejected": -1.9634007215499878, + "step": 4924 + }, + { + "epoch": 0.57, + "learning_rate": 1.3165164462132739e-07, + "logits/chosen": -2.691896915435791, + "logits/rejected": -2.9158291816711426, + "logps/chosen": -314.6687927246094, + "logps/rejected": -231.56893920898438, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20459231734275818, + "rewards/margins": 1.2141802310943604, + "rewards/rejected": -1.418772578239441, + "step": 4925 + }, + { + "epoch": 0.57, + "learning_rate": 1.3161652815170314e-07, + "logits/chosen": -2.8826990127563477, + "logits/rejected": -3.0244998931884766, + "logps/chosen": -165.87738037109375, + "logps/rejected": -197.8976287841797, + "loss": 0.4684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2988194227218628, + "rewards/margins": 1.136254906654358, + "rewards/rejected": -1.4350742101669312, + "step": 4926 + }, + { + "epoch": 0.57, + "learning_rate": 1.315814116820789e-07, + "logits/chosen": -3.3401851654052734, + "logits/rejected": -3.4247074127197266, + "logps/chosen": -184.37310791015625, + "logps/rejected": -185.7845458984375, + "loss": 0.4898, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3878783583641052, + "rewards/margins": 1.044654130935669, + "rewards/rejected": -1.432532548904419, + "step": 4927 + }, + { + "epoch": 0.57, + "learning_rate": 1.3154629521245462e-07, + "logits/chosen": -3.3403563499450684, + "logits/rejected": -3.1390328407287598, + "logps/chosen": -189.7566375732422, + "logps/rejected": -295.0284118652344, + "loss": 0.1959, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11122691631317139, + "rewards/margins": 2.3836684226989746, + "rewards/rejected": -2.2724413871765137, + "step": 4928 + }, + { + "epoch": 0.57, + "learning_rate": 1.3151117874283037e-07, + "logits/chosen": -3.452359676361084, + "logits/rejected": -3.2125983238220215, + "logps/chosen": -598.569580078125, + "logps/rejected": -291.1309814453125, + "loss": 0.3292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1856641173362732, + "rewards/margins": 1.721681833267212, + "rewards/rejected": -1.9073460102081299, + "step": 4929 + }, + { + "epoch": 0.57, + "learning_rate": 1.3147606227320613e-07, + "logits/chosen": -3.3267569541931152, + "logits/rejected": -3.52630877494812, + "logps/chosen": -193.173828125, + "logps/rejected": -273.3988037109375, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07897251844406128, + "rewards/margins": 1.0750197172164917, + "rewards/rejected": -0.9960471987724304, + "step": 4930 + }, + { + "epoch": 0.57, + "learning_rate": 1.3144094580358188e-07, + "logits/chosen": -2.708320140838623, + "logits/rejected": -2.6287567615509033, + "logps/chosen": -74.66765594482422, + "logps/rejected": -198.45526123046875, + "loss": 0.8741, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9132437705993652, + "rewards/margins": 0.2134455442428589, + "rewards/rejected": -1.1266894340515137, + "step": 4931 + }, + { + "epoch": 0.57, + "learning_rate": 1.314058293339576e-07, + "logits/chosen": -2.5971577167510986, + "logits/rejected": -2.555586099624634, + "logps/chosen": -247.43585205078125, + "logps/rejected": -268.5702819824219, + "loss": 0.5352, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03060247004032135, + "rewards/margins": 1.708949089050293, + "rewards/rejected": -1.7395515441894531, + "step": 4932 + }, + { + "epoch": 0.57, + "learning_rate": 1.3137071286433336e-07, + "logits/chosen": -3.214047431945801, + "logits/rejected": -3.0868911743164062, + "logps/chosen": -343.50177001953125, + "logps/rejected": -256.94512939453125, + "loss": 0.4016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2862287759780884, + "rewards/margins": 2.061235189437866, + "rewards/rejected": -1.7750064134597778, + "step": 4933 + }, + { + "epoch": 0.57, + "learning_rate": 1.3133559639470912e-07, + "logits/chosen": -2.4888644218444824, + "logits/rejected": -2.636190414428711, + "logps/chosen": -401.0740966796875, + "logps/rejected": -216.77523803710938, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06943672895431519, + "rewards/margins": 1.4723007678985596, + "rewards/rejected": -1.5417375564575195, + "step": 4934 + }, + { + "epoch": 0.57, + "learning_rate": 1.3130047992508487e-07, + "logits/chosen": -2.8792853355407715, + "logits/rejected": -2.895745277404785, + "logps/chosen": -213.0355224609375, + "logps/rejected": -358.20220947265625, + "loss": 0.1693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22030703723430634, + "rewards/margins": 2.918618679046631, + "rewards/rejected": -3.138925552368164, + "step": 4935 + }, + { + "epoch": 0.57, + "learning_rate": 1.312653634554606e-07, + "logits/chosen": -3.2108352184295654, + "logits/rejected": -3.3150007724761963, + "logps/chosen": -324.4971618652344, + "logps/rejected": -328.8071594238281, + "loss": 0.2518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5123298764228821, + "rewards/margins": 2.5654871463775635, + "rewards/rejected": -2.053157091140747, + "step": 4936 + }, + { + "epoch": 0.57, + "learning_rate": 1.3123024698583635e-07, + "logits/chosen": -2.8900461196899414, + "logits/rejected": -2.489104747772217, + "logps/chosen": -151.1640625, + "logps/rejected": -158.85409545898438, + "loss": 0.3136, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08491766452789307, + "rewards/margins": 1.5959644317626953, + "rewards/rejected": -1.5110468864440918, + "step": 4937 + }, + { + "epoch": 0.57, + "learning_rate": 1.311951305162121e-07, + "logits/chosen": -3.1130785942077637, + "logits/rejected": -3.0720691680908203, + "logps/chosen": -210.61068725585938, + "logps/rejected": -357.0616455078125, + "loss": 0.273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13546809554100037, + "rewards/margins": 1.3826814889907837, + "rewards/rejected": -1.247213363647461, + "step": 4938 + }, + { + "epoch": 0.57, + "learning_rate": 1.3116001404658786e-07, + "logits/chosen": -3.9551098346710205, + "logits/rejected": -3.6575276851654053, + "logps/chosen": -238.61065673828125, + "logps/rejected": -218.1276397705078, + "loss": 0.7106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2874159812927246, + "rewards/margins": 1.1699365377426147, + "rewards/rejected": -1.4573523998260498, + "step": 4939 + }, + { + "epoch": 0.57, + "learning_rate": 1.311248975769636e-07, + "logits/chosen": -3.440756320953369, + "logits/rejected": -3.40440034866333, + "logps/chosen": -394.1746826171875, + "logps/rejected": -441.11181640625, + "loss": 0.3234, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27452021837234497, + "rewards/margins": 2.6201705932617188, + "rewards/rejected": -2.3456499576568604, + "step": 4940 + }, + { + "epoch": 0.57, + "learning_rate": 1.3108978110733934e-07, + "logits/chosen": -2.6250715255737305, + "logits/rejected": -2.7495172023773193, + "logps/chosen": -222.3017578125, + "logps/rejected": -269.2227478027344, + "loss": 0.7082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3660350441932678, + "rewards/margins": 1.5599664449691772, + "rewards/rejected": -1.9260013103485107, + "step": 4941 + }, + { + "epoch": 0.57, + "learning_rate": 1.310546646377151e-07, + "logits/chosen": -3.0981194972991943, + "logits/rejected": -2.930821657180786, + "logps/chosen": -372.7041015625, + "logps/rejected": -225.827880859375, + "loss": 0.444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21049928665161133, + "rewards/margins": 0.8749426603317261, + "rewards/rejected": -1.0854419469833374, + "step": 4942 + }, + { + "epoch": 0.57, + "learning_rate": 1.3101954816809082e-07, + "logits/chosen": -2.919790029525757, + "logits/rejected": -2.80733060836792, + "logps/chosen": -226.75860595703125, + "logps/rejected": -322.7506103515625, + "loss": 0.5625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23190197348594666, + "rewards/margins": 0.9645235538482666, + "rewards/rejected": -1.1964255571365356, + "step": 4943 + }, + { + "epoch": 0.57, + "learning_rate": 1.3098443169846658e-07, + "logits/chosen": -3.527676582336426, + "logits/rejected": -3.2604856491088867, + "logps/chosen": -309.6979064941406, + "logps/rejected": -482.35015869140625, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7147489786148071, + "rewards/margins": 2.247283697128296, + "rewards/rejected": -2.9620323181152344, + "step": 4944 + }, + { + "epoch": 0.57, + "learning_rate": 1.309493152288423e-07, + "logits/chosen": -2.459625720977783, + "logits/rejected": -2.464909076690674, + "logps/chosen": -360.2669677734375, + "logps/rejected": -304.644287109375, + "loss": 0.4034, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06674538552761078, + "rewards/margins": 1.4802676439285278, + "rewards/rejected": -1.4135222434997559, + "step": 4945 + }, + { + "epoch": 0.57, + "learning_rate": 1.3091419875921808e-07, + "logits/chosen": -3.047049045562744, + "logits/rejected": -2.74877667427063, + "logps/chosen": -208.1866912841797, + "logps/rejected": -197.7428741455078, + "loss": 0.4204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0576503723859787, + "rewards/margins": 0.9361428618431091, + "rewards/rejected": -0.993793249130249, + "step": 4946 + }, + { + "epoch": 0.57, + "learning_rate": 1.308790822895938e-07, + "logits/chosen": -2.5349559783935547, + "logits/rejected": -2.4753901958465576, + "logps/chosen": -324.45916748046875, + "logps/rejected": -234.22232055664062, + "loss": 0.2112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04843682050704956, + "rewards/margins": 2.6658525466918945, + "rewards/rejected": -2.617415428161621, + "step": 4947 + }, + { + "epoch": 0.57, + "learning_rate": 1.3084396581996957e-07, + "logits/chosen": -3.1294267177581787, + "logits/rejected": -2.6886649131774902, + "logps/chosen": -226.78961181640625, + "logps/rejected": -145.67022705078125, + "loss": 0.4323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29552125930786133, + "rewards/margins": 1.0085690021514893, + "rewards/rejected": -1.3040904998779297, + "step": 4948 + }, + { + "epoch": 0.57, + "learning_rate": 1.308088493503453e-07, + "logits/chosen": -2.639169216156006, + "logits/rejected": -2.593147039413452, + "logps/chosen": -317.732177734375, + "logps/rejected": -387.45440673828125, + "loss": 0.49, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37562912702560425, + "rewards/margins": 0.843109667301178, + "rewards/rejected": -1.2187387943267822, + "step": 4949 + }, + { + "epoch": 0.57, + "learning_rate": 1.3077373288072107e-07, + "logits/chosen": -2.7017710208892822, + "logits/rejected": -2.9531190395355225, + "logps/chosen": -428.14129638671875, + "logps/rejected": -471.3913879394531, + "loss": 0.3615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.008618343621492386, + "rewards/margins": 2.5370397567749023, + "rewards/rejected": -2.5456581115722656, + "step": 4950 + }, + { + "epoch": 0.57, + "learning_rate": 1.307386164110968e-07, + "logits/chosen": -3.5788164138793945, + "logits/rejected": -3.5686447620391846, + "logps/chosen": -285.7480773925781, + "logps/rejected": -237.45346069335938, + "loss": 0.1858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24418598413467407, + "rewards/margins": 2.5306057929992676, + "rewards/rejected": -2.28641939163208, + "step": 4951 + }, + { + "epoch": 0.57, + "learning_rate": 1.3070349994147255e-07, + "logits/chosen": -2.940814971923828, + "logits/rejected": -3.0005006790161133, + "logps/chosen": -211.50926208496094, + "logps/rejected": -215.00997924804688, + "loss": 0.4796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22036145627498627, + "rewards/margins": 1.2387142181396484, + "rewards/rejected": -1.459075927734375, + "step": 4952 + }, + { + "epoch": 0.57, + "learning_rate": 1.3066838347184828e-07, + "logits/chosen": -3.748441696166992, + "logits/rejected": -3.5982909202575684, + "logps/chosen": -218.64715576171875, + "logps/rejected": -282.0867004394531, + "loss": 0.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28843116760253906, + "rewards/margins": 2.623072385787964, + "rewards/rejected": -2.911503553390503, + "step": 4953 + }, + { + "epoch": 0.57, + "learning_rate": 1.3063326700222404e-07, + "logits/chosen": -3.421612024307251, + "logits/rejected": -3.4151322841644287, + "logps/chosen": -333.5555725097656, + "logps/rejected": -382.7013854980469, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5651994943618774, + "rewards/margins": 2.2862436771392822, + "rewards/rejected": -2.851442813873291, + "step": 4954 + }, + { + "epoch": 0.57, + "learning_rate": 1.305981505325998e-07, + "logits/chosen": -2.5706520080566406, + "logits/rejected": -2.4061429500579834, + "logps/chosen": -347.1488037109375, + "logps/rejected": -270.0299377441406, + "loss": 0.5195, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35082316398620605, + "rewards/margins": 1.316308856010437, + "rewards/rejected": -0.9654858112335205, + "step": 4955 + }, + { + "epoch": 0.57, + "learning_rate": 1.3056303406297554e-07, + "logits/chosen": -2.380948543548584, + "logits/rejected": -2.5193533897399902, + "logps/chosen": -225.32102966308594, + "logps/rejected": -214.96109008789062, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2842986285686493, + "rewards/margins": 0.9809271097183228, + "rewards/rejected": -1.2652257680892944, + "step": 4956 + }, + { + "epoch": 0.57, + "learning_rate": 1.3052791759335127e-07, + "logits/chosen": -3.865328311920166, + "logits/rejected": -3.5313165187835693, + "logps/chosen": -192.69833374023438, + "logps/rejected": -221.15577697753906, + "loss": 0.3168, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16514281928539276, + "rewards/margins": 1.2249484062194824, + "rewards/rejected": -1.0598055124282837, + "step": 4957 + }, + { + "epoch": 0.57, + "learning_rate": 1.3049280112372702e-07, + "logits/chosen": -2.8983750343322754, + "logits/rejected": -2.51039457321167, + "logps/chosen": -261.3135681152344, + "logps/rejected": -144.97738647460938, + "loss": 0.4696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6397002935409546, + "rewards/margins": 0.7635846138000488, + "rewards/rejected": -1.403285026550293, + "step": 4958 + }, + { + "epoch": 0.57, + "learning_rate": 1.3045768465410278e-07, + "logits/chosen": -3.029940128326416, + "logits/rejected": -3.16182804107666, + "logps/chosen": -217.603759765625, + "logps/rejected": -181.3094940185547, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4878467917442322, + "rewards/margins": 1.7707659006118774, + "rewards/rejected": -2.258612632751465, + "step": 4959 + }, + { + "epoch": 0.57, + "learning_rate": 1.304225681844785e-07, + "logits/chosen": -2.7982935905456543, + "logits/rejected": -2.7803802490234375, + "logps/chosen": -268.7843933105469, + "logps/rejected": -252.13873291015625, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10395797342061996, + "rewards/margins": 1.4850444793701172, + "rewards/rejected": -1.5890026092529297, + "step": 4960 + }, + { + "epoch": 0.57, + "learning_rate": 1.3038745171485426e-07, + "logits/chosen": -3.3695876598358154, + "logits/rejected": -3.5060722827911377, + "logps/chosen": -251.0154266357422, + "logps/rejected": -333.7469787597656, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26864930987358093, + "rewards/margins": 2.23482084274292, + "rewards/rejected": -1.966171383857727, + "step": 4961 + }, + { + "epoch": 0.57, + "learning_rate": 1.3035233524523e-07, + "logits/chosen": -3.18544864654541, + "logits/rejected": -3.3944997787475586, + "logps/chosen": -295.8947448730469, + "logps/rejected": -253.98648071289062, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6138091683387756, + "rewards/margins": 2.1381993293762207, + "rewards/rejected": -1.5243899822235107, + "step": 4962 + }, + { + "epoch": 0.57, + "learning_rate": 1.3031721877560577e-07, + "logits/chosen": -3.6116018295288086, + "logits/rejected": -3.7100014686584473, + "logps/chosen": -285.93670654296875, + "logps/rejected": -347.6808166503906, + "loss": 0.1586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48476165533065796, + "rewards/margins": 3.9011690616607666, + "rewards/rejected": -4.38593053817749, + "step": 4963 + }, + { + "epoch": 0.57, + "learning_rate": 1.302821023059815e-07, + "logits/chosen": -3.055772542953491, + "logits/rejected": -3.0889971256256104, + "logps/chosen": -169.44876098632812, + "logps/rejected": -222.90133666992188, + "loss": 0.3566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026242226362228394, + "rewards/margins": 1.058784008026123, + "rewards/rejected": -1.0325417518615723, + "step": 4964 + }, + { + "epoch": 0.57, + "learning_rate": 1.3024698583635725e-07, + "logits/chosen": -2.95626163482666, + "logits/rejected": -2.931473970413208, + "logps/chosen": -261.1575927734375, + "logps/rejected": -288.1734924316406, + "loss": 0.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3226381540298462, + "rewards/margins": 1.2701525688171387, + "rewards/rejected": -1.5927908420562744, + "step": 4965 + }, + { + "epoch": 0.57, + "learning_rate": 1.3021186936673298e-07, + "logits/chosen": -2.9799439907073975, + "logits/rejected": -3.1734542846679688, + "logps/chosen": -144.12384033203125, + "logps/rejected": -167.71217346191406, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.014158889651298523, + "rewards/margins": 0.7376977205276489, + "rewards/rejected": -0.7518566250801086, + "step": 4966 + }, + { + "epoch": 0.57, + "learning_rate": 1.3017675289710876e-07, + "logits/chosen": -2.761080503463745, + "logits/rejected": -2.71244478225708, + "logps/chosen": -406.71026611328125, + "logps/rejected": -285.1802978515625, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5535622835159302, + "rewards/margins": 1.3473411798477173, + "rewards/rejected": -1.9009034633636475, + "step": 4967 + }, + { + "epoch": 0.57, + "learning_rate": 1.3014163642748448e-07, + "logits/chosen": -2.9888193607330322, + "logits/rejected": -2.759368896484375, + "logps/chosen": -175.24514770507812, + "logps/rejected": -214.9909210205078, + "loss": 0.7225, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36271438002586365, + "rewards/margins": 0.15871021151542664, + "rewards/rejected": -0.5214246511459351, + "step": 4968 + }, + { + "epoch": 0.57, + "learning_rate": 1.3010651995786024e-07, + "logits/chosen": -3.4790406227111816, + "logits/rejected": -3.534580707550049, + "logps/chosen": -289.99566650390625, + "logps/rejected": -168.06858825683594, + "loss": 0.3559, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12954498827457428, + "rewards/margins": 1.1969680786132812, + "rewards/rejected": -1.0674229860305786, + "step": 4969 + }, + { + "epoch": 0.57, + "learning_rate": 1.3007140348823596e-07, + "logits/chosen": -3.0978968143463135, + "logits/rejected": -2.810429096221924, + "logps/chosen": -246.46380615234375, + "logps/rejected": -372.7960205078125, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24708181619644165, + "rewards/margins": 2.0090901851654053, + "rewards/rejected": -1.7620083093643188, + "step": 4970 + }, + { + "epoch": 0.57, + "learning_rate": 1.3003628701861172e-07, + "logits/chosen": -2.1795616149902344, + "logits/rejected": -2.157310962677002, + "logps/chosen": -230.5937042236328, + "logps/rejected": -272.43585205078125, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5251719355583191, + "rewards/margins": 1.9233736991882324, + "rewards/rejected": -1.398201823234558, + "step": 4971 + }, + { + "epoch": 0.57, + "learning_rate": 1.3000117054898747e-07, + "logits/chosen": -3.417649984359741, + "logits/rejected": -3.2250962257385254, + "logps/chosen": -322.67083740234375, + "logps/rejected": -269.47613525390625, + "loss": 1.009, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.32966947555542, + "rewards/margins": -0.13085809350013733, + "rewards/rejected": -1.1988112926483154, + "step": 4972 + }, + { + "epoch": 0.57, + "learning_rate": 1.2996605407936323e-07, + "logits/chosen": -3.0274643898010254, + "logits/rejected": -3.1419639587402344, + "logps/chosen": -265.71148681640625, + "logps/rejected": -303.3494873046875, + "loss": 0.4276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.008280612528324127, + "rewards/margins": 1.550323486328125, + "rewards/rejected": -1.5586040019989014, + "step": 4973 + }, + { + "epoch": 0.57, + "learning_rate": 1.2993093760973895e-07, + "logits/chosen": -2.9352335929870605, + "logits/rejected": -2.881777048110962, + "logps/chosen": -431.638671875, + "logps/rejected": -291.3288879394531, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10204711556434631, + "rewards/margins": 2.2424111366271973, + "rewards/rejected": -2.140364170074463, + "step": 4974 + }, + { + "epoch": 0.57, + "learning_rate": 1.298958211401147e-07, + "logits/chosen": -2.822690486907959, + "logits/rejected": -3.1079559326171875, + "logps/chosen": -256.0130310058594, + "logps/rejected": -332.115234375, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06218039244413376, + "rewards/margins": 2.771812915802002, + "rewards/rejected": -2.709632396697998, + "step": 4975 + }, + { + "epoch": 0.57, + "learning_rate": 1.2986070467049046e-07, + "logits/chosen": -2.963986873626709, + "logits/rejected": -2.7930469512939453, + "logps/chosen": -276.9523620605469, + "logps/rejected": -347.6455383300781, + "loss": 0.0815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5524616837501526, + "rewards/margins": 4.511364936828613, + "rewards/rejected": -3.9589028358459473, + "step": 4976 + }, + { + "epoch": 0.57, + "learning_rate": 1.298255882008662e-07, + "logits/chosen": -3.1929097175598145, + "logits/rejected": -3.2397401332855225, + "logps/chosen": -219.9685821533203, + "logps/rejected": -266.9318542480469, + "loss": 0.2204, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06854460388422012, + "rewards/margins": 2.593303680419922, + "rewards/rejected": -2.524759292602539, + "step": 4977 + }, + { + "epoch": 0.57, + "learning_rate": 1.2979047173124194e-07, + "logits/chosen": -2.3190293312072754, + "logits/rejected": -2.4377365112304688, + "logps/chosen": -410.7904052734375, + "logps/rejected": -282.447509765625, + "loss": 0.5251, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20272788405418396, + "rewards/margins": 1.3600897789001465, + "rewards/rejected": -1.1573619842529297, + "step": 4978 + }, + { + "epoch": 0.57, + "learning_rate": 1.297553552616177e-07, + "logits/chosen": -3.8452324867248535, + "logits/rejected": -3.7175536155700684, + "logps/chosen": -178.24652099609375, + "logps/rejected": -185.779052734375, + "loss": 0.2639, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17466901242733002, + "rewards/margins": 2.2194957733154297, + "rewards/rejected": -2.0448265075683594, + "step": 4979 + }, + { + "epoch": 0.57, + "learning_rate": 1.2972023879199345e-07, + "logits/chosen": -2.8956191539764404, + "logits/rejected": -2.9622480869293213, + "logps/chosen": -226.55337524414062, + "logps/rejected": -124.57035064697266, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.62633216381073, + "rewards/margins": 1.3090336322784424, + "rewards/rejected": -1.9353657960891724, + "step": 4980 + }, + { + "epoch": 0.57, + "learning_rate": 1.2968512232236918e-07, + "logits/chosen": -3.0936715602874756, + "logits/rejected": -3.286043882369995, + "logps/chosen": -102.5303726196289, + "logps/rejected": -99.36160278320312, + "loss": 0.6965, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0346473455429077, + "rewards/margins": 0.2572775185108185, + "rewards/rejected": -1.2919249534606934, + "step": 4981 + }, + { + "epoch": 0.57, + "learning_rate": 1.2965000585274493e-07, + "logits/chosen": -2.982877731323242, + "logits/rejected": -3.151062250137329, + "logps/chosen": -271.1722717285156, + "logps/rejected": -270.23291015625, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37522900104522705, + "rewards/margins": 2.494188070297241, + "rewards/rejected": -2.1189589500427246, + "step": 4982 + }, + { + "epoch": 0.57, + "learning_rate": 1.2961488938312069e-07, + "logits/chosen": -3.476167678833008, + "logits/rejected": -3.4448540210723877, + "logps/chosen": -363.12005615234375, + "logps/rejected": -244.47769165039062, + "loss": 0.7593, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11011549830436707, + "rewards/margins": 1.4350695610046387, + "rewards/rejected": -1.5451852083206177, + "step": 4983 + }, + { + "epoch": 0.57, + "learning_rate": 1.2957977291349644e-07, + "logits/chosen": -3.26519775390625, + "logits/rejected": -3.210214138031006, + "logps/chosen": -251.0633544921875, + "logps/rejected": -253.66310119628906, + "loss": 0.1766, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03160540759563446, + "rewards/margins": 3.868051290512085, + "rewards/rejected": -3.8364460468292236, + "step": 4984 + }, + { + "epoch": 0.57, + "learning_rate": 1.2954465644387217e-07, + "logits/chosen": -2.7529828548431396, + "logits/rejected": -2.686272144317627, + "logps/chosen": -300.24365234375, + "logps/rejected": -265.9488220214844, + "loss": 0.3701, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2547810673713684, + "rewards/margins": 1.9681310653686523, + "rewards/rejected": -2.222912073135376, + "step": 4985 + }, + { + "epoch": 0.57, + "learning_rate": 1.2950953997424792e-07, + "logits/chosen": -2.576279640197754, + "logits/rejected": -2.701568603515625, + "logps/chosen": -435.45977783203125, + "logps/rejected": -294.992431640625, + "loss": 0.4067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34801918268203735, + "rewards/margins": 1.2943801879882812, + "rewards/rejected": -1.6423994302749634, + "step": 4986 + }, + { + "epoch": 0.57, + "learning_rate": 1.2947442350462367e-07, + "logits/chosen": -3.6411819458007812, + "logits/rejected": -3.500779151916504, + "logps/chosen": -286.2313537597656, + "logps/rejected": -190.35989379882812, + "loss": 0.4886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024443045258522034, + "rewards/margins": 1.3339817523956299, + "rewards/rejected": -1.3584247827529907, + "step": 4987 + }, + { + "epoch": 0.58, + "learning_rate": 1.294393070349994e-07, + "logits/chosen": -3.300842523574829, + "logits/rejected": -3.0408682823181152, + "logps/chosen": -224.90708923339844, + "logps/rejected": -244.97755432128906, + "loss": 0.4946, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010892353951931, + "rewards/margins": 2.1356139183044434, + "rewards/rejected": -2.1247217655181885, + "step": 4988 + }, + { + "epoch": 0.58, + "learning_rate": 1.2940419056537516e-07, + "logits/chosen": -3.3927745819091797, + "logits/rejected": -3.22948956489563, + "logps/chosen": -203.30419921875, + "logps/rejected": -247.09532165527344, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039208024740219116, + "rewards/margins": 2.0847160816192627, + "rewards/rejected": -2.1239237785339355, + "step": 4989 + }, + { + "epoch": 0.58, + "learning_rate": 1.293690740957509e-07, + "logits/chosen": -2.8651180267333984, + "logits/rejected": -2.9935035705566406, + "logps/chosen": -78.1316146850586, + "logps/rejected": -166.19151306152344, + "loss": 0.3983, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07077307999134064, + "rewards/margins": 1.4879200458526611, + "rewards/rejected": -1.417146921157837, + "step": 4990 + }, + { + "epoch": 0.58, + "learning_rate": 1.2933395762612666e-07, + "logits/chosen": -2.452582836151123, + "logits/rejected": -2.4943652153015137, + "logps/chosen": -332.3829650878906, + "logps/rejected": -163.5178985595703, + "loss": 0.4641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2305031418800354, + "rewards/margins": 1.5296592712402344, + "rewards/rejected": -1.760162353515625, + "step": 4991 + }, + { + "epoch": 0.58, + "learning_rate": 1.292988411565024e-07, + "logits/chosen": -3.7815663814544678, + "logits/rejected": -3.917253017425537, + "logps/chosen": -150.0355224609375, + "logps/rejected": -224.300537109375, + "loss": 0.7931, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0316247940063477, + "rewards/margins": 1.3243701457977295, + "rewards/rejected": -2.355994939804077, + "step": 4992 + }, + { + "epoch": 0.58, + "learning_rate": 1.2926372468687814e-07, + "logits/chosen": -2.853198528289795, + "logits/rejected": -2.635854482650757, + "logps/chosen": -531.5020751953125, + "logps/rejected": -340.3063659667969, + "loss": 0.356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23451195657253265, + "rewards/margins": 1.9588029384613037, + "rewards/rejected": -2.193315029144287, + "step": 4993 + }, + { + "epoch": 0.58, + "learning_rate": 1.2922860821725387e-07, + "logits/chosen": -2.662381172180176, + "logits/rejected": -2.8075764179229736, + "logps/chosen": -182.6499481201172, + "logps/rejected": -244.21336364746094, + "loss": 0.1157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4673260450363159, + "rewards/margins": 3.4276375770568848, + "rewards/rejected": -2.9603118896484375, + "step": 4994 + }, + { + "epoch": 0.58, + "learning_rate": 1.2919349174762965e-07, + "logits/chosen": -3.3101654052734375, + "logits/rejected": -3.065042734146118, + "logps/chosen": -338.4549865722656, + "logps/rejected": -168.30606079101562, + "loss": 0.5179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34526893496513367, + "rewards/margins": 0.9949976205825806, + "rewards/rejected": -1.340266466140747, + "step": 4995 + }, + { + "epoch": 0.58, + "learning_rate": 1.2915837527800538e-07, + "logits/chosen": -3.098053455352783, + "logits/rejected": -3.0035243034362793, + "logps/chosen": -287.24951171875, + "logps/rejected": -238.79983520507812, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5066121220588684, + "rewards/margins": 1.2992783784866333, + "rewards/rejected": -1.805890440940857, + "step": 4996 + }, + { + "epoch": 0.58, + "learning_rate": 1.2912325880838113e-07, + "logits/chosen": -3.003601551055908, + "logits/rejected": -3.206362009048462, + "logps/chosen": -266.2685546875, + "logps/rejected": -226.38819885253906, + "loss": 0.23, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1355937123298645, + "rewards/margins": 1.8285493850708008, + "rewards/rejected": -1.6929556131362915, + "step": 4997 + }, + { + "epoch": 0.58, + "learning_rate": 1.2908814233875686e-07, + "logits/chosen": -2.7873265743255615, + "logits/rejected": -2.7481977939605713, + "logps/chosen": -217.50782775878906, + "logps/rejected": -217.24594116210938, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1744399070739746, + "rewards/margins": 1.4546840190887451, + "rewards/rejected": -1.6291239261627197, + "step": 4998 + }, + { + "epoch": 0.58, + "learning_rate": 1.2905302586913261e-07, + "logits/chosen": -3.1659250259399414, + "logits/rejected": -3.040562629699707, + "logps/chosen": -401.05560302734375, + "logps/rejected": -330.4620666503906, + "loss": 0.7975, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8107323050498962, + "rewards/margins": 0.6146043539047241, + "rewards/rejected": -1.4253365993499756, + "step": 4999 + }, + { + "epoch": 0.58, + "learning_rate": 1.2901790939950837e-07, + "logits/chosen": -3.0244407653808594, + "logits/rejected": -2.946230888366699, + "logps/chosen": -354.1905517578125, + "logps/rejected": -312.01837158203125, + "loss": 0.7752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22785690426826477, + "rewards/margins": 0.7793235182762146, + "rewards/rejected": -1.0071804523468018, + "step": 5000 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.837320327758789, + "eval_logits/rejected": -2.79986572265625, + "eval_logps/chosen": -293.6067199707031, + "eval_logps/rejected": -237.07310485839844, + "eval_loss": 0.42721131443977356, + "eval_rewards/accuracies": 0.8142856955528259, + "eval_rewards/chosen": 0.044821999967098236, + "eval_rewards/margins": 1.3178679943084717, + "eval_rewards/rejected": -1.2730460166931152, + "eval_runtime": 32.6061, + "eval_samples_per_second": 2.147, + "eval_steps_per_second": 1.073, + "step": 5000 + }, + { + "epoch": 0.58, + "learning_rate": 1.2898279292988412e-07, + "logits/chosen": -3.196345567703247, + "logits/rejected": -3.465787410736084, + "logps/chosen": -172.88491821289062, + "logps/rejected": -211.34996032714844, + "loss": 0.2597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10982292890548706, + "rewards/margins": 1.9138081073760986, + "rewards/rejected": -2.0236310958862305, + "step": 5001 + }, + { + "epoch": 0.58, + "learning_rate": 1.2894767646025985e-07, + "logits/chosen": -2.886819362640381, + "logits/rejected": -2.9753036499023438, + "logps/chosen": -343.15582275390625, + "logps/rejected": -163.3333740234375, + "loss": 0.5452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5525258183479309, + "rewards/margins": 1.1655480861663818, + "rewards/rejected": -1.718073844909668, + "step": 5002 + }, + { + "epoch": 0.58, + "learning_rate": 1.289125599906356e-07, + "logits/chosen": -3.0178840160369873, + "logits/rejected": -3.0405256748199463, + "logps/chosen": -171.6208038330078, + "logps/rejected": -252.6309814453125, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33177024126052856, + "rewards/margins": 2.49822998046875, + "rewards/rejected": -2.166459798812866, + "step": 5003 + }, + { + "epoch": 0.58, + "learning_rate": 1.2887744352101136e-07, + "logits/chosen": -3.2035675048828125, + "logits/rejected": -3.2850818634033203, + "logps/chosen": -147.7393798828125, + "logps/rejected": -265.4011535644531, + "loss": 0.2684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2587752640247345, + "rewards/margins": 3.088487148284912, + "rewards/rejected": -2.8297119140625, + "step": 5004 + }, + { + "epoch": 0.58, + "learning_rate": 1.2884232705138709e-07, + "logits/chosen": -3.3158082962036133, + "logits/rejected": -3.358936309814453, + "logps/chosen": -437.8317565917969, + "logps/rejected": -259.3781433105469, + "loss": 0.3682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5269705653190613, + "rewards/margins": 2.456503391265869, + "rewards/rejected": -1.929532766342163, + "step": 5005 + }, + { + "epoch": 0.58, + "learning_rate": 1.2880721058176284e-07, + "logits/chosen": -2.594208002090454, + "logits/rejected": -2.533041000366211, + "logps/chosen": -216.81845092773438, + "logps/rejected": -167.12594604492188, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8146194219589233, + "rewards/margins": 2.0637366771698, + "rewards/rejected": -1.2491172552108765, + "step": 5006 + }, + { + "epoch": 0.58, + "learning_rate": 1.287720941121386e-07, + "logits/chosen": -2.6467738151550293, + "logits/rejected": -2.8627889156341553, + "logps/chosen": -343.0054016113281, + "logps/rejected": -226.44276428222656, + "loss": 0.5019, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07453954219818115, + "rewards/margins": 1.3627009391784668, + "rewards/rejected": -1.288161277770996, + "step": 5007 + }, + { + "epoch": 0.58, + "learning_rate": 1.2873697764251435e-07, + "logits/chosen": -3.085057020187378, + "logits/rejected": -3.11794376373291, + "logps/chosen": -385.4532165527344, + "logps/rejected": -305.91650390625, + "loss": 0.4043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20448081195354462, + "rewards/margins": 1.4547417163848877, + "rewards/rejected": -1.6592226028442383, + "step": 5008 + }, + { + "epoch": 0.58, + "learning_rate": 1.2870186117289007e-07, + "logits/chosen": -2.9463887214660645, + "logits/rejected": -3.1767072677612305, + "logps/chosen": -131.1376190185547, + "logps/rejected": -215.8402099609375, + "loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18317821621894836, + "rewards/margins": 2.2946653366088867, + "rewards/rejected": -2.4778435230255127, + "step": 5009 + }, + { + "epoch": 0.58, + "learning_rate": 1.2866674470326583e-07, + "logits/chosen": -3.422701597213745, + "logits/rejected": -3.350619077682495, + "logps/chosen": -360.34503173828125, + "logps/rejected": -301.46795654296875, + "loss": 0.2086, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22531238198280334, + "rewards/margins": 2.7226028442382812, + "rewards/rejected": -2.4972903728485107, + "step": 5010 + }, + { + "epoch": 0.58, + "learning_rate": 1.2863162823364156e-07, + "logits/chosen": -3.394075632095337, + "logits/rejected": -3.5729434490203857, + "logps/chosen": -297.84710693359375, + "logps/rejected": -318.5992736816406, + "loss": 0.2556, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0665550231933594, + "rewards/margins": 2.5473179817199707, + "rewards/rejected": -1.4807628393173218, + "step": 5011 + }, + { + "epoch": 0.58, + "learning_rate": 1.2859651176401734e-07, + "logits/chosen": -3.83728289604187, + "logits/rejected": -3.7644519805908203, + "logps/chosen": -274.34893798828125, + "logps/rejected": -223.5430145263672, + "loss": 0.1591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6592786312103271, + "rewards/margins": 2.5636937618255615, + "rewards/rejected": -1.9044151306152344, + "step": 5012 + }, + { + "epoch": 0.58, + "learning_rate": 1.2856139529439306e-07, + "logits/chosen": -2.727691650390625, + "logits/rejected": -2.5819878578186035, + "logps/chosen": -257.12420654296875, + "logps/rejected": -235.3209991455078, + "loss": 0.4319, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24498219788074493, + "rewards/margins": 1.3439409732818604, + "rewards/rejected": -1.0989587306976318, + "step": 5013 + }, + { + "epoch": 0.58, + "learning_rate": 1.2852627882476882e-07, + "logits/chosen": -2.499825954437256, + "logits/rejected": -2.466615676879883, + "logps/chosen": -159.66748046875, + "logps/rejected": -204.96197509765625, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10663923621177673, + "rewards/margins": 1.5640869140625, + "rewards/rejected": -1.4574477672576904, + "step": 5014 + }, + { + "epoch": 0.58, + "learning_rate": 1.2849116235514454e-07, + "logits/chosen": -3.636932611465454, + "logits/rejected": -3.9565200805664062, + "logps/chosen": -139.72499084472656, + "logps/rejected": -183.90576171875, + "loss": 0.3288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17759914696216583, + "rewards/margins": 2.6581737995147705, + "rewards/rejected": -2.480574607849121, + "step": 5015 + }, + { + "epoch": 0.58, + "learning_rate": 1.284560458855203e-07, + "logits/chosen": -2.714207172393799, + "logits/rejected": -2.934499740600586, + "logps/chosen": -375.4806823730469, + "logps/rejected": -342.8158264160156, + "loss": 0.5292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19260017573833466, + "rewards/margins": 0.6767192482948303, + "rewards/rejected": -0.8693193793296814, + "step": 5016 + }, + { + "epoch": 0.58, + "learning_rate": 1.2842092941589605e-07, + "logits/chosen": -2.6220529079437256, + "logits/rejected": -2.760239601135254, + "logps/chosen": -262.85760498046875, + "logps/rejected": -237.30552673339844, + "loss": 0.502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3737100064754486, + "rewards/margins": 0.5180554389953613, + "rewards/rejected": -0.8917654752731323, + "step": 5017 + }, + { + "epoch": 0.58, + "learning_rate": 1.283858129462718e-07, + "logits/chosen": -3.041126251220703, + "logits/rejected": -3.422842264175415, + "logps/chosen": -279.63323974609375, + "logps/rejected": -450.98590087890625, + "loss": 0.5608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3456031382083893, + "rewards/margins": 0.9249062538146973, + "rewards/rejected": -1.2705093622207642, + "step": 5018 + }, + { + "epoch": 0.58, + "learning_rate": 1.2835069647664753e-07, + "logits/chosen": -3.0662806034088135, + "logits/rejected": -2.848104953765869, + "logps/chosen": -291.6426696777344, + "logps/rejected": -248.36431884765625, + "loss": 0.4162, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00980764627456665, + "rewards/margins": 1.4155420064926147, + "rewards/rejected": -1.4253497123718262, + "step": 5019 + }, + { + "epoch": 0.58, + "learning_rate": 1.283155800070233e-07, + "logits/chosen": -2.800686836242676, + "logits/rejected": -2.7071354389190674, + "logps/chosen": -325.3921813964844, + "logps/rejected": -228.54530334472656, + "loss": 0.2382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6018565893173218, + "rewards/margins": 2.194550037384033, + "rewards/rejected": -1.5926933288574219, + "step": 5020 + }, + { + "epoch": 0.58, + "learning_rate": 1.2828046353739904e-07, + "logits/chosen": -2.2432100772857666, + "logits/rejected": -2.250680446624756, + "logps/chosen": -487.625244140625, + "logps/rejected": -343.1990051269531, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3829605281352997, + "rewards/margins": 1.7036561965942383, + "rewards/rejected": -1.3206956386566162, + "step": 5021 + }, + { + "epoch": 0.58, + "learning_rate": 1.2824534706777477e-07, + "logits/chosen": -3.2176921367645264, + "logits/rejected": -3.1002306938171387, + "logps/chosen": -174.8712921142578, + "logps/rejected": -260.817138671875, + "loss": 0.4185, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18640320003032684, + "rewards/margins": 2.413351058959961, + "rewards/rejected": -2.226947784423828, + "step": 5022 + }, + { + "epoch": 0.58, + "learning_rate": 1.2821023059815052e-07, + "logits/chosen": -3.8915762901306152, + "logits/rejected": -3.7809152603149414, + "logps/chosen": -310.43829345703125, + "logps/rejected": -226.51119995117188, + "loss": 0.4476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1441909670829773, + "rewards/margins": 1.0664031505584717, + "rewards/rejected": -0.9222121238708496, + "step": 5023 + }, + { + "epoch": 0.58, + "learning_rate": 1.2817511412852628e-07, + "logits/chosen": -3.3521389961242676, + "logits/rejected": -3.1616625785827637, + "logps/chosen": -225.028076171875, + "logps/rejected": -292.8263244628906, + "loss": 0.6162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22215688228607178, + "rewards/margins": 1.2828539609909058, + "rewards/rejected": -1.5050108432769775, + "step": 5024 + }, + { + "epoch": 0.58, + "learning_rate": 1.2813999765890203e-07, + "logits/chosen": -2.570096015930176, + "logits/rejected": -2.611603260040283, + "logps/chosen": -580.6859130859375, + "logps/rejected": -350.2803955078125, + "loss": 0.2718, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7026288509368896, + "rewards/margins": 1.7139778137207031, + "rewards/rejected": -1.0113489627838135, + "step": 5025 + }, + { + "epoch": 0.58, + "learning_rate": 1.2810488118927776e-07, + "logits/chosen": -2.9622299671173096, + "logits/rejected": -2.8956992626190186, + "logps/chosen": -233.87106323242188, + "logps/rejected": -294.0300598144531, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5673831105232239, + "rewards/margins": 2.315446376800537, + "rewards/rejected": -2.882829427719116, + "step": 5026 + }, + { + "epoch": 0.58, + "learning_rate": 1.280697647196535e-07, + "logits/chosen": -2.5463812351226807, + "logits/rejected": -2.6340372562408447, + "logps/chosen": -441.39752197265625, + "logps/rejected": -286.4170227050781, + "loss": 0.2949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41872280836105347, + "rewards/margins": 1.9148333072662354, + "rewards/rejected": -2.3335559368133545, + "step": 5027 + }, + { + "epoch": 0.58, + "learning_rate": 1.2803464825002926e-07, + "logits/chosen": -2.8648364543914795, + "logits/rejected": -3.0014097690582275, + "logps/chosen": -255.07516479492188, + "logps/rejected": -400.2191162109375, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1512080281972885, + "rewards/margins": 2.0829834938049316, + "rewards/rejected": -1.9317755699157715, + "step": 5028 + }, + { + "epoch": 0.58, + "learning_rate": 1.2799953178040502e-07, + "logits/chosen": -3.2900381088256836, + "logits/rejected": -3.147045850753784, + "logps/chosen": -201.22671508789062, + "logps/rejected": -220.6361846923828, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4388643205165863, + "rewards/margins": 1.5479562282562256, + "rewards/rejected": -1.9868204593658447, + "step": 5029 + }, + { + "epoch": 0.58, + "learning_rate": 1.2796441531078075e-07, + "logits/chosen": -3.60080885887146, + "logits/rejected": -3.733241081237793, + "logps/chosen": -240.65692138671875, + "logps/rejected": -255.76438903808594, + "loss": 0.2361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4005383849143982, + "rewards/margins": 2.566586971282959, + "rewards/rejected": -2.166048526763916, + "step": 5030 + }, + { + "epoch": 0.58, + "learning_rate": 1.279292988411565e-07, + "logits/chosen": -3.122959852218628, + "logits/rejected": -3.2530298233032227, + "logps/chosen": -201.73202514648438, + "logps/rejected": -179.4698944091797, + "loss": 0.5353, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7513167858123779, + "rewards/margins": 1.6176173686981201, + "rewards/rejected": -2.368934154510498, + "step": 5031 + }, + { + "epoch": 0.58, + "learning_rate": 1.2789418237153225e-07, + "logits/chosen": -3.2062222957611084, + "logits/rejected": -3.208982229232788, + "logps/chosen": -123.01377868652344, + "logps/rejected": -133.63888549804688, + "loss": 0.2888, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1813463270664215, + "rewards/margins": 1.688930869102478, + "rewards/rejected": -1.8702771663665771, + "step": 5032 + }, + { + "epoch": 0.58, + "learning_rate": 1.2785906590190798e-07, + "logits/chosen": -3.83549427986145, + "logits/rejected": -3.7102231979370117, + "logps/chosen": -224.90518188476562, + "logps/rejected": -209.16561889648438, + "loss": 0.3337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06444337964057922, + "rewards/margins": 1.5488038063049316, + "rewards/rejected": -1.4843604564666748, + "step": 5033 + }, + { + "epoch": 0.58, + "learning_rate": 1.2782394943228374e-07, + "logits/chosen": -3.3604860305786133, + "logits/rejected": -3.398679733276367, + "logps/chosen": -354.6896667480469, + "logps/rejected": -321.07269287109375, + "loss": 0.2954, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05834078788757324, + "rewards/margins": 2.6479005813598633, + "rewards/rejected": -2.58955979347229, + "step": 5034 + }, + { + "epoch": 0.58, + "learning_rate": 1.277888329626595e-07, + "logits/chosen": -2.6741292476654053, + "logits/rejected": -2.771068572998047, + "logps/chosen": -317.76690673828125, + "logps/rejected": -236.85256958007812, + "loss": 0.4228, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2403039038181305, + "rewards/margins": 1.4570119380950928, + "rewards/rejected": -1.6973159313201904, + "step": 5035 + }, + { + "epoch": 0.58, + "learning_rate": 1.2775371649303524e-07, + "logits/chosen": -3.3747763633728027, + "logits/rejected": -3.176621437072754, + "logps/chosen": -184.29248046875, + "logps/rejected": -220.4187469482422, + "loss": 0.2022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3587665557861328, + "rewards/margins": 2.018613815307617, + "rewards/rejected": -1.6598472595214844, + "step": 5036 + }, + { + "epoch": 0.58, + "learning_rate": 1.2771860002341097e-07, + "logits/chosen": -2.7813496589660645, + "logits/rejected": -2.5995054244995117, + "logps/chosen": -382.8707275390625, + "logps/rejected": -281.0979919433594, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25480368733406067, + "rewards/margins": 2.0126893520355225, + "rewards/rejected": -1.7578855752944946, + "step": 5037 + }, + { + "epoch": 0.58, + "learning_rate": 1.2768348355378672e-07, + "logits/chosen": -2.7973992824554443, + "logits/rejected": -3.121063709259033, + "logps/chosen": -276.9651794433594, + "logps/rejected": -197.33160400390625, + "loss": 0.4069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38399460911750793, + "rewards/margins": 1.3929458856582642, + "rewards/rejected": -1.7769403457641602, + "step": 5038 + }, + { + "epoch": 0.58, + "learning_rate": 1.2764836708416245e-07, + "logits/chosen": -3.359433650970459, + "logits/rejected": -3.6237974166870117, + "logps/chosen": -163.86814880371094, + "logps/rejected": -303.46124267578125, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5372501015663147, + "rewards/margins": 3.1151843070983887, + "rewards/rejected": -2.5779342651367188, + "step": 5039 + }, + { + "epoch": 0.58, + "learning_rate": 1.2761325061453823e-07, + "logits/chosen": -3.3076095581054688, + "logits/rejected": -3.6929259300231934, + "logps/chosen": -141.2142333984375, + "logps/rejected": -267.79583740234375, + "loss": 0.2649, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1145787239074707, + "rewards/margins": 2.82140851020813, + "rewards/rejected": -2.706829786300659, + "step": 5040 + }, + { + "epoch": 0.58, + "learning_rate": 1.2757813414491396e-07, + "logits/chosen": -3.026960849761963, + "logits/rejected": -3.09399151802063, + "logps/chosen": -523.8560180664062, + "logps/rejected": -365.1963806152344, + "loss": 0.5293, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16027438640594482, + "rewards/margins": 0.6240905523300171, + "rewards/rejected": -0.7843649387359619, + "step": 5041 + }, + { + "epoch": 0.58, + "learning_rate": 1.275430176752897e-07, + "logits/chosen": -3.7586920261383057, + "logits/rejected": -3.2695913314819336, + "logps/chosen": -382.9727783203125, + "logps/rejected": -294.2950439453125, + "loss": 0.3162, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33849620819091797, + "rewards/margins": 1.2544217109680176, + "rewards/rejected": -0.9159255027770996, + "step": 5042 + }, + { + "epoch": 0.58, + "learning_rate": 1.2750790120566544e-07, + "logits/chosen": -3.217665910720825, + "logits/rejected": -3.3796916007995605, + "logps/chosen": -150.95706176757812, + "logps/rejected": -327.947021484375, + "loss": 0.2167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27542123198509216, + "rewards/margins": 2.630263328552246, + "rewards/rejected": -2.905684471130371, + "step": 5043 + }, + { + "epoch": 0.58, + "learning_rate": 1.274727847360412e-07, + "logits/chosen": -3.571077823638916, + "logits/rejected": -3.3481037616729736, + "logps/chosen": -203.13087463378906, + "logps/rejected": -212.49935913085938, + "loss": 0.997, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9830268025398254, + "rewards/margins": 1.4347411394119263, + "rewards/rejected": -2.4177680015563965, + "step": 5044 + }, + { + "epoch": 0.58, + "learning_rate": 1.2743766826641695e-07, + "logits/chosen": -2.69978666305542, + "logits/rejected": -3.1136157512664795, + "logps/chosen": -151.898193359375, + "logps/rejected": -242.90869140625, + "loss": 0.4703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1295502781867981, + "rewards/margins": 2.659722328186035, + "rewards/rejected": -2.7892727851867676, + "step": 5045 + }, + { + "epoch": 0.58, + "learning_rate": 1.274025517967927e-07, + "logits/chosen": -3.000504493713379, + "logits/rejected": -3.0767440795898438, + "logps/chosen": -298.09686279296875, + "logps/rejected": -349.932373046875, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13690738379955292, + "rewards/margins": 2.0991227626800537, + "rewards/rejected": -1.962215542793274, + "step": 5046 + }, + { + "epoch": 0.58, + "learning_rate": 1.2736743532716843e-07, + "logits/chosen": -2.9589009284973145, + "logits/rejected": -2.848604679107666, + "logps/chosen": -301.490478515625, + "logps/rejected": -225.5598602294922, + "loss": 0.6178, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4710807800292969, + "rewards/margins": 0.9879388213157654, + "rewards/rejected": -1.459019660949707, + "step": 5047 + }, + { + "epoch": 0.58, + "learning_rate": 1.2733231885754418e-07, + "logits/chosen": -3.125692844390869, + "logits/rejected": -2.695746898651123, + "logps/chosen": -283.38897705078125, + "logps/rejected": -255.02989196777344, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3765089511871338, + "rewards/margins": 3.6900973320007324, + "rewards/rejected": -3.3135886192321777, + "step": 5048 + }, + { + "epoch": 0.58, + "learning_rate": 1.2729720238791994e-07, + "logits/chosen": -3.4725213050842285, + "logits/rejected": -3.8035953044891357, + "logps/chosen": -241.58206176757812, + "logps/rejected": -209.90032958984375, + "loss": 0.3333, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6175392270088196, + "rewards/margins": 2.5831849575042725, + "rewards/rejected": -1.9656457901000977, + "step": 5049 + }, + { + "epoch": 0.58, + "learning_rate": 1.2726208591829566e-07, + "logits/chosen": -3.6497132778167725, + "logits/rejected": -3.5948026180267334, + "logps/chosen": -180.8606414794922, + "logps/rejected": -144.51934814453125, + "loss": 0.4962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27254343032836914, + "rewards/margins": 1.9792249202728271, + "rewards/rejected": -2.2517683506011963, + "step": 5050 + }, + { + "epoch": 0.58, + "learning_rate": 1.2722696944867142e-07, + "logits/chosen": -3.6335620880126953, + "logits/rejected": -3.6673502922058105, + "logps/chosen": -157.08267211914062, + "logps/rejected": -185.21987915039062, + "loss": 0.3749, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2823788821697235, + "rewards/margins": 2.3970425128936768, + "rewards/rejected": -2.114663600921631, + "step": 5051 + }, + { + "epoch": 0.58, + "learning_rate": 1.2719185297904717e-07, + "logits/chosen": -3.716050863265991, + "logits/rejected": -3.6318862438201904, + "logps/chosen": -276.47503662109375, + "logps/rejected": -182.00587463378906, + "loss": 0.4966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43334800004959106, + "rewards/margins": 0.7634434700012207, + "rewards/rejected": -1.196791410446167, + "step": 5052 + }, + { + "epoch": 0.58, + "learning_rate": 1.2715673650942293e-07, + "logits/chosen": -4.0758585929870605, + "logits/rejected": -3.609835624694824, + "logps/chosen": -272.31866455078125, + "logps/rejected": -243.53866577148438, + "loss": 0.2366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2793155908584595, + "rewards/margins": 2.5818521976470947, + "rewards/rejected": -2.3025364875793457, + "step": 5053 + }, + { + "epoch": 0.58, + "learning_rate": 1.2712162003979865e-07, + "logits/chosen": -3.664198398590088, + "logits/rejected": -3.5196340084075928, + "logps/chosen": -253.3223876953125, + "logps/rejected": -179.81265258789062, + "loss": 0.7124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1484735608100891, + "rewards/margins": 1.009164571762085, + "rewards/rejected": -1.1576380729675293, + "step": 5054 + }, + { + "epoch": 0.58, + "learning_rate": 1.270865035701744e-07, + "logits/chosen": -3.508932590484619, + "logits/rejected": -3.3474268913269043, + "logps/chosen": -380.0367736816406, + "logps/rejected": -314.1165466308594, + "loss": 0.377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14284448325634003, + "rewards/margins": 2.476856231689453, + "rewards/rejected": -2.6197004318237305, + "step": 5055 + }, + { + "epoch": 0.58, + "learning_rate": 1.2705138710055013e-07, + "logits/chosen": -3.9125895500183105, + "logits/rejected": -3.9237680435180664, + "logps/chosen": -184.95849609375, + "logps/rejected": -173.5776824951172, + "loss": 0.616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46787717938423157, + "rewards/margins": 0.4966202974319458, + "rewards/rejected": -0.964497447013855, + "step": 5056 + }, + { + "epoch": 0.58, + "learning_rate": 1.2701627063092591e-07, + "logits/chosen": -3.0725293159484863, + "logits/rejected": -2.801165819168091, + "logps/chosen": -288.439208984375, + "logps/rejected": -215.64703369140625, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3049061894416809, + "rewards/margins": 1.1041843891143799, + "rewards/rejected": -1.4090906381607056, + "step": 5057 + }, + { + "epoch": 0.58, + "learning_rate": 1.2698115416130164e-07, + "logits/chosen": -3.1248087882995605, + "logits/rejected": -3.2253081798553467, + "logps/chosen": -119.39644622802734, + "logps/rejected": -144.3485107421875, + "loss": 0.4683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3015996217727661, + "rewards/margins": 2.0588626861572266, + "rewards/rejected": -2.3604624271392822, + "step": 5058 + }, + { + "epoch": 0.58, + "learning_rate": 1.269460376916774e-07, + "logits/chosen": -2.791971445083618, + "logits/rejected": -2.7963523864746094, + "logps/chosen": -323.49853515625, + "logps/rejected": -192.822021484375, + "loss": 0.4968, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6141326427459717, + "rewards/margins": 1.7042126655578613, + "rewards/rejected": -1.0900801420211792, + "step": 5059 + }, + { + "epoch": 0.58, + "learning_rate": 1.2691092122205312e-07, + "logits/chosen": -3.592034101486206, + "logits/rejected": -3.8011045455932617, + "logps/chosen": -236.0926513671875, + "logps/rejected": -362.00421142578125, + "loss": 0.1288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5348492860794067, + "rewards/margins": 2.520763397216797, + "rewards/rejected": -3.055612564086914, + "step": 5060 + }, + { + "epoch": 0.58, + "learning_rate": 1.2687580475242888e-07, + "logits/chosen": -2.9570603370666504, + "logits/rejected": -2.8587281703948975, + "logps/chosen": -179.15188598632812, + "logps/rejected": -219.90992736816406, + "loss": 0.439, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27063390612602234, + "rewards/margins": 2.301243543624878, + "rewards/rejected": -2.030609607696533, + "step": 5061 + }, + { + "epoch": 0.58, + "learning_rate": 1.2684068828280463e-07, + "logits/chosen": -3.278244972229004, + "logits/rejected": -3.2030293941497803, + "logps/chosen": -170.67337036132812, + "logps/rejected": -204.93435668945312, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.530208945274353, + "rewards/margins": 1.1478404998779297, + "rewards/rejected": -1.6780494451522827, + "step": 5062 + }, + { + "epoch": 0.58, + "learning_rate": 1.2680557181318039e-07, + "logits/chosen": -3.3655543327331543, + "logits/rejected": -3.226430892944336, + "logps/chosen": -276.15582275390625, + "logps/rejected": -237.97308349609375, + "loss": 0.3083, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12420712411403656, + "rewards/margins": 1.7512704133987427, + "rewards/rejected": -1.627063274383545, + "step": 5063 + }, + { + "epoch": 0.58, + "learning_rate": 1.267704553435561e-07, + "logits/chosen": -3.6587796211242676, + "logits/rejected": -3.214343786239624, + "logps/chosen": -201.4375762939453, + "logps/rejected": -124.67745208740234, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1228506565093994, + "rewards/margins": 1.8152803182601929, + "rewards/rejected": -0.6924296617507935, + "step": 5064 + }, + { + "epoch": 0.58, + "learning_rate": 1.2673533887393187e-07, + "logits/chosen": -2.635091781616211, + "logits/rejected": -2.7422800064086914, + "logps/chosen": -413.92254638671875, + "logps/rejected": -427.0927429199219, + "loss": 0.4357, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.266759991645813, + "rewards/margins": 1.4970612525939941, + "rewards/rejected": -1.2303013801574707, + "step": 5065 + }, + { + "epoch": 0.58, + "learning_rate": 1.2670022240430762e-07, + "logits/chosen": -2.5339131355285645, + "logits/rejected": -2.4504597187042236, + "logps/chosen": -274.673583984375, + "logps/rejected": -346.19049072265625, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20619097352027893, + "rewards/margins": 1.6395108699798584, + "rewards/rejected": -1.8457016944885254, + "step": 5066 + }, + { + "epoch": 0.58, + "learning_rate": 1.2666510593468335e-07, + "logits/chosen": -2.6928839683532715, + "logits/rejected": -2.690260171890259, + "logps/chosen": -432.06317138671875, + "logps/rejected": -352.77777099609375, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36328935623168945, + "rewards/margins": 4.899564266204834, + "rewards/rejected": -4.5362749099731445, + "step": 5067 + }, + { + "epoch": 0.58, + "learning_rate": 1.266299894650591e-07, + "logits/chosen": -2.850738286972046, + "logits/rejected": -2.9828696250915527, + "logps/chosen": -520.4832153320312, + "logps/rejected": -397.86468505859375, + "loss": 0.3542, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15189455449581146, + "rewards/margins": 2.4175562858581543, + "rewards/rejected": -2.569450616836548, + "step": 5068 + }, + { + "epoch": 0.58, + "learning_rate": 1.2659487299543486e-07, + "logits/chosen": -3.745903491973877, + "logits/rejected": -3.9414281845092773, + "logps/chosen": -240.80059814453125, + "logps/rejected": -315.74066162109375, + "loss": 0.4399, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17542952299118042, + "rewards/margins": 2.731564998626709, + "rewards/rejected": -2.906994342803955, + "step": 5069 + }, + { + "epoch": 0.58, + "learning_rate": 1.265597565258106e-07, + "logits/chosen": -2.5919289588928223, + "logits/rejected": -3.0560688972473145, + "logps/chosen": -247.66384887695312, + "logps/rejected": -288.96783447265625, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14901769161224365, + "rewards/margins": 1.90613853931427, + "rewards/rejected": -2.0551562309265137, + "step": 5070 + }, + { + "epoch": 0.58, + "learning_rate": 1.2652464005618634e-07, + "logits/chosen": -2.4298906326293945, + "logits/rejected": -2.5612735748291016, + "logps/chosen": -296.75848388671875, + "logps/rejected": -280.7152099609375, + "loss": 0.6773, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4941185712814331, + "rewards/margins": 1.4057281017303467, + "rewards/rejected": -1.8998465538024902, + "step": 5071 + }, + { + "epoch": 0.58, + "learning_rate": 1.264895235865621e-07, + "logits/chosen": -2.7841343879699707, + "logits/rejected": -2.810114860534668, + "logps/chosen": -172.5916290283203, + "logps/rejected": -232.9892120361328, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2140403836965561, + "rewards/margins": 2.355552911758423, + "rewards/rejected": -2.141512393951416, + "step": 5072 + }, + { + "epoch": 0.58, + "learning_rate": 1.2645440711693784e-07, + "logits/chosen": -2.7614309787750244, + "logits/rejected": -2.7017533779144287, + "logps/chosen": -236.35214233398438, + "logps/rejected": -265.38330078125, + "loss": 0.4667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3127371072769165, + "rewards/margins": 1.2616267204284668, + "rewards/rejected": -1.5743638277053833, + "step": 5073 + }, + { + "epoch": 0.58, + "learning_rate": 1.264192906473136e-07, + "logits/chosen": -3.6879162788391113, + "logits/rejected": -3.875938653945923, + "logps/chosen": -343.4891357421875, + "logps/rejected": -337.6551208496094, + "loss": 0.592, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19598700106143951, + "rewards/margins": 1.043034553527832, + "rewards/rejected": -1.2390215396881104, + "step": 5074 + }, + { + "epoch": 0.59, + "learning_rate": 1.2638417417768933e-07, + "logits/chosen": -3.291682720184326, + "logits/rejected": -3.2164254188537598, + "logps/chosen": -271.5522766113281, + "logps/rejected": -161.20501708984375, + "loss": 0.3629, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16440042853355408, + "rewards/margins": 1.5468573570251465, + "rewards/rejected": -1.3824567794799805, + "step": 5075 + }, + { + "epoch": 0.59, + "learning_rate": 1.2634905770806508e-07, + "logits/chosen": -2.965287446975708, + "logits/rejected": -3.394080400466919, + "logps/chosen": -278.1961364746094, + "logps/rejected": -269.2933349609375, + "loss": 0.4719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5391241908073425, + "rewards/margins": 1.36617112159729, + "rewards/rejected": -1.9052952527999878, + "step": 5076 + }, + { + "epoch": 0.59, + "learning_rate": 1.2631394123844083e-07, + "logits/chosen": -2.8547449111938477, + "logits/rejected": -3.0556201934814453, + "logps/chosen": -305.9022216796875, + "logps/rejected": -224.9374237060547, + "loss": 0.1149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6179894804954529, + "rewards/margins": 3.270540237426758, + "rewards/rejected": -2.6525509357452393, + "step": 5077 + }, + { + "epoch": 0.59, + "learning_rate": 1.2627882476881656e-07, + "logits/chosen": -3.004833698272705, + "logits/rejected": -2.725419759750366, + "logps/chosen": -198.72137451171875, + "logps/rejected": -232.59539794921875, + "loss": 0.349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2596345543861389, + "rewards/margins": 2.522596597671509, + "rewards/rejected": -2.782231330871582, + "step": 5078 + }, + { + "epoch": 0.59, + "learning_rate": 1.2624370829919231e-07, + "logits/chosen": -2.3458752632141113, + "logits/rejected": -2.365662097930908, + "logps/chosen": -251.01634216308594, + "logps/rejected": -245.94439697265625, + "loss": 0.5736, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36321520805358887, + "rewards/margins": 0.9257047176361084, + "rewards/rejected": -1.2889199256896973, + "step": 5079 + }, + { + "epoch": 0.59, + "learning_rate": 1.2620859182956807e-07, + "logits/chosen": -2.771125316619873, + "logits/rejected": -2.6224145889282227, + "logps/chosen": -395.3077087402344, + "logps/rejected": -211.20298767089844, + "loss": 0.5588, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6337928771972656, + "rewards/margins": 0.8307833671569824, + "rewards/rejected": -1.4645761251449585, + "step": 5080 + }, + { + "epoch": 0.59, + "learning_rate": 1.2617347535994382e-07, + "logits/chosen": -2.65049409866333, + "logits/rejected": -2.061173439025879, + "logps/chosen": -174.20001220703125, + "logps/rejected": -254.60610961914062, + "loss": 0.3058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05493832379579544, + "rewards/margins": 2.1030774116516113, + "rewards/rejected": -2.158015727996826, + "step": 5081 + }, + { + "epoch": 0.59, + "learning_rate": 1.2613835889031955e-07, + "logits/chosen": -3.1570653915405273, + "logits/rejected": -3.0731801986694336, + "logps/chosen": -245.9629669189453, + "logps/rejected": -196.310546875, + "loss": 0.4268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49099090695381165, + "rewards/margins": 0.9770956039428711, + "rewards/rejected": -1.4680863618850708, + "step": 5082 + }, + { + "epoch": 0.59, + "learning_rate": 1.261032424206953e-07, + "logits/chosen": -3.55169939994812, + "logits/rejected": -3.4565110206604004, + "logps/chosen": -257.9891357421875, + "logps/rejected": -342.3607482910156, + "loss": 0.7543, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1295592784881592, + "rewards/margins": 0.920102596282959, + "rewards/rejected": -2.049661874771118, + "step": 5083 + }, + { + "epoch": 0.59, + "learning_rate": 1.2606812595107103e-07, + "logits/chosen": -2.961394786834717, + "logits/rejected": -3.238657236099243, + "logps/chosen": -219.01072692871094, + "logps/rejected": -302.8183288574219, + "loss": 0.5823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5494468212127686, + "rewards/margins": 1.3739678859710693, + "rewards/rejected": -1.923414707183838, + "step": 5084 + }, + { + "epoch": 0.59, + "learning_rate": 1.260330094814468e-07, + "logits/chosen": -3.379826784133911, + "logits/rejected": -3.6602985858917236, + "logps/chosen": -189.70741271972656, + "logps/rejected": -310.4198913574219, + "loss": 0.2593, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21904106438159943, + "rewards/margins": 3.63997745513916, + "rewards/rejected": -3.420936107635498, + "step": 5085 + }, + { + "epoch": 0.59, + "learning_rate": 1.2599789301182254e-07, + "logits/chosen": -3.975299119949341, + "logits/rejected": -3.833606243133545, + "logps/chosen": -147.48446655273438, + "logps/rejected": -171.4473876953125, + "loss": 0.2761, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4759521484375, + "rewards/margins": 1.5857096910476685, + "rewards/rejected": -1.1097575426101685, + "step": 5086 + }, + { + "epoch": 0.59, + "learning_rate": 1.259627765421983e-07, + "logits/chosen": -2.3709487915039062, + "logits/rejected": -2.256443500518799, + "logps/chosen": -365.72991943359375, + "logps/rejected": -353.644287109375, + "loss": 0.6442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41030019521713257, + "rewards/margins": 0.9880859851837158, + "rewards/rejected": -1.3983861207962036, + "step": 5087 + }, + { + "epoch": 0.59, + "learning_rate": 1.2592766007257402e-07, + "logits/chosen": -3.2282509803771973, + "logits/rejected": -3.366941213607788, + "logps/chosen": -207.70994567871094, + "logps/rejected": -237.38648986816406, + "loss": 0.4184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.054413825273513794, + "rewards/margins": 1.370856523513794, + "rewards/rejected": -1.4252703189849854, + "step": 5088 + }, + { + "epoch": 0.59, + "learning_rate": 1.2589254360294977e-07, + "logits/chosen": -3.161414623260498, + "logits/rejected": -2.958066463470459, + "logps/chosen": -234.46304321289062, + "logps/rejected": -158.98114013671875, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9040212631225586, + "rewards/margins": 0.9930480718612671, + "rewards/rejected": -1.8970693349838257, + "step": 5089 + }, + { + "epoch": 0.59, + "learning_rate": 1.2585742713332553e-07, + "logits/chosen": -3.3154497146606445, + "logits/rejected": -3.4296998977661133, + "logps/chosen": -226.7612762451172, + "logps/rejected": -134.78831481933594, + "loss": 0.8794, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0726767778396606, + "rewards/margins": -0.13235639035701752, + "rewards/rejected": -0.9403204917907715, + "step": 5090 + }, + { + "epoch": 0.59, + "learning_rate": 1.2582231066370128e-07, + "logits/chosen": -2.833193063735962, + "logits/rejected": -3.1983954906463623, + "logps/chosen": -358.6549072265625, + "logps/rejected": -249.66159057617188, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1164134293794632, + "rewards/margins": 0.7592272162437439, + "rewards/rejected": -0.6428138017654419, + "step": 5091 + }, + { + "epoch": 0.59, + "learning_rate": 1.25787194194077e-07, + "logits/chosen": -3.092123031616211, + "logits/rejected": -3.229050636291504, + "logps/chosen": -357.05364990234375, + "logps/rejected": -215.02664184570312, + "loss": 0.4016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24447083473205566, + "rewards/margins": 1.3640906810760498, + "rewards/rejected": -1.1196197271347046, + "step": 5092 + }, + { + "epoch": 0.59, + "learning_rate": 1.2575207772445276e-07, + "logits/chosen": -3.471397638320923, + "logits/rejected": -3.2719600200653076, + "logps/chosen": -243.06199645996094, + "logps/rejected": -345.8392333984375, + "loss": 0.6261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28838104009628296, + "rewards/margins": 0.6559892892837524, + "rewards/rejected": -0.9443702697753906, + "step": 5093 + }, + { + "epoch": 0.59, + "learning_rate": 1.2571696125482852e-07, + "logits/chosen": -3.005966901779175, + "logits/rejected": -2.860386610031128, + "logps/chosen": -125.33768463134766, + "logps/rejected": -216.98428344726562, + "loss": 0.4373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10151642560958862, + "rewards/margins": 1.3632721900939941, + "rewards/rejected": -1.4647884368896484, + "step": 5094 + }, + { + "epoch": 0.59, + "learning_rate": 1.2568184478520424e-07, + "logits/chosen": -3.4604074954986572, + "logits/rejected": -2.967810869216919, + "logps/chosen": -223.649169921875, + "logps/rejected": -155.54026794433594, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35581523180007935, + "rewards/margins": 1.7625510692596436, + "rewards/rejected": -1.4067357778549194, + "step": 5095 + }, + { + "epoch": 0.59, + "learning_rate": 1.2564672831558e-07, + "logits/chosen": -3.1967105865478516, + "logits/rejected": -3.034351348876953, + "logps/chosen": -281.3988952636719, + "logps/rejected": -268.4374084472656, + "loss": 0.2854, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23806801438331604, + "rewards/margins": 2.042045831680298, + "rewards/rejected": -1.8039780855178833, + "step": 5096 + }, + { + "epoch": 0.59, + "learning_rate": 1.2561161184595575e-07, + "logits/chosen": -3.43220591545105, + "logits/rejected": -3.0261197090148926, + "logps/chosen": -249.7388916015625, + "logps/rejected": -232.57327270507812, + "loss": 0.247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6207653284072876, + "rewards/margins": 2.2236382961273193, + "rewards/rejected": -1.6028728485107422, + "step": 5097 + }, + { + "epoch": 0.59, + "learning_rate": 1.255764953763315e-07, + "logits/chosen": -3.4867067337036133, + "logits/rejected": -3.6398983001708984, + "logps/chosen": -233.5177001953125, + "logps/rejected": -320.90240478515625, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21252700686454773, + "rewards/margins": 3.3579001426696777, + "rewards/rejected": -3.570427417755127, + "step": 5098 + }, + { + "epoch": 0.59, + "learning_rate": 1.2554137890670723e-07, + "logits/chosen": -3.1799564361572266, + "logits/rejected": -3.0686426162719727, + "logps/chosen": -224.99356079101562, + "logps/rejected": -191.45594787597656, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02809259295463562, + "rewards/margins": 1.8410484790802002, + "rewards/rejected": -1.8691411018371582, + "step": 5099 + }, + { + "epoch": 0.59, + "learning_rate": 1.2550626243708299e-07, + "logits/chosen": -3.4630966186523438, + "logits/rejected": -3.6557281017303467, + "logps/chosen": -282.0107421875, + "logps/rejected": -252.32705688476562, + "loss": 0.2895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18214941024780273, + "rewards/margins": 2.1229639053344727, + "rewards/rejected": -2.3051135540008545, + "step": 5100 + }, + { + "epoch": 0.59, + "learning_rate": 1.2547114596745871e-07, + "logits/chosen": -2.3197498321533203, + "logits/rejected": -2.5977914333343506, + "logps/chosen": -365.92022705078125, + "logps/rejected": -286.7470703125, + "loss": 0.3031, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40431421995162964, + "rewards/margins": 1.635265588760376, + "rewards/rejected": -2.0395798683166504, + "step": 5101 + }, + { + "epoch": 0.59, + "learning_rate": 1.254360294978345e-07, + "logits/chosen": -3.2668089866638184, + "logits/rejected": -3.0273594856262207, + "logps/chosen": -168.8604736328125, + "logps/rejected": -205.32867431640625, + "loss": 0.7513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.614285945892334, + "rewards/margins": 0.31139981746673584, + "rewards/rejected": -0.9256857633590698, + "step": 5102 + }, + { + "epoch": 0.59, + "learning_rate": 1.2540091302821022e-07, + "logits/chosen": -3.794553518295288, + "logits/rejected": -3.7923269271850586, + "logps/chosen": -165.3007049560547, + "logps/rejected": -200.82066345214844, + "loss": 0.2393, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39332714676856995, + "rewards/margins": 2.1092162132263184, + "rewards/rejected": -1.7158890962600708, + "step": 5103 + }, + { + "epoch": 0.59, + "learning_rate": 1.2536579655858598e-07, + "logits/chosen": -3.2049660682678223, + "logits/rejected": -3.161163330078125, + "logps/chosen": -248.8014373779297, + "logps/rejected": -375.4140625, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07316935807466507, + "rewards/margins": 1.8015270233154297, + "rewards/rejected": -1.7283576726913452, + "step": 5104 + }, + { + "epoch": 0.59, + "learning_rate": 1.253306800889617e-07, + "logits/chosen": -2.981464385986328, + "logits/rejected": -2.7328383922576904, + "logps/chosen": -418.39544677734375, + "logps/rejected": -282.54193115234375, + "loss": 0.7656, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5402126908302307, + "rewards/margins": 0.11329154670238495, + "rewards/rejected": -0.6535042524337769, + "step": 5105 + }, + { + "epoch": 0.59, + "learning_rate": 1.2529556361933746e-07, + "logits/chosen": -2.722846508026123, + "logits/rejected": -3.145650863647461, + "logps/chosen": -397.30169677734375, + "logps/rejected": -251.1290283203125, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5785143375396729, + "rewards/margins": 2.051023244857788, + "rewards/rejected": -1.4725090265274048, + "step": 5106 + }, + { + "epoch": 0.59, + "learning_rate": 1.252604471497132e-07, + "logits/chosen": -2.6955244541168213, + "logits/rejected": -3.220076084136963, + "logps/chosen": -172.3206787109375, + "logps/rejected": -182.2061767578125, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11834937334060669, + "rewards/margins": 2.2305028438568115, + "rewards/rejected": -2.1121532917022705, + "step": 5107 + }, + { + "epoch": 0.59, + "learning_rate": 1.2522533068008896e-07, + "logits/chosen": -3.074620246887207, + "logits/rejected": -2.980717182159424, + "logps/chosen": -319.91424560546875, + "logps/rejected": -327.15728759765625, + "loss": 0.4391, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4174458384513855, + "rewards/margins": 1.3168244361877441, + "rewards/rejected": -0.8993785977363586, + "step": 5108 + }, + { + "epoch": 0.59, + "learning_rate": 1.251902142104647e-07, + "logits/chosen": -3.1302223205566406, + "logits/rejected": -3.093503475189209, + "logps/chosen": -303.3023376464844, + "logps/rejected": -290.64898681640625, + "loss": 0.5396, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5212228298187256, + "rewards/margins": 0.6032912135124207, + "rewards/rejected": -1.1245139837265015, + "step": 5109 + }, + { + "epoch": 0.59, + "learning_rate": 1.2515509774084045e-07, + "logits/chosen": -2.592440128326416, + "logits/rejected": -2.6826672554016113, + "logps/chosen": -223.41751098632812, + "logps/rejected": -270.8307800292969, + "loss": 0.2948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6458752155303955, + "rewards/margins": 2.5807454586029053, + "rewards/rejected": -3.22662091255188, + "step": 5110 + }, + { + "epoch": 0.59, + "learning_rate": 1.251199812712162e-07, + "logits/chosen": -2.7077412605285645, + "logits/rejected": -2.7753868103027344, + "logps/chosen": -276.3486022949219, + "logps/rejected": -285.5839538574219, + "loss": 0.3592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10496076941490173, + "rewards/margins": 1.5069990158081055, + "rewards/rejected": -1.6119598150253296, + "step": 5111 + }, + { + "epoch": 0.59, + "learning_rate": 1.2508486480159193e-07, + "logits/chosen": -3.3332247734069824, + "logits/rejected": -3.0399951934814453, + "logps/chosen": -233.8594512939453, + "logps/rejected": -191.0212860107422, + "loss": 0.2969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2931063771247864, + "rewards/margins": 2.3192198276519775, + "rewards/rejected": -2.612326145172119, + "step": 5112 + }, + { + "epoch": 0.59, + "learning_rate": 1.2504974833196768e-07, + "logits/chosen": -2.8460159301757812, + "logits/rejected": -2.6648716926574707, + "logps/chosen": -316.8892822265625, + "logps/rejected": -206.99224853515625, + "loss": 0.3231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04325654357671738, + "rewards/margins": 1.5111477375030518, + "rewards/rejected": -1.5544042587280273, + "step": 5113 + }, + { + "epoch": 0.59, + "learning_rate": 1.2501463186234343e-07, + "logits/chosen": -3.2239773273468018, + "logits/rejected": -2.935886859893799, + "logps/chosen": -262.8858337402344, + "logps/rejected": -238.65518188476562, + "loss": 0.4349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06051987409591675, + "rewards/margins": 1.341530442237854, + "rewards/rejected": -1.402050256729126, + "step": 5114 + }, + { + "epoch": 0.59, + "learning_rate": 1.249795153927192e-07, + "logits/chosen": -3.9414167404174805, + "logits/rejected": -3.9088540077209473, + "logps/chosen": -262.698486328125, + "logps/rejected": -351.52734375, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0574973821640015, + "rewards/margins": 2.0938568115234375, + "rewards/rejected": -3.1513540744781494, + "step": 5115 + }, + { + "epoch": 0.59, + "learning_rate": 1.2494439892309492e-07, + "logits/chosen": -2.84096097946167, + "logits/rejected": -3.148622989654541, + "logps/chosen": -284.6800231933594, + "logps/rejected": -286.5943298339844, + "loss": 0.3847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03002943843603134, + "rewards/margins": 1.3376684188842773, + "rewards/rejected": -1.3676979541778564, + "step": 5116 + }, + { + "epoch": 0.59, + "learning_rate": 1.2490928245347067e-07, + "logits/chosen": -2.510728120803833, + "logits/rejected": -2.775230884552002, + "logps/chosen": -276.8660888671875, + "logps/rejected": -182.60079956054688, + "loss": 0.6986, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5546553134918213, + "rewards/margins": 0.4451044201850891, + "rewards/rejected": -0.9997596740722656, + "step": 5117 + }, + { + "epoch": 0.59, + "learning_rate": 1.2487416598384642e-07, + "logits/chosen": -3.403625726699829, + "logits/rejected": -3.6277008056640625, + "logps/chosen": -228.04000854492188, + "logps/rejected": -214.87258911132812, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43365222215652466, + "rewards/margins": 3.4498209953308105, + "rewards/rejected": -3.0161688327789307, + "step": 5118 + }, + { + "epoch": 0.59, + "learning_rate": 1.2483904951422218e-07, + "logits/chosen": -3.393980026245117, + "logits/rejected": -3.198084831237793, + "logps/chosen": -311.3647155761719, + "logps/rejected": -211.47418212890625, + "loss": 0.3923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0815800279378891, + "rewards/margins": 0.7866491675376892, + "rewards/rejected": -0.8682292103767395, + "step": 5119 + }, + { + "epoch": 0.59, + "learning_rate": 1.248039330445979e-07, + "logits/chosen": -2.5145881175994873, + "logits/rejected": -2.6762452125549316, + "logps/chosen": -213.59185791015625, + "logps/rejected": -144.87527465820312, + "loss": 0.3309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22275319695472717, + "rewards/margins": 1.267820119857788, + "rewards/rejected": -1.0450668334960938, + "step": 5120 + }, + { + "epoch": 0.59, + "learning_rate": 1.2476881657497366e-07, + "logits/chosen": -2.9267120361328125, + "logits/rejected": -2.855530261993408, + "logps/chosen": -137.56900024414062, + "logps/rejected": -231.34202575683594, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16564035415649414, + "rewards/margins": 2.381357192993164, + "rewards/rejected": -2.21571683883667, + "step": 5121 + }, + { + "epoch": 0.59, + "learning_rate": 1.247337001053494e-07, + "logits/chosen": -3.0637199878692627, + "logits/rejected": -3.181149482727051, + "logps/chosen": -273.14910888671875, + "logps/rejected": -292.7840576171875, + "loss": 0.4791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25485843420028687, + "rewards/margins": 1.5640875101089478, + "rewards/rejected": -1.8189458847045898, + "step": 5122 + }, + { + "epoch": 0.59, + "learning_rate": 1.2469858363572514e-07, + "logits/chosen": -2.9009571075439453, + "logits/rejected": -2.716930389404297, + "logps/chosen": -408.74090576171875, + "logps/rejected": -416.535400390625, + "loss": 0.6914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1782316118478775, + "rewards/margins": 1.0850670337677002, + "rewards/rejected": -1.2632986307144165, + "step": 5123 + }, + { + "epoch": 0.59, + "learning_rate": 1.246634671661009e-07, + "logits/chosen": -3.664851427078247, + "logits/rejected": -4.0036187171936035, + "logps/chosen": -102.59632873535156, + "logps/rejected": -219.96957397460938, + "loss": 0.3498, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08753354847431183, + "rewards/margins": 1.4200209379196167, + "rewards/rejected": -1.3324873447418213, + "step": 5124 + }, + { + "epoch": 0.59, + "learning_rate": 1.2462835069647665e-07, + "logits/chosen": -3.18683123588562, + "logits/rejected": -3.395913600921631, + "logps/chosen": -298.7718811035156, + "logps/rejected": -251.67849731445312, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7214133143424988, + "rewards/margins": 3.402377128601074, + "rewards/rejected": -2.6809637546539307, + "step": 5125 + }, + { + "epoch": 0.59, + "learning_rate": 1.245932342268524e-07, + "logits/chosen": -2.568044424057007, + "logits/rejected": -2.509856700897217, + "logps/chosen": -209.97274780273438, + "logps/rejected": -280.86871337890625, + "loss": 0.3376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16808778047561646, + "rewards/margins": 1.1652448177337646, + "rewards/rejected": -1.3333325386047363, + "step": 5126 + }, + { + "epoch": 0.59, + "learning_rate": 1.2455811775722813e-07, + "logits/chosen": -3.292430877685547, + "logits/rejected": -3.012542963027954, + "logps/chosen": -285.42822265625, + "logps/rejected": -222.84686279296875, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3531725406646729, + "rewards/margins": 0.7035712003707886, + "rewards/rejected": -2.056743621826172, + "step": 5127 + }, + { + "epoch": 0.59, + "learning_rate": 1.2452300128760388e-07, + "logits/chosen": -2.6094465255737305, + "logits/rejected": -2.8407602310180664, + "logps/chosen": -383.1266784667969, + "logps/rejected": -309.13916015625, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28032296895980835, + "rewards/margins": 2.163501024246216, + "rewards/rejected": -1.8831779956817627, + "step": 5128 + }, + { + "epoch": 0.59, + "learning_rate": 1.244878848179796e-07, + "logits/chosen": -2.8577542304992676, + "logits/rejected": -2.7162580490112305, + "logps/chosen": -268.920166015625, + "logps/rejected": -358.79058837890625, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05660443753004074, + "rewards/margins": 2.738135814666748, + "rewards/rejected": -2.6815311908721924, + "step": 5129 + }, + { + "epoch": 0.59, + "learning_rate": 1.244527683483554e-07, + "logits/chosen": -2.692230224609375, + "logits/rejected": -3.1477882862091064, + "logps/chosen": -215.27191162109375, + "logps/rejected": -237.44729614257812, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4310786724090576, + "rewards/margins": 1.7590134143829346, + "rewards/rejected": -1.327934980392456, + "step": 5130 + }, + { + "epoch": 0.59, + "learning_rate": 1.2441765187873112e-07, + "logits/chosen": -2.5057621002197266, + "logits/rejected": -2.8244967460632324, + "logps/chosen": -161.97804260253906, + "logps/rejected": -193.97213745117188, + "loss": 0.3752, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6203717589378357, + "rewards/margins": 1.5864571332931519, + "rewards/rejected": -0.9660854339599609, + "step": 5131 + }, + { + "epoch": 0.59, + "learning_rate": 1.2438253540910687e-07, + "logits/chosen": -2.2675700187683105, + "logits/rejected": -2.3567779064178467, + "logps/chosen": -418.916015625, + "logps/rejected": -200.3754425048828, + "loss": 0.3388, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10373665392398834, + "rewards/margins": 1.4681721925735474, + "rewards/rejected": -1.3644354343414307, + "step": 5132 + }, + { + "epoch": 0.59, + "learning_rate": 1.243474189394826e-07, + "logits/chosen": -2.542189836502075, + "logits/rejected": -2.658172845840454, + "logps/chosen": -282.65771484375, + "logps/rejected": -216.7683563232422, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5716956853866577, + "rewards/margins": 2.353724479675293, + "rewards/rejected": -1.7820289134979248, + "step": 5133 + }, + { + "epoch": 0.59, + "learning_rate": 1.2431230246985835e-07, + "logits/chosen": -2.9660253524780273, + "logits/rejected": -2.8505380153656006, + "logps/chosen": -395.95263671875, + "logps/rejected": -235.45089721679688, + "loss": 0.6385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4191504120826721, + "rewards/margins": 0.9702684879302979, + "rewards/rejected": -1.3894188404083252, + "step": 5134 + }, + { + "epoch": 0.59, + "learning_rate": 1.242771860002341e-07, + "logits/chosen": -3.4007906913757324, + "logits/rejected": -3.733142614364624, + "logps/chosen": -315.6585388183594, + "logps/rejected": -278.4137878417969, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005838632583618164, + "rewards/margins": 2.1229376792907715, + "rewards/rejected": -2.1170990467071533, + "step": 5135 + }, + { + "epoch": 0.59, + "learning_rate": 1.2424206953060986e-07, + "logits/chosen": -3.526848793029785, + "logits/rejected": -3.306994915008545, + "logps/chosen": -286.5155944824219, + "logps/rejected": -170.82574462890625, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20262190699577332, + "rewards/margins": 0.9343003034591675, + "rewards/rejected": -0.7316783666610718, + "step": 5136 + }, + { + "epoch": 0.59, + "learning_rate": 1.242069530609856e-07, + "logits/chosen": -2.4026148319244385, + "logits/rejected": -2.529700756072998, + "logps/chosen": -306.0565185546875, + "logps/rejected": -346.70477294921875, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10417544841766357, + "rewards/margins": 2.3160579204559326, + "rewards/rejected": -2.4202332496643066, + "step": 5137 + }, + { + "epoch": 0.59, + "learning_rate": 1.2417183659136134e-07, + "logits/chosen": -3.093322515487671, + "logits/rejected": -3.5482943058013916, + "logps/chosen": -265.547607421875, + "logps/rejected": -286.43084716796875, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06096839904785156, + "rewards/margins": 1.304715871810913, + "rewards/rejected": -1.3656842708587646, + "step": 5138 + }, + { + "epoch": 0.59, + "learning_rate": 1.241367201217371e-07, + "logits/chosen": -3.9022932052612305, + "logits/rejected": -3.9884281158447266, + "logps/chosen": -199.88873291015625, + "logps/rejected": -164.64747619628906, + "loss": 0.5254, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4884019196033478, + "rewards/margins": 1.1995644569396973, + "rewards/rejected": -1.6879663467407227, + "step": 5139 + }, + { + "epoch": 0.59, + "learning_rate": 1.2410160365211282e-07, + "logits/chosen": -3.5390703678131104, + "logits/rejected": -3.4585611820220947, + "logps/chosen": -341.9511413574219, + "logps/rejected": -354.1522521972656, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07511122524738312, + "rewards/margins": 2.7844161987304688, + "rewards/rejected": -2.7093048095703125, + "step": 5140 + }, + { + "epoch": 0.59, + "learning_rate": 1.2406648718248858e-07, + "logits/chosen": -3.0640769004821777, + "logits/rejected": -3.085679531097412, + "logps/chosen": -193.04376220703125, + "logps/rejected": -213.20973205566406, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3558647334575653, + "rewards/margins": 1.142473816871643, + "rewards/rejected": -1.4983384609222412, + "step": 5141 + }, + { + "epoch": 0.59, + "learning_rate": 1.2403137071286433e-07, + "logits/chosen": -2.8975396156311035, + "logits/rejected": -3.4024884700775146, + "logps/chosen": -163.31744384765625, + "logps/rejected": -251.74160766601562, + "loss": 0.3609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06632895767688751, + "rewards/margins": 1.6642582416534424, + "rewards/rejected": -1.7305872440338135, + "step": 5142 + }, + { + "epoch": 0.59, + "learning_rate": 1.2399625424324008e-07, + "logits/chosen": -3.1446986198425293, + "logits/rejected": -3.116234302520752, + "logps/chosen": -221.79779052734375, + "logps/rejected": -340.4810791015625, + "loss": 0.1868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.039515942335128784, + "rewards/margins": 2.5974526405334473, + "rewards/rejected": -2.557936668395996, + "step": 5143 + }, + { + "epoch": 0.59, + "learning_rate": 1.239611377736158e-07, + "logits/chosen": -3.0500571727752686, + "logits/rejected": -2.977799892425537, + "logps/chosen": -420.51318359375, + "logps/rejected": -341.87811279296875, + "loss": 0.7214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19016128778457642, + "rewards/margins": 1.105961561203003, + "rewards/rejected": -1.2961229085922241, + "step": 5144 + }, + { + "epoch": 0.59, + "learning_rate": 1.2392602130399157e-07, + "logits/chosen": -2.2986419200897217, + "logits/rejected": -2.478050470352173, + "logps/chosen": -287.5576171875, + "logps/rejected": -286.1418151855469, + "loss": 0.8768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2835155129432678, + "rewards/margins": 0.39522460103034973, + "rewards/rejected": -0.6787400841712952, + "step": 5145 + }, + { + "epoch": 0.59, + "learning_rate": 1.238909048343673e-07, + "logits/chosen": -2.921954870223999, + "logits/rejected": -3.044980049133301, + "logps/chosen": -358.2756652832031, + "logps/rejected": -351.0960693359375, + "loss": 0.5455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5187159776687622, + "rewards/margins": 0.9334307909011841, + "rewards/rejected": -1.4521467685699463, + "step": 5146 + }, + { + "epoch": 0.59, + "learning_rate": 1.2385578836474307e-07, + "logits/chosen": -2.869605779647827, + "logits/rejected": -3.122382879257202, + "logps/chosen": -361.6567687988281, + "logps/rejected": -172.15972900390625, + "loss": 0.2119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2931489050388336, + "rewards/margins": 2.1417300701141357, + "rewards/rejected": -1.848581314086914, + "step": 5147 + }, + { + "epoch": 0.59, + "learning_rate": 1.238206718951188e-07, + "logits/chosen": -2.826767921447754, + "logits/rejected": -2.8080570697784424, + "logps/chosen": -316.5094909667969, + "logps/rejected": -220.82273864746094, + "loss": 0.2032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009382015094161034, + "rewards/margins": 1.6772688627243042, + "rewards/rejected": -1.6678868532180786, + "step": 5148 + }, + { + "epoch": 0.59, + "learning_rate": 1.2378555542549455e-07, + "logits/chosen": -2.4643707275390625, + "logits/rejected": -2.3738160133361816, + "logps/chosen": -230.30197143554688, + "logps/rejected": -400.03924560546875, + "loss": 0.4456, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19337430596351624, + "rewards/margins": 1.6614190340042114, + "rewards/rejected": -1.4680447578430176, + "step": 5149 + }, + { + "epoch": 0.59, + "learning_rate": 1.2375043895587028e-07, + "logits/chosen": -3.0783121585845947, + "logits/rejected": -2.9951138496398926, + "logps/chosen": -311.6236572265625, + "logps/rejected": -331.98797607421875, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4024505913257599, + "rewards/margins": 2.564390182495117, + "rewards/rejected": -2.9668407440185547, + "step": 5150 + }, + { + "epoch": 0.59, + "learning_rate": 1.2371532248624604e-07, + "logits/chosen": -3.188065528869629, + "logits/rejected": -3.387887716293335, + "logps/chosen": -163.85888671875, + "logps/rejected": -191.5136260986328, + "loss": 0.391, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3803991377353668, + "rewards/margins": 1.5723127126693726, + "rewards/rejected": -1.952711820602417, + "step": 5151 + }, + { + "epoch": 0.59, + "learning_rate": 1.236802060166218e-07, + "logits/chosen": -2.56868052482605, + "logits/rejected": -2.7470569610595703, + "logps/chosen": -264.04254150390625, + "logps/rejected": -417.3427734375, + "loss": 0.5542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43324828147888184, + "rewards/margins": 0.6258847117424011, + "rewards/rejected": -1.0591329336166382, + "step": 5152 + }, + { + "epoch": 0.59, + "learning_rate": 1.2364508954699754e-07, + "logits/chosen": -3.365210771560669, + "logits/rejected": -3.1227645874023438, + "logps/chosen": -262.933837890625, + "logps/rejected": -336.25677490234375, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3530784547328949, + "rewards/margins": 3.0454087257385254, + "rewards/rejected": -3.398487091064453, + "step": 5153 + }, + { + "epoch": 0.59, + "learning_rate": 1.2360997307737327e-07, + "logits/chosen": -3.9119112491607666, + "logits/rejected": -3.6367697715759277, + "logps/chosen": -184.5072021484375, + "logps/rejected": -171.3543701171875, + "loss": 0.4592, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8210390210151672, + "rewards/margins": 1.0980569124221802, + "rewards/rejected": -1.9190958738327026, + "step": 5154 + }, + { + "epoch": 0.59, + "learning_rate": 1.2357485660774903e-07, + "logits/chosen": -2.72275972366333, + "logits/rejected": -2.8188462257385254, + "logps/chosen": -227.47532653808594, + "logps/rejected": -299.435546875, + "loss": 0.1807, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2197084128856659, + "rewards/margins": 2.520461082458496, + "rewards/rejected": -2.3007524013519287, + "step": 5155 + }, + { + "epoch": 0.59, + "learning_rate": 1.2353974013812478e-07, + "logits/chosen": -3.1895558834075928, + "logits/rejected": -3.1331796646118164, + "logps/chosen": -288.162109375, + "logps/rejected": -213.41281127929688, + "loss": 0.6024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5914193987846375, + "rewards/margins": 0.756540834903717, + "rewards/rejected": -1.3479602336883545, + "step": 5156 + }, + { + "epoch": 0.59, + "learning_rate": 1.235046236685005e-07, + "logits/chosen": -3.620007038116455, + "logits/rejected": -3.8738200664520264, + "logps/chosen": -141.0775909423828, + "logps/rejected": -243.56143188476562, + "loss": 0.211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4806353449821472, + "rewards/margins": 2.7707862854003906, + "rewards/rejected": -3.2514216899871826, + "step": 5157 + }, + { + "epoch": 0.59, + "learning_rate": 1.2346950719887626e-07, + "logits/chosen": -3.41267991065979, + "logits/rejected": -3.5527589321136475, + "logps/chosen": -370.16900634765625, + "logps/rejected": -284.5588073730469, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2605476379394531, + "rewards/margins": 2.0956666469573975, + "rewards/rejected": -2.3562142848968506, + "step": 5158 + }, + { + "epoch": 0.59, + "learning_rate": 1.2343439072925201e-07, + "logits/chosen": -3.213653087615967, + "logits/rejected": -3.373002052307129, + "logps/chosen": -384.2462463378906, + "logps/rejected": -295.5800476074219, + "loss": 0.6025, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032629773020744324, + "rewards/margins": 1.5159854888916016, + "rewards/rejected": -1.5486153364181519, + "step": 5159 + }, + { + "epoch": 0.59, + "learning_rate": 1.2339927425962777e-07, + "logits/chosen": -3.0373315811157227, + "logits/rejected": -3.1449544429779053, + "logps/chosen": -249.20455932617188, + "logps/rejected": -237.3478240966797, + "loss": 0.8793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3325718939304352, + "rewards/margins": 0.3562734127044678, + "rewards/rejected": -0.6888452768325806, + "step": 5160 + }, + { + "epoch": 0.59, + "learning_rate": 1.233641577900035e-07, + "logits/chosen": -3.1594860553741455, + "logits/rejected": -3.129887580871582, + "logps/chosen": -275.6136779785156, + "logps/rejected": -245.92340087890625, + "loss": 0.4689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.040759071707725525, + "rewards/margins": 1.006184697151184, + "rewards/rejected": -1.0469439029693604, + "step": 5161 + }, + { + "epoch": 0.6, + "learning_rate": 1.2332904132037925e-07, + "logits/chosen": -2.2091431617736816, + "logits/rejected": -2.653167724609375, + "logps/chosen": -444.1913146972656, + "logps/rejected": -193.44659423828125, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.037239450961351395, + "rewards/margins": 1.749734878540039, + "rewards/rejected": -1.7869744300842285, + "step": 5162 + }, + { + "epoch": 0.6, + "learning_rate": 1.23293924850755e-07, + "logits/chosen": -2.809598445892334, + "logits/rejected": -2.874854564666748, + "logps/chosen": -466.6560974121094, + "logps/rejected": -404.8394470214844, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6390507221221924, + "rewards/margins": 3.0227246284484863, + "rewards/rejected": -2.383673906326294, + "step": 5163 + }, + { + "epoch": 0.6, + "learning_rate": 1.2325880838113076e-07, + "logits/chosen": -2.729487895965576, + "logits/rejected": -2.9293627738952637, + "logps/chosen": -500.4903564453125, + "logps/rejected": -310.22930908203125, + "loss": 0.1662, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25695616006851196, + "rewards/margins": 2.245103597640991, + "rewards/rejected": -1.9881473779678345, + "step": 5164 + }, + { + "epoch": 0.6, + "learning_rate": 1.2322369191150648e-07, + "logits/chosen": -2.759629011154175, + "logits/rejected": -2.6159956455230713, + "logps/chosen": -494.8719482421875, + "logps/rejected": -302.10479736328125, + "loss": 0.3733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09541263431310654, + "rewards/margins": 1.8479219675064087, + "rewards/rejected": -1.9433344602584839, + "step": 5165 + }, + { + "epoch": 0.6, + "learning_rate": 1.2318857544188224e-07, + "logits/chosen": -3.351102352142334, + "logits/rejected": -3.1951398849487305, + "logps/chosen": -239.81430053710938, + "logps/rejected": -227.35720825195312, + "loss": 0.3937, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08247008919715881, + "rewards/margins": 2.307767391204834, + "rewards/rejected": -2.225297212600708, + "step": 5166 + }, + { + "epoch": 0.6, + "learning_rate": 1.23153458972258e-07, + "logits/chosen": -2.204308032989502, + "logits/rejected": -2.259019374847412, + "logps/chosen": -370.3982238769531, + "logps/rejected": -269.4747009277344, + "loss": 0.2341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45790329575538635, + "rewards/margins": 2.18782901763916, + "rewards/rejected": -1.7299258708953857, + "step": 5167 + }, + { + "epoch": 0.6, + "learning_rate": 1.2311834250263372e-07, + "logits/chosen": -2.803575038909912, + "logits/rejected": -2.5925161838531494, + "logps/chosen": -417.7294006347656, + "logps/rejected": -300.6839599609375, + "loss": 0.6917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3312559425830841, + "rewards/margins": 1.2431349754333496, + "rewards/rejected": -0.9118790030479431, + "step": 5168 + }, + { + "epoch": 0.6, + "learning_rate": 1.2308322603300947e-07, + "logits/chosen": -3.617661237716675, + "logits/rejected": -3.525425434112549, + "logps/chosen": -283.5308837890625, + "logps/rejected": -181.88235473632812, + "loss": 0.4695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32231175899505615, + "rewards/margins": 0.8831068277359009, + "rewards/rejected": -1.205418586730957, + "step": 5169 + }, + { + "epoch": 0.6, + "learning_rate": 1.2304810956338523e-07, + "logits/chosen": -3.0581657886505127, + "logits/rejected": -2.835472583770752, + "logps/chosen": -176.3271026611328, + "logps/rejected": -164.31805419921875, + "loss": 0.4601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5914827585220337, + "rewards/margins": 1.153172492980957, + "rewards/rejected": -1.7446553707122803, + "step": 5170 + }, + { + "epoch": 0.6, + "learning_rate": 1.2301299309376098e-07, + "logits/chosen": -2.9150075912475586, + "logits/rejected": -3.131957769393921, + "logps/chosen": -355.2781982421875, + "logps/rejected": -271.6290588378906, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1303451508283615, + "rewards/margins": 1.7111685276031494, + "rewards/rejected": -1.580823540687561, + "step": 5171 + }, + { + "epoch": 0.6, + "learning_rate": 1.229778766241367e-07, + "logits/chosen": -3.3008627891540527, + "logits/rejected": -3.500185012817383, + "logps/chosen": -227.70870971679688, + "logps/rejected": -286.99676513671875, + "loss": 0.9137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4265899658203125, + "rewards/margins": 1.3762304782867432, + "rewards/rejected": -1.8028204441070557, + "step": 5172 + }, + { + "epoch": 0.6, + "learning_rate": 1.2294276015451246e-07, + "logits/chosen": -3.062892436981201, + "logits/rejected": -3.0610804557800293, + "logps/chosen": -337.1151123046875, + "logps/rejected": -220.88787841796875, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4035223424434662, + "rewards/margins": 2.117227554321289, + "rewards/rejected": -1.71370530128479, + "step": 5173 + }, + { + "epoch": 0.6, + "learning_rate": 1.229076436848882e-07, + "logits/chosen": -3.16302227973938, + "logits/rejected": -2.93937611579895, + "logps/chosen": -431.26702880859375, + "logps/rejected": -457.2254638671875, + "loss": 0.3975, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04858148843050003, + "rewards/margins": 1.5382827520370483, + "rewards/rejected": -1.5868642330169678, + "step": 5174 + }, + { + "epoch": 0.6, + "learning_rate": 1.2287252721526397e-07, + "logits/chosen": -2.721254348754883, + "logits/rejected": -2.4619572162628174, + "logps/chosen": -149.45738220214844, + "logps/rejected": -97.16966247558594, + "loss": 0.6138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4412391185760498, + "rewards/margins": 0.8806108236312866, + "rewards/rejected": -1.3218498229980469, + "step": 5175 + }, + { + "epoch": 0.6, + "learning_rate": 1.228374107456397e-07, + "logits/chosen": -3.1360621452331543, + "logits/rejected": -3.457526445388794, + "logps/chosen": -300.4748229980469, + "logps/rejected": -274.537109375, + "loss": 0.7781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5320114493370056, + "rewards/margins": 0.6593219041824341, + "rewards/rejected": -1.191333293914795, + "step": 5176 + }, + { + "epoch": 0.6, + "learning_rate": 1.2280229427601545e-07, + "logits/chosen": -2.9052743911743164, + "logits/rejected": -2.901937246322632, + "logps/chosen": -262.151611328125, + "logps/rejected": -213.69956970214844, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21114599704742432, + "rewards/margins": 1.6099231243133545, + "rewards/rejected": -1.3987771272659302, + "step": 5177 + }, + { + "epoch": 0.6, + "learning_rate": 1.2276717780639118e-07, + "logits/chosen": -3.143329381942749, + "logits/rejected": -3.0886950492858887, + "logps/chosen": -165.79611206054688, + "logps/rejected": -289.0396423339844, + "loss": 0.3345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4432068467140198, + "rewards/margins": 2.25163197517395, + "rewards/rejected": -2.694838762283325, + "step": 5178 + }, + { + "epoch": 0.6, + "learning_rate": 1.2273206133676693e-07, + "logits/chosen": -3.8247275352478027, + "logits/rejected": -4.282573223114014, + "logps/chosen": -102.87472534179688, + "logps/rejected": -190.88671875, + "loss": 0.3997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7284829616546631, + "rewards/margins": 2.4708025455474854, + "rewards/rejected": -3.1992855072021484, + "step": 5179 + }, + { + "epoch": 0.6, + "learning_rate": 1.2269694486714269e-07, + "logits/chosen": -2.9088070392608643, + "logits/rejected": -3.211364269256592, + "logps/chosen": -268.937744140625, + "logps/rejected": -358.1583557128906, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44695067405700684, + "rewards/margins": 2.32647705078125, + "rewards/rejected": -2.773427724838257, + "step": 5180 + }, + { + "epoch": 0.6, + "learning_rate": 1.2266182839751844e-07, + "logits/chosen": -2.664862871170044, + "logits/rejected": -3.0151631832122803, + "logps/chosen": -314.10125732421875, + "logps/rejected": -191.87794494628906, + "loss": 0.8745, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7956207394599915, + "rewards/margins": 0.1315751075744629, + "rewards/rejected": -0.9271959662437439, + "step": 5181 + }, + { + "epoch": 0.6, + "learning_rate": 1.2262671192789417e-07, + "logits/chosen": -4.011450290679932, + "logits/rejected": -3.760204315185547, + "logps/chosen": -379.6104736328125, + "logps/rejected": -390.9476013183594, + "loss": 0.5916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1002708375453949, + "rewards/margins": 1.4575798511505127, + "rewards/rejected": -1.3573089838027954, + "step": 5182 + }, + { + "epoch": 0.6, + "learning_rate": 1.2259159545826992e-07, + "logits/chosen": -2.348848819732666, + "logits/rejected": -2.8163280487060547, + "logps/chosen": -446.20574951171875, + "logps/rejected": -251.6975555419922, + "loss": 0.4218, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2143925279378891, + "rewards/margins": 2.2070815563201904, + "rewards/rejected": -1.9926890134811401, + "step": 5183 + }, + { + "epoch": 0.6, + "learning_rate": 1.2255647898864568e-07, + "logits/chosen": -3.4913954734802246, + "logits/rejected": -3.388582944869995, + "logps/chosen": -285.34344482421875, + "logps/rejected": -275.7471923828125, + "loss": 0.7703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3055725693702698, + "rewards/margins": 1.5932461023330688, + "rewards/rejected": -1.8988187313079834, + "step": 5184 + }, + { + "epoch": 0.6, + "learning_rate": 1.225213625190214e-07, + "logits/chosen": -2.2849247455596924, + "logits/rejected": -2.199320077896118, + "logps/chosen": -336.43896484375, + "logps/rejected": -326.4547424316406, + "loss": 0.5351, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32438692450523376, + "rewards/margins": 1.9414520263671875, + "rewards/rejected": -1.617065191268921, + "step": 5185 + }, + { + "epoch": 0.6, + "learning_rate": 1.2248624604939716e-07, + "logits/chosen": -2.9985289573669434, + "logits/rejected": -2.9439406394958496, + "logps/chosen": -253.88369750976562, + "logps/rejected": -272.1767578125, + "loss": 0.1663, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05855366587638855, + "rewards/margins": 3.2972700595855713, + "rewards/rejected": -3.3558237552642822, + "step": 5186 + }, + { + "epoch": 0.6, + "learning_rate": 1.224511295797729e-07, + "logits/chosen": -3.2378978729248047, + "logits/rejected": -2.902519941329956, + "logps/chosen": -267.2054443359375, + "logps/rejected": -169.57061767578125, + "loss": 0.462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10090598464012146, + "rewards/margins": 1.0095982551574707, + "rewards/rejected": -1.110504150390625, + "step": 5187 + }, + { + "epoch": 0.6, + "learning_rate": 1.2241601311014866e-07, + "logits/chosen": -2.7439441680908203, + "logits/rejected": -2.5505380630493164, + "logps/chosen": -432.6770324707031, + "logps/rejected": -374.92626953125, + "loss": 0.8003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14458434283733368, + "rewards/margins": 0.4205944240093231, + "rewards/rejected": -0.5651787519454956, + "step": 5188 + }, + { + "epoch": 0.6, + "learning_rate": 1.223808966405244e-07, + "logits/chosen": -2.9126179218292236, + "logits/rejected": -2.9583144187927246, + "logps/chosen": -219.92019653320312, + "logps/rejected": -376.5243225097656, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05798976868391037, + "rewards/margins": 2.5000534057617188, + "rewards/rejected": -2.5580430030822754, + "step": 5189 + }, + { + "epoch": 0.6, + "learning_rate": 1.2234578017090015e-07, + "logits/chosen": -2.811135768890381, + "logits/rejected": -2.784280776977539, + "logps/chosen": -255.41860961914062, + "logps/rejected": -279.0583801269531, + "loss": 0.6034, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25849589705467224, + "rewards/margins": 0.8028140664100647, + "rewards/rejected": -0.5443181395530701, + "step": 5190 + }, + { + "epoch": 0.6, + "learning_rate": 1.2231066370127587e-07, + "logits/chosen": -2.988229513168335, + "logits/rejected": -2.9353456497192383, + "logps/chosen": -292.20159912109375, + "logps/rejected": -398.951416015625, + "loss": 0.6775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42451125383377075, + "rewards/margins": 0.22684772312641144, + "rewards/rejected": -0.651358962059021, + "step": 5191 + }, + { + "epoch": 0.6, + "learning_rate": 1.2227554723165165e-07, + "logits/chosen": -3.631828784942627, + "logits/rejected": -3.5053586959838867, + "logps/chosen": -231.27569580078125, + "logps/rejected": -250.5157012939453, + "loss": 0.4285, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17528848350048065, + "rewards/margins": 1.556211233139038, + "rewards/rejected": -1.380922794342041, + "step": 5192 + }, + { + "epoch": 0.6, + "learning_rate": 1.2224043076202738e-07, + "logits/chosen": -3.0040669441223145, + "logits/rejected": -2.959940195083618, + "logps/chosen": -281.1169738769531, + "logps/rejected": -302.4212951660156, + "loss": 0.1298, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.765924334526062, + "rewards/margins": 4.009528160095215, + "rewards/rejected": -3.2436037063598633, + "step": 5193 + }, + { + "epoch": 0.6, + "learning_rate": 1.2220531429240313e-07, + "logits/chosen": -3.0549678802490234, + "logits/rejected": -2.9327633380889893, + "logps/chosen": -170.459228515625, + "logps/rejected": -202.31314086914062, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3828567862510681, + "rewards/margins": 1.072320580482483, + "rewards/rejected": -0.6894637942314148, + "step": 5194 + }, + { + "epoch": 0.6, + "learning_rate": 1.2217019782277886e-07, + "logits/chosen": -3.3431754112243652, + "logits/rejected": -3.401245594024658, + "logps/chosen": -203.76792907714844, + "logps/rejected": -276.31671142578125, + "loss": 0.3384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16383762657642365, + "rewards/margins": 1.644264817237854, + "rewards/rejected": -1.4804272651672363, + "step": 5195 + }, + { + "epoch": 0.6, + "learning_rate": 1.2213508135315462e-07, + "logits/chosen": -3.3390755653381348, + "logits/rejected": -3.496105194091797, + "logps/chosen": -198.2833251953125, + "logps/rejected": -214.4532012939453, + "loss": 0.2186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2153109908103943, + "rewards/margins": 1.8719356060028076, + "rewards/rejected": -1.6566245555877686, + "step": 5196 + }, + { + "epoch": 0.6, + "learning_rate": 1.2209996488353037e-07, + "logits/chosen": -2.543335437774658, + "logits/rejected": -2.8048582077026367, + "logps/chosen": -474.1484069824219, + "logps/rejected": -217.5401611328125, + "loss": 1.3623, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6533265113830566, + "rewards/margins": -0.558960497379303, + "rewards/rejected": -1.0943658351898193, + "step": 5197 + }, + { + "epoch": 0.6, + "learning_rate": 1.2206484841390612e-07, + "logits/chosen": -3.136906862258911, + "logits/rejected": -3.1081228256225586, + "logps/chosen": -268.7641906738281, + "logps/rejected": -438.28350830078125, + "loss": 0.5438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45171698927879333, + "rewards/margins": 1.0462948083877563, + "rewards/rejected": -1.498011827468872, + "step": 5198 + }, + { + "epoch": 0.6, + "learning_rate": 1.2202973194428185e-07, + "logits/chosen": -3.0721583366394043, + "logits/rejected": -2.9507555961608887, + "logps/chosen": -182.5630340576172, + "logps/rejected": -335.062255859375, + "loss": 0.3415, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004327267408370972, + "rewards/margins": 3.069530487060547, + "rewards/rejected": -3.0652031898498535, + "step": 5199 + }, + { + "epoch": 0.6, + "learning_rate": 1.219946154746576e-07, + "logits/chosen": -3.632455348968506, + "logits/rejected": -3.5064611434936523, + "logps/chosen": -273.2044372558594, + "logps/rejected": -248.01242065429688, + "loss": 0.3583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23595888912677765, + "rewards/margins": 2.087024688720703, + "rewards/rejected": -2.322983741760254, + "step": 5200 + }, + { + "epoch": 0.6, + "learning_rate": 1.2195949900503336e-07, + "logits/chosen": -2.843179225921631, + "logits/rejected": -2.407686471939087, + "logps/chosen": -297.7922668457031, + "logps/rejected": -349.45745849609375, + "loss": 0.5948, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15714681148529053, + "rewards/margins": 0.7533746957778931, + "rewards/rejected": -0.9105215072631836, + "step": 5201 + }, + { + "epoch": 0.6, + "learning_rate": 1.2192438253540909e-07, + "logits/chosen": -3.062192916870117, + "logits/rejected": -2.9286723136901855, + "logps/chosen": -416.03021240234375, + "logps/rejected": -348.7435302734375, + "loss": 0.2691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33860456943511963, + "rewards/margins": 2.4508655071258545, + "rewards/rejected": -2.1122610569000244, + "step": 5202 + }, + { + "epoch": 0.6, + "learning_rate": 1.2188926606578484e-07, + "logits/chosen": -2.5191261768341064, + "logits/rejected": -2.333369731903076, + "logps/chosen": -235.30941772460938, + "logps/rejected": -322.86285400390625, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2562994062900543, + "rewards/margins": 1.2130556106567383, + "rewards/rejected": -0.9567563533782959, + "step": 5203 + }, + { + "epoch": 0.6, + "learning_rate": 1.218541495961606e-07, + "logits/chosen": -1.9033299684524536, + "logits/rejected": -1.866495132446289, + "logps/chosen": -191.02035522460938, + "logps/rejected": -246.0564727783203, + "loss": 0.3568, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11245231330394745, + "rewards/margins": 1.682666540145874, + "rewards/rejected": -1.5702142715454102, + "step": 5204 + }, + { + "epoch": 0.6, + "learning_rate": 1.2181903312653635e-07, + "logits/chosen": -3.030803680419922, + "logits/rejected": -3.0483882427215576, + "logps/chosen": -363.9146728515625, + "logps/rejected": -333.9434509277344, + "loss": 0.1629, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3156580328941345, + "rewards/margins": 2.3087282180786133, + "rewards/rejected": -1.993070363998413, + "step": 5205 + }, + { + "epoch": 0.6, + "learning_rate": 1.2178391665691207e-07, + "logits/chosen": -2.2458279132843018, + "logits/rejected": -2.2214837074279785, + "logps/chosen": -403.5029602050781, + "logps/rejected": -270.57659912109375, + "loss": 0.4705, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.059605419635772705, + "rewards/margins": 1.6680136919021606, + "rewards/rejected": -1.6084082126617432, + "step": 5206 + }, + { + "epoch": 0.6, + "learning_rate": 1.2174880018728783e-07, + "logits/chosen": -2.8312740325927734, + "logits/rejected": -2.9783174991607666, + "logps/chosen": -341.7548828125, + "logps/rejected": -371.0419921875, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10385347902774811, + "rewards/margins": 3.9140877723693848, + "rewards/rejected": -3.8102340698242188, + "step": 5207 + }, + { + "epoch": 0.6, + "learning_rate": 1.2171368371766358e-07, + "logits/chosen": -3.7071492671966553, + "logits/rejected": -3.680072069168091, + "logps/chosen": -239.55862426757812, + "logps/rejected": -337.0492858886719, + "loss": 0.5341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6820453405380249, + "rewards/margins": 2.6396238803863525, + "rewards/rejected": -3.321669340133667, + "step": 5208 + }, + { + "epoch": 0.6, + "learning_rate": 1.2167856724803934e-07, + "logits/chosen": -3.2911148071289062, + "logits/rejected": -3.235689640045166, + "logps/chosen": -207.2025146484375, + "logps/rejected": -197.9849853515625, + "loss": 0.3571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2274639904499054, + "rewards/margins": 2.2422285079956055, + "rewards/rejected": -2.4696924686431885, + "step": 5209 + }, + { + "epoch": 0.6, + "learning_rate": 1.2164345077841506e-07, + "logits/chosen": -3.667083263397217, + "logits/rejected": -3.7753007411956787, + "logps/chosen": -202.90139770507812, + "logps/rejected": -233.2733154296875, + "loss": 0.5496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2940638065338135, + "rewards/margins": 1.0239603519439697, + "rewards/rejected": -1.3180241584777832, + "step": 5210 + }, + { + "epoch": 0.6, + "learning_rate": 1.2160833430879082e-07, + "logits/chosen": -3.4138057231903076, + "logits/rejected": -3.7077465057373047, + "logps/chosen": -123.68051147460938, + "logps/rejected": -201.48062133789062, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0331110954284668, + "rewards/margins": 2.8269762992858887, + "rewards/rejected": -1.7938652038574219, + "step": 5211 + }, + { + "epoch": 0.6, + "learning_rate": 1.2157321783916657e-07, + "logits/chosen": -3.430452585220337, + "logits/rejected": -3.331547975540161, + "logps/chosen": -277.80694580078125, + "logps/rejected": -206.60984802246094, + "loss": 0.5471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01072278618812561, + "rewards/margins": 1.1077311038970947, + "rewards/rejected": -1.118453860282898, + "step": 5212 + }, + { + "epoch": 0.6, + "learning_rate": 1.215381013695423e-07, + "logits/chosen": -3.344498872756958, + "logits/rejected": -3.419649600982666, + "logps/chosen": -267.4978942871094, + "logps/rejected": -247.4333953857422, + "loss": 0.5408, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39230096340179443, + "rewards/margins": 1.9568054676055908, + "rewards/rejected": -1.564504623413086, + "step": 5213 + }, + { + "epoch": 0.6, + "learning_rate": 1.2150298489991805e-07, + "logits/chosen": -2.718519926071167, + "logits/rejected": -3.027796745300293, + "logps/chosen": -199.85960388183594, + "logps/rejected": -308.43450927734375, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5378886461257935, + "rewards/margins": 1.626355767250061, + "rewards/rejected": -2.1642444133758545, + "step": 5214 + }, + { + "epoch": 0.6, + "learning_rate": 1.214678684302938e-07, + "logits/chosen": -2.26768159866333, + "logits/rejected": -2.3641233444213867, + "logps/chosen": -269.4970397949219, + "logps/rejected": -178.1766357421875, + "loss": 0.8523, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.658146858215332, + "rewards/margins": 1.2302989959716797, + "rewards/rejected": -1.8884457349777222, + "step": 5215 + }, + { + "epoch": 0.6, + "learning_rate": 1.2143275196066956e-07, + "logits/chosen": -3.828339099884033, + "logits/rejected": -3.438283920288086, + "logps/chosen": -312.7521667480469, + "logps/rejected": -286.481201171875, + "loss": 0.4256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08610451221466064, + "rewards/margins": 2.0246944427490234, + "rewards/rejected": -2.1107988357543945, + "step": 5216 + }, + { + "epoch": 0.6, + "learning_rate": 1.213976354910453e-07, + "logits/chosen": -3.1766724586486816, + "logits/rejected": -3.0177972316741943, + "logps/chosen": -516.7952880859375, + "logps/rejected": -344.722900390625, + "loss": 0.2547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31025010347366333, + "rewards/margins": 2.1715352535247803, + "rewards/rejected": -2.481785297393799, + "step": 5217 + }, + { + "epoch": 0.6, + "learning_rate": 1.2136251902142104e-07, + "logits/chosen": -3.464975357055664, + "logits/rejected": -3.0747368335723877, + "logps/chosen": -229.38645935058594, + "logps/rejected": -283.61602783203125, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28603190183639526, + "rewards/margins": 1.935234546661377, + "rewards/rejected": -1.649202585220337, + "step": 5218 + }, + { + "epoch": 0.6, + "learning_rate": 1.2132740255179677e-07, + "logits/chosen": -3.1373889446258545, + "logits/rejected": -2.7044944763183594, + "logps/chosen": -286.94097900390625, + "logps/rejected": -219.58468627929688, + "loss": 0.4144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.576767086982727, + "rewards/margins": 1.6572036743164062, + "rewards/rejected": -2.2339706420898438, + "step": 5219 + }, + { + "epoch": 0.6, + "learning_rate": 1.2129228608217255e-07, + "logits/chosen": -2.6105387210845947, + "logits/rejected": -2.6753692626953125, + "logps/chosen": -369.8568115234375, + "logps/rejected": -217.5819549560547, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3486417531967163, + "rewards/margins": 0.8533391356468201, + "rewards/rejected": -1.2019808292388916, + "step": 5220 + }, + { + "epoch": 0.6, + "learning_rate": 1.2125716961254828e-07, + "logits/chosen": -2.804413080215454, + "logits/rejected": -2.661242723464966, + "logps/chosen": -366.040283203125, + "logps/rejected": -367.112548828125, + "loss": 0.2683, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.247966006398201, + "rewards/margins": 2.6256794929504395, + "rewards/rejected": -2.377713441848755, + "step": 5221 + }, + { + "epoch": 0.6, + "learning_rate": 1.2122205314292403e-07, + "logits/chosen": -2.782045841217041, + "logits/rejected": -3.1047775745391846, + "logps/chosen": -159.67384338378906, + "logps/rejected": -303.17169189453125, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019940122961997986, + "rewards/margins": 1.6275315284729004, + "rewards/rejected": -1.6075913906097412, + "step": 5222 + }, + { + "epoch": 0.6, + "learning_rate": 1.2118693667329976e-07, + "logits/chosen": -2.593538761138916, + "logits/rejected": -2.3662805557250977, + "logps/chosen": -228.2781219482422, + "logps/rejected": -239.7020721435547, + "loss": 0.2843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1524525135755539, + "rewards/margins": 1.3987566232681274, + "rewards/rejected": -1.2463042736053467, + "step": 5223 + }, + { + "epoch": 0.6, + "learning_rate": 1.2115182020367554e-07, + "logits/chosen": -3.44091534614563, + "logits/rejected": -3.477853775024414, + "logps/chosen": -115.70633697509766, + "logps/rejected": -182.95123291015625, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02864648401737213, + "rewards/margins": 1.3936532735824585, + "rewards/rejected": -1.4222997426986694, + "step": 5224 + }, + { + "epoch": 0.6, + "learning_rate": 1.2111670373405127e-07, + "logits/chosen": -2.741844892501831, + "logits/rejected": -2.879951000213623, + "logps/chosen": -240.1184539794922, + "logps/rejected": -273.3140563964844, + "loss": 0.3415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4107971787452698, + "rewards/margins": 1.5568939447402954, + "rewards/rejected": -1.9676910638809204, + "step": 5225 + }, + { + "epoch": 0.6, + "learning_rate": 1.2108158726442702e-07, + "logits/chosen": -3.196927070617676, + "logits/rejected": -3.275141716003418, + "logps/chosen": -195.29122924804688, + "logps/rejected": -234.3670654296875, + "loss": 0.5083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07945594191551208, + "rewards/margins": 1.788076400756836, + "rewards/rejected": -1.8675322532653809, + "step": 5226 + }, + { + "epoch": 0.6, + "learning_rate": 1.2104647079480275e-07, + "logits/chosen": -2.665973663330078, + "logits/rejected": -2.7045369148254395, + "logps/chosen": -232.06036376953125, + "logps/rejected": -186.01589965820312, + "loss": 0.4748, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12026205658912659, + "rewards/margins": 0.8856464624404907, + "rewards/rejected": -0.7653844356536865, + "step": 5227 + }, + { + "epoch": 0.6, + "learning_rate": 1.210113543251785e-07, + "logits/chosen": -2.227813482284546, + "logits/rejected": -2.634087562561035, + "logps/chosen": -332.4530334472656, + "logps/rejected": -444.0452575683594, + "loss": 0.3683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15424798429012299, + "rewards/margins": 1.6662102937698364, + "rewards/rejected": -1.8204580545425415, + "step": 5228 + }, + { + "epoch": 0.6, + "learning_rate": 1.2097623785555425e-07, + "logits/chosen": -3.177196502685547, + "logits/rejected": -2.92907977104187, + "logps/chosen": -162.54293823242188, + "logps/rejected": -154.77243041992188, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17443682253360748, + "rewards/margins": 1.363032341003418, + "rewards/rejected": -1.5374690294265747, + "step": 5229 + }, + { + "epoch": 0.6, + "learning_rate": 1.2094112138592998e-07, + "logits/chosen": -3.122189521789551, + "logits/rejected": -3.016157865524292, + "logps/chosen": -227.82876586914062, + "logps/rejected": -248.71435546875, + "loss": 0.3712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.088976189494133, + "rewards/margins": 0.8735700845718384, + "rewards/rejected": -0.9625463485717773, + "step": 5230 + }, + { + "epoch": 0.6, + "learning_rate": 1.2090600491630574e-07, + "logits/chosen": -3.0521514415740967, + "logits/rejected": -2.9738926887512207, + "logps/chosen": -329.7683410644531, + "logps/rejected": -276.89349365234375, + "loss": 0.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20828092098236084, + "rewards/margins": 1.4070470333099365, + "rewards/rejected": -1.6153278350830078, + "step": 5231 + }, + { + "epoch": 0.6, + "learning_rate": 1.208708884466815e-07, + "logits/chosen": -3.993516445159912, + "logits/rejected": -3.5869407653808594, + "logps/chosen": -324.1094665527344, + "logps/rejected": -201.15512084960938, + "loss": 0.9559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8842984437942505, + "rewards/margins": 0.6926509737968445, + "rewards/rejected": -1.5769493579864502, + "step": 5232 + }, + { + "epoch": 0.6, + "learning_rate": 1.2083577197705724e-07, + "logits/chosen": -2.706437110900879, + "logits/rejected": -2.5001683235168457, + "logps/chosen": -247.0949249267578, + "logps/rejected": -440.0496520996094, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25635719299316406, + "rewards/margins": 2.8700578212738037, + "rewards/rejected": -2.6137006282806396, + "step": 5233 + }, + { + "epoch": 0.6, + "learning_rate": 1.2080065550743297e-07, + "logits/chosen": -3.5473976135253906, + "logits/rejected": -3.609318733215332, + "logps/chosen": -204.4401397705078, + "logps/rejected": -232.96856689453125, + "loss": 0.2044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03684224188327789, + "rewards/margins": 2.7051258087158203, + "rewards/rejected": -2.668283700942993, + "step": 5234 + }, + { + "epoch": 0.6, + "learning_rate": 1.2076553903780872e-07, + "logits/chosen": -3.4690723419189453, + "logits/rejected": -4.082224369049072, + "logps/chosen": -172.59579467773438, + "logps/rejected": -335.96820068359375, + "loss": 0.1719, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.791286051273346, + "rewards/margins": 3.591503858566284, + "rewards/rejected": -2.800217628479004, + "step": 5235 + }, + { + "epoch": 0.6, + "learning_rate": 1.2073042256818448e-07, + "logits/chosen": -2.8833184242248535, + "logits/rejected": -2.8643088340759277, + "logps/chosen": -324.9278564453125, + "logps/rejected": -269.1184387207031, + "loss": 0.2847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4237309396266937, + "rewards/margins": 1.2329367399215698, + "rewards/rejected": -1.656667709350586, + "step": 5236 + }, + { + "epoch": 0.6, + "learning_rate": 1.2069530609856023e-07, + "logits/chosen": -3.2410712242126465, + "logits/rejected": -3.141294240951538, + "logps/chosen": -318.8669128417969, + "logps/rejected": -263.40020751953125, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5544151663780212, + "rewards/margins": 2.312713384628296, + "rewards/rejected": -2.867128610610962, + "step": 5237 + }, + { + "epoch": 0.6, + "learning_rate": 1.2066018962893596e-07, + "logits/chosen": -4.029869079589844, + "logits/rejected": -3.990821361541748, + "logps/chosen": -429.2679443359375, + "logps/rejected": -218.78884887695312, + "loss": 0.5345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2725607752799988, + "rewards/margins": 1.8695515394210815, + "rewards/rejected": -2.1421122550964355, + "step": 5238 + }, + { + "epoch": 0.6, + "learning_rate": 1.2062507315931171e-07, + "logits/chosen": -2.9293506145477295, + "logits/rejected": -3.1132500171661377, + "logps/chosen": -274.43902587890625, + "logps/rejected": -281.4142150878906, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4382481575012207, + "rewards/margins": 1.106748104095459, + "rewards/rejected": -1.5449962615966797, + "step": 5239 + }, + { + "epoch": 0.6, + "learning_rate": 1.2058995668968744e-07, + "logits/chosen": -2.7817864418029785, + "logits/rejected": -2.712226390838623, + "logps/chosen": -363.6641845703125, + "logps/rejected": -339.00439453125, + "loss": 0.7376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07091806828975677, + "rewards/margins": 1.5792995691299438, + "rewards/rejected": -1.6502177715301514, + "step": 5240 + }, + { + "epoch": 0.6, + "learning_rate": 1.2055484022006322e-07, + "logits/chosen": -2.2478151321411133, + "logits/rejected": -2.2379281520843506, + "logps/chosen": -345.2948303222656, + "logps/rejected": -207.29713439941406, + "loss": 0.1363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3363218605518341, + "rewards/margins": 2.583042621612549, + "rewards/rejected": -2.246720790863037, + "step": 5241 + }, + { + "epoch": 0.6, + "learning_rate": 1.2051972375043895e-07, + "logits/chosen": -2.9631898403167725, + "logits/rejected": -3.1933281421661377, + "logps/chosen": -326.3813171386719, + "logps/rejected": -303.8610534667969, + "loss": 0.588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01886656880378723, + "rewards/margins": 2.3021111488342285, + "rewards/rejected": -2.3209774494171143, + "step": 5242 + }, + { + "epoch": 0.6, + "learning_rate": 1.204846072808147e-07, + "logits/chosen": -3.171943187713623, + "logits/rejected": -2.9726765155792236, + "logps/chosen": -303.70343017578125, + "logps/rejected": -288.77178955078125, + "loss": 0.4872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.060552701354026794, + "rewards/margins": 2.048556089401245, + "rewards/rejected": -1.9880033731460571, + "step": 5243 + }, + { + "epoch": 0.6, + "learning_rate": 1.2044949081119043e-07, + "logits/chosen": -2.5231645107269287, + "logits/rejected": -2.7960026264190674, + "logps/chosen": -239.24803161621094, + "logps/rejected": -311.535400390625, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19614994525909424, + "rewards/margins": 2.4837446212768555, + "rewards/rejected": -2.287594795227051, + "step": 5244 + }, + { + "epoch": 0.6, + "learning_rate": 1.2041437434156618e-07, + "logits/chosen": -2.4150071144104004, + "logits/rejected": -2.3548648357391357, + "logps/chosen": -402.8688659667969, + "logps/rejected": -357.9402160644531, + "loss": 0.5995, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6132261157035828, + "rewards/margins": 1.1073737144470215, + "rewards/rejected": -1.720599889755249, + "step": 5245 + }, + { + "epoch": 0.6, + "learning_rate": 1.2037925787194194e-07, + "logits/chosen": -3.1932194232940674, + "logits/rejected": -3.2763094902038574, + "logps/chosen": -267.24530029296875, + "logps/rejected": -412.2287902832031, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2812500596046448, + "rewards/margins": 1.5769884586334229, + "rewards/rejected": -1.2957382202148438, + "step": 5246 + }, + { + "epoch": 0.6, + "learning_rate": 1.2034414140231767e-07, + "logits/chosen": -2.9148612022399902, + "logits/rejected": -2.9143834114074707, + "logps/chosen": -302.7430114746094, + "logps/rejected": -226.05499267578125, + "loss": 0.4157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2111915498971939, + "rewards/margins": 1.7067968845367432, + "rewards/rejected": -1.9179884195327759, + "step": 5247 + }, + { + "epoch": 0.6, + "learning_rate": 1.2030902493269342e-07, + "logits/chosen": -3.700883150100708, + "logits/rejected": -3.5394699573516846, + "logps/chosen": -252.55007934570312, + "logps/rejected": -296.26751708984375, + "loss": 0.3648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16407988965511322, + "rewards/margins": 1.4042044878005981, + "rewards/rejected": -1.568284273147583, + "step": 5248 + }, + { + "epoch": 0.61, + "learning_rate": 1.2027390846306917e-07, + "logits/chosen": -3.3435847759246826, + "logits/rejected": -3.6531989574432373, + "logps/chosen": -281.9114685058594, + "logps/rejected": -149.0885467529297, + "loss": 0.4394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3497462272644043, + "rewards/margins": 0.9830405712127686, + "rewards/rejected": -1.3327867984771729, + "step": 5249 + }, + { + "epoch": 0.61, + "learning_rate": 1.2023879199344493e-07, + "logits/chosen": -3.815300941467285, + "logits/rejected": -3.3272323608398438, + "logps/chosen": -317.18035888671875, + "logps/rejected": -256.69805908203125, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04160308837890625, + "rewards/margins": 2.046069383621216, + "rewards/rejected": -2.087672472000122, + "step": 5250 + }, + { + "epoch": 0.61, + "learning_rate": 1.2020367552382065e-07, + "logits/chosen": -2.4591777324676514, + "logits/rejected": -2.8692550659179688, + "logps/chosen": -167.66961669921875, + "logps/rejected": -271.9220886230469, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09552323818206787, + "rewards/margins": 3.819779396057129, + "rewards/rejected": -3.7242558002471924, + "step": 5251 + }, + { + "epoch": 0.61, + "learning_rate": 1.201685590541964e-07, + "logits/chosen": -3.1461150646209717, + "logits/rejected": -3.147984743118286, + "logps/chosen": -196.43641662597656, + "logps/rejected": -325.5423583984375, + "loss": 0.1833, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18382255733013153, + "rewards/margins": 2.732227325439453, + "rewards/rejected": -2.5484049320220947, + "step": 5252 + }, + { + "epoch": 0.61, + "learning_rate": 1.2013344258457216e-07, + "logits/chosen": -2.6345555782318115, + "logits/rejected": -2.861504316329956, + "logps/chosen": -325.9178466796875, + "logps/rejected": -274.4767150878906, + "loss": 0.4591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.518799901008606, + "rewards/margins": 1.3637924194335938, + "rewards/rejected": -1.8825924396514893, + "step": 5253 + }, + { + "epoch": 0.61, + "learning_rate": 1.2009832611494792e-07, + "logits/chosen": -2.8335118293762207, + "logits/rejected": -2.546264171600342, + "logps/chosen": -333.8028564453125, + "logps/rejected": -265.1710205078125, + "loss": 0.8559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21729569137096405, + "rewards/margins": 0.4414712190628052, + "rewards/rejected": -0.6587669253349304, + "step": 5254 + }, + { + "epoch": 0.61, + "learning_rate": 1.2006320964532364e-07, + "logits/chosen": -2.810483455657959, + "logits/rejected": -2.869291305541992, + "logps/chosen": -271.9112548828125, + "logps/rejected": -205.33668518066406, + "loss": 0.4359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04997949302196503, + "rewards/margins": 1.3843660354614258, + "rewards/rejected": -1.3343865871429443, + "step": 5255 + }, + { + "epoch": 0.61, + "learning_rate": 1.200280931756994e-07, + "logits/chosen": -3.0189099311828613, + "logits/rejected": -3.150319814682007, + "logps/chosen": -314.3910217285156, + "logps/rejected": -274.66558837890625, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08220675587654114, + "rewards/margins": 1.5590273141860962, + "rewards/rejected": -1.4768205881118774, + "step": 5256 + }, + { + "epoch": 0.61, + "learning_rate": 1.1999297670607515e-07, + "logits/chosen": -3.6237306594848633, + "logits/rejected": -3.667257785797119, + "logps/chosen": -138.72219848632812, + "logps/rejected": -225.99139404296875, + "loss": 0.3974, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3245377540588379, + "rewards/margins": 1.8181201219558716, + "rewards/rejected": -2.142657995223999, + "step": 5257 + }, + { + "epoch": 0.61, + "learning_rate": 1.199578602364509e-07, + "logits/chosen": -2.6627988815307617, + "logits/rejected": -3.0449540615081787, + "logps/chosen": -229.34292602539062, + "logps/rejected": -360.1202392578125, + "loss": 0.2943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08262103796005249, + "rewards/margins": 2.7906956672668457, + "rewards/rejected": -2.873316526412964, + "step": 5258 + }, + { + "epoch": 0.61, + "learning_rate": 1.1992274376682663e-07, + "logits/chosen": -2.2099671363830566, + "logits/rejected": -2.3426947593688965, + "logps/chosen": -262.30718994140625, + "logps/rejected": -304.1112365722656, + "loss": 0.8665, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5446299314498901, + "rewards/margins": 1.161249041557312, + "rewards/rejected": -1.7058790922164917, + "step": 5259 + }, + { + "epoch": 0.61, + "learning_rate": 1.1988762729720239e-07, + "logits/chosen": -3.844816207885742, + "logits/rejected": -4.067734718322754, + "logps/chosen": -145.03819274902344, + "logps/rejected": -282.3373718261719, + "loss": 0.3722, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2555675208568573, + "rewards/margins": 3.2537617683410645, + "rewards/rejected": -3.509329319000244, + "step": 5260 + }, + { + "epoch": 0.61, + "learning_rate": 1.1985251082757814e-07, + "logits/chosen": -3.0935184955596924, + "logits/rejected": -3.371138572692871, + "logps/chosen": -281.9754638671875, + "logps/rejected": -285.90301513671875, + "loss": 0.2597, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24172136187553406, + "rewards/margins": 2.565387725830078, + "rewards/rejected": -2.3236663341522217, + "step": 5261 + }, + { + "epoch": 0.61, + "learning_rate": 1.1981739435795387e-07, + "logits/chosen": -2.8428263664245605, + "logits/rejected": -2.8018476963043213, + "logps/chosen": -305.2217102050781, + "logps/rejected": -300.6449890136719, + "loss": 0.5926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7708445191383362, + "rewards/margins": 1.2593092918395996, + "rewards/rejected": -2.030153751373291, + "step": 5262 + }, + { + "epoch": 0.61, + "learning_rate": 1.1978227788832962e-07, + "logits/chosen": -3.1400656700134277, + "logits/rejected": -3.187441110610962, + "logps/chosen": -177.19375610351562, + "logps/rejected": -205.10202026367188, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10997073352336884, + "rewards/margins": 2.0718464851379395, + "rewards/rejected": -1.9618757963180542, + "step": 5263 + }, + { + "epoch": 0.61, + "learning_rate": 1.1974716141870535e-07, + "logits/chosen": -2.060303211212158, + "logits/rejected": -1.9244282245635986, + "logps/chosen": -248.10623168945312, + "logps/rejected": -204.86111450195312, + "loss": 0.5644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10428924113512039, + "rewards/margins": 0.6631040573120117, + "rewards/rejected": -0.7673932313919067, + "step": 5264 + }, + { + "epoch": 0.61, + "learning_rate": 1.1971204494908113e-07, + "logits/chosen": -2.9772720336914062, + "logits/rejected": -2.8204245567321777, + "logps/chosen": -343.1988220214844, + "logps/rejected": -278.396728515625, + "loss": 0.6047, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18444335460662842, + "rewards/margins": 0.6080455780029297, + "rewards/rejected": -0.4236021637916565, + "step": 5265 + }, + { + "epoch": 0.61, + "learning_rate": 1.1967692847945686e-07, + "logits/chosen": -3.187317132949829, + "logits/rejected": -3.0652551651000977, + "logps/chosen": -154.0248260498047, + "logps/rejected": -182.4474639892578, + "loss": 0.3734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024674534797668457, + "rewards/margins": 1.5339751243591309, + "rewards/rejected": -1.5586495399475098, + "step": 5266 + }, + { + "epoch": 0.61, + "learning_rate": 1.196418120098326e-07, + "logits/chosen": -3.5659375190734863, + "logits/rejected": -3.2536399364471436, + "logps/chosen": -319.11846923828125, + "logps/rejected": -219.90380859375, + "loss": 0.3997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20624789595603943, + "rewards/margins": 1.5621837377548218, + "rewards/rejected": -1.7684316635131836, + "step": 5267 + }, + { + "epoch": 0.61, + "learning_rate": 1.1960669554020834e-07, + "logits/chosen": -2.498556137084961, + "logits/rejected": -2.259441614151001, + "logps/chosen": -458.2297058105469, + "logps/rejected": -367.47039794921875, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08534112572669983, + "rewards/margins": 2.304173707962036, + "rewards/rejected": -2.218832492828369, + "step": 5268 + }, + { + "epoch": 0.61, + "learning_rate": 1.1957157907058412e-07, + "logits/chosen": -2.806627035140991, + "logits/rejected": -2.6817054748535156, + "logps/chosen": -282.81439208984375, + "logps/rejected": -341.064453125, + "loss": 0.3562, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02530829608440399, + "rewards/margins": 1.363014578819275, + "rewards/rejected": -1.3883228302001953, + "step": 5269 + }, + { + "epoch": 0.61, + "learning_rate": 1.1953646260095984e-07, + "logits/chosen": -3.093775987625122, + "logits/rejected": -3.0295724868774414, + "logps/chosen": -261.1454162597656, + "logps/rejected": -266.12225341796875, + "loss": 0.4608, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04410265386104584, + "rewards/margins": 1.73930025100708, + "rewards/rejected": -1.7834028005599976, + "step": 5270 + }, + { + "epoch": 0.61, + "learning_rate": 1.195013461313356e-07, + "logits/chosen": -2.2236580848693848, + "logits/rejected": -2.1952908039093018, + "logps/chosen": -423.15618896484375, + "logps/rejected": -427.3813781738281, + "loss": 0.4028, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30229640007019043, + "rewards/margins": 1.8198291063308716, + "rewards/rejected": -1.5175325870513916, + "step": 5271 + }, + { + "epoch": 0.61, + "learning_rate": 1.1946622966171133e-07, + "logits/chosen": -3.947577476501465, + "logits/rejected": -3.789827346801758, + "logps/chosen": -256.69659423828125, + "logps/rejected": -223.3997802734375, + "loss": 0.4425, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11041039973497391, + "rewards/margins": 1.3474764823913574, + "rewards/rejected": -1.4578869342803955, + "step": 5272 + }, + { + "epoch": 0.61, + "learning_rate": 1.1943111319208708e-07, + "logits/chosen": -3.1630654335021973, + "logits/rejected": -3.676985740661621, + "logps/chosen": -212.4100341796875, + "logps/rejected": -255.36795043945312, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43209031224250793, + "rewards/margins": 3.3255698680877686, + "rewards/rejected": -2.893479347229004, + "step": 5273 + }, + { + "epoch": 0.61, + "learning_rate": 1.1939599672246283e-07, + "logits/chosen": -2.889444351196289, + "logits/rejected": -2.722604274749756, + "logps/chosen": -341.0267639160156, + "logps/rejected": -287.3748474121094, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.677532434463501, + "rewards/margins": 2.2087178230285645, + "rewards/rejected": -2.8862500190734863, + "step": 5274 + }, + { + "epoch": 0.61, + "learning_rate": 1.193608802528386e-07, + "logits/chosen": -2.539132595062256, + "logits/rejected": -2.625786304473877, + "logps/chosen": -208.70423889160156, + "logps/rejected": -242.12620544433594, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05602100491523743, + "rewards/margins": 1.6638544797897339, + "rewards/rejected": -1.719875454902649, + "step": 5275 + }, + { + "epoch": 0.61, + "learning_rate": 1.1932576378321432e-07, + "logits/chosen": -2.5883026123046875, + "logits/rejected": -2.6176249980926514, + "logps/chosen": -244.16883850097656, + "logps/rejected": -257.6977233886719, + "loss": 0.5963, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3560296297073364, + "rewards/margins": 2.2562255859375, + "rewards/rejected": -2.612255334854126, + "step": 5276 + }, + { + "epoch": 0.61, + "learning_rate": 1.1929064731359007e-07, + "logits/chosen": -2.9746947288513184, + "logits/rejected": -2.738182544708252, + "logps/chosen": -133.9920654296875, + "logps/rejected": -292.6603698730469, + "loss": 0.5034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2177162766456604, + "rewards/margins": 1.9542315006256104, + "rewards/rejected": -2.171947956085205, + "step": 5277 + }, + { + "epoch": 0.61, + "learning_rate": 1.1925553084396582e-07, + "logits/chosen": -3.16219425201416, + "logits/rejected": -3.2775774002075195, + "logps/chosen": -301.2888488769531, + "logps/rejected": -271.2750244140625, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13881921768188477, + "rewards/margins": 2.6677494049072266, + "rewards/rejected": -2.528930425643921, + "step": 5278 + }, + { + "epoch": 0.61, + "learning_rate": 1.1922041437434155e-07, + "logits/chosen": -3.3047657012939453, + "logits/rejected": -3.3548595905303955, + "logps/chosen": -239.56040954589844, + "logps/rejected": -255.6653289794922, + "loss": 0.8374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.837917685508728, + "rewards/margins": 0.8629946708679199, + "rewards/rejected": -1.7009124755859375, + "step": 5279 + }, + { + "epoch": 0.61, + "learning_rate": 1.191852979047173e-07, + "logits/chosen": -3.5522146224975586, + "logits/rejected": -3.5528056621551514, + "logps/chosen": -212.634521484375, + "logps/rejected": -294.1792907714844, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33663004636764526, + "rewards/margins": 2.5563061237335205, + "rewards/rejected": -2.2196762561798096, + "step": 5280 + }, + { + "epoch": 0.61, + "learning_rate": 1.1915018143509306e-07, + "logits/chosen": -3.1175241470336914, + "logits/rejected": -3.093761920928955, + "logps/chosen": -450.05426025390625, + "logps/rejected": -337.426025390625, + "loss": 0.4192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04596281051635742, + "rewards/margins": 2.2999346256256104, + "rewards/rejected": -2.3458971977233887, + "step": 5281 + }, + { + "epoch": 0.61, + "learning_rate": 1.191150649654688e-07, + "logits/chosen": -3.022493839263916, + "logits/rejected": -3.0333921909332275, + "logps/chosen": -142.62327575683594, + "logps/rejected": -248.74234008789062, + "loss": 0.4135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2471529096364975, + "rewards/margins": 1.4975841045379639, + "rewards/rejected": -1.744737148284912, + "step": 5282 + }, + { + "epoch": 0.61, + "learning_rate": 1.1907994849584454e-07, + "logits/chosen": -3.1336073875427246, + "logits/rejected": -2.5719711780548096, + "logps/chosen": -368.74749755859375, + "logps/rejected": -241.32363891601562, + "loss": 0.419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8870683908462524, + "rewards/margins": 1.1168632507324219, + "rewards/rejected": -2.003931760787964, + "step": 5283 + }, + { + "epoch": 0.61, + "learning_rate": 1.1904483202622029e-07, + "logits/chosen": -3.1582255363464355, + "logits/rejected": -3.1383910179138184, + "logps/chosen": -270.9600830078125, + "logps/rejected": -328.2778015136719, + "loss": 0.308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34702032804489136, + "rewards/margins": 2.6627840995788574, + "rewards/rejected": -3.0098042488098145, + "step": 5284 + }, + { + "epoch": 0.61, + "learning_rate": 1.1900971555659605e-07, + "logits/chosen": -3.577345848083496, + "logits/rejected": -3.2390990257263184, + "logps/chosen": -249.53189086914062, + "logps/rejected": -143.78677368164062, + "loss": 0.2213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2536546587944031, + "rewards/margins": 1.6147410869598389, + "rewards/rejected": -1.8683958053588867, + "step": 5285 + }, + { + "epoch": 0.61, + "learning_rate": 1.1897459908697179e-07, + "logits/chosen": -3.1280903816223145, + "logits/rejected": -3.047313690185547, + "logps/chosen": -223.62655639648438, + "logps/rejected": -235.99679565429688, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6268102526664734, + "rewards/margins": 1.199461817741394, + "rewards/rejected": -1.8262720108032227, + "step": 5286 + }, + { + "epoch": 0.61, + "learning_rate": 1.1893948261734753e-07, + "logits/chosen": -3.4842052459716797, + "logits/rejected": -2.7985804080963135, + "logps/chosen": -252.07977294921875, + "logps/rejected": -250.7867431640625, + "loss": 0.2884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41090714931488037, + "rewards/margins": 1.8253885507583618, + "rewards/rejected": -2.236295700073242, + "step": 5287 + }, + { + "epoch": 0.61, + "learning_rate": 1.1890436614772327e-07, + "logits/chosen": -3.4553141593933105, + "logits/rejected": -3.8400914669036865, + "logps/chosen": -238.926025390625, + "logps/rejected": -483.93255615234375, + "loss": 0.2362, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08708344399929047, + "rewards/margins": 2.877342939376831, + "rewards/rejected": -2.790259599685669, + "step": 5288 + }, + { + "epoch": 0.61, + "learning_rate": 1.1886924967809901e-07, + "logits/chosen": -3.441469192504883, + "logits/rejected": -3.3018500804901123, + "logps/chosen": -265.0951232910156, + "logps/rejected": -172.10955810546875, + "loss": 0.8662, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5917856097221375, + "rewards/margins": 0.12730883061885834, + "rewards/rejected": -0.7190945148468018, + "step": 5289 + }, + { + "epoch": 0.61, + "learning_rate": 1.1883413320847478e-07, + "logits/chosen": -3.458289623260498, + "logits/rejected": -3.163661003112793, + "logps/chosen": -301.8624267578125, + "logps/rejected": -285.4290771484375, + "loss": 0.1464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2466646134853363, + "rewards/margins": 2.864305019378662, + "rewards/rejected": -2.617640256881714, + "step": 5290 + }, + { + "epoch": 0.61, + "learning_rate": 1.1879901673885052e-07, + "logits/chosen": -3.9664559364318848, + "logits/rejected": -3.89241886138916, + "logps/chosen": -218.73037719726562, + "logps/rejected": -169.672607421875, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3425738513469696, + "rewards/margins": 1.9347800016403198, + "rewards/rejected": -2.2773537635803223, + "step": 5291 + }, + { + "epoch": 0.61, + "learning_rate": 1.1876390026922626e-07, + "logits/chosen": -2.964613199234009, + "logits/rejected": -2.864636182785034, + "logps/chosen": -209.5089569091797, + "logps/rejected": -266.8509216308594, + "loss": 0.5364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4176611304283142, + "rewards/margins": 1.1993352174758911, + "rewards/rejected": -1.61699640750885, + "step": 5292 + }, + { + "epoch": 0.61, + "learning_rate": 1.18728783799602e-07, + "logits/chosen": -3.19462251663208, + "logits/rejected": -3.375669479370117, + "logps/chosen": -165.86605834960938, + "logps/rejected": -169.89500427246094, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19701166450977325, + "rewards/margins": 1.4314093589782715, + "rewards/rejected": -1.2343976497650146, + "step": 5293 + }, + { + "epoch": 0.61, + "learning_rate": 1.1869366732997777e-07, + "logits/chosen": -2.4611167907714844, + "logits/rejected": -2.904947519302368, + "logps/chosen": -150.60958862304688, + "logps/rejected": -286.30218505859375, + "loss": 0.5097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1646120548248291, + "rewards/margins": 1.449049711227417, + "rewards/rejected": -1.613661766052246, + "step": 5294 + }, + { + "epoch": 0.61, + "learning_rate": 1.186585508603535e-07, + "logits/chosen": -2.7439794540405273, + "logits/rejected": -2.790468692779541, + "logps/chosen": -117.99046325683594, + "logps/rejected": -242.6410675048828, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16659428179264069, + "rewards/margins": 1.763291358947754, + "rewards/rejected": -1.929885745048523, + "step": 5295 + }, + { + "epoch": 0.61, + "learning_rate": 1.1862343439072925e-07, + "logits/chosen": -2.7106122970581055, + "logits/rejected": -2.639359951019287, + "logps/chosen": -336.0431213378906, + "logps/rejected": -280.45977783203125, + "loss": 0.5486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03731174021959305, + "rewards/margins": 1.3584873676300049, + "rewards/rejected": -1.395799160003662, + "step": 5296 + }, + { + "epoch": 0.61, + "learning_rate": 1.1858831792110499e-07, + "logits/chosen": -3.347933053970337, + "logits/rejected": -3.309882402420044, + "logps/chosen": -217.00970458984375, + "logps/rejected": -189.84866333007812, + "loss": 0.3179, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19347253441810608, + "rewards/margins": 1.369217872619629, + "rewards/rejected": -1.5626904964447021, + "step": 5297 + }, + { + "epoch": 0.61, + "learning_rate": 1.1855320145148074e-07, + "logits/chosen": -3.3293817043304443, + "logits/rejected": -2.966381549835205, + "logps/chosen": -204.59609985351562, + "logps/rejected": -258.4775390625, + "loss": 0.5751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3843788206577301, + "rewards/margins": 0.7772958874702454, + "rewards/rejected": -1.1616747379302979, + "step": 5298 + }, + { + "epoch": 0.61, + "learning_rate": 1.1851808498185648e-07, + "logits/chosen": -3.2447218894958496, + "logits/rejected": -3.351438522338867, + "logps/chosen": -185.56190490722656, + "logps/rejected": -254.40341186523438, + "loss": 0.3343, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41642457246780396, + "rewards/margins": 2.1154158115386963, + "rewards/rejected": -2.5318403244018555, + "step": 5299 + }, + { + "epoch": 0.61, + "learning_rate": 1.1848296851223222e-07, + "logits/chosen": -3.2801432609558105, + "logits/rejected": -2.935929298400879, + "logps/chosen": -265.004150390625, + "logps/rejected": -269.19580078125, + "loss": 0.751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.111660435795784, + "rewards/margins": 1.7392542362213135, + "rewards/rejected": -1.8509145975112915, + "step": 5300 + }, + { + "epoch": 0.61, + "learning_rate": 1.1844785204260798e-07, + "logits/chosen": -3.5987958908081055, + "logits/rejected": -3.50144362449646, + "logps/chosen": -234.84075927734375, + "logps/rejected": -238.41683959960938, + "loss": 0.3501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24521403014659882, + "rewards/margins": 1.6754508018493652, + "rewards/rejected": -1.4302366971969604, + "step": 5301 + }, + { + "epoch": 0.61, + "learning_rate": 1.1841273557298373e-07, + "logits/chosen": -3.4161148071289062, + "logits/rejected": -3.103753089904785, + "logps/chosen": -136.54122924804688, + "logps/rejected": -205.8348388671875, + "loss": 0.9316, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5553924441337585, + "rewards/margins": 0.5703779458999634, + "rewards/rejected": -1.1257704496383667, + "step": 5302 + }, + { + "epoch": 0.61, + "learning_rate": 1.1837761910335947e-07, + "logits/chosen": -3.3338518142700195, + "logits/rejected": -3.2054007053375244, + "logps/chosen": -472.4631652832031, + "logps/rejected": -259.8365478515625, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5355492830276489, + "rewards/margins": 2.5621094703674316, + "rewards/rejected": -2.0265605449676514, + "step": 5303 + }, + { + "epoch": 0.61, + "learning_rate": 1.1834250263373521e-07, + "logits/chosen": -3.085245132446289, + "logits/rejected": -3.0464587211608887, + "logps/chosen": -454.04937744140625, + "logps/rejected": -367.3216857910156, + "loss": 0.4047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22949331998825073, + "rewards/margins": 1.540716528892517, + "rewards/rejected": -1.770209789276123, + "step": 5304 + }, + { + "epoch": 0.61, + "learning_rate": 1.1830738616411095e-07, + "logits/chosen": -3.0679941177368164, + "logits/rejected": -2.881873846054077, + "logps/chosen": -376.23602294921875, + "logps/rejected": -266.7079162597656, + "loss": 0.3172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2813473641872406, + "rewards/margins": 2.0690431594848633, + "rewards/rejected": -2.3503904342651367, + "step": 5305 + }, + { + "epoch": 0.61, + "learning_rate": 1.1827226969448672e-07, + "logits/chosen": -3.325892925262451, + "logits/rejected": -3.5403671264648438, + "logps/chosen": -146.9102783203125, + "logps/rejected": -244.6565704345703, + "loss": 0.32, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24200797080993652, + "rewards/margins": 2.2373180389404297, + "rewards/rejected": -2.479326009750366, + "step": 5306 + }, + { + "epoch": 0.61, + "learning_rate": 1.1823715322486246e-07, + "logits/chosen": -3.0236501693725586, + "logits/rejected": -3.1799793243408203, + "logps/chosen": -194.70164489746094, + "logps/rejected": -193.9634552001953, + "loss": 0.3651, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16030502319335938, + "rewards/margins": 2.7424092292785645, + "rewards/rejected": -2.582104444503784, + "step": 5307 + }, + { + "epoch": 0.61, + "learning_rate": 1.182020367552382e-07, + "logits/chosen": -3.430779457092285, + "logits/rejected": -3.5485873222351074, + "logps/chosen": -215.32264709472656, + "logps/rejected": -279.6903991699219, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5702784061431885, + "rewards/margins": 0.4695984125137329, + "rewards/rejected": -1.0398766994476318, + "step": 5308 + }, + { + "epoch": 0.61, + "learning_rate": 1.1816692028561394e-07, + "logits/chosen": -3.5047433376312256, + "logits/rejected": -3.1535704135894775, + "logps/chosen": -475.0372314453125, + "logps/rejected": -330.7916564941406, + "loss": 0.2436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1843947172164917, + "rewards/margins": 3.481719732284546, + "rewards/rejected": -3.666114568710327, + "step": 5309 + }, + { + "epoch": 0.61, + "learning_rate": 1.181318038159897e-07, + "logits/chosen": -2.5618462562561035, + "logits/rejected": -2.5857903957366943, + "logps/chosen": -343.82763671875, + "logps/rejected": -261.24713134765625, + "loss": 0.4493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1268748790025711, + "rewards/margins": 0.7392730712890625, + "rewards/rejected": -0.8661479353904724, + "step": 5310 + }, + { + "epoch": 0.61, + "learning_rate": 1.1809668734636545e-07, + "logits/chosen": -2.926823139190674, + "logits/rejected": -3.2674179077148438, + "logps/chosen": -274.2376403808594, + "logps/rejected": -322.9490966796875, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.246821641921997, + "rewards/margins": 3.441399574279785, + "rewards/rejected": -2.194577693939209, + "step": 5311 + }, + { + "epoch": 0.61, + "learning_rate": 1.1806157087674119e-07, + "logits/chosen": -2.785946846008301, + "logits/rejected": -3.198619842529297, + "logps/chosen": -347.98626708984375, + "logps/rejected": -524.7647705078125, + "loss": 0.6095, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38966768980026245, + "rewards/margins": 1.7886133193969727, + "rewards/rejected": -2.178280830383301, + "step": 5312 + }, + { + "epoch": 0.61, + "learning_rate": 1.1802645440711693e-07, + "logits/chosen": -2.4815726280212402, + "logits/rejected": -2.538074016571045, + "logps/chosen": -125.90350341796875, + "logps/rejected": -125.7213134765625, + "loss": 0.3306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16973868012428284, + "rewards/margins": 1.1876661777496338, + "rewards/rejected": -1.3574048280715942, + "step": 5313 + }, + { + "epoch": 0.61, + "learning_rate": 1.1799133793749268e-07, + "logits/chosen": -3.2826974391937256, + "logits/rejected": -3.6301088333129883, + "logps/chosen": -465.04888916015625, + "logps/rejected": -274.7007141113281, + "loss": 0.5097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5384688377380371, + "rewards/margins": 0.7397345304489136, + "rewards/rejected": -1.2782033681869507, + "step": 5314 + }, + { + "epoch": 0.61, + "learning_rate": 1.1795622146786842e-07, + "logits/chosen": -3.179577350616455, + "logits/rejected": -3.2284016609191895, + "logps/chosen": -307.47021484375, + "logps/rejected": -299.35601806640625, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.043544963002204895, + "rewards/margins": 1.4730011224746704, + "rewards/rejected": -1.429456114768982, + "step": 5315 + }, + { + "epoch": 0.61, + "learning_rate": 1.1792110499824416e-07, + "logits/chosen": -3.4529571533203125, + "logits/rejected": -3.3142149448394775, + "logps/chosen": -260.7003479003906, + "logps/rejected": -185.76760864257812, + "loss": 0.4209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38796547055244446, + "rewards/margins": 1.4888010025024414, + "rewards/rejected": -1.876766324043274, + "step": 5316 + }, + { + "epoch": 0.61, + "learning_rate": 1.178859885286199e-07, + "logits/chosen": -3.271704912185669, + "logits/rejected": -3.2487998008728027, + "logps/chosen": -238.91781616210938, + "logps/rejected": -226.2290496826172, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16189655661582947, + "rewards/margins": 1.5519782304763794, + "rewards/rejected": -1.7138746976852417, + "step": 5317 + }, + { + "epoch": 0.61, + "learning_rate": 1.1785087205899567e-07, + "logits/chosen": -2.915142059326172, + "logits/rejected": -2.652663469314575, + "logps/chosen": -278.468017578125, + "logps/rejected": -192.33761596679688, + "loss": 0.5338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5215287804603577, + "rewards/margins": 1.0504300594329834, + "rewards/rejected": -1.5719587802886963, + "step": 5318 + }, + { + "epoch": 0.61, + "learning_rate": 1.1781575558937141e-07, + "logits/chosen": -3.5180277824401855, + "logits/rejected": -3.282268524169922, + "logps/chosen": -241.83883666992188, + "logps/rejected": -200.3997344970703, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15620733797550201, + "rewards/margins": 1.6773450374603271, + "rewards/rejected": -1.5211377143859863, + "step": 5319 + }, + { + "epoch": 0.61, + "learning_rate": 1.1778063911974715e-07, + "logits/chosen": -3.0525553226470947, + "logits/rejected": -3.3098504543304443, + "logps/chosen": -201.21319580078125, + "logps/rejected": -275.56060791015625, + "loss": 0.3635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3383446931838989, + "rewards/margins": 3.3931355476379395, + "rewards/rejected": -3.731480598449707, + "step": 5320 + }, + { + "epoch": 0.61, + "learning_rate": 1.177455226501229e-07, + "logits/chosen": -2.9487924575805664, + "logits/rejected": -2.9715561866760254, + "logps/chosen": -362.1551513671875, + "logps/rejected": -333.4438171386719, + "loss": 0.5509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14812664687633514, + "rewards/margins": 1.1062774658203125, + "rewards/rejected": -1.254404067993164, + "step": 5321 + }, + { + "epoch": 0.61, + "learning_rate": 1.1771040618049866e-07, + "logits/chosen": -3.723816394805908, + "logits/rejected": -3.8011326789855957, + "logps/chosen": -373.8204650878906, + "logps/rejected": -288.3244934082031, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5491907596588135, + "rewards/margins": 1.336431622505188, + "rewards/rejected": -1.885622501373291, + "step": 5322 + }, + { + "epoch": 0.61, + "learning_rate": 1.176752897108744e-07, + "logits/chosen": -2.927650213241577, + "logits/rejected": -2.8360512256622314, + "logps/chosen": -284.93756103515625, + "logps/rejected": -205.94961547851562, + "loss": 0.2744, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3633912205696106, + "rewards/margins": 1.5899953842163086, + "rewards/rejected": -1.2266043424606323, + "step": 5323 + }, + { + "epoch": 0.61, + "learning_rate": 1.1764017324125014e-07, + "logits/chosen": -3.083552598953247, + "logits/rejected": -3.436340808868408, + "logps/chosen": -356.75830078125, + "logps/rejected": -297.759765625, + "loss": 0.6191, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2587321400642395, + "rewards/margins": 0.8752649426460266, + "rewards/rejected": -0.6165328025817871, + "step": 5324 + }, + { + "epoch": 0.61, + "learning_rate": 1.1760505677162588e-07, + "logits/chosen": -2.9998779296875, + "logits/rejected": -2.7792530059814453, + "logps/chosen": -282.32183837890625, + "logps/rejected": -268.5966796875, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14703623950481415, + "rewards/margins": 3.291665554046631, + "rewards/rejected": -3.1446292400360107, + "step": 5325 + }, + { + "epoch": 0.61, + "learning_rate": 1.1756994030200164e-07, + "logits/chosen": -2.91245698928833, + "logits/rejected": -2.907775402069092, + "logps/chosen": -320.2054443359375, + "logps/rejected": -227.05935668945312, + "loss": 0.189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1989031285047531, + "rewards/margins": 1.9366915225982666, + "rewards/rejected": -2.135594367980957, + "step": 5326 + }, + { + "epoch": 0.61, + "learning_rate": 1.1753482383237738e-07, + "logits/chosen": -2.9528565406799316, + "logits/rejected": -3.032849073410034, + "logps/chosen": -183.32455444335938, + "logps/rejected": -265.6575622558594, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3145020604133606, + "rewards/margins": 1.8275034427642822, + "rewards/rejected": -2.142005443572998, + "step": 5327 + }, + { + "epoch": 0.61, + "learning_rate": 1.1749970736275313e-07, + "logits/chosen": -3.0894155502319336, + "logits/rejected": -2.8906004428863525, + "logps/chosen": -362.88482666015625, + "logps/rejected": -428.5622863769531, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.331216424703598, + "rewards/margins": 3.1433463096618652, + "rewards/rejected": -2.8121302127838135, + "step": 5328 + }, + { + "epoch": 0.61, + "learning_rate": 1.1746459089312887e-07, + "logits/chosen": -2.402616500854492, + "logits/rejected": -2.1959052085876465, + "logps/chosen": -321.28521728515625, + "logps/rejected": -364.29913330078125, + "loss": 0.3752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.762101411819458, + "rewards/margins": 2.0562899112701416, + "rewards/rejected": -2.8183910846710205, + "step": 5329 + }, + { + "epoch": 0.61, + "learning_rate": 1.1742947442350463e-07, + "logits/chosen": -3.1478371620178223, + "logits/rejected": -2.9943008422851562, + "logps/chosen": -351.0337219238281, + "logps/rejected": -258.2586364746094, + "loss": 0.253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6636461019515991, + "rewards/margins": 1.955299973487854, + "rewards/rejected": -2.618946075439453, + "step": 5330 + }, + { + "epoch": 0.61, + "learning_rate": 1.1739435795388037e-07, + "logits/chosen": -3.491133213043213, + "logits/rejected": -3.777883529663086, + "logps/chosen": -135.02462768554688, + "logps/rejected": -318.19952392578125, + "loss": 0.3777, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015131175518035889, + "rewards/margins": 3.5479683876037598, + "rewards/rejected": -3.532837152481079, + "step": 5331 + }, + { + "epoch": 0.61, + "learning_rate": 1.1735924148425611e-07, + "logits/chosen": -3.3519742488861084, + "logits/rejected": -3.6445095539093018, + "logps/chosen": -180.16610717773438, + "logps/rejected": -304.76409912109375, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8437068462371826, + "rewards/margins": 4.059483528137207, + "rewards/rejected": -4.903190612792969, + "step": 5332 + }, + { + "epoch": 0.61, + "learning_rate": 1.1732412501463185e-07, + "logits/chosen": -3.0656003952026367, + "logits/rejected": -3.0494801998138428, + "logps/chosen": -284.759033203125, + "logps/rejected": -347.4619140625, + "loss": 0.3902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.288232684135437, + "rewards/margins": 1.8624756336212158, + "rewards/rejected": -2.1507084369659424, + "step": 5333 + }, + { + "epoch": 0.61, + "learning_rate": 1.1728900854500759e-07, + "logits/chosen": -3.800502300262451, + "logits/rejected": -3.53464937210083, + "logps/chosen": -287.4187316894531, + "logps/rejected": -187.14830017089844, + "loss": 0.3117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46296465396881104, + "rewards/margins": 2.2086329460144043, + "rewards/rejected": -2.671597719192505, + "step": 5334 + }, + { + "epoch": 0.62, + "learning_rate": 1.1725389207538336e-07, + "logits/chosen": -2.9256017208099365, + "logits/rejected": -2.8205676078796387, + "logps/chosen": -178.267578125, + "logps/rejected": -233.16204833984375, + "loss": 0.3904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.328305721282959, + "rewards/margins": 2.335015296936035, + "rewards/rejected": -2.663321018218994, + "step": 5335 + }, + { + "epoch": 0.62, + "learning_rate": 1.172187756057591e-07, + "logits/chosen": -3.3814122676849365, + "logits/rejected": -3.501513719558716, + "logps/chosen": -121.938720703125, + "logps/rejected": -173.59361267089844, + "loss": 0.6302, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23246164619922638, + "rewards/margins": 0.9932323694229126, + "rewards/rejected": -1.2256940603256226, + "step": 5336 + }, + { + "epoch": 0.62, + "learning_rate": 1.1718365913613484e-07, + "logits/chosen": -3.236764907836914, + "logits/rejected": -3.1434755325317383, + "logps/chosen": -236.4114990234375, + "logps/rejected": -311.14337158203125, + "loss": 0.1755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.273426353931427, + "rewards/margins": 2.69964861869812, + "rewards/rejected": -2.9730751514434814, + "step": 5337 + }, + { + "epoch": 0.62, + "learning_rate": 1.1714854266651058e-07, + "logits/chosen": -3.455944538116455, + "logits/rejected": -3.6186256408691406, + "logps/chosen": -176.22402954101562, + "logps/rejected": -181.53271484375, + "loss": 0.366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14881527423858643, + "rewards/margins": 1.9560400247573853, + "rewards/rejected": -1.8072246313095093, + "step": 5338 + }, + { + "epoch": 0.62, + "learning_rate": 1.1711342619688634e-07, + "logits/chosen": -3.065885543823242, + "logits/rejected": -3.2828285694122314, + "logps/chosen": -191.8654327392578, + "logps/rejected": -170.66455078125, + "loss": 0.6744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3719821572303772, + "rewards/margins": 0.31412550806999207, + "rewards/rejected": -0.6861076354980469, + "step": 5339 + }, + { + "epoch": 0.62, + "learning_rate": 1.1707830972726209e-07, + "logits/chosen": -3.1119391918182373, + "logits/rejected": -2.972588062286377, + "logps/chosen": -284.97113037109375, + "logps/rejected": -236.74998474121094, + "loss": 0.184, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.426011323928833, + "rewards/margins": 2.314789295196533, + "rewards/rejected": -1.8887779712677002, + "step": 5340 + }, + { + "epoch": 0.62, + "learning_rate": 1.1704319325763783e-07, + "logits/chosen": -3.187002182006836, + "logits/rejected": -3.2886123657226562, + "logps/chosen": -204.85812377929688, + "logps/rejected": -185.04075622558594, + "loss": 0.3148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06176924705505371, + "rewards/margins": 1.9781360626220703, + "rewards/rejected": -1.9163668155670166, + "step": 5341 + }, + { + "epoch": 0.62, + "learning_rate": 1.1700807678801357e-07, + "logits/chosen": -2.8209750652313232, + "logits/rejected": -2.702106237411499, + "logps/chosen": -205.88223266601562, + "logps/rejected": -136.56605529785156, + "loss": 0.8899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5664583444595337, + "rewards/margins": 0.4206068813800812, + "rewards/rejected": -0.9870651960372925, + "step": 5342 + }, + { + "epoch": 0.62, + "learning_rate": 1.1697296031838932e-07, + "logits/chosen": -2.800090789794922, + "logits/rejected": -3.3524303436279297, + "logps/chosen": -218.87380981445312, + "logps/rejected": -282.2682189941406, + "loss": 0.488, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16046997904777527, + "rewards/margins": 1.9108846187591553, + "rewards/rejected": -1.7504147291183472, + "step": 5343 + }, + { + "epoch": 0.62, + "learning_rate": 1.1693784384876506e-07, + "logits/chosen": -3.130704879760742, + "logits/rejected": -2.9810471534729004, + "logps/chosen": -357.56427001953125, + "logps/rejected": -426.0700988769531, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24282976984977722, + "rewards/margins": 2.745765447616577, + "rewards/rejected": -2.5029354095458984, + "step": 5344 + }, + { + "epoch": 0.62, + "learning_rate": 1.1690272737914081e-07, + "logits/chosen": -2.4825663566589355, + "logits/rejected": -2.5842790603637695, + "logps/chosen": -271.2116394042969, + "logps/rejected": -171.34786987304688, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5077495574951172, + "rewards/margins": 1.6565362215042114, + "rewards/rejected": -1.1487867832183838, + "step": 5345 + }, + { + "epoch": 0.62, + "learning_rate": 1.1686761090951656e-07, + "logits/chosen": -3.0953383445739746, + "logits/rejected": -2.850071430206299, + "logps/chosen": -435.32421875, + "logps/rejected": -388.5845031738281, + "loss": 0.3082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1500154435634613, + "rewards/margins": 1.8172104358673096, + "rewards/rejected": -1.9672259092330933, + "step": 5346 + }, + { + "epoch": 0.62, + "learning_rate": 1.1683249443989231e-07, + "logits/chosen": -2.1106648445129395, + "logits/rejected": -2.185969591140747, + "logps/chosen": -442.5934143066406, + "logps/rejected": -398.11981201171875, + "loss": 0.595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12288632243871689, + "rewards/margins": 0.7967591881752014, + "rewards/rejected": -0.9196454882621765, + "step": 5347 + }, + { + "epoch": 0.62, + "learning_rate": 1.1679737797026805e-07, + "logits/chosen": -2.9698996543884277, + "logits/rejected": -2.7327969074249268, + "logps/chosen": -237.7808837890625, + "logps/rejected": -211.74642944335938, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.001416921615600586, + "rewards/margins": 1.8736257553100586, + "rewards/rejected": -1.8722089529037476, + "step": 5348 + }, + { + "epoch": 0.62, + "learning_rate": 1.1676226150064379e-07, + "logits/chosen": -2.8923890590667725, + "logits/rejected": -2.6324353218078613, + "logps/chosen": -195.33990478515625, + "logps/rejected": -187.3834228515625, + "loss": 0.5373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26573634147644043, + "rewards/margins": 1.328852653503418, + "rewards/rejected": -1.5945888757705688, + "step": 5349 + }, + { + "epoch": 0.62, + "learning_rate": 1.1672714503101953e-07, + "logits/chosen": -3.5589406490325928, + "logits/rejected": -3.4424633979797363, + "logps/chosen": -211.50296020507812, + "logps/rejected": -217.49502563476562, + "loss": 0.6954, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9044696092605591, + "rewards/margins": 0.6498827338218689, + "rewards/rejected": -1.5543524026870728, + "step": 5350 + }, + { + "epoch": 0.62, + "learning_rate": 1.166920285613953e-07, + "logits/chosen": -3.0086312294006348, + "logits/rejected": -2.9488587379455566, + "logps/chosen": -391.87847900390625, + "logps/rejected": -321.02691650390625, + "loss": 0.3231, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49690067768096924, + "rewards/margins": 1.2262020111083984, + "rewards/rejected": -0.729301393032074, + "step": 5351 + }, + { + "epoch": 0.62, + "learning_rate": 1.1665691209177104e-07, + "logits/chosen": -2.7800304889678955, + "logits/rejected": -2.7538325786590576, + "logps/chosen": -288.1872253417969, + "logps/rejected": -257.7411804199219, + "loss": 0.7158, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3013516962528229, + "rewards/margins": 0.32236841320991516, + "rewards/rejected": -0.623720109462738, + "step": 5352 + }, + { + "epoch": 0.62, + "learning_rate": 1.1662179562214678e-07, + "logits/chosen": -2.281803607940674, + "logits/rejected": -2.277806282043457, + "logps/chosen": -473.61590576171875, + "logps/rejected": -395.5416564941406, + "loss": 0.5023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5726956129074097, + "rewards/margins": 0.8667303919792175, + "rewards/rejected": -0.29403483867645264, + "step": 5353 + }, + { + "epoch": 0.62, + "learning_rate": 1.1658667915252252e-07, + "logits/chosen": -2.492711067199707, + "logits/rejected": -2.419447183609009, + "logps/chosen": -412.77294921875, + "logps/rejected": -319.8740539550781, + "loss": 0.601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29719799757003784, + "rewards/margins": 0.5715266466140747, + "rewards/rejected": -0.8687245845794678, + "step": 5354 + }, + { + "epoch": 0.62, + "learning_rate": 1.1655156268289827e-07, + "logits/chosen": -3.4408748149871826, + "logits/rejected": -3.4335925579071045, + "logps/chosen": -359.4884033203125, + "logps/rejected": -282.474365234375, + "loss": 0.9117, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1333894729614258, + "rewards/margins": 0.7807798385620117, + "rewards/rejected": -1.914169430732727, + "step": 5355 + }, + { + "epoch": 0.62, + "learning_rate": 1.1651644621327403e-07, + "logits/chosen": -2.9157872200012207, + "logits/rejected": -2.74361515045166, + "logps/chosen": -266.62646484375, + "logps/rejected": -285.52301025390625, + "loss": 0.3993, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2488541603088379, + "rewards/margins": 2.2976036071777344, + "rewards/rejected": -2.546457529067993, + "step": 5356 + }, + { + "epoch": 0.62, + "learning_rate": 1.1648132974364977e-07, + "logits/chosen": -2.7847070693969727, + "logits/rejected": -3.3674535751342773, + "logps/chosen": -134.5704345703125, + "logps/rejected": -188.64144897460938, + "loss": 0.1762, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6167689561843872, + "rewards/margins": 2.695558547973633, + "rewards/rejected": -2.078789710998535, + "step": 5357 + }, + { + "epoch": 0.62, + "learning_rate": 1.1644621327402551e-07, + "logits/chosen": -2.9598584175109863, + "logits/rejected": -2.8595986366271973, + "logps/chosen": -261.0899658203125, + "logps/rejected": -238.88121032714844, + "loss": 0.3561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16842323541641235, + "rewards/margins": 1.4701862335205078, + "rewards/rejected": -1.6386094093322754, + "step": 5358 + }, + { + "epoch": 0.62, + "learning_rate": 1.1641109680440126e-07, + "logits/chosen": -3.641542911529541, + "logits/rejected": -3.7010583877563477, + "logps/chosen": -171.93292236328125, + "logps/rejected": -216.55474853515625, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20446255803108215, + "rewards/margins": 1.739989995956421, + "rewards/rejected": -1.9444525241851807, + "step": 5359 + }, + { + "epoch": 0.62, + "learning_rate": 1.16375980334777e-07, + "logits/chosen": -2.7971901893615723, + "logits/rejected": -2.84483003616333, + "logps/chosen": -293.93487548828125, + "logps/rejected": -276.29827880859375, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4803926944732666, + "rewards/margins": 2.2325050830841064, + "rewards/rejected": -2.712897539138794, + "step": 5360 + }, + { + "epoch": 0.62, + "learning_rate": 1.1634086386515274e-07, + "logits/chosen": -3.4643607139587402, + "logits/rejected": -3.2944631576538086, + "logps/chosen": -175.54051208496094, + "logps/rejected": -193.58775329589844, + "loss": 0.4159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32840365171432495, + "rewards/margins": 2.3999476432800293, + "rewards/rejected": -2.728351354598999, + "step": 5361 + }, + { + "epoch": 0.62, + "learning_rate": 1.163057473955285e-07, + "logits/chosen": -3.506871223449707, + "logits/rejected": -3.2820370197296143, + "logps/chosen": -201.41183471679688, + "logps/rejected": -213.82748413085938, + "loss": 0.5613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7490542531013489, + "rewards/margins": 0.6519057154655457, + "rewards/rejected": -1.4009599685668945, + "step": 5362 + }, + { + "epoch": 0.62, + "learning_rate": 1.1627063092590425e-07, + "logits/chosen": -3.064243793487549, + "logits/rejected": -2.6860570907592773, + "logps/chosen": -170.38438415527344, + "logps/rejected": -225.47384643554688, + "loss": 0.5712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34198832511901855, + "rewards/margins": 1.3039499521255493, + "rewards/rejected": -1.6459382772445679, + "step": 5363 + }, + { + "epoch": 0.62, + "learning_rate": 1.1623551445627999e-07, + "logits/chosen": -4.304924964904785, + "logits/rejected": -3.9734325408935547, + "logps/chosen": -380.6185607910156, + "logps/rejected": -189.29541015625, + "loss": 0.3436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11818194389343262, + "rewards/margins": 1.7870562076568604, + "rewards/rejected": -1.9052382707595825, + "step": 5364 + }, + { + "epoch": 0.62, + "learning_rate": 1.1620039798665573e-07, + "logits/chosen": -2.360538959503174, + "logits/rejected": -2.377342700958252, + "logps/chosen": -313.07025146484375, + "logps/rejected": -407.6640625, + "loss": 0.4556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6989781260490417, + "rewards/margins": 1.9685337543487549, + "rewards/rejected": -2.6675119400024414, + "step": 5365 + }, + { + "epoch": 0.62, + "learning_rate": 1.1616528151703147e-07, + "logits/chosen": -3.8501389026641846, + "logits/rejected": -3.4594995975494385, + "logps/chosen": -238.63307189941406, + "logps/rejected": -188.6956329345703, + "loss": 0.5013, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6884474158287048, + "rewards/margins": 1.5536322593688965, + "rewards/rejected": -2.242079734802246, + "step": 5366 + }, + { + "epoch": 0.62, + "learning_rate": 1.1613016504740724e-07, + "logits/chosen": -3.5160298347473145, + "logits/rejected": -2.789252281188965, + "logps/chosen": -309.52423095703125, + "logps/rejected": -162.95404052734375, + "loss": 0.6461, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15706217288970947, + "rewards/margins": 1.5007129907608032, + "rewards/rejected": -1.6577751636505127, + "step": 5367 + }, + { + "epoch": 0.62, + "learning_rate": 1.1609504857778298e-07, + "logits/chosen": -3.1422603130340576, + "logits/rejected": -2.9530131816864014, + "logps/chosen": -138.58863830566406, + "logps/rejected": -172.74046325683594, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3287200331687927, + "rewards/margins": 2.238513946533203, + "rewards/rejected": -1.909793734550476, + "step": 5368 + }, + { + "epoch": 0.62, + "learning_rate": 1.1605993210815872e-07, + "logits/chosen": -2.3680989742279053, + "logits/rejected": -2.3658642768859863, + "logps/chosen": -331.8570251464844, + "logps/rejected": -191.47860717773438, + "loss": 0.459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29823246598243713, + "rewards/margins": 1.2475379705429077, + "rewards/rejected": -1.545770525932312, + "step": 5369 + }, + { + "epoch": 0.62, + "learning_rate": 1.1602481563853446e-07, + "logits/chosen": -2.9775118827819824, + "logits/rejected": -2.706105947494507, + "logps/chosen": -183.79908752441406, + "logps/rejected": -177.86422729492188, + "loss": 0.3683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19985660910606384, + "rewards/margins": 1.490896463394165, + "rewards/rejected": -1.6907532215118408, + "step": 5370 + }, + { + "epoch": 0.62, + "learning_rate": 1.1598969916891022e-07, + "logits/chosen": -3.2645456790924072, + "logits/rejected": -3.371328353881836, + "logps/chosen": -172.08164978027344, + "logps/rejected": -193.70401000976562, + "loss": 0.4403, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2348976582288742, + "rewards/margins": 1.964170217514038, + "rewards/rejected": -1.7292726039886475, + "step": 5371 + }, + { + "epoch": 0.62, + "learning_rate": 1.1595458269928596e-07, + "logits/chosen": -3.0191707611083984, + "logits/rejected": -3.1529541015625, + "logps/chosen": -142.548583984375, + "logps/rejected": -225.72662353515625, + "loss": 0.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46746212244033813, + "rewards/margins": 2.7991256713867188, + "rewards/rejected": -2.3316633701324463, + "step": 5372 + }, + { + "epoch": 0.62, + "learning_rate": 1.1591946622966171e-07, + "logits/chosen": -2.8772644996643066, + "logits/rejected": -2.7923009395599365, + "logps/chosen": -398.38177490234375, + "logps/rejected": -186.63479614257812, + "loss": 0.2628, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.616472601890564, + "rewards/margins": 1.9396283626556396, + "rewards/rejected": -1.3231558799743652, + "step": 5373 + }, + { + "epoch": 0.62, + "learning_rate": 1.1588434976003745e-07, + "logits/chosen": -3.531045436859131, + "logits/rejected": -3.352996349334717, + "logps/chosen": -327.17327880859375, + "logps/rejected": -321.76068115234375, + "loss": 0.4477, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14260751008987427, + "rewards/margins": 1.86981999874115, + "rewards/rejected": -1.7272124290466309, + "step": 5374 + }, + { + "epoch": 0.62, + "learning_rate": 1.158492332904132e-07, + "logits/chosen": -2.60917067527771, + "logits/rejected": -2.534966468811035, + "logps/chosen": -235.608642578125, + "logps/rejected": -372.5523681640625, + "loss": 0.6513, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1976119875907898, + "rewards/margins": 1.8307408094406128, + "rewards/rejected": -1.6331288814544678, + "step": 5375 + }, + { + "epoch": 0.62, + "learning_rate": 1.1581411682078895e-07, + "logits/chosen": -4.025502681732178, + "logits/rejected": -3.471933364868164, + "logps/chosen": -278.642822265625, + "logps/rejected": -208.4122314453125, + "loss": 0.8403, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4096136093139648, + "rewards/margins": 1.4459164142608643, + "rewards/rejected": -2.855530023574829, + "step": 5376 + }, + { + "epoch": 0.62, + "learning_rate": 1.1577900035116469e-07, + "logits/chosen": -2.206879138946533, + "logits/rejected": -2.642988443374634, + "logps/chosen": -269.44342041015625, + "logps/rejected": -260.6522216796875, + "loss": 0.3802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3760431408882141, + "rewards/margins": 1.7626538276672363, + "rewards/rejected": -2.1386971473693848, + "step": 5377 + }, + { + "epoch": 0.62, + "learning_rate": 1.1574388388154043e-07, + "logits/chosen": -3.2079758644104004, + "logits/rejected": -3.2585935592651367, + "logps/chosen": -373.4296875, + "logps/rejected": -374.28857421875, + "loss": 0.5142, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1068644523620605, + "rewards/margins": 0.6792547106742859, + "rewards/rejected": -1.7861191034317017, + "step": 5378 + }, + { + "epoch": 0.62, + "learning_rate": 1.157087674119162e-07, + "logits/chosen": -2.5816521644592285, + "logits/rejected": -2.7543318271636963, + "logps/chosen": -286.53887939453125, + "logps/rejected": -231.24273681640625, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22637930512428284, + "rewards/margins": 2.225533962249756, + "rewards/rejected": -2.451913356781006, + "step": 5379 + }, + { + "epoch": 0.62, + "learning_rate": 1.1567365094229194e-07, + "logits/chosen": -3.822096586227417, + "logits/rejected": -3.7211849689483643, + "logps/chosen": -374.24298095703125, + "logps/rejected": -276.4981689453125, + "loss": 0.4712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19458897411823273, + "rewards/margins": 1.0528664588928223, + "rewards/rejected": -1.2474554777145386, + "step": 5380 + }, + { + "epoch": 0.62, + "learning_rate": 1.1563853447266768e-07, + "logits/chosen": -3.264277696609497, + "logits/rejected": -3.530611991882324, + "logps/chosen": -304.64068603515625, + "logps/rejected": -331.2483825683594, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04678189754486084, + "rewards/margins": 2.0136659145355225, + "rewards/rejected": -1.9668841361999512, + "step": 5381 + }, + { + "epoch": 0.62, + "learning_rate": 1.1560341800304342e-07, + "logits/chosen": -3.364964723587036, + "logits/rejected": -3.331735849380493, + "logps/chosen": -269.4156494140625, + "logps/rejected": -225.2075653076172, + "loss": 0.2944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06383839249610901, + "rewards/margins": 2.0007333755493164, + "rewards/rejected": -1.9368950128555298, + "step": 5382 + }, + { + "epoch": 0.62, + "learning_rate": 1.1556830153341916e-07, + "logits/chosen": -3.3190815448760986, + "logits/rejected": -3.3456435203552246, + "logps/chosen": -307.66278076171875, + "logps/rejected": -259.479736328125, + "loss": 0.5095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1309036910533905, + "rewards/margins": 1.546416997909546, + "rewards/rejected": -1.415513277053833, + "step": 5383 + }, + { + "epoch": 0.62, + "learning_rate": 1.1553318506379492e-07, + "logits/chosen": -3.6437695026397705, + "logits/rejected": -3.157029628753662, + "logps/chosen": -284.73406982421875, + "logps/rejected": -169.3768768310547, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34945350885391235, + "rewards/margins": 1.341076374053955, + "rewards/rejected": -0.9916229248046875, + "step": 5384 + }, + { + "epoch": 0.62, + "learning_rate": 1.1549806859417066e-07, + "logits/chosen": -3.7959232330322266, + "logits/rejected": -3.4397354125976562, + "logps/chosen": -277.3111877441406, + "logps/rejected": -149.9631805419922, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.048348478972911835, + "rewards/margins": 1.3206932544708252, + "rewards/rejected": -1.2723448276519775, + "step": 5385 + }, + { + "epoch": 0.62, + "learning_rate": 1.154629521245464e-07, + "logits/chosen": -3.120267152786255, + "logits/rejected": -2.985335111618042, + "logps/chosen": -420.81866455078125, + "logps/rejected": -433.0013122558594, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21078455448150635, + "rewards/margins": 3.5479841232299805, + "rewards/rejected": -3.7587685585021973, + "step": 5386 + }, + { + "epoch": 0.62, + "learning_rate": 1.1542783565492215e-07, + "logits/chosen": -3.4664766788482666, + "logits/rejected": -3.244804859161377, + "logps/chosen": -160.06060791015625, + "logps/rejected": -139.59918212890625, + "loss": 0.5818, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6167044639587402, + "rewards/margins": 0.6924952864646912, + "rewards/rejected": -1.3091998100280762, + "step": 5387 + }, + { + "epoch": 0.62, + "learning_rate": 1.153927191852979e-07, + "logits/chosen": -2.6381890773773193, + "logits/rejected": -2.712780237197876, + "logps/chosen": -393.1153564453125, + "logps/rejected": -367.4856872558594, + "loss": 0.4411, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5455946922302246, + "rewards/margins": 0.8453782796859741, + "rewards/rejected": -0.2997836470603943, + "step": 5388 + }, + { + "epoch": 0.62, + "learning_rate": 1.1535760271567364e-07, + "logits/chosen": -2.6452341079711914, + "logits/rejected": -2.7302184104919434, + "logps/chosen": -408.2210998535156, + "logps/rejected": -444.42034912109375, + "loss": 0.8822, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42330971360206604, + "rewards/margins": 0.8302160501480103, + "rewards/rejected": -1.253525733947754, + "step": 5389 + }, + { + "epoch": 0.62, + "learning_rate": 1.153224862460494e-07, + "logits/chosen": -3.5458033084869385, + "logits/rejected": -3.072679042816162, + "logps/chosen": -263.4439392089844, + "logps/rejected": -141.41001892089844, + "loss": 0.3718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4121408760547638, + "rewards/margins": 1.2258577346801758, + "rewards/rejected": -1.6379987001419067, + "step": 5390 + }, + { + "epoch": 0.62, + "learning_rate": 1.1528736977642513e-07, + "logits/chosen": -2.7225780487060547, + "logits/rejected": -2.771023750305176, + "logps/chosen": -201.1732635498047, + "logps/rejected": -229.7334442138672, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.051396504044532776, + "rewards/margins": 3.471224546432495, + "rewards/rejected": -3.419827938079834, + "step": 5391 + }, + { + "epoch": 0.62, + "learning_rate": 1.1525225330680089e-07, + "logits/chosen": -2.082326889038086, + "logits/rejected": -2.0754449367523193, + "logps/chosen": -495.4266357421875, + "logps/rejected": -344.55792236328125, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2101336121559143, + "rewards/margins": 2.1621766090393066, + "rewards/rejected": -2.372310161590576, + "step": 5392 + }, + { + "epoch": 0.62, + "learning_rate": 1.1521713683717663e-07, + "logits/chosen": -2.6029341220855713, + "logits/rejected": -2.7744336128234863, + "logps/chosen": -293.91827392578125, + "logps/rejected": -192.15835571289062, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06664273142814636, + "rewards/margins": 2.1556262969970703, + "rewards/rejected": -2.222269058227539, + "step": 5393 + }, + { + "epoch": 0.62, + "learning_rate": 1.1518202036755237e-07, + "logits/chosen": -3.458683729171753, + "logits/rejected": -3.314277172088623, + "logps/chosen": -298.3847961425781, + "logps/rejected": -193.99102783203125, + "loss": 0.4917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4525948464870453, + "rewards/margins": 1.4432734251022339, + "rewards/rejected": -1.8958684206008911, + "step": 5394 + }, + { + "epoch": 0.62, + "learning_rate": 1.1514690389792811e-07, + "logits/chosen": -3.58998441696167, + "logits/rejected": -3.6365833282470703, + "logps/chosen": -210.98135375976562, + "logps/rejected": -211.5127410888672, + "loss": 0.5932, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38476985692977905, + "rewards/margins": 1.6893210411071777, + "rewards/rejected": -2.0740909576416016, + "step": 5395 + }, + { + "epoch": 0.62, + "learning_rate": 1.1511178742830388e-07, + "logits/chosen": -2.6051440238952637, + "logits/rejected": -3.0333023071289062, + "logps/chosen": -278.6952819824219, + "logps/rejected": -159.6924591064453, + "loss": 0.6043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35068565607070923, + "rewards/margins": 1.330560326576233, + "rewards/rejected": -1.681246042251587, + "step": 5396 + }, + { + "epoch": 0.62, + "learning_rate": 1.1507667095867962e-07, + "logits/chosen": -3.63861083984375, + "logits/rejected": -3.3933768272399902, + "logps/chosen": -173.93887329101562, + "logps/rejected": -209.50067138671875, + "loss": 0.4215, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12903009355068207, + "rewards/margins": 1.3305171728134155, + "rewards/rejected": -1.2014870643615723, + "step": 5397 + }, + { + "epoch": 0.62, + "learning_rate": 1.1504155448905536e-07, + "logits/chosen": -3.0496575832366943, + "logits/rejected": -3.16556715965271, + "logps/chosen": -342.45849609375, + "logps/rejected": -245.93946838378906, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.640344500541687, + "rewards/margins": 2.2235093116760254, + "rewards/rejected": -1.5831648111343384, + "step": 5398 + }, + { + "epoch": 0.62, + "learning_rate": 1.150064380194311e-07, + "logits/chosen": -3.1905038356781006, + "logits/rejected": -3.363406181335449, + "logps/chosen": -283.7483215332031, + "logps/rejected": -280.486328125, + "loss": 0.4722, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008265011012554169, + "rewards/margins": 1.7873954772949219, + "rewards/rejected": -1.7791303396224976, + "step": 5399 + }, + { + "epoch": 0.62, + "learning_rate": 1.1497132154980687e-07, + "logits/chosen": -3.301525115966797, + "logits/rejected": -3.8508386611938477, + "logps/chosen": -193.31004333496094, + "logps/rejected": -366.68011474609375, + "loss": 0.619, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5141566395759583, + "rewards/margins": 2.0829615592956543, + "rewards/rejected": -2.597118377685547, + "step": 5400 + }, + { + "epoch": 0.62, + "learning_rate": 1.1493620508018261e-07, + "logits/chosen": -2.770860433578491, + "logits/rejected": -2.551589012145996, + "logps/chosen": -314.78253173828125, + "logps/rejected": -422.1291198730469, + "loss": 0.3633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39257174730300903, + "rewards/margins": 1.58377206325531, + "rewards/rejected": -1.9763438701629639, + "step": 5401 + }, + { + "epoch": 0.62, + "learning_rate": 1.1490108861055835e-07, + "logits/chosen": -3.0580742359161377, + "logits/rejected": -3.1320137977600098, + "logps/chosen": -219.77944946289062, + "logps/rejected": -139.26168823242188, + "loss": 0.8642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21715977787971497, + "rewards/margins": 0.8461529612541199, + "rewards/rejected": -1.0633126497268677, + "step": 5402 + }, + { + "epoch": 0.62, + "learning_rate": 1.1486597214093409e-07, + "logits/chosen": -3.369581937789917, + "logits/rejected": -3.575672149658203, + "logps/chosen": -125.51608276367188, + "logps/rejected": -255.1027374267578, + "loss": 0.3569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43118661642074585, + "rewards/margins": 2.2483248710632324, + "rewards/rejected": -1.8171381950378418, + "step": 5403 + }, + { + "epoch": 0.62, + "learning_rate": 1.1483085567130984e-07, + "logits/chosen": -4.299713611602783, + "logits/rejected": -3.7780957221984863, + "logps/chosen": -456.3119201660156, + "logps/rejected": -261.55059814453125, + "loss": 0.3405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18194884061813354, + "rewards/margins": 1.8531893491744995, + "rewards/rejected": -2.0351383686065674, + "step": 5404 + }, + { + "epoch": 0.62, + "learning_rate": 1.1479573920168558e-07, + "logits/chosen": -2.953155994415283, + "logits/rejected": -3.108332633972168, + "logps/chosen": -187.6568145751953, + "logps/rejected": -411.9206237792969, + "loss": 0.1787, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13976708054542542, + "rewards/margins": 2.5863378047943115, + "rewards/rejected": -2.446570634841919, + "step": 5405 + }, + { + "epoch": 0.62, + "learning_rate": 1.1476062273206132e-07, + "logits/chosen": -3.154958486557007, + "logits/rejected": -2.8166441917419434, + "logps/chosen": -263.3016357421875, + "logps/rejected": -274.4705505371094, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13420172035694122, + "rewards/margins": 3.1139426231384277, + "rewards/rejected": -3.2481441497802734, + "step": 5406 + }, + { + "epoch": 0.62, + "learning_rate": 1.1472550626243708e-07, + "logits/chosen": -2.9040894508361816, + "logits/rejected": -2.9279346466064453, + "logps/chosen": -264.76708984375, + "logps/rejected": -189.4815673828125, + "loss": 0.3979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12637919187545776, + "rewards/margins": 2.0214807987213135, + "rewards/rejected": -1.8951016664505005, + "step": 5407 + }, + { + "epoch": 0.62, + "learning_rate": 1.1469038979281283e-07, + "logits/chosen": -3.070676565170288, + "logits/rejected": -2.966632604598999, + "logps/chosen": -480.0934143066406, + "logps/rejected": -333.8505554199219, + "loss": 0.5298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21648862957954407, + "rewards/margins": 1.086332082748413, + "rewards/rejected": -1.3028206825256348, + "step": 5408 + }, + { + "epoch": 0.62, + "learning_rate": 1.1465527332318857e-07, + "logits/chosen": -2.7101669311523438, + "logits/rejected": -2.6957597732543945, + "logps/chosen": -162.93040466308594, + "logps/rejected": -255.97894287109375, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3285563886165619, + "rewards/margins": 2.7797298431396484, + "rewards/rejected": -2.4511733055114746, + "step": 5409 + }, + { + "epoch": 0.62, + "learning_rate": 1.1462015685356431e-07, + "logits/chosen": -2.925961494445801, + "logits/rejected": -2.6428089141845703, + "logps/chosen": -271.47564697265625, + "logps/rejected": -222.16268920898438, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4405193030834198, + "rewards/margins": 2.243133544921875, + "rewards/rejected": -2.683652639389038, + "step": 5410 + }, + { + "epoch": 0.62, + "learning_rate": 1.1458504038394005e-07, + "logits/chosen": -3.0183939933776855, + "logits/rejected": -3.2148308753967285, + "logps/chosen": -131.65011596679688, + "logps/rejected": -162.43545532226562, + "loss": 0.324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16675469279289246, + "rewards/margins": 1.6619699001312256, + "rewards/rejected": -1.8287245035171509, + "step": 5411 + }, + { + "epoch": 0.62, + "learning_rate": 1.1454992391431582e-07, + "logits/chosen": -2.8220229148864746, + "logits/rejected": -3.366724967956543, + "logps/chosen": -126.24058532714844, + "logps/rejected": -144.64662170410156, + "loss": 0.7044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03696702420711517, + "rewards/margins": 1.1342765092849731, + "rewards/rejected": -1.17124342918396, + "step": 5412 + }, + { + "epoch": 0.62, + "learning_rate": 1.1451480744469156e-07, + "logits/chosen": -3.0047519207000732, + "logits/rejected": -3.090839147567749, + "logps/chosen": -176.64801025390625, + "logps/rejected": -344.8848876953125, + "loss": 0.6468, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29135653376579285, + "rewards/margins": 2.4813733100891113, + "rewards/rejected": -2.7727296352386475, + "step": 5413 + }, + { + "epoch": 0.62, + "learning_rate": 1.144796909750673e-07, + "logits/chosen": -2.925032615661621, + "logits/rejected": -3.062242031097412, + "logps/chosen": -321.96514892578125, + "logps/rejected": -296.39019775390625, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22239811718463898, + "rewards/margins": 1.4773352146148682, + "rewards/rejected": -1.6997333765029907, + "step": 5414 + }, + { + "epoch": 0.62, + "learning_rate": 1.1444457450544304e-07, + "logits/chosen": -2.4675376415252686, + "logits/rejected": -2.4189505577087402, + "logps/chosen": -237.29827880859375, + "logps/rejected": -231.3621063232422, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4489559531211853, + "rewards/margins": 1.4412122964859009, + "rewards/rejected": -1.8901681900024414, + "step": 5415 + }, + { + "epoch": 0.62, + "learning_rate": 1.144094580358188e-07, + "logits/chosen": -2.884814977645874, + "logits/rejected": -2.778104543685913, + "logps/chosen": -404.9049072265625, + "logps/rejected": -200.90936279296875, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24723833799362183, + "rewards/margins": 2.126283884048462, + "rewards/rejected": -1.8790454864501953, + "step": 5416 + }, + { + "epoch": 0.62, + "learning_rate": 1.1437434156619455e-07, + "logits/chosen": -2.202568292617798, + "logits/rejected": -2.464547634124756, + "logps/chosen": -376.1035461425781, + "logps/rejected": -354.1040344238281, + "loss": 0.7762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7168967723846436, + "rewards/margins": 0.5304185152053833, + "rewards/rejected": -1.2473154067993164, + "step": 5417 + }, + { + "epoch": 0.62, + "learning_rate": 1.1433922509657029e-07, + "logits/chosen": -3.5774011611938477, + "logits/rejected": -4.000446796417236, + "logps/chosen": -107.77477264404297, + "logps/rejected": -282.89703369140625, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8016239404678345, + "rewards/margins": 4.8462395668029785, + "rewards/rejected": -4.044615745544434, + "step": 5418 + }, + { + "epoch": 0.62, + "learning_rate": 1.1430410862694603e-07, + "logits/chosen": -2.620431900024414, + "logits/rejected": -3.0785717964172363, + "logps/chosen": -175.94989013671875, + "logps/rejected": -218.52227783203125, + "loss": 0.4215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032325759530067444, + "rewards/margins": 1.2509998083114624, + "rewards/rejected": -1.2833256721496582, + "step": 5419 + }, + { + "epoch": 0.62, + "learning_rate": 1.1426899215732178e-07, + "logits/chosen": -3.1671366691589355, + "logits/rejected": -2.99135160446167, + "logps/chosen": -185.8384552001953, + "logps/rejected": -243.1055908203125, + "loss": 0.3873, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0011806488037109375, + "rewards/margins": 1.0082817077636719, + "rewards/rejected": -1.0094622373580933, + "step": 5420 + }, + { + "epoch": 0.62, + "learning_rate": 1.1423387568769753e-07, + "logits/chosen": -2.63264536857605, + "logits/rejected": -3.0324509143829346, + "logps/chosen": -104.58489227294922, + "logps/rejected": -185.0447998046875, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5698172450065613, + "rewards/margins": 1.0728511810302734, + "rewards/rejected": -1.6426682472229004, + "step": 5421 + }, + { + "epoch": 0.63, + "learning_rate": 1.1419875921807327e-07, + "logits/chosen": -2.9067025184631348, + "logits/rejected": -2.6670165061950684, + "logps/chosen": -208.9619140625, + "logps/rejected": -238.3024139404297, + "loss": 0.5933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8235546350479126, + "rewards/margins": 1.0167022943496704, + "rewards/rejected": -1.8402568101882935, + "step": 5422 + }, + { + "epoch": 0.63, + "learning_rate": 1.1416364274844901e-07, + "logits/chosen": -2.978832960128784, + "logits/rejected": -3.1148366928100586, + "logps/chosen": -383.68096923828125, + "logps/rejected": -390.9912109375, + "loss": 0.6266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26824572682380676, + "rewards/margins": 2.4279966354370117, + "rewards/rejected": -2.1597509384155273, + "step": 5423 + }, + { + "epoch": 0.63, + "learning_rate": 1.1412852627882477e-07, + "logits/chosen": -2.9072251319885254, + "logits/rejected": -3.3236148357391357, + "logps/chosen": -190.90957641601562, + "logps/rejected": -256.1713562011719, + "loss": 0.2069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01433553360402584, + "rewards/margins": 2.844028949737549, + "rewards/rejected": -2.82969331741333, + "step": 5424 + }, + { + "epoch": 0.63, + "learning_rate": 1.1409340980920051e-07, + "logits/chosen": -3.109469413757324, + "logits/rejected": -2.5608115196228027, + "logps/chosen": -463.7767333984375, + "logps/rejected": -420.4684753417969, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2778787612915039, + "rewards/margins": 2.9256393909454346, + "rewards/rejected": -2.6477603912353516, + "step": 5425 + }, + { + "epoch": 0.63, + "learning_rate": 1.1405829333957626e-07, + "logits/chosen": -3.141941785812378, + "logits/rejected": -3.2391011714935303, + "logps/chosen": -147.60377502441406, + "logps/rejected": -264.9602966308594, + "loss": 0.2218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.047821879386901855, + "rewards/margins": 3.1390957832336426, + "rewards/rejected": -3.186917543411255, + "step": 5426 + }, + { + "epoch": 0.63, + "learning_rate": 1.14023176869952e-07, + "logits/chosen": -3.6093525886535645, + "logits/rejected": -3.3298633098602295, + "logps/chosen": -204.36322021484375, + "logps/rejected": -151.15078735351562, + "loss": 0.5657, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5423285961151123, + "rewards/margins": 0.8560734391212463, + "rewards/rejected": -1.3984020948410034, + "step": 5427 + }, + { + "epoch": 0.63, + "learning_rate": 1.1398806040032776e-07, + "logits/chosen": -3.6177897453308105, + "logits/rejected": -3.276336669921875, + "logps/chosen": -217.2994842529297, + "logps/rejected": -246.02566528320312, + "loss": 0.2333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15047326683998108, + "rewards/margins": 2.269132614135742, + "rewards/rejected": -2.4196062088012695, + "step": 5428 + }, + { + "epoch": 0.63, + "learning_rate": 1.139529439307035e-07, + "logits/chosen": -3.082894802093506, + "logits/rejected": -3.460049629211426, + "logps/chosen": -281.1061706542969, + "logps/rejected": -295.80780029296875, + "loss": 0.2662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12408819794654846, + "rewards/margins": 3.4467334747314453, + "rewards/rejected": -3.570821762084961, + "step": 5429 + }, + { + "epoch": 0.63, + "learning_rate": 1.1391782746107924e-07, + "logits/chosen": -2.802238702774048, + "logits/rejected": -2.550931930541992, + "logps/chosen": -416.0614318847656, + "logps/rejected": -267.0864562988281, + "loss": 0.6602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12466681003570557, + "rewards/margins": 1.1529356241226196, + "rewards/rejected": -1.2776024341583252, + "step": 5430 + }, + { + "epoch": 0.63, + "learning_rate": 1.1388271099145498e-07, + "logits/chosen": -2.8321495056152344, + "logits/rejected": -3.1490602493286133, + "logps/chosen": -149.52276611328125, + "logps/rejected": -232.69137573242188, + "loss": 0.2792, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11203056573867798, + "rewards/margins": 1.9958739280700684, + "rewards/rejected": -2.1079044342041016, + "step": 5431 + }, + { + "epoch": 0.63, + "learning_rate": 1.1384759452183073e-07, + "logits/chosen": -3.296407461166382, + "logits/rejected": -2.870182991027832, + "logps/chosen": -305.19390869140625, + "logps/rejected": -218.38131713867188, + "loss": 0.2697, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5059475302696228, + "rewards/margins": 2.417079448699951, + "rewards/rejected": -1.9111319780349731, + "step": 5432 + }, + { + "epoch": 0.63, + "learning_rate": 1.1381247805220648e-07, + "logits/chosen": -2.7658138275146484, + "logits/rejected": -2.8423752784729004, + "logps/chosen": -271.23333740234375, + "logps/rejected": -478.18267822265625, + "loss": 0.4862, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16587768495082855, + "rewards/margins": 1.790069580078125, + "rewards/rejected": -1.9559471607208252, + "step": 5433 + }, + { + "epoch": 0.63, + "learning_rate": 1.1377736158258223e-07, + "logits/chosen": -2.8306634426116943, + "logits/rejected": -2.794461965560913, + "logps/chosen": -261.24853515625, + "logps/rejected": -193.36582946777344, + "loss": 0.5264, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5096862316131592, + "rewards/margins": 1.4253833293914795, + "rewards/rejected": -1.9350695610046387, + "step": 5434 + }, + { + "epoch": 0.63, + "learning_rate": 1.1374224511295797e-07, + "logits/chosen": -2.6660594940185547, + "logits/rejected": -3.0235042572021484, + "logps/chosen": -109.76969909667969, + "logps/rejected": -319.69097900390625, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34447622299194336, + "rewards/margins": 3.847911834716797, + "rewards/rejected": -3.5034356117248535, + "step": 5435 + }, + { + "epoch": 0.63, + "learning_rate": 1.1370712864333371e-07, + "logits/chosen": -3.0998990535736084, + "logits/rejected": -3.336967945098877, + "logps/chosen": -414.9439392089844, + "logps/rejected": -232.19619750976562, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45807647705078125, + "rewards/margins": 1.7607529163360596, + "rewards/rejected": -2.21882963180542, + "step": 5436 + }, + { + "epoch": 0.63, + "learning_rate": 1.1367201217370947e-07, + "logits/chosen": -3.728574275970459, + "logits/rejected": -3.5528323650360107, + "logps/chosen": -287.1342468261719, + "logps/rejected": -238.96002197265625, + "loss": 0.1266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38072076439857483, + "rewards/margins": 3.321661949157715, + "rewards/rejected": -2.9409408569335938, + "step": 5437 + }, + { + "epoch": 0.63, + "learning_rate": 1.1363689570408521e-07, + "logits/chosen": -3.526336193084717, + "logits/rejected": -3.264965057373047, + "logps/chosen": -345.6750793457031, + "logps/rejected": -116.9435806274414, + "loss": 0.7122, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15803363919258118, + "rewards/margins": 0.8013694286346436, + "rewards/rejected": -0.6433357000350952, + "step": 5438 + }, + { + "epoch": 0.63, + "learning_rate": 1.1360177923446095e-07, + "logits/chosen": -3.0087521076202393, + "logits/rejected": -2.824171543121338, + "logps/chosen": -223.07186889648438, + "logps/rejected": -213.9716339111328, + "loss": 0.426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37828201055526733, + "rewards/margins": 1.7054604291915894, + "rewards/rejected": -2.083742380142212, + "step": 5439 + }, + { + "epoch": 0.63, + "learning_rate": 1.1356666276483669e-07, + "logits/chosen": -3.228221893310547, + "logits/rejected": -2.894627809524536, + "logps/chosen": -296.3581848144531, + "logps/rejected": -286.0030822753906, + "loss": 0.5976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3554047644138336, + "rewards/margins": 1.2417845726013184, + "rewards/rejected": -1.5971894264221191, + "step": 5440 + }, + { + "epoch": 0.63, + "learning_rate": 1.1353154629521246e-07, + "logits/chosen": -3.2218034267425537, + "logits/rejected": -2.8591294288635254, + "logps/chosen": -285.0145263671875, + "logps/rejected": -274.23272705078125, + "loss": 0.8723, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8490760326385498, + "rewards/margins": 0.13216370344161987, + "rewards/rejected": -0.9812397956848145, + "step": 5441 + }, + { + "epoch": 0.63, + "learning_rate": 1.134964298255882e-07, + "logits/chosen": -4.100832462310791, + "logits/rejected": -3.80203914642334, + "logps/chosen": -235.61962890625, + "logps/rejected": -227.03167724609375, + "loss": 0.3129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26808854937553406, + "rewards/margins": 1.6245558261871338, + "rewards/rejected": -1.3564671277999878, + "step": 5442 + }, + { + "epoch": 0.63, + "learning_rate": 1.1346131335596394e-07, + "logits/chosen": -3.4097819328308105, + "logits/rejected": -3.295990228652954, + "logps/chosen": -211.964599609375, + "logps/rejected": -180.66329956054688, + "loss": 0.4157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5080854892730713, + "rewards/margins": 1.5150783061981201, + "rewards/rejected": -2.0231637954711914, + "step": 5443 + }, + { + "epoch": 0.63, + "learning_rate": 1.1342619688633968e-07, + "logits/chosen": -3.1121020317077637, + "logits/rejected": -3.2889585494995117, + "logps/chosen": -207.82928466796875, + "logps/rejected": -289.00762939453125, + "loss": 0.0863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14625754952430725, + "rewards/margins": 4.511458396911621, + "rewards/rejected": -4.657716274261475, + "step": 5444 + }, + { + "epoch": 0.63, + "learning_rate": 1.1339108041671545e-07, + "logits/chosen": -3.490854263305664, + "logits/rejected": -3.321404457092285, + "logps/chosen": -238.72340393066406, + "logps/rejected": -205.87388610839844, + "loss": 0.234, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3354250192642212, + "rewards/margins": 1.9860202074050903, + "rewards/rejected": -1.6505951881408691, + "step": 5445 + }, + { + "epoch": 0.63, + "learning_rate": 1.1335596394709119e-07, + "logits/chosen": -3.2389137744903564, + "logits/rejected": -3.1520161628723145, + "logps/chosen": -253.183349609375, + "logps/rejected": -242.78567504882812, + "loss": 0.2368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3328303396701813, + "rewards/margins": 1.6772754192352295, + "rewards/rejected": -2.010105609893799, + "step": 5446 + }, + { + "epoch": 0.63, + "learning_rate": 1.1332084747746693e-07, + "logits/chosen": -2.9628305435180664, + "logits/rejected": -2.890165328979492, + "logps/chosen": -212.53640747070312, + "logps/rejected": -182.61257934570312, + "loss": 0.5047, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37939080595970154, + "rewards/margins": 0.7861454486846924, + "rewards/rejected": -0.406754732131958, + "step": 5447 + }, + { + "epoch": 0.63, + "learning_rate": 1.1328573100784267e-07, + "logits/chosen": -3.116598606109619, + "logits/rejected": -2.9375576972961426, + "logps/chosen": -337.99609375, + "logps/rejected": -235.3089599609375, + "loss": 0.5024, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39569348096847534, + "rewards/margins": 2.57637095451355, + "rewards/rejected": -2.972064971923828, + "step": 5448 + }, + { + "epoch": 0.63, + "learning_rate": 1.1325061453821842e-07, + "logits/chosen": -2.991877555847168, + "logits/rejected": -3.1429474353790283, + "logps/chosen": -371.9602355957031, + "logps/rejected": -249.68043518066406, + "loss": 0.4805, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26950603723526, + "rewards/margins": 1.5438337326049805, + "rewards/rejected": -1.8133398294448853, + "step": 5449 + }, + { + "epoch": 0.63, + "learning_rate": 1.1321549806859416e-07, + "logits/chosen": -2.960064649581909, + "logits/rejected": -3.080139636993408, + "logps/chosen": -203.80941772460938, + "logps/rejected": -315.38623046875, + "loss": 0.2668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12071201205253601, + "rewards/margins": 2.563567638397217, + "rewards/rejected": -2.4428555965423584, + "step": 5450 + }, + { + "epoch": 0.63, + "learning_rate": 1.1318038159896992e-07, + "logits/chosen": -2.9071130752563477, + "logits/rejected": -2.765225648880005, + "logps/chosen": -289.81298828125, + "logps/rejected": -280.88726806640625, + "loss": 0.5503, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07644108682870865, + "rewards/margins": 1.2755508422851562, + "rewards/rejected": -1.1991097927093506, + "step": 5451 + }, + { + "epoch": 0.63, + "learning_rate": 1.1314526512934566e-07, + "logits/chosen": -3.750321388244629, + "logits/rejected": -3.59281587600708, + "logps/chosen": -242.33827209472656, + "logps/rejected": -119.08585357666016, + "loss": 0.1899, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0153803825378418, + "rewards/margins": 2.235485553741455, + "rewards/rejected": -1.2201051712036133, + "step": 5452 + }, + { + "epoch": 0.63, + "learning_rate": 1.1311014865972141e-07, + "logits/chosen": -3.1659464836120605, + "logits/rejected": -3.2590179443359375, + "logps/chosen": -545.3598022460938, + "logps/rejected": -289.70953369140625, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21787193417549133, + "rewards/margins": 1.8483103513717651, + "rewards/rejected": -2.0661821365356445, + "step": 5453 + }, + { + "epoch": 0.63, + "learning_rate": 1.1307503219009715e-07, + "logits/chosen": -2.451460838317871, + "logits/rejected": -2.4063472747802734, + "logps/chosen": -170.24411010742188, + "logps/rejected": -192.6683807373047, + "loss": 0.4421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10424965620040894, + "rewards/margins": 1.3747406005859375, + "rewards/rejected": -1.4789901971817017, + "step": 5454 + }, + { + "epoch": 0.63, + "learning_rate": 1.1303991572047289e-07, + "logits/chosen": -3.2338876724243164, + "logits/rejected": -3.245044231414795, + "logps/chosen": -241.0927276611328, + "logps/rejected": -271.0386047363281, + "loss": 0.7426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34863513708114624, + "rewards/margins": 2.761258363723755, + "rewards/rejected": -3.109893798828125, + "step": 5455 + }, + { + "epoch": 0.63, + "learning_rate": 1.1300479925084863e-07, + "logits/chosen": -2.910937547683716, + "logits/rejected": -2.9161999225616455, + "logps/chosen": -244.00106811523438, + "logps/rejected": -340.05047607421875, + "loss": 0.2815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4220176339149475, + "rewards/margins": 2.5856494903564453, + "rewards/rejected": -3.007667303085327, + "step": 5456 + }, + { + "epoch": 0.63, + "learning_rate": 1.129696827812244e-07, + "logits/chosen": -3.194343090057373, + "logits/rejected": -2.9654996395111084, + "logps/chosen": -308.35601806640625, + "logps/rejected": -175.7947998046875, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1461782604455948, + "rewards/margins": 1.2838854789733887, + "rewards/rejected": -1.1377073526382446, + "step": 5457 + }, + { + "epoch": 0.63, + "learning_rate": 1.1293456631160014e-07, + "logits/chosen": -2.6091010570526123, + "logits/rejected": -2.7087013721466064, + "logps/chosen": -405.87109375, + "logps/rejected": -340.6890869140625, + "loss": 0.2325, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2965024709701538, + "rewards/margins": 2.8282132148742676, + "rewards/rejected": -2.531710624694824, + "step": 5458 + }, + { + "epoch": 0.63, + "learning_rate": 1.1289944984197588e-07, + "logits/chosen": -2.9300050735473633, + "logits/rejected": -3.0163733959198, + "logps/chosen": -152.17926025390625, + "logps/rejected": -238.68359375, + "loss": 0.6028, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2626815736293793, + "rewards/margins": 1.0172045230865479, + "rewards/rejected": -1.2798861265182495, + "step": 5459 + }, + { + "epoch": 0.63, + "learning_rate": 1.1286433337235162e-07, + "logits/chosen": -2.729191541671753, + "logits/rejected": -2.871738910675049, + "logps/chosen": -249.9425048828125, + "logps/rejected": -265.055908203125, + "loss": 0.8353, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11828944832086563, + "rewards/margins": 0.9151426553726196, + "rewards/rejected": -1.033432126045227, + "step": 5460 + }, + { + "epoch": 0.63, + "learning_rate": 1.1282921690272738e-07, + "logits/chosen": -3.222806930541992, + "logits/rejected": -3.399954319000244, + "logps/chosen": -201.75534057617188, + "logps/rejected": -236.1642303466797, + "loss": 0.7093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4881255030632019, + "rewards/margins": 0.8929263949394226, + "rewards/rejected": -1.3810518980026245, + "step": 5461 + }, + { + "epoch": 0.63, + "learning_rate": 1.1279410043310313e-07, + "logits/chosen": -2.8979408740997314, + "logits/rejected": -3.125361919403076, + "logps/chosen": -322.6529541015625, + "logps/rejected": -300.826171875, + "loss": 0.4069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31613582372665405, + "rewards/margins": 1.1837774515151978, + "rewards/rejected": -1.499913215637207, + "step": 5462 + }, + { + "epoch": 0.63, + "learning_rate": 1.1275898396347887e-07, + "logits/chosen": -2.647444248199463, + "logits/rejected": -2.7109642028808594, + "logps/chosen": -319.679931640625, + "logps/rejected": -317.677001953125, + "loss": 0.468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19447019696235657, + "rewards/margins": 1.4868981838226318, + "rewards/rejected": -1.681368350982666, + "step": 5463 + }, + { + "epoch": 0.63, + "learning_rate": 1.1272386749385461e-07, + "logits/chosen": -3.595109701156616, + "logits/rejected": -3.2960875034332275, + "logps/chosen": -279.94647216796875, + "logps/rejected": -192.41842651367188, + "loss": 0.2299, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04962420463562012, + "rewards/margins": 1.8095622062683105, + "rewards/rejected": -1.7599380016326904, + "step": 5464 + }, + { + "epoch": 0.63, + "learning_rate": 1.1268875102423036e-07, + "logits/chosen": -2.961522102355957, + "logits/rejected": -2.8921151161193848, + "logps/chosen": -245.94371032714844, + "logps/rejected": -164.75112915039062, + "loss": 0.5996, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20202457904815674, + "rewards/margins": 1.1998381614685059, + "rewards/rejected": -0.9978134632110596, + "step": 5465 + }, + { + "epoch": 0.63, + "learning_rate": 1.126536345546061e-07, + "logits/chosen": -3.589834690093994, + "logits/rejected": -3.3231847286224365, + "logps/chosen": -217.22613525390625, + "logps/rejected": -249.88613891601562, + "loss": 0.2193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24851754307746887, + "rewards/margins": 2.4803733825683594, + "rewards/rejected": -2.231855869293213, + "step": 5466 + }, + { + "epoch": 0.63, + "learning_rate": 1.1261851808498185e-07, + "logits/chosen": -2.4813520908355713, + "logits/rejected": -2.3675708770751953, + "logps/chosen": -316.1728210449219, + "logps/rejected": -301.47607421875, + "loss": 0.2915, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.036406297236680984, + "rewards/margins": 2.8549630641937256, + "rewards/rejected": -2.891369342803955, + "step": 5467 + }, + { + "epoch": 0.63, + "learning_rate": 1.1258340161535759e-07, + "logits/chosen": -2.8529398441314697, + "logits/rejected": -2.5666921138763428, + "logps/chosen": -290.71295166015625, + "logps/rejected": -201.75439453125, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013438880443572998, + "rewards/margins": 1.3818333148956299, + "rewards/rejected": -1.3952722549438477, + "step": 5468 + }, + { + "epoch": 0.63, + "learning_rate": 1.1254828514573335e-07, + "logits/chosen": -3.5478787422180176, + "logits/rejected": -3.6383392810821533, + "logps/chosen": -285.2642822265625, + "logps/rejected": -231.12814331054688, + "loss": 0.5834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1902227401733398, + "rewards/margins": 0.7500325441360474, + "rewards/rejected": -1.9402554035186768, + "step": 5469 + }, + { + "epoch": 0.63, + "learning_rate": 1.125131686761091e-07, + "logits/chosen": -3.1576550006866455, + "logits/rejected": -2.8569412231445312, + "logps/chosen": -133.49795532226562, + "logps/rejected": -266.6334533691406, + "loss": 0.2957, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5071113109588623, + "rewards/margins": 2.5961060523986816, + "rewards/rejected": -3.103217124938965, + "step": 5470 + }, + { + "epoch": 0.63, + "learning_rate": 1.1247805220648483e-07, + "logits/chosen": -2.686901569366455, + "logits/rejected": -2.7907938957214355, + "logps/chosen": -396.9640808105469, + "logps/rejected": -359.2188720703125, + "loss": 0.4158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11026449501514435, + "rewards/margins": 1.0140016078948975, + "rewards/rejected": -1.1242660284042358, + "step": 5471 + }, + { + "epoch": 0.63, + "learning_rate": 1.1244293573686058e-07, + "logits/chosen": -2.5132343769073486, + "logits/rejected": -2.519845485687256, + "logps/chosen": -133.67977905273438, + "logps/rejected": -251.34068298339844, + "loss": 0.5071, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022243410348892212, + "rewards/margins": 1.8780794143676758, + "rewards/rejected": -1.8558359146118164, + "step": 5472 + }, + { + "epoch": 0.63, + "learning_rate": 1.1240781926723634e-07, + "logits/chosen": -2.7375595569610596, + "logits/rejected": -2.8225605487823486, + "logps/chosen": -311.9199523925781, + "logps/rejected": -179.55682373046875, + "loss": 0.5972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5273056030273438, + "rewards/margins": 0.3976099491119385, + "rewards/rejected": -0.9249155521392822, + "step": 5473 + }, + { + "epoch": 0.63, + "learning_rate": 1.1237270279761208e-07, + "logits/chosen": -2.752690315246582, + "logits/rejected": -3.0547902584075928, + "logps/chosen": -289.3938293457031, + "logps/rejected": -252.96621704101562, + "loss": 0.3935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1708342432975769, + "rewards/margins": 1.4646451473236084, + "rewards/rejected": -1.2938108444213867, + "step": 5474 + }, + { + "epoch": 0.63, + "learning_rate": 1.1233758632798782e-07, + "logits/chosen": -2.8565263748168945, + "logits/rejected": -2.9993391036987305, + "logps/chosen": -414.2953796386719, + "logps/rejected": -222.9341583251953, + "loss": 0.3523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026893839240074158, + "rewards/margins": 1.9569569826126099, + "rewards/rejected": -1.9838507175445557, + "step": 5475 + }, + { + "epoch": 0.63, + "learning_rate": 1.1230246985836356e-07, + "logits/chosen": -3.0080456733703613, + "logits/rejected": -2.7970476150512695, + "logps/chosen": -155.52613830566406, + "logps/rejected": -107.7236328125, + "loss": 0.6354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35474684834480286, + "rewards/margins": 0.37442582845687866, + "rewards/rejected": -0.7291726469993591, + "step": 5476 + }, + { + "epoch": 0.63, + "learning_rate": 1.122673533887393e-07, + "logits/chosen": -3.6482114791870117, + "logits/rejected": -3.4958415031433105, + "logps/chosen": -235.63941955566406, + "logps/rejected": -230.5106201171875, + "loss": 0.9269, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08770638704299927, + "rewards/margins": 1.9582582712173462, + "rewards/rejected": -1.8705518245697021, + "step": 5477 + }, + { + "epoch": 0.63, + "learning_rate": 1.1223223691911506e-07, + "logits/chosen": -2.789910078048706, + "logits/rejected": -2.3334789276123047, + "logps/chosen": -131.17750549316406, + "logps/rejected": -449.4748229980469, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027030013501644135, + "rewards/margins": 3.3140439987182617, + "rewards/rejected": -3.2870140075683594, + "step": 5478 + }, + { + "epoch": 0.63, + "learning_rate": 1.1219712044949081e-07, + "logits/chosen": -3.126075506210327, + "logits/rejected": -2.8245368003845215, + "logps/chosen": -234.1618194580078, + "logps/rejected": -258.89892578125, + "loss": 0.1442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6773309111595154, + "rewards/margins": 3.2553558349609375, + "rewards/rejected": -2.5780248641967773, + "step": 5479 + }, + { + "epoch": 0.63, + "learning_rate": 1.1216200397986655e-07, + "logits/chosen": -2.8615169525146484, + "logits/rejected": -2.5520660877227783, + "logps/chosen": -224.00656127929688, + "logps/rejected": -268.4437255859375, + "loss": 0.2738, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3212311863899231, + "rewards/margins": 1.435390830039978, + "rewards/rejected": -1.1141595840454102, + "step": 5480 + }, + { + "epoch": 0.63, + "learning_rate": 1.121268875102423e-07, + "logits/chosen": -3.1347765922546387, + "logits/rejected": -3.1705238819122314, + "logps/chosen": -357.8116760253906, + "logps/rejected": -292.01727294921875, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31670644879341125, + "rewards/margins": 2.730114698410034, + "rewards/rejected": -3.046821355819702, + "step": 5481 + }, + { + "epoch": 0.63, + "learning_rate": 1.1209177104061805e-07, + "logits/chosen": -2.2741241455078125, + "logits/rejected": -2.504934310913086, + "logps/chosen": -339.88153076171875, + "logps/rejected": -273.9819030761719, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1699130535125732, + "rewards/margins": 3.0218961238861084, + "rewards/rejected": -1.8519833087921143, + "step": 5482 + }, + { + "epoch": 0.63, + "learning_rate": 1.1205665457099379e-07, + "logits/chosen": -3.467454671859741, + "logits/rejected": -3.2936248779296875, + "logps/chosen": -334.1725158691406, + "logps/rejected": -244.14776611328125, + "loss": 0.4266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02453860640525818, + "rewards/margins": 1.6729447841644287, + "rewards/rejected": -1.6974833011627197, + "step": 5483 + }, + { + "epoch": 0.63, + "learning_rate": 1.1202153810136953e-07, + "logits/chosen": -3.3341166973114014, + "logits/rejected": -3.0226941108703613, + "logps/chosen": -377.73419189453125, + "logps/rejected": -231.8177947998047, + "loss": 0.4177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024500221014022827, + "rewards/margins": 1.2430144548416138, + "rewards/rejected": -1.2185142040252686, + "step": 5484 + }, + { + "epoch": 0.63, + "learning_rate": 1.1198642163174527e-07, + "logits/chosen": -3.0285165309906006, + "logits/rejected": -2.7580361366271973, + "logps/chosen": -436.3427429199219, + "logps/rejected": -353.1871337890625, + "loss": 0.2927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6433258056640625, + "rewards/margins": 2.3950695991516113, + "rewards/rejected": -1.7517437934875488, + "step": 5485 + }, + { + "epoch": 0.63, + "learning_rate": 1.1195130516212104e-07, + "logits/chosen": -3.093221426010132, + "logits/rejected": -3.1678154468536377, + "logps/chosen": -337.2660827636719, + "logps/rejected": -369.57666015625, + "loss": 0.8245, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.500946044921875, + "rewards/margins": 0.6713109612464905, + "rewards/rejected": -1.1722568273544312, + "step": 5486 + }, + { + "epoch": 0.63, + "learning_rate": 1.1191618869249678e-07, + "logits/chosen": -2.9627132415771484, + "logits/rejected": -3.2397356033325195, + "logps/chosen": -189.01760864257812, + "logps/rejected": -379.73077392578125, + "loss": 0.1948, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.263182669878006, + "rewards/margins": 4.189352989196777, + "rewards/rejected": -3.9261703491210938, + "step": 5487 + }, + { + "epoch": 0.63, + "learning_rate": 1.1188107222287252e-07, + "logits/chosen": -2.622494697570801, + "logits/rejected": -2.824293851852417, + "logps/chosen": -202.5093231201172, + "logps/rejected": -236.33265686035156, + "loss": 0.4966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5236482620239258, + "rewards/margins": 1.5963563919067383, + "rewards/rejected": -2.120004653930664, + "step": 5488 + }, + { + "epoch": 0.63, + "learning_rate": 1.1184595575324826e-07, + "logits/chosen": -3.165426254272461, + "logits/rejected": -3.333479404449463, + "logps/chosen": -175.8100128173828, + "logps/rejected": -325.0626220703125, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04856446385383606, + "rewards/margins": 1.4174546003341675, + "rewards/rejected": -1.4660191535949707, + "step": 5489 + }, + { + "epoch": 0.63, + "learning_rate": 1.1181083928362403e-07, + "logits/chosen": -3.4511826038360596, + "logits/rejected": -3.328671932220459, + "logps/chosen": -200.65191650390625, + "logps/rejected": -185.54669189453125, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3867371082305908, + "rewards/margins": 2.0300190448760986, + "rewards/rejected": -2.4167561531066895, + "step": 5490 + }, + { + "epoch": 0.63, + "learning_rate": 1.1177572281399977e-07, + "logits/chosen": -3.573338508605957, + "logits/rejected": -3.4713687896728516, + "logps/chosen": -449.6094665527344, + "logps/rejected": -394.2887878417969, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7046130895614624, + "rewards/margins": 2.857367515563965, + "rewards/rejected": -2.152754306793213, + "step": 5491 + }, + { + "epoch": 0.63, + "learning_rate": 1.117406063443755e-07, + "logits/chosen": -3.0129055976867676, + "logits/rejected": -3.2479326725006104, + "logps/chosen": -319.97259521484375, + "logps/rejected": -261.19915771484375, + "loss": 0.4255, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02799205482006073, + "rewards/margins": 1.9949949979782104, + "rewards/rejected": -1.9670027494430542, + "step": 5492 + }, + { + "epoch": 0.63, + "learning_rate": 1.1170548987475125e-07, + "logits/chosen": -3.35306453704834, + "logits/rejected": -3.1737165451049805, + "logps/chosen": -296.0086669921875, + "logps/rejected": -210.87933349609375, + "loss": 0.4761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019969038665294647, + "rewards/margins": 0.8099215626716614, + "rewards/rejected": -0.8298905491828918, + "step": 5493 + }, + { + "epoch": 0.63, + "learning_rate": 1.11670373405127e-07, + "logits/chosen": -3.1186866760253906, + "logits/rejected": -3.2538411617279053, + "logps/chosen": -208.56736755371094, + "logps/rejected": -252.6584014892578, + "loss": 0.3489, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25513797998428345, + "rewards/margins": 1.531419038772583, + "rewards/rejected": -1.2762811183929443, + "step": 5494 + }, + { + "epoch": 0.63, + "learning_rate": 1.1163525693550274e-07, + "logits/chosen": -3.1133861541748047, + "logits/rejected": -2.958989143371582, + "logps/chosen": -224.08558654785156, + "logps/rejected": -247.2935791015625, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28028395771980286, + "rewards/margins": 2.087554931640625, + "rewards/rejected": -1.807271122932434, + "step": 5495 + }, + { + "epoch": 0.63, + "learning_rate": 1.116001404658785e-07, + "logits/chosen": -3.2378695011138916, + "logits/rejected": -3.3117024898529053, + "logps/chosen": -294.3490295410156, + "logps/rejected": -199.15228271484375, + "loss": 0.3782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3890104591846466, + "rewards/margins": 0.9755556583404541, + "rewards/rejected": -1.3645660877227783, + "step": 5496 + }, + { + "epoch": 0.63, + "learning_rate": 1.1156502399625424e-07, + "logits/chosen": -3.28847074508667, + "logits/rejected": -3.117636203765869, + "logps/chosen": -276.21246337890625, + "logps/rejected": -194.04251098632812, + "loss": 0.5365, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05329914391040802, + "rewards/margins": 0.7173891067504883, + "rewards/rejected": -0.7706882953643799, + "step": 5497 + }, + { + "epoch": 0.63, + "learning_rate": 1.1152990752662999e-07, + "logits/chosen": -3.267228126525879, + "logits/rejected": -3.4802141189575195, + "logps/chosen": -183.12152099609375, + "logps/rejected": -243.7259979248047, + "loss": 0.2143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4132990837097168, + "rewards/margins": 2.1349406242370605, + "rewards/rejected": -1.7216416597366333, + "step": 5498 + }, + { + "epoch": 0.63, + "learning_rate": 1.1149479105700573e-07, + "logits/chosen": -3.1874938011169434, + "logits/rejected": -2.705043077468872, + "logps/chosen": -197.82650756835938, + "logps/rejected": -176.7633514404297, + "loss": 0.5871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4271780252456665, + "rewards/margins": 0.9877208471298218, + "rewards/rejected": -1.4148989915847778, + "step": 5499 + }, + { + "epoch": 0.63, + "learning_rate": 1.1145967458738147e-07, + "logits/chosen": -3.4292654991149902, + "logits/rejected": -3.6989030838012695, + "logps/chosen": -213.780029296875, + "logps/rejected": -333.119873046875, + "loss": 0.5713, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49021971225738525, + "rewards/margins": 1.1087383031845093, + "rewards/rejected": -1.598958134651184, + "step": 5500 + }, + { + "epoch": 0.63, + "learning_rate": 1.1142455811775721e-07, + "logits/chosen": -3.675584316253662, + "logits/rejected": -3.445375919342041, + "logps/chosen": -159.32354736328125, + "logps/rejected": -151.65151977539062, + "loss": 0.4831, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.037028104066848755, + "rewards/margins": 0.8467313647270203, + "rewards/rejected": -0.8097033500671387, + "step": 5501 + }, + { + "epoch": 0.63, + "learning_rate": 1.1138944164813298e-07, + "logits/chosen": -2.632962226867676, + "logits/rejected": -2.6882009506225586, + "logps/chosen": -167.53314208984375, + "logps/rejected": -232.84783935546875, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17690376937389374, + "rewards/margins": 2.4795751571655273, + "rewards/rejected": -2.6564788818359375, + "step": 5502 + }, + { + "epoch": 0.63, + "learning_rate": 1.1135432517850872e-07, + "logits/chosen": -2.5546200275421143, + "logits/rejected": -2.5025768280029297, + "logps/chosen": -349.92431640625, + "logps/rejected": -327.72802734375, + "loss": 0.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4213631749153137, + "rewards/margins": 1.9757661819458008, + "rewards/rejected": -2.397129535675049, + "step": 5503 + }, + { + "epoch": 0.63, + "learning_rate": 1.1131920870888446e-07, + "logits/chosen": -2.577798843383789, + "logits/rejected": -2.746854066848755, + "logps/chosen": -183.76918029785156, + "logps/rejected": -207.2682342529297, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3552616834640503, + "rewards/margins": 0.9401463866233826, + "rewards/rejected": -1.2954081296920776, + "step": 5504 + }, + { + "epoch": 0.63, + "learning_rate": 1.112840922392602e-07, + "logits/chosen": -3.2074503898620605, + "logits/rejected": -3.1832382678985596, + "logps/chosen": -285.6666564941406, + "logps/rejected": -241.51795959472656, + "loss": 0.2973, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.175079807639122, + "rewards/margins": 3.100816249847412, + "rewards/rejected": -3.275895833969116, + "step": 5505 + }, + { + "epoch": 0.63, + "learning_rate": 1.1124897576963595e-07, + "logits/chosen": -3.2883517742156982, + "logits/rejected": -3.1571602821350098, + "logps/chosen": -219.53919982910156, + "logps/rejected": -305.21295166015625, + "loss": 0.231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4753226339817047, + "rewards/margins": 2.1593844890594482, + "rewards/rejected": -2.634706974029541, + "step": 5506 + }, + { + "epoch": 0.63, + "learning_rate": 1.1121385930001171e-07, + "logits/chosen": -3.3992414474487305, + "logits/rejected": -3.351496696472168, + "logps/chosen": -295.09014892578125, + "logps/rejected": -160.6492919921875, + "loss": 0.4097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23790279030799866, + "rewards/margins": 2.086582899093628, + "rewards/rejected": -2.3244857788085938, + "step": 5507 + }, + { + "epoch": 0.63, + "learning_rate": 1.1117874283038745e-07, + "logits/chosen": -2.9079818725585938, + "logits/rejected": -3.1298067569732666, + "logps/chosen": -392.1679992675781, + "logps/rejected": -333.46246337890625, + "loss": 0.7285, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1759652942419052, + "rewards/margins": 2.1347365379333496, + "rewards/rejected": -2.310701608657837, + "step": 5508 + }, + { + "epoch": 0.64, + "learning_rate": 1.1114362636076319e-07, + "logits/chosen": -2.898730993270874, + "logits/rejected": -2.7392079830169678, + "logps/chosen": -273.8663635253906, + "logps/rejected": -279.561767578125, + "loss": 0.3804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1743800789117813, + "rewards/margins": 1.2793071269989014, + "rewards/rejected": -1.4536871910095215, + "step": 5509 + }, + { + "epoch": 0.64, + "learning_rate": 1.1110850989113894e-07, + "logits/chosen": -3.3898472785949707, + "logits/rejected": -3.0971789360046387, + "logps/chosen": -145.29794311523438, + "logps/rejected": -105.98072814941406, + "loss": 0.7936, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5425716638565063, + "rewards/margins": 0.16475412249565125, + "rewards/rejected": -0.70732581615448, + "step": 5510 + }, + { + "epoch": 0.64, + "learning_rate": 1.1107339342151468e-07, + "logits/chosen": -3.1593804359436035, + "logits/rejected": -3.061960458755493, + "logps/chosen": -186.45339965820312, + "logps/rejected": -199.9506072998047, + "loss": 0.3424, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07303686439990997, + "rewards/margins": 1.3414933681488037, + "rewards/rejected": -1.2684564590454102, + "step": 5511 + }, + { + "epoch": 0.64, + "learning_rate": 1.1103827695189042e-07, + "logits/chosen": -3.246035575866699, + "logits/rejected": -2.9584169387817383, + "logps/chosen": -186.2506103515625, + "logps/rejected": -213.4463348388672, + "loss": 0.5063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08119082450866699, + "rewards/margins": 1.144974708557129, + "rewards/rejected": -1.226165533065796, + "step": 5512 + }, + { + "epoch": 0.64, + "learning_rate": 1.1100316048226618e-07, + "logits/chosen": -3.345869541168213, + "logits/rejected": -2.9399325847625732, + "logps/chosen": -266.8641357421875, + "logps/rejected": -268.50604248046875, + "loss": 0.1933, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05347121134400368, + "rewards/margins": 2.6958200931549072, + "rewards/rejected": -2.6423490047454834, + "step": 5513 + }, + { + "epoch": 0.64, + "learning_rate": 1.1096804401264193e-07, + "logits/chosen": -3.7046170234680176, + "logits/rejected": -3.8143067359924316, + "logps/chosen": -225.65672302246094, + "logps/rejected": -219.10513305664062, + "loss": 0.3799, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09888547658920288, + "rewards/margins": 1.8516778945922852, + "rewards/rejected": -1.9505633115768433, + "step": 5514 + }, + { + "epoch": 0.64, + "learning_rate": 1.1093292754301767e-07, + "logits/chosen": -3.517336368560791, + "logits/rejected": -3.5595571994781494, + "logps/chosen": -167.04100036621094, + "logps/rejected": -172.13621520996094, + "loss": 0.592, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0002405792474746704, + "rewards/margins": 1.1323533058166504, + "rewards/rejected": -1.132112741470337, + "step": 5515 + }, + { + "epoch": 0.64, + "learning_rate": 1.1089781107339341e-07, + "logits/chosen": -3.27402663230896, + "logits/rejected": -3.2704644203186035, + "logps/chosen": -187.83917236328125, + "logps/rejected": -196.63037109375, + "loss": 0.1935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00946022942662239, + "rewards/margins": 2.1864519119262695, + "rewards/rejected": -2.1959123611450195, + "step": 5516 + }, + { + "epoch": 0.64, + "learning_rate": 1.1086269460376915e-07, + "logits/chosen": -2.6184921264648438, + "logits/rejected": -2.687542200088501, + "logps/chosen": -133.64236450195312, + "logps/rejected": -264.16558837890625, + "loss": 0.1079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22215205430984497, + "rewards/margins": 3.58726167678833, + "rewards/rejected": -3.8094139099121094, + "step": 5517 + }, + { + "epoch": 0.64, + "learning_rate": 1.1082757813414492e-07, + "logits/chosen": -3.6257870197296143, + "logits/rejected": -3.62660551071167, + "logps/chosen": -224.2109375, + "logps/rejected": -225.47927856445312, + "loss": 0.1755, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022235482931137085, + "rewards/margins": 2.357403039932251, + "rewards/rejected": -2.335167407989502, + "step": 5518 + }, + { + "epoch": 0.64, + "learning_rate": 1.1079246166452066e-07, + "logits/chosen": -3.2906100749969482, + "logits/rejected": -3.007499933242798, + "logps/chosen": -369.81866455078125, + "logps/rejected": -219.42481994628906, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30697324872016907, + "rewards/margins": 1.4028041362762451, + "rewards/rejected": -1.0958307981491089, + "step": 5519 + }, + { + "epoch": 0.64, + "learning_rate": 1.107573451948964e-07, + "logits/chosen": -2.570202350616455, + "logits/rejected": -2.583059787750244, + "logps/chosen": -510.99200439453125, + "logps/rejected": -262.88958740234375, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01783415675163269, + "rewards/margins": 2.1254565715789795, + "rewards/rejected": -2.1432905197143555, + "step": 5520 + }, + { + "epoch": 0.64, + "learning_rate": 1.1072222872527214e-07, + "logits/chosen": -3.207482099533081, + "logits/rejected": -2.9200170040130615, + "logps/chosen": -234.81971740722656, + "logps/rejected": -192.84413146972656, + "loss": 0.3051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3027254045009613, + "rewards/margins": 1.2280296087265015, + "rewards/rejected": -0.9253041744232178, + "step": 5521 + }, + { + "epoch": 0.64, + "learning_rate": 1.106871122556479e-07, + "logits/chosen": -2.925812244415283, + "logits/rejected": -3.4228882789611816, + "logps/chosen": -200.66900634765625, + "logps/rejected": -242.32943725585938, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20015142858028412, + "rewards/margins": 2.682528018951416, + "rewards/rejected": -2.48237681388855, + "step": 5522 + }, + { + "epoch": 0.64, + "learning_rate": 1.1065199578602364e-07, + "logits/chosen": -3.066866159439087, + "logits/rejected": -3.4221444129943848, + "logps/chosen": -241.11843872070312, + "logps/rejected": -175.12295532226562, + "loss": 0.3941, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12313181161880493, + "rewards/margins": 2.260941982269287, + "rewards/rejected": -2.137809991836548, + "step": 5523 + }, + { + "epoch": 0.64, + "learning_rate": 1.1061687931639939e-07, + "logits/chosen": -2.6994361877441406, + "logits/rejected": -2.991220712661743, + "logps/chosen": -252.54864501953125, + "logps/rejected": -307.52001953125, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1569393128156662, + "rewards/margins": 2.6649858951568604, + "rewards/rejected": -2.821925401687622, + "step": 5524 + }, + { + "epoch": 0.64, + "learning_rate": 1.1058176284677513e-07, + "logits/chosen": -2.7881903648376465, + "logits/rejected": -3.095353603363037, + "logps/chosen": -174.21531677246094, + "logps/rejected": -354.4920959472656, + "loss": 0.4865, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03687632083892822, + "rewards/margins": 1.4258685111999512, + "rewards/rejected": -1.3889920711517334, + "step": 5525 + }, + { + "epoch": 0.64, + "learning_rate": 1.1054664637715087e-07, + "logits/chosen": -3.7372450828552246, + "logits/rejected": -3.546281337738037, + "logps/chosen": -200.64089965820312, + "logps/rejected": -160.68017578125, + "loss": 0.6084, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.246497243642807, + "rewards/margins": 0.6750624775886536, + "rewards/rejected": -0.9215598106384277, + "step": 5526 + }, + { + "epoch": 0.64, + "learning_rate": 1.1051152990752663e-07, + "logits/chosen": -2.849851131439209, + "logits/rejected": -3.046604871749878, + "logps/chosen": -292.5186462402344, + "logps/rejected": -327.0318298339844, + "loss": 0.2668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17079542577266693, + "rewards/margins": 1.8089994192123413, + "rewards/rejected": -1.6382039785385132, + "step": 5527 + }, + { + "epoch": 0.64, + "learning_rate": 1.1047641343790237e-07, + "logits/chosen": -3.5392556190490723, + "logits/rejected": -4.038215637207031, + "logps/chosen": -140.36862182617188, + "logps/rejected": -206.01290893554688, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33839088678359985, + "rewards/margins": 1.8980073928833008, + "rewards/rejected": -2.236398220062256, + "step": 5528 + }, + { + "epoch": 0.64, + "learning_rate": 1.1044129696827811e-07, + "logits/chosen": -3.5254416465759277, + "logits/rejected": -3.6530165672302246, + "logps/chosen": -180.14093017578125, + "logps/rejected": -188.54705810546875, + "loss": 0.2533, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12502899765968323, + "rewards/margins": 2.089557647705078, + "rewards/rejected": -1.9645286798477173, + "step": 5529 + }, + { + "epoch": 0.64, + "learning_rate": 1.1040618049865386e-07, + "logits/chosen": -2.4629435539245605, + "logits/rejected": -2.5604774951934814, + "logps/chosen": -399.15191650390625, + "logps/rejected": -222.55772399902344, + "loss": 0.2493, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11885417252779007, + "rewards/margins": 2.4662117958068848, + "rewards/rejected": -2.3473575115203857, + "step": 5530 + }, + { + "epoch": 0.64, + "learning_rate": 1.1037106402902962e-07, + "logits/chosen": -3.161863327026367, + "logits/rejected": -3.0353612899780273, + "logps/chosen": -283.15850830078125, + "logps/rejected": -319.2021484375, + "loss": 0.5942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42695289850234985, + "rewards/margins": 1.6866494417190552, + "rewards/rejected": -2.11360239982605, + "step": 5531 + }, + { + "epoch": 0.64, + "learning_rate": 1.1033594755940536e-07, + "logits/chosen": -3.0062460899353027, + "logits/rejected": -2.817401647567749, + "logps/chosen": -346.0284423828125, + "logps/rejected": -325.31866455078125, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22136089205741882, + "rewards/margins": 0.5394108295440674, + "rewards/rejected": -0.7607717514038086, + "step": 5532 + }, + { + "epoch": 0.64, + "learning_rate": 1.103008310897811e-07, + "logits/chosen": -3.115601062774658, + "logits/rejected": -2.8772406578063965, + "logps/chosen": -266.0494384765625, + "logps/rejected": -293.56298828125, + "loss": 0.619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6358642578125, + "rewards/margins": 0.9450751543045044, + "rewards/rejected": -1.5809394121170044, + "step": 5533 + }, + { + "epoch": 0.64, + "learning_rate": 1.1026571462015684e-07, + "logits/chosen": -3.480530023574829, + "logits/rejected": -3.662848949432373, + "logps/chosen": -117.86821746826172, + "logps/rejected": -189.725341796875, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004321090877056122, + "rewards/margins": 2.006197214126587, + "rewards/rejected": -2.0105185508728027, + "step": 5534 + }, + { + "epoch": 0.64, + "learning_rate": 1.102305981505326e-07, + "logits/chosen": -3.716085195541382, + "logits/rejected": -3.654440402984619, + "logps/chosen": -220.44017028808594, + "logps/rejected": -286.96490478515625, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4073021113872528, + "rewards/margins": 2.4694888591766357, + "rewards/rejected": -2.0621869564056396, + "step": 5535 + }, + { + "epoch": 0.64, + "learning_rate": 1.1019548168090835e-07, + "logits/chosen": -2.8730549812316895, + "logits/rejected": -3.0500383377075195, + "logps/chosen": -349.35247802734375, + "logps/rejected": -264.81915283203125, + "loss": 0.4437, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4813695549964905, + "rewards/margins": 2.3494057655334473, + "rewards/rejected": -1.8680362701416016, + "step": 5536 + }, + { + "epoch": 0.64, + "learning_rate": 1.1016036521128409e-07, + "logits/chosen": -3.096554756164551, + "logits/rejected": -3.180095911026001, + "logps/chosen": -172.13525390625, + "logps/rejected": -198.8572235107422, + "loss": 0.6499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5295023918151855, + "rewards/margins": 0.6526376008987427, + "rewards/rejected": -1.1821398735046387, + "step": 5537 + }, + { + "epoch": 0.64, + "learning_rate": 1.1012524874165983e-07, + "logits/chosen": -3.1378531455993652, + "logits/rejected": -3.0356433391571045, + "logps/chosen": -148.58685302734375, + "logps/rejected": -188.75653076171875, + "loss": 0.229, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3977190852165222, + "rewards/margins": 2.714970588684082, + "rewards/rejected": -2.317251443862915, + "step": 5538 + }, + { + "epoch": 0.64, + "learning_rate": 1.1009013227203558e-07, + "logits/chosen": -3.0705313682556152, + "logits/rejected": -3.3714499473571777, + "logps/chosen": -193.19554138183594, + "logps/rejected": -220.8395538330078, + "loss": 0.4075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0800720751285553, + "rewards/margins": 2.1725826263427734, + "rewards/rejected": -2.252654552459717, + "step": 5539 + }, + { + "epoch": 0.64, + "learning_rate": 1.1005501580241132e-07, + "logits/chosen": -2.7951762676239014, + "logits/rejected": -2.848287582397461, + "logps/chosen": -365.0451354980469, + "logps/rejected": -252.99771118164062, + "loss": 0.3297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03073069453239441, + "rewards/margins": 1.3673338890075684, + "rewards/rejected": -1.3980646133422852, + "step": 5540 + }, + { + "epoch": 0.64, + "learning_rate": 1.1001989933278707e-07, + "logits/chosen": -3.3802058696746826, + "logits/rejected": -3.7321338653564453, + "logps/chosen": -208.88134765625, + "logps/rejected": -333.64434814453125, + "loss": 0.378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2732570767402649, + "rewards/margins": 2.0252866744995117, + "rewards/rejected": -2.298543691635132, + "step": 5541 + }, + { + "epoch": 0.64, + "learning_rate": 1.0998478286316282e-07, + "logits/chosen": -3.1483664512634277, + "logits/rejected": -3.3012075424194336, + "logps/chosen": -291.90533447265625, + "logps/rejected": -316.12078857421875, + "loss": 1.0098, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6501993536949158, + "rewards/margins": -0.13786569237709045, + "rewards/rejected": -0.5123336911201477, + "step": 5542 + }, + { + "epoch": 0.64, + "learning_rate": 1.0994966639353857e-07, + "logits/chosen": -3.832737445831299, + "logits/rejected": -3.5804250240325928, + "logps/chosen": -241.39788818359375, + "logps/rejected": -238.79476928710938, + "loss": 0.4502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6003098487854004, + "rewards/margins": 1.7608039379119873, + "rewards/rejected": -2.361114025115967, + "step": 5543 + }, + { + "epoch": 0.64, + "learning_rate": 1.0991454992391431e-07, + "logits/chosen": -3.0383307933807373, + "logits/rejected": -2.8511760234832764, + "logps/chosen": -178.01217651367188, + "logps/rejected": -200.23818969726562, + "loss": 0.5132, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5584123730659485, + "rewards/margins": 0.9145832657814026, + "rewards/rejected": -1.472995638847351, + "step": 5544 + }, + { + "epoch": 0.64, + "learning_rate": 1.0987943345429005e-07, + "logits/chosen": -3.308229923248291, + "logits/rejected": -3.42641544342041, + "logps/chosen": -204.2576141357422, + "logps/rejected": -284.22882080078125, + "loss": 0.7145, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.951913595199585, + "rewards/margins": 1.8994026184082031, + "rewards/rejected": -2.851316452026367, + "step": 5545 + }, + { + "epoch": 0.64, + "learning_rate": 1.0984431698466579e-07, + "logits/chosen": -2.736722469329834, + "logits/rejected": -2.847074031829834, + "logps/chosen": -212.12921142578125, + "logps/rejected": -300.3636474609375, + "loss": 0.4496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4790023863315582, + "rewards/margins": 0.8247756958007812, + "rewards/rejected": -1.3037781715393066, + "step": 5546 + }, + { + "epoch": 0.64, + "learning_rate": 1.0980920051504156e-07, + "logits/chosen": -2.9862406253814697, + "logits/rejected": -2.9840121269226074, + "logps/chosen": -308.8196716308594, + "logps/rejected": -266.7723693847656, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05951756238937378, + "rewards/margins": 1.5760716199874878, + "rewards/rejected": -1.5165541172027588, + "step": 5547 + }, + { + "epoch": 0.64, + "learning_rate": 1.097740840454173e-07, + "logits/chosen": -3.0478577613830566, + "logits/rejected": -3.2702858448028564, + "logps/chosen": -160.64198303222656, + "logps/rejected": -237.85113525390625, + "loss": 0.6475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5977204442024231, + "rewards/margins": 1.1510095596313477, + "rewards/rejected": -1.748729944229126, + "step": 5548 + }, + { + "epoch": 0.64, + "learning_rate": 1.0973896757579304e-07, + "logits/chosen": -3.814485788345337, + "logits/rejected": -3.588376760482788, + "logps/chosen": -143.7447509765625, + "logps/rejected": -278.7994384765625, + "loss": 0.1919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10172916948795319, + "rewards/margins": 2.988987922668457, + "rewards/rejected": -3.090717077255249, + "step": 5549 + }, + { + "epoch": 0.64, + "learning_rate": 1.0970385110616878e-07, + "logits/chosen": -3.194347858428955, + "logits/rejected": -3.0685272216796875, + "logps/chosen": -243.16021728515625, + "logps/rejected": -215.71728515625, + "loss": 0.3894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19392141699790955, + "rewards/margins": 1.0382965803146362, + "rewards/rejected": -1.2322180271148682, + "step": 5550 + }, + { + "epoch": 0.64, + "learning_rate": 1.0966873463654455e-07, + "logits/chosen": -3.2503294944763184, + "logits/rejected": -2.9025073051452637, + "logps/chosen": -331.2242126464844, + "logps/rejected": -341.3046569824219, + "loss": 0.6131, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1706608533859253, + "rewards/margins": 0.782849133014679, + "rewards/rejected": -1.95350980758667, + "step": 5551 + }, + { + "epoch": 0.64, + "learning_rate": 1.0963361816692029e-07, + "logits/chosen": -3.2189316749572754, + "logits/rejected": -2.8718652725219727, + "logps/chosen": -416.78631591796875, + "logps/rejected": -246.37713623046875, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07266174256801605, + "rewards/margins": 2.642735242843628, + "rewards/rejected": -2.7153968811035156, + "step": 5552 + }, + { + "epoch": 0.64, + "learning_rate": 1.0959850169729603e-07, + "logits/chosen": -3.575732707977295, + "logits/rejected": -3.4063620567321777, + "logps/chosen": -327.4098205566406, + "logps/rejected": -285.12481689453125, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45916301012039185, + "rewards/margins": 2.975454092025757, + "rewards/rejected": -2.5162911415100098, + "step": 5553 + }, + { + "epoch": 0.64, + "learning_rate": 1.0956338522767177e-07, + "logits/chosen": -3.0119707584381104, + "logits/rejected": -3.1084108352661133, + "logps/chosen": -223.31387329101562, + "logps/rejected": -291.1942443847656, + "loss": 0.5748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3868541717529297, + "rewards/margins": 1.4171669483184814, + "rewards/rejected": -1.8040211200714111, + "step": 5554 + }, + { + "epoch": 0.64, + "learning_rate": 1.0952826875804752e-07, + "logits/chosen": -3.3900554180145264, + "logits/rejected": -3.0866732597351074, + "logps/chosen": -321.4344177246094, + "logps/rejected": -292.443603515625, + "loss": 0.3488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33955633640289307, + "rewards/margins": 1.567571759223938, + "rewards/rejected": -1.907128095626831, + "step": 5555 + }, + { + "epoch": 0.64, + "learning_rate": 1.0949315228842326e-07, + "logits/chosen": -2.999690055847168, + "logits/rejected": -2.804152250289917, + "logps/chosen": -256.8074645996094, + "logps/rejected": -204.11727905273438, + "loss": 0.3829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17519989609718323, + "rewards/margins": 1.1310919523239136, + "rewards/rejected": -1.3062918186187744, + "step": 5556 + }, + { + "epoch": 0.64, + "learning_rate": 1.09458035818799e-07, + "logits/chosen": -2.6042256355285645, + "logits/rejected": -2.86977219581604, + "logps/chosen": -183.22239685058594, + "logps/rejected": -197.10342407226562, + "loss": 0.4679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07626602798700333, + "rewards/margins": 1.3117234706878662, + "rewards/rejected": -1.235457420349121, + "step": 5557 + }, + { + "epoch": 0.64, + "learning_rate": 1.0942291934917476e-07, + "logits/chosen": -3.276357650756836, + "logits/rejected": -3.391195774078369, + "logps/chosen": -337.4051818847656, + "logps/rejected": -303.994140625, + "loss": 0.481, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2578878104686737, + "rewards/margins": 1.165971040725708, + "rewards/rejected": -1.4238587617874146, + "step": 5558 + }, + { + "epoch": 0.64, + "learning_rate": 1.0938780287955051e-07, + "logits/chosen": -3.352724552154541, + "logits/rejected": -3.3108954429626465, + "logps/chosen": -390.0657958984375, + "logps/rejected": -295.61553955078125, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5092910528182983, + "rewards/margins": 3.520625352859497, + "rewards/rejected": -3.011334180831909, + "step": 5559 + }, + { + "epoch": 0.64, + "learning_rate": 1.0935268640992625e-07, + "logits/chosen": -3.1597836017608643, + "logits/rejected": -3.287008285522461, + "logps/chosen": -120.779296875, + "logps/rejected": -241.67713928222656, + "loss": 0.3208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09977317601442337, + "rewards/margins": 2.0206854343414307, + "rewards/rejected": -1.9209123849868774, + "step": 5560 + }, + { + "epoch": 0.64, + "learning_rate": 1.0931756994030199e-07, + "logits/chosen": -2.970947504043579, + "logits/rejected": -2.5185866355895996, + "logps/chosen": -386.9381103515625, + "logps/rejected": -235.0163116455078, + "loss": 0.1764, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9169564247131348, + "rewards/margins": 2.645935297012329, + "rewards/rejected": -1.7289788722991943, + "step": 5561 + }, + { + "epoch": 0.64, + "learning_rate": 1.0928245347067773e-07, + "logits/chosen": -3.4842095375061035, + "logits/rejected": -3.3589704036712646, + "logps/chosen": -341.6241149902344, + "logps/rejected": -378.51959228515625, + "loss": 0.3594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03753170371055603, + "rewards/margins": 1.4058305025100708, + "rewards/rejected": -1.4433623552322388, + "step": 5562 + }, + { + "epoch": 0.64, + "learning_rate": 1.092473370010535e-07, + "logits/chosen": -3.228346109390259, + "logits/rejected": -3.049185276031494, + "logps/chosen": -271.5245361328125, + "logps/rejected": -276.07611083984375, + "loss": 0.5067, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019838716834783554, + "rewards/margins": 1.2017756700515747, + "rewards/rejected": -1.1819369792938232, + "step": 5563 + }, + { + "epoch": 0.64, + "learning_rate": 1.0921222053142924e-07, + "logits/chosen": -3.4402101039886475, + "logits/rejected": -3.1711840629577637, + "logps/chosen": -384.1895751953125, + "logps/rejected": -361.6351013183594, + "loss": 0.9413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.775615930557251, + "rewards/margins": 0.20605912804603577, + "rewards/rejected": -0.9816750884056091, + "step": 5564 + }, + { + "epoch": 0.64, + "learning_rate": 1.0917710406180498e-07, + "logits/chosen": -2.81465744972229, + "logits/rejected": -2.9696295261383057, + "logps/chosen": -302.9178771972656, + "logps/rejected": -244.16351318359375, + "loss": 0.4386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8099825382232666, + "rewards/margins": 1.150705337524414, + "rewards/rejected": -1.9606878757476807, + "step": 5565 + }, + { + "epoch": 0.64, + "learning_rate": 1.0914198759218072e-07, + "logits/chosen": -3.4526453018188477, + "logits/rejected": -3.2858376502990723, + "logps/chosen": -252.64450073242188, + "logps/rejected": -304.3641052246094, + "loss": 0.5275, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2996978759765625, + "rewards/margins": 0.9118590950965881, + "rewards/rejected": -2.211556911468506, + "step": 5566 + }, + { + "epoch": 0.64, + "learning_rate": 1.0910687112255648e-07, + "logits/chosen": -3.6942102909088135, + "logits/rejected": -3.684047222137451, + "logps/chosen": -263.5755615234375, + "logps/rejected": -296.6995544433594, + "loss": 0.5639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05185417830944061, + "rewards/margins": 0.8827250003814697, + "rewards/rejected": -0.9345791935920715, + "step": 5567 + }, + { + "epoch": 0.64, + "learning_rate": 1.0907175465293223e-07, + "logits/chosen": -4.03670597076416, + "logits/rejected": -3.77561354637146, + "logps/chosen": -146.99713134765625, + "logps/rejected": -147.85215759277344, + "loss": 0.5525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2682439684867859, + "rewards/margins": 1.5591005086898804, + "rewards/rejected": -1.827344536781311, + "step": 5568 + }, + { + "epoch": 0.64, + "learning_rate": 1.0903663818330797e-07, + "logits/chosen": -3.3505730628967285, + "logits/rejected": -3.831346035003662, + "logps/chosen": -109.84162139892578, + "logps/rejected": -224.5396728515625, + "loss": 0.3552, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17383822798728943, + "rewards/margins": 2.12558913230896, + "rewards/rejected": -2.299427032470703, + "step": 5569 + }, + { + "epoch": 0.64, + "learning_rate": 1.0900152171368371e-07, + "logits/chosen": -3.6277406215667725, + "logits/rejected": -4.161089897155762, + "logps/chosen": -261.3705139160156, + "logps/rejected": -379.6358642578125, + "loss": 0.4551, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12518097460269928, + "rewards/margins": 1.8030238151550293, + "rewards/rejected": -1.9282046556472778, + "step": 5570 + }, + { + "epoch": 0.64, + "learning_rate": 1.0896640524405947e-07, + "logits/chosen": -3.1875648498535156, + "logits/rejected": -2.9674971103668213, + "logps/chosen": -204.2544708251953, + "logps/rejected": -239.86083984375, + "loss": 0.5764, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5869861841201782, + "rewards/margins": 1.5737985372543335, + "rewards/rejected": -2.1607847213745117, + "step": 5571 + }, + { + "epoch": 0.64, + "learning_rate": 1.089312887744352e-07, + "logits/chosen": -3.281020402908325, + "logits/rejected": -3.0123519897460938, + "logps/chosen": -242.91847229003906, + "logps/rejected": -158.44375610351562, + "loss": 0.223, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06888549029827118, + "rewards/margins": 1.9734975099563599, + "rewards/rejected": -1.9046120643615723, + "step": 5572 + }, + { + "epoch": 0.64, + "learning_rate": 1.0889617230481095e-07, + "logits/chosen": -2.486341714859009, + "logits/rejected": -2.53477144241333, + "logps/chosen": -383.22998046875, + "logps/rejected": -254.41238403320312, + "loss": 0.5796, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2264101207256317, + "rewards/margins": 0.6975858211517334, + "rewards/rejected": -0.9239959120750427, + "step": 5573 + }, + { + "epoch": 0.64, + "learning_rate": 1.0886105583518669e-07, + "logits/chosen": -3.2129435539245605, + "logits/rejected": -3.1654787063598633, + "logps/chosen": -281.7437744140625, + "logps/rejected": -430.1336975097656, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05344715714454651, + "rewards/margins": 1.4276573657989502, + "rewards/rejected": -1.3742101192474365, + "step": 5574 + }, + { + "epoch": 0.64, + "learning_rate": 1.0882593936556244e-07, + "logits/chosen": -3.0444867610931396, + "logits/rejected": -2.699658155441284, + "logps/chosen": -110.93022155761719, + "logps/rejected": -239.550537109375, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5824467539787292, + "rewards/margins": 2.487058162689209, + "rewards/rejected": -1.9046114683151245, + "step": 5575 + }, + { + "epoch": 0.64, + "learning_rate": 1.087908228959382e-07, + "logits/chosen": -3.68730092048645, + "logits/rejected": -3.498046636581421, + "logps/chosen": -401.29443359375, + "logps/rejected": -299.57666015625, + "loss": 0.7253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08355779200792313, + "rewards/margins": 0.5526366233825684, + "rewards/rejected": -0.46907883882522583, + "step": 5576 + }, + { + "epoch": 0.64, + "learning_rate": 1.0875570642631394e-07, + "logits/chosen": -3.2425222396850586, + "logits/rejected": -3.1226930618286133, + "logps/chosen": -125.64039611816406, + "logps/rejected": -172.56283569335938, + "loss": 0.4, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8648499846458435, + "rewards/margins": 2.569629669189453, + "rewards/rejected": -1.704779863357544, + "step": 5577 + }, + { + "epoch": 0.64, + "learning_rate": 1.0872058995668968e-07, + "logits/chosen": -2.093470573425293, + "logits/rejected": -2.1296279430389404, + "logps/chosen": -211.73025512695312, + "logps/rejected": -227.899169921875, + "loss": 0.4287, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2163473665714264, + "rewards/margins": 1.1292474269866943, + "rewards/rejected": -1.3455947637557983, + "step": 5578 + }, + { + "epoch": 0.64, + "learning_rate": 1.0868547348706542e-07, + "logits/chosen": -3.247188091278076, + "logits/rejected": -3.2769057750701904, + "logps/chosen": -306.28924560546875, + "logps/rejected": -282.15765380859375, + "loss": 0.2376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06101532280445099, + "rewards/margins": 1.7930705547332764, + "rewards/rejected": -1.854085922241211, + "step": 5579 + }, + { + "epoch": 0.64, + "learning_rate": 1.0865035701744118e-07, + "logits/chosen": -3.0781497955322266, + "logits/rejected": -3.3715529441833496, + "logps/chosen": -260.44757080078125, + "logps/rejected": -310.0091247558594, + "loss": 0.4823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13576748967170715, + "rewards/margins": 1.4480490684509277, + "rewards/rejected": -1.5838165283203125, + "step": 5580 + }, + { + "epoch": 0.64, + "learning_rate": 1.0861524054781692e-07, + "logits/chosen": -2.9371581077575684, + "logits/rejected": -2.5432183742523193, + "logps/chosen": -304.43212890625, + "logps/rejected": -186.59397888183594, + "loss": 0.3481, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14453229308128357, + "rewards/margins": 1.2859280109405518, + "rewards/rejected": -1.4304603338241577, + "step": 5581 + }, + { + "epoch": 0.64, + "learning_rate": 1.0858012407819267e-07, + "logits/chosen": -2.3198423385620117, + "logits/rejected": -2.144890069961548, + "logps/chosen": -308.636962890625, + "logps/rejected": -346.78814697265625, + "loss": 0.3071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2574445307254791, + "rewards/margins": 2.143583059310913, + "rewards/rejected": -1.8861385583877563, + "step": 5582 + }, + { + "epoch": 0.64, + "learning_rate": 1.085450076085684e-07, + "logits/chosen": -3.5493483543395996, + "logits/rejected": -3.2361743450164795, + "logps/chosen": -193.277099609375, + "logps/rejected": -210.62564086914062, + "loss": 0.5259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2196657508611679, + "rewards/margins": 1.386404275894165, + "rewards/rejected": -1.6060700416564941, + "step": 5583 + }, + { + "epoch": 0.64, + "learning_rate": 1.0850989113894416e-07, + "logits/chosen": -3.3553285598754883, + "logits/rejected": -3.612657308578491, + "logps/chosen": -310.71856689453125, + "logps/rejected": -184.47161865234375, + "loss": 0.301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09465652704238892, + "rewards/margins": 1.9318721294403076, + "rewards/rejected": -1.8372156620025635, + "step": 5584 + }, + { + "epoch": 0.64, + "learning_rate": 1.0847477466931991e-07, + "logits/chosen": -3.2950055599212646, + "logits/rejected": -3.2470502853393555, + "logps/chosen": -208.34292602539062, + "logps/rejected": -193.65957641601562, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22717347741127014, + "rewards/margins": 1.8450474739074707, + "rewards/rejected": -2.072220802307129, + "step": 5585 + }, + { + "epoch": 0.64, + "learning_rate": 1.0843965819969565e-07, + "logits/chosen": -3.690366268157959, + "logits/rejected": -3.8127546310424805, + "logps/chosen": -257.842529296875, + "logps/rejected": -262.7574462890625, + "loss": 0.593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33239611983299255, + "rewards/margins": 1.1095035076141357, + "rewards/rejected": -1.4418996572494507, + "step": 5586 + }, + { + "epoch": 0.64, + "learning_rate": 1.084045417300714e-07, + "logits/chosen": -3.2315633296966553, + "logits/rejected": -3.470195770263672, + "logps/chosen": -190.51394653320312, + "logps/rejected": -255.328857421875, + "loss": 0.3484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6538544297218323, + "rewards/margins": 1.645661473274231, + "rewards/rejected": -2.299515724182129, + "step": 5587 + }, + { + "epoch": 0.64, + "learning_rate": 1.0836942526044715e-07, + "logits/chosen": -2.4882209300994873, + "logits/rejected": -2.4565813541412354, + "logps/chosen": -328.62774658203125, + "logps/rejected": -271.49755859375, + "loss": 0.6423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2788046598434448, + "rewards/margins": 0.7455707788467407, + "rewards/rejected": -1.024375557899475, + "step": 5588 + }, + { + "epoch": 0.64, + "learning_rate": 1.0833430879082289e-07, + "logits/chosen": -2.9080185890197754, + "logits/rejected": -3.0100901126861572, + "logps/chosen": -210.41696166992188, + "logps/rejected": -238.1894989013672, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.039103079587221146, + "rewards/margins": 2.4273297786712646, + "rewards/rejected": -2.466432809829712, + "step": 5589 + }, + { + "epoch": 0.64, + "learning_rate": 1.0829919232119863e-07, + "logits/chosen": -2.591337203979492, + "logits/rejected": -3.032055377960205, + "logps/chosen": -244.86566162109375, + "logps/rejected": -195.8192138671875, + "loss": 0.5524, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.485135555267334, + "rewards/margins": 0.6468579769134521, + "rewards/rejected": -1.1319935321807861, + "step": 5590 + }, + { + "epoch": 0.64, + "learning_rate": 1.0826407585157437e-07, + "logits/chosen": -3.267516851425171, + "logits/rejected": -2.9798011779785156, + "logps/chosen": -268.37872314453125, + "logps/rejected": -336.0381774902344, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5347472429275513, + "rewards/margins": 2.6066441535949707, + "rewards/rejected": -2.071897029876709, + "step": 5591 + }, + { + "epoch": 0.64, + "learning_rate": 1.0822895938195014e-07, + "logits/chosen": -3.5827057361602783, + "logits/rejected": -2.9033217430114746, + "logps/chosen": -340.9233093261719, + "logps/rejected": -256.255859375, + "loss": 0.1634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10278189927339554, + "rewards/margins": 2.7689175605773926, + "rewards/rejected": -2.871699333190918, + "step": 5592 + }, + { + "epoch": 0.64, + "learning_rate": 1.0819384291232588e-07, + "logits/chosen": -3.2179789543151855, + "logits/rejected": -3.315683603286743, + "logps/chosen": -256.3691711425781, + "logps/rejected": -159.93289184570312, + "loss": 0.2699, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7480844259262085, + "rewards/margins": 1.8089112043380737, + "rewards/rejected": -1.0608270168304443, + "step": 5593 + }, + { + "epoch": 0.64, + "learning_rate": 1.0815872644270162e-07, + "logits/chosen": -3.0654518604278564, + "logits/rejected": -3.1645560264587402, + "logps/chosen": -225.15603637695312, + "logps/rejected": -164.1244659423828, + "loss": 0.3489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.181911900639534, + "rewards/margins": 1.6123943328857422, + "rewards/rejected": -1.7943061590194702, + "step": 5594 + }, + { + "epoch": 0.64, + "learning_rate": 1.0812360997307736e-07, + "logits/chosen": -2.8024024963378906, + "logits/rejected": -2.9924545288085938, + "logps/chosen": -343.4334716796875, + "logps/rejected": -222.19732666015625, + "loss": 0.4127, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10730773210525513, + "rewards/margins": 1.383644938468933, + "rewards/rejected": -1.2763371467590332, + "step": 5595 + }, + { + "epoch": 0.65, + "learning_rate": 1.0808849350345313e-07, + "logits/chosen": -2.8340485095977783, + "logits/rejected": -2.9356205463409424, + "logps/chosen": -210.2417755126953, + "logps/rejected": -352.7631530761719, + "loss": 0.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25834548473358154, + "rewards/margins": 1.1179499626159668, + "rewards/rejected": -1.3762953281402588, + "step": 5596 + }, + { + "epoch": 0.65, + "learning_rate": 1.0805337703382887e-07, + "logits/chosen": -2.8124303817749023, + "logits/rejected": -2.8547677993774414, + "logps/chosen": -212.1126251220703, + "logps/rejected": -216.63223266601562, + "loss": 0.2948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010674461722373962, + "rewards/margins": 1.3432278633117676, + "rewards/rejected": -1.3539023399353027, + "step": 5597 + }, + { + "epoch": 0.65, + "learning_rate": 1.0801826056420461e-07, + "logits/chosen": -3.4194626808166504, + "logits/rejected": -3.1343626976013184, + "logps/chosen": -295.12030029296875, + "logps/rejected": -317.6312255859375, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6855341196060181, + "rewards/margins": 1.664825439453125, + "rewards/rejected": -2.3503594398498535, + "step": 5598 + }, + { + "epoch": 0.65, + "learning_rate": 1.0798314409458035e-07, + "logits/chosen": -3.280465602874756, + "logits/rejected": -3.305845260620117, + "logps/chosen": -276.3464050292969, + "logps/rejected": -231.20907592773438, + "loss": 0.3184, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6885541677474976, + "rewards/margins": 1.5935680866241455, + "rewards/rejected": -0.905013918876648, + "step": 5599 + }, + { + "epoch": 0.65, + "learning_rate": 1.079480276249561e-07, + "logits/chosen": -2.777763605117798, + "logits/rejected": -2.6619744300842285, + "logps/chosen": -211.89588928222656, + "logps/rejected": -220.45114135742188, + "loss": 0.4555, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20355479419231415, + "rewards/margins": 1.22539222240448, + "rewards/rejected": -1.0218374729156494, + "step": 5600 + }, + { + "epoch": 0.65, + "learning_rate": 1.0791291115533184e-07, + "logits/chosen": -2.2357311248779297, + "logits/rejected": -2.553327798843384, + "logps/chosen": -427.9742736816406, + "logps/rejected": -294.8554382324219, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0345647931098938, + "rewards/margins": 2.150009870529175, + "rewards/rejected": -2.115445137023926, + "step": 5601 + }, + { + "epoch": 0.65, + "learning_rate": 1.078777946857076e-07, + "logits/chosen": -3.3509554862976074, + "logits/rejected": -3.012974500656128, + "logps/chosen": -267.797607421875, + "logps/rejected": -290.2547912597656, + "loss": 0.4646, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0950521230697632, + "rewards/margins": 0.997070848941803, + "rewards/rejected": -2.092123031616211, + "step": 5602 + }, + { + "epoch": 0.65, + "learning_rate": 1.0784267821608334e-07, + "logits/chosen": -2.8489294052124023, + "logits/rejected": -2.9960145950317383, + "logps/chosen": -188.86260986328125, + "logps/rejected": -252.52825927734375, + "loss": 0.1979, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38133570551872253, + "rewards/margins": 2.8531885147094727, + "rewards/rejected": -2.4718527793884277, + "step": 5603 + }, + { + "epoch": 0.65, + "learning_rate": 1.0780756174645909e-07, + "logits/chosen": -3.2402541637420654, + "logits/rejected": -3.009042263031006, + "logps/chosen": -397.97711181640625, + "logps/rejected": -321.26220703125, + "loss": 0.401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10091283917427063, + "rewards/margins": 1.5222771167755127, + "rewards/rejected": -1.6231898069381714, + "step": 5604 + }, + { + "epoch": 0.65, + "learning_rate": 1.0777244527683483e-07, + "logits/chosen": -3.549013137817383, + "logits/rejected": -3.3744044303894043, + "logps/chosen": -224.06903076171875, + "logps/rejected": -120.34078979492188, + "loss": 0.4514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8129531741142273, + "rewards/margins": 1.4392356872558594, + "rewards/rejected": -2.2521889209747314, + "step": 5605 + }, + { + "epoch": 0.65, + "learning_rate": 1.0773732880721057e-07, + "logits/chosen": -3.1628262996673584, + "logits/rejected": -3.16489577293396, + "logps/chosen": -386.4945373535156, + "logps/rejected": -245.6687469482422, + "loss": 0.3294, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35679465532302856, + "rewards/margins": 1.87200129032135, + "rewards/rejected": -1.5152066946029663, + "step": 5606 + }, + { + "epoch": 0.65, + "learning_rate": 1.0770221233758631e-07, + "logits/chosen": -3.3239593505859375, + "logits/rejected": -3.4630179405212402, + "logps/chosen": -363.105712890625, + "logps/rejected": -328.94775390625, + "loss": 0.4745, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11541348695755005, + "rewards/margins": 2.149893283843994, + "rewards/rejected": -2.2653069496154785, + "step": 5607 + }, + { + "epoch": 0.65, + "learning_rate": 1.0766709586796208e-07, + "logits/chosen": -3.5925865173339844, + "logits/rejected": -3.72599720954895, + "logps/chosen": -365.674072265625, + "logps/rejected": -188.48912048339844, + "loss": 0.7316, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8029249310493469, + "rewards/margins": 1.0370036363601685, + "rewards/rejected": -1.8399286270141602, + "step": 5608 + }, + { + "epoch": 0.65, + "learning_rate": 1.0763197939833782e-07, + "logits/chosen": -2.567650318145752, + "logits/rejected": -2.655116558074951, + "logps/chosen": -380.6853942871094, + "logps/rejected": -231.78065490722656, + "loss": 0.1854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7246997356414795, + "rewards/margins": 2.237149238586426, + "rewards/rejected": -1.5124496221542358, + "step": 5609 + }, + { + "epoch": 0.65, + "learning_rate": 1.0759686292871356e-07, + "logits/chosen": -3.4586851596832275, + "logits/rejected": -3.451690673828125, + "logps/chosen": -261.8019714355469, + "logps/rejected": -297.08782958984375, + "loss": 0.5052, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46848416328430176, + "rewards/margins": 1.188066005706787, + "rewards/rejected": -1.6565501689910889, + "step": 5610 + }, + { + "epoch": 0.65, + "learning_rate": 1.075617464590893e-07, + "logits/chosen": -3.149951934814453, + "logits/rejected": -3.1391890048980713, + "logps/chosen": -207.39163208007812, + "logps/rejected": -298.48443603515625, + "loss": 0.3342, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24646739661693573, + "rewards/margins": 2.761017322540283, + "rewards/rejected": -2.514549970626831, + "step": 5611 + }, + { + "epoch": 0.65, + "learning_rate": 1.0752662998946506e-07, + "logits/chosen": -2.6833510398864746, + "logits/rejected": -2.3555638790130615, + "logps/chosen": -84.63641357421875, + "logps/rejected": -248.94668579101562, + "loss": 0.4146, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06846611201763153, + "rewards/margins": 1.8839759826660156, + "rewards/rejected": -1.9524421691894531, + "step": 5612 + }, + { + "epoch": 0.65, + "learning_rate": 1.0749151351984081e-07, + "logits/chosen": -3.7128190994262695, + "logits/rejected": -3.487022638320923, + "logps/chosen": -144.5499267578125, + "logps/rejected": -131.38104248046875, + "loss": 0.2716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2076992392539978, + "rewards/margins": 1.8572113513946533, + "rewards/rejected": -2.064910650253296, + "step": 5613 + }, + { + "epoch": 0.65, + "learning_rate": 1.0745639705021655e-07, + "logits/chosen": -2.8327040672302246, + "logits/rejected": -2.8158223628997803, + "logps/chosen": -300.1782531738281, + "logps/rejected": -169.82550048828125, + "loss": 0.3294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11419486254453659, + "rewards/margins": 2.0326669216156006, + "rewards/rejected": -2.1468617916107178, + "step": 5614 + }, + { + "epoch": 0.65, + "learning_rate": 1.0742128058059229e-07, + "logits/chosen": -3.380664825439453, + "logits/rejected": -3.621087074279785, + "logps/chosen": -108.83056640625, + "logps/rejected": -237.49118041992188, + "loss": 0.3475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.362771213054657, + "rewards/margins": 2.1301302909851074, + "rewards/rejected": -2.492901563644409, + "step": 5615 + }, + { + "epoch": 0.65, + "learning_rate": 1.0738616411096804e-07, + "logits/chosen": -3.5697433948516846, + "logits/rejected": -3.1161513328552246, + "logps/chosen": -248.85910034179688, + "logps/rejected": -304.90283203125, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30594247579574585, + "rewards/margins": 2.8118603229522705, + "rewards/rejected": -2.505917549133301, + "step": 5616 + }, + { + "epoch": 0.65, + "learning_rate": 1.0735104764134379e-07, + "logits/chosen": -3.0564703941345215, + "logits/rejected": -2.877027988433838, + "logps/chosen": -361.870361328125, + "logps/rejected": -270.618408203125, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3858398497104645, + "rewards/margins": 2.238760471343994, + "rewards/rejected": -1.8529205322265625, + "step": 5617 + }, + { + "epoch": 0.65, + "learning_rate": 1.0731593117171953e-07, + "logits/chosen": -4.0963873863220215, + "logits/rejected": -4.082793712615967, + "logps/chosen": -290.6500549316406, + "logps/rejected": -279.9942932128906, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0034888237714767456, + "rewards/margins": 3.2841386795043945, + "rewards/rejected": -3.287627696990967, + "step": 5618 + }, + { + "epoch": 0.65, + "learning_rate": 1.0728081470209528e-07, + "logits/chosen": -3.0906553268432617, + "logits/rejected": -2.7943954467773438, + "logps/chosen": -425.14691162109375, + "logps/rejected": -275.3019104003906, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5881611108779907, + "rewards/margins": 1.158515214920044, + "rewards/rejected": -1.7466763257980347, + "step": 5619 + }, + { + "epoch": 0.65, + "learning_rate": 1.0724569823247102e-07, + "logits/chosen": -3.1494131088256836, + "logits/rejected": -3.066164016723633, + "logps/chosen": -320.2164611816406, + "logps/rejected": -295.4013977050781, + "loss": 0.4188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3534978926181793, + "rewards/margins": 2.1844289302825928, + "rewards/rejected": -2.5379269123077393, + "step": 5620 + }, + { + "epoch": 0.65, + "learning_rate": 1.0721058176284677e-07, + "logits/chosen": -3.0796542167663574, + "logits/rejected": -2.746368885040283, + "logps/chosen": -246.46578979492188, + "logps/rejected": -277.140380859375, + "loss": 0.2437, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15386377274990082, + "rewards/margins": 2.5156631469726562, + "rewards/rejected": -2.6695268154144287, + "step": 5621 + }, + { + "epoch": 0.65, + "learning_rate": 1.0717546529322252e-07, + "logits/chosen": -3.2416837215423584, + "logits/rejected": -3.155458450317383, + "logps/chosen": -312.3352355957031, + "logps/rejected": -269.7440490722656, + "loss": 0.3523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3387162983417511, + "rewards/margins": 1.5783562660217285, + "rewards/rejected": -1.9170725345611572, + "step": 5622 + }, + { + "epoch": 0.65, + "learning_rate": 1.0714034882359826e-07, + "logits/chosen": -3.30324125289917, + "logits/rejected": -3.296976089477539, + "logps/chosen": -161.32293701171875, + "logps/rejected": -416.4250183105469, + "loss": 0.484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3976821303367615, + "rewards/margins": 1.7413086891174316, + "rewards/rejected": -2.138990879058838, + "step": 5623 + }, + { + "epoch": 0.65, + "learning_rate": 1.07105232353974e-07, + "logits/chosen": -3.6000545024871826, + "logits/rejected": -3.2748231887817383, + "logps/chosen": -282.6176452636719, + "logps/rejected": -345.45068359375, + "loss": 0.4239, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42987963557243347, + "rewards/margins": 1.4026055335998535, + "rewards/rejected": -0.9727257490158081, + "step": 5624 + }, + { + "epoch": 0.65, + "learning_rate": 1.0707011588434976e-07, + "logits/chosen": -2.7481307983398438, + "logits/rejected": -2.9967517852783203, + "logps/chosen": -440.8220520019531, + "logps/rejected": -239.9080810546875, + "loss": 0.4028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16004885733127594, + "rewards/margins": 1.4254229068756104, + "rewards/rejected": -1.585471749305725, + "step": 5625 + }, + { + "epoch": 0.65, + "learning_rate": 1.070349994147255e-07, + "logits/chosen": -2.6817069053649902, + "logits/rejected": -2.652165412902832, + "logps/chosen": -268.5359191894531, + "logps/rejected": -302.2108154296875, + "loss": 0.5902, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.042094022035598755, + "rewards/margins": 0.8159489631652832, + "rewards/rejected": -0.8580430150032043, + "step": 5626 + }, + { + "epoch": 0.65, + "learning_rate": 1.0699988294510124e-07, + "logits/chosen": -3.9612483978271484, + "logits/rejected": -3.791958808898926, + "logps/chosen": -124.39517211914062, + "logps/rejected": -113.29580688476562, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43146008253097534, + "rewards/margins": 0.8343965411186218, + "rewards/rejected": -1.2658566236495972, + "step": 5627 + }, + { + "epoch": 0.65, + "learning_rate": 1.0696476647547699e-07, + "logits/chosen": -3.05950665473938, + "logits/rejected": -3.152378559112549, + "logps/chosen": -153.36898803710938, + "logps/rejected": -193.42507934570312, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5413777828216553, + "rewards/margins": 2.2741198539733887, + "rewards/rejected": -1.732742190361023, + "step": 5628 + }, + { + "epoch": 0.65, + "learning_rate": 1.0692965000585274e-07, + "logits/chosen": -2.5198967456817627, + "logits/rejected": -2.5737547874450684, + "logps/chosen": -244.37142944335938, + "logps/rejected": -240.4387664794922, + "loss": 0.1569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01401326060295105, + "rewards/margins": 2.80594539642334, + "rewards/rejected": -2.8199586868286133, + "step": 5629 + }, + { + "epoch": 0.65, + "learning_rate": 1.0689453353622849e-07, + "logits/chosen": -2.3849613666534424, + "logits/rejected": -2.583472967147827, + "logps/chosen": -263.3632507324219, + "logps/rejected": -233.72970581054688, + "loss": 0.3461, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38611477613449097, + "rewards/margins": 1.4344377517700195, + "rewards/rejected": -1.8205524682998657, + "step": 5630 + }, + { + "epoch": 0.65, + "learning_rate": 1.0685941706660423e-07, + "logits/chosen": -3.03352952003479, + "logits/rejected": -3.284853219985962, + "logps/chosen": -275.52435302734375, + "logps/rejected": -236.57351684570312, + "loss": 0.3627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27701035141944885, + "rewards/margins": 1.753282070159912, + "rewards/rejected": -2.030292510986328, + "step": 5631 + }, + { + "epoch": 0.65, + "learning_rate": 1.0682430059697997e-07, + "logits/chosen": -2.5650057792663574, + "logits/rejected": -2.698627471923828, + "logps/chosen": -169.54940795898438, + "logps/rejected": -238.5128173828125, + "loss": 0.5999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41020867228507996, + "rewards/margins": 1.0185129642486572, + "rewards/rejected": -1.42872154712677, + "step": 5632 + }, + { + "epoch": 0.65, + "learning_rate": 1.0678918412735573e-07, + "logits/chosen": -3.33251690864563, + "logits/rejected": -3.1733808517456055, + "logps/chosen": -271.3424987792969, + "logps/rejected": -320.1819152832031, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43721261620521545, + "rewards/margins": 0.9759700298309326, + "rewards/rejected": -1.4131826162338257, + "step": 5633 + }, + { + "epoch": 0.65, + "learning_rate": 1.0675406765773147e-07, + "logits/chosen": -3.4792590141296387, + "logits/rejected": -3.5824410915374756, + "logps/chosen": -277.6376037597656, + "logps/rejected": -299.1560363769531, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018851593136787415, + "rewards/margins": 3.747931957244873, + "rewards/rejected": -3.7290802001953125, + "step": 5634 + }, + { + "epoch": 0.65, + "learning_rate": 1.0671895118810721e-07, + "logits/chosen": -2.170626640319824, + "logits/rejected": -2.1167328357696533, + "logps/chosen": -118.99354553222656, + "logps/rejected": -225.41033935546875, + "loss": 0.3474, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.061789363622665405, + "rewards/margins": 1.9839577674865723, + "rewards/rejected": -1.922168254852295, + "step": 5635 + }, + { + "epoch": 0.65, + "learning_rate": 1.0668383471848296e-07, + "logits/chosen": -3.1704862117767334, + "logits/rejected": -2.985698938369751, + "logps/chosen": -293.5659484863281, + "logps/rejected": -258.3111267089844, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7479022145271301, + "rewards/margins": 2.3396358489990234, + "rewards/rejected": -1.5917335748672485, + "step": 5636 + }, + { + "epoch": 0.65, + "learning_rate": 1.0664871824885872e-07, + "logits/chosen": -3.299772262573242, + "logits/rejected": -3.2477047443389893, + "logps/chosen": -133.55397033691406, + "logps/rejected": -224.42312622070312, + "loss": 0.2976, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8161277770996094, + "rewards/margins": 2.388042449951172, + "rewards/rejected": -3.2041702270507812, + "step": 5637 + }, + { + "epoch": 0.65, + "learning_rate": 1.0661360177923446e-07, + "logits/chosen": -3.3448681831359863, + "logits/rejected": -3.331289291381836, + "logps/chosen": -389.712890625, + "logps/rejected": -331.9437255859375, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0990728810429573, + "rewards/margins": 2.878101348876953, + "rewards/rejected": -2.9771742820739746, + "step": 5638 + }, + { + "epoch": 0.65, + "learning_rate": 1.065784853096102e-07, + "logits/chosen": -2.780048370361328, + "logits/rejected": -2.751544237136841, + "logps/chosen": -244.10482788085938, + "logps/rejected": -404.24127197265625, + "loss": 0.3062, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2737772762775421, + "rewards/margins": 1.9931374788284302, + "rewards/rejected": -1.7193602323532104, + "step": 5639 + }, + { + "epoch": 0.65, + "learning_rate": 1.0654336883998594e-07, + "logits/chosen": -3.902237892150879, + "logits/rejected": -3.9272589683532715, + "logps/chosen": -316.08642578125, + "logps/rejected": -238.65634155273438, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2896531820297241, + "rewards/margins": 1.7634758949279785, + "rewards/rejected": -2.053129196166992, + "step": 5640 + }, + { + "epoch": 0.65, + "learning_rate": 1.065082523703617e-07, + "logits/chosen": -2.7935855388641357, + "logits/rejected": -3.0234084129333496, + "logps/chosen": -237.99130249023438, + "logps/rejected": -180.79141235351562, + "loss": 0.3619, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2731037735939026, + "rewards/margins": 1.7001707553863525, + "rewards/rejected": -1.4270669221878052, + "step": 5641 + }, + { + "epoch": 0.65, + "learning_rate": 1.0647313590073745e-07, + "logits/chosen": -2.949439764022827, + "logits/rejected": -2.7813570499420166, + "logps/chosen": -310.2611999511719, + "logps/rejected": -259.5787048339844, + "loss": 0.1304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35151568055152893, + "rewards/margins": 3.0993924140930176, + "rewards/rejected": -2.7478768825531006, + "step": 5642 + }, + { + "epoch": 0.65, + "learning_rate": 1.0643801943111319e-07, + "logits/chosen": -2.239032745361328, + "logits/rejected": -2.5805978775024414, + "logps/chosen": -351.34051513671875, + "logps/rejected": -339.5082092285156, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7640773057937622, + "rewards/margins": 2.4078924655914307, + "rewards/rejected": -1.6438151597976685, + "step": 5643 + }, + { + "epoch": 0.65, + "learning_rate": 1.0640290296148893e-07, + "logits/chosen": -3.7529730796813965, + "logits/rejected": -3.305634021759033, + "logps/chosen": -203.53274536132812, + "logps/rejected": -117.1561050415039, + "loss": 0.7392, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030054934322834015, + "rewards/margins": 0.595278799533844, + "rewards/rejected": -0.5652239322662354, + "step": 5644 + }, + { + "epoch": 0.65, + "learning_rate": 1.0636778649186468e-07, + "logits/chosen": -3.780831813812256, + "logits/rejected": -3.5922584533691406, + "logps/chosen": -258.83477783203125, + "logps/rejected": -280.8088684082031, + "loss": 0.7261, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5713047981262207, + "rewards/margins": 1.3562636375427246, + "rewards/rejected": -1.9275684356689453, + "step": 5645 + }, + { + "epoch": 0.65, + "learning_rate": 1.0633267002224042e-07, + "logits/chosen": -3.3099231719970703, + "logits/rejected": -3.187441349029541, + "logps/chosen": -235.12351989746094, + "logps/rejected": -298.9351501464844, + "loss": 0.29, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0494069866836071, + "rewards/margins": 2.3913867473602295, + "rewards/rejected": -2.440793752670288, + "step": 5646 + }, + { + "epoch": 0.65, + "learning_rate": 1.0629755355261618e-07, + "logits/chosen": -4.01441764831543, + "logits/rejected": -3.8159525394439697, + "logps/chosen": -139.42787170410156, + "logps/rejected": -159.613525390625, + "loss": 0.5203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14440304040908813, + "rewards/margins": 1.1954987049102783, + "rewards/rejected": -1.3399016857147217, + "step": 5647 + }, + { + "epoch": 0.65, + "learning_rate": 1.0626243708299192e-07, + "logits/chosen": -2.5943546295166016, + "logits/rejected": -2.703242778778076, + "logps/chosen": -214.1205291748047, + "logps/rejected": -234.7897186279297, + "loss": 0.585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1280806064605713, + "rewards/margins": 1.4257488250732422, + "rewards/rejected": -1.553829312324524, + "step": 5648 + }, + { + "epoch": 0.65, + "learning_rate": 1.0622732061336767e-07, + "logits/chosen": -3.1391568183898926, + "logits/rejected": -3.265798807144165, + "logps/chosen": -401.6448974609375, + "logps/rejected": -390.2916259765625, + "loss": 0.4921, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4184234142303467, + "rewards/margins": 1.970555305480957, + "rewards/rejected": -1.5521318912506104, + "step": 5649 + }, + { + "epoch": 0.65, + "learning_rate": 1.0619220414374341e-07, + "logits/chosen": -3.4210269451141357, + "logits/rejected": -3.4223904609680176, + "logps/chosen": -240.19163513183594, + "logps/rejected": -282.5373229980469, + "loss": 0.649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4499392509460449, + "rewards/margins": 1.1601508855819702, + "rewards/rejected": -1.6100900173187256, + "step": 5650 + }, + { + "epoch": 0.65, + "learning_rate": 1.0615708767411915e-07, + "logits/chosen": -3.1556801795959473, + "logits/rejected": -2.656970977783203, + "logps/chosen": -272.407470703125, + "logps/rejected": -272.4090576171875, + "loss": 0.3913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09946460276842117, + "rewards/margins": 2.429797649383545, + "rewards/rejected": -2.3303327560424805, + "step": 5651 + }, + { + "epoch": 0.65, + "learning_rate": 1.0612197120449489e-07, + "logits/chosen": -2.8886561393737793, + "logits/rejected": -2.7998502254486084, + "logps/chosen": -196.05117797851562, + "logps/rejected": -508.733154296875, + "loss": 0.1769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40366441011428833, + "rewards/margins": 3.828294277191162, + "rewards/rejected": -3.4246296882629395, + "step": 5652 + }, + { + "epoch": 0.65, + "learning_rate": 1.0608685473487066e-07, + "logits/chosen": -3.5574193000793457, + "logits/rejected": -3.5740721225738525, + "logps/chosen": -400.5965881347656, + "logps/rejected": -244.47166442871094, + "loss": 0.4949, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05485007166862488, + "rewards/margins": 1.314725637435913, + "rewards/rejected": -1.2598754167556763, + "step": 5653 + }, + { + "epoch": 0.65, + "learning_rate": 1.060517382652464e-07, + "logits/chosen": -2.7622063159942627, + "logits/rejected": -3.0911450386047363, + "logps/chosen": -371.63128662109375, + "logps/rejected": -325.51800537109375, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2772482633590698, + "rewards/margins": 3.532459020614624, + "rewards/rejected": -3.2552106380462646, + "step": 5654 + }, + { + "epoch": 0.65, + "learning_rate": 1.0601662179562214e-07, + "logits/chosen": -2.849799156188965, + "logits/rejected": -2.7424912452697754, + "logps/chosen": -258.6644592285156, + "logps/rejected": -427.9648132324219, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02935117483139038, + "rewards/margins": 2.8424487113952637, + "rewards/rejected": -2.8130974769592285, + "step": 5655 + }, + { + "epoch": 0.65, + "learning_rate": 1.0598150532599788e-07, + "logits/chosen": -2.6591079235076904, + "logits/rejected": -2.7387495040893555, + "logps/chosen": -286.99951171875, + "logps/rejected": -226.531494140625, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43943923711776733, + "rewards/margins": 1.1658662557601929, + "rewards/rejected": -1.605305552482605, + "step": 5656 + }, + { + "epoch": 0.65, + "learning_rate": 1.0594638885637364e-07, + "logits/chosen": -3.119502067565918, + "logits/rejected": -2.653235912322998, + "logps/chosen": -251.0325927734375, + "logps/rejected": -270.8792724609375, + "loss": 0.3374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2780793011188507, + "rewards/margins": 2.130380153656006, + "rewards/rejected": -2.408459424972534, + "step": 5657 + }, + { + "epoch": 0.65, + "learning_rate": 1.0591127238674939e-07, + "logits/chosen": -2.950901985168457, + "logits/rejected": -3.069645404815674, + "logps/chosen": -145.67153930664062, + "logps/rejected": -144.027587890625, + "loss": 0.4273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3278617560863495, + "rewards/margins": 1.1225440502166748, + "rewards/rejected": -0.7946822643280029, + "step": 5658 + }, + { + "epoch": 0.65, + "learning_rate": 1.0587615591712513e-07, + "logits/chosen": -3.4326488971710205, + "logits/rejected": -3.471013307571411, + "logps/chosen": -124.53093719482422, + "logps/rejected": -223.44517517089844, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16564786434173584, + "rewards/margins": 2.682650566101074, + "rewards/rejected": -2.517002820968628, + "step": 5659 + }, + { + "epoch": 0.65, + "learning_rate": 1.0584103944750087e-07, + "logits/chosen": -3.178682804107666, + "logits/rejected": -3.2476272583007812, + "logps/chosen": -321.0895080566406, + "logps/rejected": -253.02748107910156, + "loss": 0.2562, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14146476984024048, + "rewards/margins": 2.842315912246704, + "rewards/rejected": -2.7008509635925293, + "step": 5660 + }, + { + "epoch": 0.65, + "learning_rate": 1.0580592297787662e-07, + "logits/chosen": -3.033768653869629, + "logits/rejected": -2.892059564590454, + "logps/chosen": -222.59097290039062, + "logps/rejected": -168.8028564453125, + "loss": 0.4481, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06116996705532074, + "rewards/margins": 0.9721100926399231, + "rewards/rejected": -0.9109401702880859, + "step": 5661 + }, + { + "epoch": 0.65, + "learning_rate": 1.0577080650825236e-07, + "logits/chosen": -2.9500136375427246, + "logits/rejected": -2.8467724323272705, + "logps/chosen": -255.1674041748047, + "logps/rejected": -290.8746337890625, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28930673003196716, + "rewards/margins": 2.3265459537506104, + "rewards/rejected": -2.0372393131256104, + "step": 5662 + }, + { + "epoch": 0.65, + "learning_rate": 1.057356900386281e-07, + "logits/chosen": -2.848926067352295, + "logits/rejected": -2.918388843536377, + "logps/chosen": -402.5325927734375, + "logps/rejected": -264.5010681152344, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16099755465984344, + "rewards/margins": 2.3789467811584473, + "rewards/rejected": -2.217949390411377, + "step": 5663 + }, + { + "epoch": 0.65, + "learning_rate": 1.0570057356900386e-07, + "logits/chosen": -3.0099682807922363, + "logits/rejected": -2.885167121887207, + "logps/chosen": -236.37689208984375, + "logps/rejected": -362.9037170410156, + "loss": 0.4405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3657780885696411, + "rewards/margins": 1.1091043949127197, + "rewards/rejected": -1.4748824834823608, + "step": 5664 + }, + { + "epoch": 0.65, + "learning_rate": 1.0566545709937961e-07, + "logits/chosen": -2.9213805198669434, + "logits/rejected": -3.078748941421509, + "logps/chosen": -183.796875, + "logps/rejected": -229.20748901367188, + "loss": 0.2711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5323242545127869, + "rewards/margins": 1.8985296487808228, + "rewards/rejected": -2.430853843688965, + "step": 5665 + }, + { + "epoch": 0.65, + "learning_rate": 1.0563034062975535e-07, + "logits/chosen": -3.6523122787475586, + "logits/rejected": -3.3650810718536377, + "logps/chosen": -192.48849487304688, + "logps/rejected": -154.14117431640625, + "loss": 0.456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1224488765001297, + "rewards/margins": 1.600853443145752, + "rewards/rejected": -1.7233024835586548, + "step": 5666 + }, + { + "epoch": 0.65, + "learning_rate": 1.055952241601311e-07, + "logits/chosen": -3.1325795650482178, + "logits/rejected": -2.974757671356201, + "logps/chosen": -222.03225708007812, + "logps/rejected": -437.45953369140625, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08758679032325745, + "rewards/margins": 2.350001811981201, + "rewards/rejected": -2.4375884532928467, + "step": 5667 + }, + { + "epoch": 0.65, + "learning_rate": 1.0556010769050684e-07, + "logits/chosen": -2.751160144805908, + "logits/rejected": -2.456346035003662, + "logps/chosen": -323.395751953125, + "logps/rejected": -310.3280029296875, + "loss": 0.5966, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2822277843952179, + "rewards/margins": 0.33945244550704956, + "rewards/rejected": -0.6216802597045898, + "step": 5668 + }, + { + "epoch": 0.65, + "learning_rate": 1.0552499122088258e-07, + "logits/chosen": -2.8875417709350586, + "logits/rejected": -2.777507781982422, + "logps/chosen": -214.64027404785156, + "logps/rejected": -267.35870361328125, + "loss": 0.6261, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4231797456741333, + "rewards/margins": 0.5762326717376709, + "rewards/rejected": -0.9994123578071594, + "step": 5669 + }, + { + "epoch": 0.65, + "learning_rate": 1.0548987475125834e-07, + "logits/chosen": -3.3265933990478516, + "logits/rejected": -3.226207971572876, + "logps/chosen": -543.3410034179688, + "logps/rejected": -664.228271484375, + "loss": 0.3178, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28617334365844727, + "rewards/margins": 1.7684576511383057, + "rewards/rejected": -1.4822843074798584, + "step": 5670 + }, + { + "epoch": 0.65, + "learning_rate": 1.0545475828163408e-07, + "logits/chosen": -3.2650251388549805, + "logits/rejected": -3.645474433898926, + "logps/chosen": -249.3783416748047, + "logps/rejected": -189.82521057128906, + "loss": 0.2603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2167520523071289, + "rewards/margins": 1.931557536125183, + "rewards/rejected": -1.7148054838180542, + "step": 5671 + }, + { + "epoch": 0.65, + "learning_rate": 1.0541964181200982e-07, + "logits/chosen": -3.2366621494293213, + "logits/rejected": -3.22756290435791, + "logps/chosen": -224.30191040039062, + "logps/rejected": -176.5642547607422, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3034258484840393, + "rewards/margins": 3.0575666427612305, + "rewards/rejected": -2.754140853881836, + "step": 5672 + }, + { + "epoch": 0.65, + "learning_rate": 1.0538452534238556e-07, + "logits/chosen": -3.0748085975646973, + "logits/rejected": -3.197624921798706, + "logps/chosen": -197.33633422851562, + "logps/rejected": -222.533447265625, + "loss": 0.2329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00991755723953247, + "rewards/margins": 2.039431095123291, + "rewards/rejected": -2.0493483543395996, + "step": 5673 + }, + { + "epoch": 0.65, + "learning_rate": 1.0534940887276132e-07, + "logits/chosen": -2.5382986068725586, + "logits/rejected": -2.4201886653900146, + "logps/chosen": -224.68167114257812, + "logps/rejected": -251.83700561523438, + "loss": 0.3632, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2533620595932007, + "rewards/margins": 1.2374414205551147, + "rewards/rejected": -1.4908034801483154, + "step": 5674 + }, + { + "epoch": 0.65, + "learning_rate": 1.0531429240313707e-07, + "logits/chosen": -3.029167652130127, + "logits/rejected": -2.9935715198516846, + "logps/chosen": -452.415771484375, + "logps/rejected": -330.2823486328125, + "loss": 0.3319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1379069685935974, + "rewards/margins": 1.765679955482483, + "rewards/rejected": -1.9035868644714355, + "step": 5675 + }, + { + "epoch": 0.65, + "learning_rate": 1.0527917593351281e-07, + "logits/chosen": -2.7963755130767822, + "logits/rejected": -2.789581537246704, + "logps/chosen": -275.1109924316406, + "logps/rejected": -298.79852294921875, + "loss": 0.2468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16254812479019165, + "rewards/margins": 2.0897903442382812, + "rewards/rejected": -1.9272422790527344, + "step": 5676 + }, + { + "epoch": 0.65, + "learning_rate": 1.0524405946388855e-07, + "logits/chosen": -3.1262917518615723, + "logits/rejected": -3.1405839920043945, + "logps/chosen": -148.9027557373047, + "logps/rejected": -147.8314208984375, + "loss": 0.5529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8513997793197632, + "rewards/margins": 0.7240864634513855, + "rewards/rejected": -1.575486183166504, + "step": 5677 + }, + { + "epoch": 0.65, + "learning_rate": 1.0520894299426431e-07, + "logits/chosen": -2.790769338607788, + "logits/rejected": -2.8445677757263184, + "logps/chosen": -239.002197265625, + "logps/rejected": -241.6929168701172, + "loss": 0.5074, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5344011187553406, + "rewards/margins": 1.1009798049926758, + "rewards/rejected": -1.6353809833526611, + "step": 5678 + }, + { + "epoch": 0.65, + "learning_rate": 1.0517382652464005e-07, + "logits/chosen": -3.7280218601226807, + "logits/rejected": -3.8529036045074463, + "logps/chosen": -210.79376220703125, + "logps/rejected": -205.57794189453125, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2839612662792206, + "rewards/margins": 1.8908631801605225, + "rewards/rejected": -1.6069018840789795, + "step": 5679 + }, + { + "epoch": 0.65, + "learning_rate": 1.0513871005501579e-07, + "logits/chosen": -3.0787436962127686, + "logits/rejected": -3.214360237121582, + "logps/chosen": -266.3089904785156, + "logps/rejected": -291.978759765625, + "loss": 0.2913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1352439820766449, + "rewards/margins": 3.3252556324005127, + "rewards/rejected": -3.190011739730835, + "step": 5680 + }, + { + "epoch": 0.65, + "learning_rate": 1.0510359358539154e-07, + "logits/chosen": -3.4405646324157715, + "logits/rejected": -3.6403346061706543, + "logps/chosen": -78.33030700683594, + "logps/rejected": -171.14923095703125, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08642993867397308, + "rewards/margins": 2.9304428100585938, + "rewards/rejected": -2.84401273727417, + "step": 5681 + }, + { + "epoch": 0.66, + "learning_rate": 1.050684771157673e-07, + "logits/chosen": -3.1489312648773193, + "logits/rejected": -3.263981342315674, + "logps/chosen": -166.10752868652344, + "logps/rejected": -216.5714569091797, + "loss": 0.5901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11411190032958984, + "rewards/margins": 0.6056146621704102, + "rewards/rejected": -0.7197266221046448, + "step": 5682 + }, + { + "epoch": 0.66, + "learning_rate": 1.0503336064614304e-07, + "logits/chosen": -3.284754991531372, + "logits/rejected": -3.1118085384368896, + "logps/chosen": -246.32818603515625, + "logps/rejected": -215.7868194580078, + "loss": 0.5536, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16461071372032166, + "rewards/margins": 0.9271101951599121, + "rewards/rejected": -1.0917208194732666, + "step": 5683 + }, + { + "epoch": 0.66, + "learning_rate": 1.0499824417651878e-07, + "logits/chosen": -2.8899500370025635, + "logits/rejected": -2.9293642044067383, + "logps/chosen": -97.51370239257812, + "logps/rejected": -173.61080932617188, + "loss": 0.2343, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18274614214897156, + "rewards/margins": 2.240851640701294, + "rewards/rejected": -2.05810546875, + "step": 5684 + }, + { + "epoch": 0.66, + "learning_rate": 1.0496312770689452e-07, + "logits/chosen": -3.6179728507995605, + "logits/rejected": -3.8891453742980957, + "logps/chosen": -321.82598876953125, + "logps/rejected": -278.4671325683594, + "loss": 0.5094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6125072240829468, + "rewards/margins": 1.3180747032165527, + "rewards/rejected": -1.9305819272994995, + "step": 5685 + }, + { + "epoch": 0.66, + "learning_rate": 1.0492801123727029e-07, + "logits/chosen": -3.957486867904663, + "logits/rejected": -3.761627435684204, + "logps/chosen": -147.50839233398438, + "logps/rejected": -119.27775573730469, + "loss": 0.5428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18673869967460632, + "rewards/margins": 1.2834882736206055, + "rewards/rejected": -1.4702270030975342, + "step": 5686 + }, + { + "epoch": 0.66, + "learning_rate": 1.0489289476764603e-07, + "logits/chosen": -2.319512128829956, + "logits/rejected": -2.418222427368164, + "logps/chosen": -142.8349609375, + "logps/rejected": -245.02413940429688, + "loss": 0.1976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24598731100559235, + "rewards/margins": 2.870877504348755, + "rewards/rejected": -3.1168649196624756, + "step": 5687 + }, + { + "epoch": 0.66, + "learning_rate": 1.0485777829802177e-07, + "logits/chosen": -3.803910970687866, + "logits/rejected": -3.607776403427124, + "logps/chosen": -163.85333251953125, + "logps/rejected": -266.318115234375, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5149109959602356, + "rewards/margins": 2.472567558288574, + "rewards/rejected": -1.9576565027236938, + "step": 5688 + }, + { + "epoch": 0.66, + "learning_rate": 1.0482266182839751e-07, + "logits/chosen": -2.8760786056518555, + "logits/rejected": -2.9410862922668457, + "logps/chosen": -212.697509765625, + "logps/rejected": -320.9857177734375, + "loss": 0.2641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1762629747390747, + "rewards/margins": 3.213963508605957, + "rewards/rejected": -3.037700653076172, + "step": 5689 + }, + { + "epoch": 0.66, + "learning_rate": 1.0478754535877326e-07, + "logits/chosen": -3.1751739978790283, + "logits/rejected": -3.032536506652832, + "logps/chosen": -267.23199462890625, + "logps/rejected": -279.4087829589844, + "loss": 0.5959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37167856097221375, + "rewards/margins": 1.4079164266586304, + "rewards/rejected": -1.7795950174331665, + "step": 5690 + }, + { + "epoch": 0.66, + "learning_rate": 1.04752428889149e-07, + "logits/chosen": -3.249744415283203, + "logits/rejected": -3.4339518547058105, + "logps/chosen": -183.49972534179688, + "logps/rejected": -215.670166015625, + "loss": 0.6862, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08495617657899857, + "rewards/margins": 1.819348692893982, + "rewards/rejected": -1.9043047428131104, + "step": 5691 + }, + { + "epoch": 0.66, + "learning_rate": 1.0471731241952476e-07, + "logits/chosen": -3.664407253265381, + "logits/rejected": -3.8349153995513916, + "logps/chosen": -217.32522583007812, + "logps/rejected": -290.8777770996094, + "loss": 0.3375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3529944121837616, + "rewards/margins": 2.1300225257873535, + "rewards/rejected": -1.77702796459198, + "step": 5692 + }, + { + "epoch": 0.66, + "learning_rate": 1.046821959499005e-07, + "logits/chosen": -2.8151869773864746, + "logits/rejected": -2.9243216514587402, + "logps/chosen": -362.2946472167969, + "logps/rejected": -243.14352416992188, + "loss": 0.2589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7825701236724854, + "rewards/margins": 2.1367433071136475, + "rewards/rejected": -1.354173183441162, + "step": 5693 + }, + { + "epoch": 0.66, + "learning_rate": 1.0464707948027625e-07, + "logits/chosen": -3.183488130569458, + "logits/rejected": -3.603588342666626, + "logps/chosen": -138.3988800048828, + "logps/rejected": -226.831787109375, + "loss": 0.3695, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1802050769329071, + "rewards/margins": 2.0392398834228516, + "rewards/rejected": -1.859034776687622, + "step": 5694 + }, + { + "epoch": 0.66, + "learning_rate": 1.0461196301065199e-07, + "logits/chosen": -3.4468328952789307, + "logits/rejected": -3.568474054336548, + "logps/chosen": -124.92695617675781, + "logps/rejected": -355.1290283203125, + "loss": 0.4872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12237244844436646, + "rewards/margins": 1.5683296918869019, + "rewards/rejected": -1.4459571838378906, + "step": 5695 + }, + { + "epoch": 0.66, + "learning_rate": 1.0457684654102773e-07, + "logits/chosen": -3.7337703704833984, + "logits/rejected": -3.6179959774017334, + "logps/chosen": -181.03802490234375, + "logps/rejected": -199.02984619140625, + "loss": 0.7119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32055598497390747, + "rewards/margins": 0.9162235856056213, + "rewards/rejected": -1.2367795705795288, + "step": 5696 + }, + { + "epoch": 0.66, + "learning_rate": 1.0454173007140347e-07, + "logits/chosen": -3.6756057739257812, + "logits/rejected": -3.4971208572387695, + "logps/chosen": -206.87847900390625, + "logps/rejected": -284.82073974609375, + "loss": 0.3417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45266881585121155, + "rewards/margins": 3.3560338020324707, + "rewards/rejected": -3.8087027072906494, + "step": 5697 + }, + { + "epoch": 0.66, + "learning_rate": 1.0450661360177924e-07, + "logits/chosen": -2.911289691925049, + "logits/rejected": -3.0027456283569336, + "logps/chosen": -320.0033874511719, + "logps/rejected": -307.98486328125, + "loss": 0.4677, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20468509197235107, + "rewards/margins": 1.2846497297286987, + "rewards/rejected": -1.0799646377563477, + "step": 5698 + }, + { + "epoch": 0.66, + "learning_rate": 1.0447149713215498e-07, + "logits/chosen": -3.840421676635742, + "logits/rejected": -3.6942310333251953, + "logps/chosen": -109.56204223632812, + "logps/rejected": -93.37203979492188, + "loss": 0.2908, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.763930082321167, + "rewards/margins": 1.803572654724121, + "rewards/rejected": -1.039642572402954, + "step": 5699 + }, + { + "epoch": 0.66, + "learning_rate": 1.0443638066253072e-07, + "logits/chosen": -3.4806346893310547, + "logits/rejected": -3.6597394943237305, + "logps/chosen": -249.08120727539062, + "logps/rejected": -237.81280517578125, + "loss": 0.5392, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32755815982818604, + "rewards/margins": 0.9470750093460083, + "rewards/rejected": -1.2746331691741943, + "step": 5700 + }, + { + "epoch": 0.66, + "learning_rate": 1.0440126419290646e-07, + "logits/chosen": -3.540438652038574, + "logits/rejected": -3.0810225009918213, + "logps/chosen": -143.7138214111328, + "logps/rejected": -236.2102508544922, + "loss": 0.5851, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.46429991722106934, + "rewards/margins": 0.959042489528656, + "rewards/rejected": -1.4233425855636597, + "step": 5701 + }, + { + "epoch": 0.66, + "learning_rate": 1.0436614772328223e-07, + "logits/chosen": -2.752253293991089, + "logits/rejected": -2.399976968765259, + "logps/chosen": -270.9918212890625, + "logps/rejected": -289.14703369140625, + "loss": 0.2854, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03849659487605095, + "rewards/margins": 2.6767969131469727, + "rewards/rejected": -2.6383001804351807, + "step": 5702 + }, + { + "epoch": 0.66, + "learning_rate": 1.0433103125365797e-07, + "logits/chosen": -3.2046937942504883, + "logits/rejected": -3.295044422149658, + "logps/chosen": -200.97665405273438, + "logps/rejected": -226.44586181640625, + "loss": 0.4062, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12999536097049713, + "rewards/margins": 1.1068569421768188, + "rewards/rejected": -0.9768615961074829, + "step": 5703 + }, + { + "epoch": 0.66, + "learning_rate": 1.0429591478403371e-07, + "logits/chosen": -2.9703588485717773, + "logits/rejected": -3.274094343185425, + "logps/chosen": -156.4149627685547, + "logps/rejected": -224.1671142578125, + "loss": 0.1751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.027678310871124268, + "rewards/margins": 3.3839643001556396, + "rewards/rejected": -3.411642551422119, + "step": 5704 + }, + { + "epoch": 0.66, + "learning_rate": 1.0426079831440945e-07, + "logits/chosen": -3.10577392578125, + "logits/rejected": -3.619148015975952, + "logps/chosen": -131.41033935546875, + "logps/rejected": -298.48724365234375, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0007128790020942688, + "rewards/margins": 2.637416362762451, + "rewards/rejected": -2.638129234313965, + "step": 5705 + }, + { + "epoch": 0.66, + "learning_rate": 1.042256818447852e-07, + "logits/chosen": -2.6301238536834717, + "logits/rejected": -2.424525260925293, + "logps/chosen": -158.76478576660156, + "logps/rejected": -118.92388153076172, + "loss": 0.6512, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5498378276824951, + "rewards/margins": 0.1957525759935379, + "rewards/rejected": -0.7455903887748718, + "step": 5706 + }, + { + "epoch": 0.66, + "learning_rate": 1.0419056537516094e-07, + "logits/chosen": -3.317917823791504, + "logits/rejected": -3.3548166751861572, + "logps/chosen": -269.99188232421875, + "logps/rejected": -327.50982666015625, + "loss": 1.0147, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3903298377990723, + "rewards/margins": 0.8020012378692627, + "rewards/rejected": -2.192330837249756, + "step": 5707 + }, + { + "epoch": 0.66, + "learning_rate": 1.0415544890553668e-07, + "logits/chosen": -2.92586612701416, + "logits/rejected": -2.777235984802246, + "logps/chosen": -215.6224365234375, + "logps/rejected": -191.28253173828125, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4073292315006256, + "rewards/margins": 1.915848731994629, + "rewards/rejected": -1.5085195302963257, + "step": 5708 + }, + { + "epoch": 0.66, + "learning_rate": 1.0412033243591244e-07, + "logits/chosen": -3.2123873233795166, + "logits/rejected": -3.033339262008667, + "logps/chosen": -309.721923828125, + "logps/rejected": -360.3021240234375, + "loss": 0.1267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015027925372123718, + "rewards/margins": 4.467125415802002, + "rewards/rejected": -4.482152938842773, + "step": 5709 + }, + { + "epoch": 0.66, + "learning_rate": 1.0408521596628819e-07, + "logits/chosen": -3.307359218597412, + "logits/rejected": -3.4250526428222656, + "logps/chosen": -224.22018432617188, + "logps/rejected": -299.73651123046875, + "loss": 0.5033, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.806451141834259, + "rewards/margins": 0.9775727391242981, + "rewards/rejected": -1.7840238809585571, + "step": 5710 + }, + { + "epoch": 0.66, + "learning_rate": 1.0405009949666393e-07, + "logits/chosen": -2.572415828704834, + "logits/rejected": -2.288235664367676, + "logps/chosen": -390.760986328125, + "logps/rejected": -326.1639099121094, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.280534029006958, + "rewards/margins": 1.3565744161605835, + "rewards/rejected": -1.076040506362915, + "step": 5711 + }, + { + "epoch": 0.66, + "learning_rate": 1.0401498302703967e-07, + "logits/chosen": -3.079155445098877, + "logits/rejected": -3.2752201557159424, + "logps/chosen": -274.352294921875, + "logps/rejected": -248.8653564453125, + "loss": 0.339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24281111359596252, + "rewards/margins": 1.1365067958831787, + "rewards/rejected": -1.3793179988861084, + "step": 5712 + }, + { + "epoch": 0.66, + "learning_rate": 1.0397986655741541e-07, + "logits/chosen": -3.3110251426696777, + "logits/rejected": -2.9535210132598877, + "logps/chosen": -249.51333618164062, + "logps/rejected": -331.1379699707031, + "loss": 0.1868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5496994853019714, + "rewards/margins": 2.435807943344116, + "rewards/rejected": -1.8861083984375, + "step": 5713 + }, + { + "epoch": 0.66, + "learning_rate": 1.0394475008779118e-07, + "logits/chosen": -3.366668224334717, + "logits/rejected": -3.3062572479248047, + "logps/chosen": -343.701416015625, + "logps/rejected": -550.7233276367188, + "loss": 0.357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25125181674957275, + "rewards/margins": 2.227639675140381, + "rewards/rejected": -2.478891372680664, + "step": 5714 + }, + { + "epoch": 0.66, + "learning_rate": 1.0390963361816692e-07, + "logits/chosen": -3.6016037464141846, + "logits/rejected": -3.4645016193389893, + "logps/chosen": -107.71371459960938, + "logps/rejected": -210.82025146484375, + "loss": 0.2657, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13403849303722382, + "rewards/margins": 3.5827460289001465, + "rewards/rejected": -3.4487078189849854, + "step": 5715 + }, + { + "epoch": 0.66, + "learning_rate": 1.0387451714854266e-07, + "logits/chosen": -2.790761947631836, + "logits/rejected": -2.9193239212036133, + "logps/chosen": -258.6531982421875, + "logps/rejected": -279.71697998046875, + "loss": 0.5587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4381749629974365, + "rewards/margins": 0.34310364723205566, + "rewards/rejected": -0.7812786102294922, + "step": 5716 + }, + { + "epoch": 0.66, + "learning_rate": 1.038394006789184e-07, + "logits/chosen": -3.4202685356140137, + "logits/rejected": -3.3117189407348633, + "logps/chosen": -444.21319580078125, + "logps/rejected": -345.298095703125, + "loss": 0.2987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19175538420677185, + "rewards/margins": 1.8954626321792603, + "rewards/rejected": -1.703707218170166, + "step": 5717 + }, + { + "epoch": 0.66, + "learning_rate": 1.0380428420929414e-07, + "logits/chosen": -3.545107841491699, + "logits/rejected": -3.9487690925598145, + "logps/chosen": -216.664306640625, + "logps/rejected": -296.3105163574219, + "loss": 0.4502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7283449769020081, + "rewards/margins": 1.1700036525726318, + "rewards/rejected": -1.8983485698699951, + "step": 5718 + }, + { + "epoch": 0.66, + "learning_rate": 1.0376916773966991e-07, + "logits/chosen": -2.916353940963745, + "logits/rejected": -2.8376336097717285, + "logps/chosen": -313.30224609375, + "logps/rejected": -294.0783996582031, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07945895195007324, + "rewards/margins": 2.3460144996643066, + "rewards/rejected": -2.2665555477142334, + "step": 5719 + }, + { + "epoch": 0.66, + "learning_rate": 1.0373405127004565e-07, + "logits/chosen": -3.2879676818847656, + "logits/rejected": -3.1487197875976562, + "logps/chosen": -329.46575927734375, + "logps/rejected": -263.84033203125, + "loss": 0.2686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35222020745277405, + "rewards/margins": 2.444688320159912, + "rewards/rejected": -2.092468023300171, + "step": 5720 + }, + { + "epoch": 0.66, + "learning_rate": 1.0369893480042139e-07, + "logits/chosen": -2.9500763416290283, + "logits/rejected": -2.991264581680298, + "logps/chosen": -292.0182189941406, + "logps/rejected": -346.2338562011719, + "loss": 0.1991, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6156041026115417, + "rewards/margins": 1.9652278423309326, + "rewards/rejected": -1.349623680114746, + "step": 5721 + }, + { + "epoch": 0.66, + "learning_rate": 1.0366381833079713e-07, + "logits/chosen": -2.7860605716705322, + "logits/rejected": -2.4905943870544434, + "logps/chosen": -212.73974609375, + "logps/rejected": -225.8954620361328, + "loss": 0.3071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24492380023002625, + "rewards/margins": 1.484731674194336, + "rewards/rejected": -1.7296556234359741, + "step": 5722 + }, + { + "epoch": 0.66, + "learning_rate": 1.0362870186117289e-07, + "logits/chosen": -3.043379306793213, + "logits/rejected": -2.696592330932617, + "logps/chosen": -283.6926574707031, + "logps/rejected": -224.9112548828125, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.66765296459198, + "rewards/margins": 1.1680107116699219, + "rewards/rejected": -1.8356637954711914, + "step": 5723 + }, + { + "epoch": 0.66, + "learning_rate": 1.0359358539154863e-07, + "logits/chosen": -2.4576287269592285, + "logits/rejected": -2.4469122886657715, + "logps/chosen": -262.71685791015625, + "logps/rejected": -321.0290222167969, + "loss": 0.371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11369910836219788, + "rewards/margins": 1.3149696588516235, + "rewards/rejected": -1.428668737411499, + "step": 5724 + }, + { + "epoch": 0.66, + "learning_rate": 1.0355846892192437e-07, + "logits/chosen": -2.110001802444458, + "logits/rejected": -2.3591601848602295, + "logps/chosen": -270.73016357421875, + "logps/rejected": -211.98812866210938, + "loss": 0.3144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1545550376176834, + "rewards/margins": 1.9593926668167114, + "rewards/rejected": -2.113947629928589, + "step": 5725 + }, + { + "epoch": 0.66, + "learning_rate": 1.0352335245230012e-07, + "logits/chosen": -2.622436285018921, + "logits/rejected": -2.674302577972412, + "logps/chosen": -103.29092407226562, + "logps/rejected": -248.2713623046875, + "loss": 0.2216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6769444942474365, + "rewards/margins": 2.1094658374786377, + "rewards/rejected": -1.4325213432312012, + "step": 5726 + }, + { + "epoch": 0.66, + "learning_rate": 1.0348823598267588e-07, + "logits/chosen": -2.843531847000122, + "logits/rejected": -2.6668548583984375, + "logps/chosen": -278.90863037109375, + "logps/rejected": -343.7578430175781, + "loss": 0.7383, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2215561419725418, + "rewards/margins": 0.4181056320667267, + "rewards/rejected": -0.6396617889404297, + "step": 5727 + }, + { + "epoch": 0.66, + "learning_rate": 1.0345311951305162e-07, + "logits/chosen": -2.365753650665283, + "logits/rejected": -2.570936441421509, + "logps/chosen": -289.2744140625, + "logps/rejected": -277.8552551269531, + "loss": 0.3276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05299463868141174, + "rewards/margins": 1.836580753326416, + "rewards/rejected": -1.8895753622055054, + "step": 5728 + }, + { + "epoch": 0.66, + "learning_rate": 1.0341800304342736e-07, + "logits/chosen": -3.182713031768799, + "logits/rejected": -2.8020620346069336, + "logps/chosen": -382.60546875, + "logps/rejected": -385.7548522949219, + "loss": 0.1858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2580437660217285, + "rewards/margins": 2.751758575439453, + "rewards/rejected": -2.4937148094177246, + "step": 5729 + }, + { + "epoch": 0.66, + "learning_rate": 1.033828865738031e-07, + "logits/chosen": -3.3183655738830566, + "logits/rejected": -3.046914577484131, + "logps/chosen": -214.63174438476562, + "logps/rejected": -295.0330810546875, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3338734209537506, + "rewards/margins": 2.296858310699463, + "rewards/rejected": -2.6307315826416016, + "step": 5730 + }, + { + "epoch": 0.66, + "learning_rate": 1.0334777010417886e-07, + "logits/chosen": -3.3060357570648193, + "logits/rejected": -3.224677085876465, + "logps/chosen": -366.2154541015625, + "logps/rejected": -312.2091064453125, + "loss": 0.4923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46116968989372253, + "rewards/margins": 1.7030208110809326, + "rewards/rejected": -2.1641905307769775, + "step": 5731 + }, + { + "epoch": 0.66, + "learning_rate": 1.033126536345546e-07, + "logits/chosen": -3.242741584777832, + "logits/rejected": -3.4385271072387695, + "logps/chosen": -167.60096740722656, + "logps/rejected": -212.5272674560547, + "loss": 0.3071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06767319142818451, + "rewards/margins": 2.017521381378174, + "rewards/rejected": -1.9498481750488281, + "step": 5732 + }, + { + "epoch": 0.66, + "learning_rate": 1.0327753716493035e-07, + "logits/chosen": -3.8079960346221924, + "logits/rejected": -3.7925806045532227, + "logps/chosen": -138.5676727294922, + "logps/rejected": -171.40826416015625, + "loss": 0.5716, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23291721940040588, + "rewards/margins": 0.6927387118339539, + "rewards/rejected": -0.9256559610366821, + "step": 5733 + }, + { + "epoch": 0.66, + "learning_rate": 1.0324242069530609e-07, + "logits/chosen": -2.6706252098083496, + "logits/rejected": -2.9224562644958496, + "logps/chosen": -383.078857421875, + "logps/rejected": -248.27459716796875, + "loss": 0.426, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1918918490409851, + "rewards/margins": 1.5502405166625977, + "rewards/rejected": -1.3583484888076782, + "step": 5734 + }, + { + "epoch": 0.66, + "learning_rate": 1.0320730422568184e-07, + "logits/chosen": -2.640475034713745, + "logits/rejected": -2.404812812805176, + "logps/chosen": -190.0913543701172, + "logps/rejected": -200.3535614013672, + "loss": 0.4559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49830782413482666, + "rewards/margins": 0.7523486018180847, + "rewards/rejected": -1.2506563663482666, + "step": 5735 + }, + { + "epoch": 0.66, + "learning_rate": 1.031721877560576e-07, + "logits/chosen": -3.041409730911255, + "logits/rejected": -3.019402027130127, + "logps/chosen": -264.6996154785156, + "logps/rejected": -371.77130126953125, + "loss": 0.3476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015005752444267273, + "rewards/margins": 3.2795512676239014, + "rewards/rejected": -3.26454496383667, + "step": 5736 + }, + { + "epoch": 0.66, + "learning_rate": 1.0313707128643333e-07, + "logits/chosen": -2.8524675369262695, + "logits/rejected": -2.885733127593994, + "logps/chosen": -387.8680419921875, + "logps/rejected": -312.30218505859375, + "loss": 0.5176, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44686976075172424, + "rewards/margins": 1.2730282545089722, + "rewards/rejected": -0.8261585235595703, + "step": 5737 + }, + { + "epoch": 0.66, + "learning_rate": 1.0310195481680908e-07, + "logits/chosen": -2.345773935317993, + "logits/rejected": -2.759247303009033, + "logps/chosen": -321.7190246582031, + "logps/rejected": -215.51544189453125, + "loss": 0.1919, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06312372535467148, + "rewards/margins": 2.3416717052459717, + "rewards/rejected": -2.278548002243042, + "step": 5738 + }, + { + "epoch": 0.66, + "learning_rate": 1.0306683834718483e-07, + "logits/chosen": -3.4460511207580566, + "logits/rejected": -3.118619680404663, + "logps/chosen": -286.6305847167969, + "logps/rejected": -173.76528930664062, + "loss": 0.3881, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5135477185249329, + "rewards/margins": 1.3169912099838257, + "rewards/rejected": -1.8305387496948242, + "step": 5739 + }, + { + "epoch": 0.66, + "learning_rate": 1.0303172187756057e-07, + "logits/chosen": -3.0794808864593506, + "logits/rejected": -2.8718647956848145, + "logps/chosen": -271.12091064453125, + "logps/rejected": -255.77081298828125, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26195651292800903, + "rewards/margins": 2.050210475921631, + "rewards/rejected": -1.7882541418075562, + "step": 5740 + }, + { + "epoch": 0.66, + "learning_rate": 1.0299660540793631e-07, + "logits/chosen": -3.616921901702881, + "logits/rejected": -3.5958313941955566, + "logps/chosen": -240.07421875, + "logps/rejected": -308.73089599609375, + "loss": 0.3026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3943845331668854, + "rewards/margins": 1.7851823568344116, + "rewards/rejected": -2.1795668601989746, + "step": 5741 + }, + { + "epoch": 0.66, + "learning_rate": 1.0296148893831205e-07, + "logits/chosen": -3.210824489593506, + "logits/rejected": -2.970659017562866, + "logps/chosen": -419.97796630859375, + "logps/rejected": -358.19085693359375, + "loss": 0.9998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6633117198944092, + "rewards/margins": 0.7042019963264465, + "rewards/rejected": -1.367513656616211, + "step": 5742 + }, + { + "epoch": 0.66, + "learning_rate": 1.0292637246868782e-07, + "logits/chosen": -2.979198694229126, + "logits/rejected": -2.5398201942443848, + "logps/chosen": -293.94439697265625, + "logps/rejected": -277.1271667480469, + "loss": 0.3335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5395147204399109, + "rewards/margins": 1.6915919780731201, + "rewards/rejected": -2.231106758117676, + "step": 5743 + }, + { + "epoch": 0.66, + "learning_rate": 1.0289125599906356e-07, + "logits/chosen": -3.6044628620147705, + "logits/rejected": -3.719559669494629, + "logps/chosen": -289.176025390625, + "logps/rejected": -311.2850036621094, + "loss": 0.5243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058010444045066833, + "rewards/margins": 0.541130006313324, + "rewards/rejected": -0.5991405248641968, + "step": 5744 + }, + { + "epoch": 0.66, + "learning_rate": 1.028561395294393e-07, + "logits/chosen": -2.6133618354797363, + "logits/rejected": -2.87786602973938, + "logps/chosen": -216.2079315185547, + "logps/rejected": -193.7664337158203, + "loss": 0.5019, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05857527628540993, + "rewards/margins": 1.7002768516540527, + "rewards/rejected": -1.6417016983032227, + "step": 5745 + }, + { + "epoch": 0.66, + "learning_rate": 1.0282102305981504e-07, + "logits/chosen": -3.280165195465088, + "logits/rejected": -3.311551332473755, + "logps/chosen": -190.90087890625, + "logps/rejected": -228.99082946777344, + "loss": 0.2795, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09299305081367493, + "rewards/margins": 1.8984345197677612, + "rewards/rejected": -1.8054416179656982, + "step": 5746 + }, + { + "epoch": 0.66, + "learning_rate": 1.0278590659019081e-07, + "logits/chosen": -2.6901144981384277, + "logits/rejected": -2.9623804092407227, + "logps/chosen": -314.64324951171875, + "logps/rejected": -197.41644287109375, + "loss": 0.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12403127551078796, + "rewards/margins": 2.2017598152160645, + "rewards/rejected": -2.077728509902954, + "step": 5747 + }, + { + "epoch": 0.66, + "learning_rate": 1.0275079012056655e-07, + "logits/chosen": -2.7973833084106445, + "logits/rejected": -3.1017847061157227, + "logps/chosen": -255.87982177734375, + "logps/rejected": -200.33206176757812, + "loss": 0.1903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39685922861099243, + "rewards/margins": 2.0026543140411377, + "rewards/rejected": -1.6057950258255005, + "step": 5748 + }, + { + "epoch": 0.66, + "learning_rate": 1.0271567365094229e-07, + "logits/chosen": -2.4292118549346924, + "logits/rejected": -2.435119867324829, + "logps/chosen": -325.36627197265625, + "logps/rejected": -277.3565368652344, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29526379704475403, + "rewards/margins": 1.527868390083313, + "rewards/rejected": -1.8231322765350342, + "step": 5749 + }, + { + "epoch": 0.66, + "learning_rate": 1.0268055718131803e-07, + "logits/chosen": -3.1930861473083496, + "logits/rejected": -3.105412006378174, + "logps/chosen": -158.46270751953125, + "logps/rejected": -219.99899291992188, + "loss": 0.8084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3112417757511139, + "rewards/margins": 0.7355090379714966, + "rewards/rejected": -1.046750783920288, + "step": 5750 + }, + { + "epoch": 0.66, + "learning_rate": 1.0264544071169378e-07, + "logits/chosen": -2.847608804702759, + "logits/rejected": -2.8641858100891113, + "logps/chosen": -241.14773559570312, + "logps/rejected": -227.56137084960938, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14421606063842773, + "rewards/margins": 1.8780659437179565, + "rewards/rejected": -2.0222818851470947, + "step": 5751 + }, + { + "epoch": 0.66, + "learning_rate": 1.0261032424206952e-07, + "logits/chosen": -3.65132999420166, + "logits/rejected": -3.207498073577881, + "logps/chosen": -269.8680725097656, + "logps/rejected": -225.82614135742188, + "loss": 0.3324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14805744588375092, + "rewards/margins": 1.2387300729751587, + "rewards/rejected": -1.3867875337600708, + "step": 5752 + }, + { + "epoch": 0.66, + "learning_rate": 1.0257520777244528e-07, + "logits/chosen": -2.9970133304595947, + "logits/rejected": -3.2680580615997314, + "logps/chosen": -279.75506591796875, + "logps/rejected": -259.4080810546875, + "loss": 0.3932, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7207176089286804, + "rewards/margins": 1.026503324508667, + "rewards/rejected": -0.3057858347892761, + "step": 5753 + }, + { + "epoch": 0.66, + "learning_rate": 1.0254009130282102e-07, + "logits/chosen": -3.44746994972229, + "logits/rejected": -3.162877321243286, + "logps/chosen": -197.7921142578125, + "logps/rejected": -163.2577667236328, + "loss": 0.3743, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02987562119960785, + "rewards/margins": 1.3482426404953003, + "rewards/rejected": -1.3183670043945312, + "step": 5754 + }, + { + "epoch": 0.66, + "learning_rate": 1.0250497483319677e-07, + "logits/chosen": -2.5681235790252686, + "logits/rejected": -2.328019380569458, + "logps/chosen": -297.33062744140625, + "logps/rejected": -267.73529052734375, + "loss": 0.3946, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03810843825340271, + "rewards/margins": 1.5640288591384888, + "rewards/rejected": -1.5259203910827637, + "step": 5755 + }, + { + "epoch": 0.66, + "learning_rate": 1.0246985836357251e-07, + "logits/chosen": -3.223646402359009, + "logits/rejected": -3.1563305854797363, + "logps/chosen": -416.85455322265625, + "logps/rejected": -366.9803161621094, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29481378197669983, + "rewards/margins": 2.5134780406951904, + "rewards/rejected": -2.2186641693115234, + "step": 5756 + }, + { + "epoch": 0.66, + "learning_rate": 1.0243474189394825e-07, + "logits/chosen": -3.3645846843719482, + "logits/rejected": -3.0482821464538574, + "logps/chosen": -250.76380920410156, + "logps/rejected": -205.61764526367188, + "loss": 0.5448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0019525587558746338, + "rewards/margins": 0.9233155250549316, + "rewards/rejected": -0.9252680540084839, + "step": 5757 + }, + { + "epoch": 0.66, + "learning_rate": 1.02399625424324e-07, + "logits/chosen": -3.7089920043945312, + "logits/rejected": -3.4363558292388916, + "logps/chosen": -211.56361389160156, + "logps/rejected": -268.3355712890625, + "loss": 0.4072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9486469030380249, + "rewards/margins": 2.0298609733581543, + "rewards/rejected": -2.9785079956054688, + "step": 5758 + }, + { + "epoch": 0.66, + "learning_rate": 1.0236450895469976e-07, + "logits/chosen": -3.1310739517211914, + "logits/rejected": -2.936893939971924, + "logps/chosen": -269.208740234375, + "logps/rejected": -186.3160858154297, + "loss": 0.6258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13150577247142792, + "rewards/margins": 0.5692039132118225, + "rewards/rejected": -0.7007097005844116, + "step": 5759 + }, + { + "epoch": 0.66, + "learning_rate": 1.023293924850755e-07, + "logits/chosen": -3.5869882106781006, + "logits/rejected": -3.253415107727051, + "logps/chosen": -326.01904296875, + "logps/rejected": -340.59417724609375, + "loss": 0.5657, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8326348066329956, + "rewards/margins": 2.3703763484954834, + "rewards/rejected": -3.2030110359191895, + "step": 5760 + }, + { + "epoch": 0.66, + "learning_rate": 1.0229427601545124e-07, + "logits/chosen": -2.6967151165008545, + "logits/rejected": -2.81656551361084, + "logps/chosen": -316.7264709472656, + "logps/rejected": -256.78936767578125, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03522692248225212, + "rewards/margins": 1.8587560653686523, + "rewards/rejected": -1.8235292434692383, + "step": 5761 + }, + { + "epoch": 0.66, + "learning_rate": 1.0225915954582698e-07, + "logits/chosen": -2.6511945724487305, + "logits/rejected": -2.6449978351593018, + "logps/chosen": -281.9930725097656, + "logps/rejected": -209.66607666015625, + "loss": 0.5767, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04240059107542038, + "rewards/margins": 0.6381044387817383, + "rewards/rejected": -0.6805050373077393, + "step": 5762 + }, + { + "epoch": 0.66, + "learning_rate": 1.0222404307620272e-07, + "logits/chosen": -3.1640877723693848, + "logits/rejected": -2.7555370330810547, + "logps/chosen": -438.7602233886719, + "logps/rejected": -346.1592102050781, + "loss": 0.3816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01563824713230133, + "rewards/margins": 1.8843624591827393, + "rewards/rejected": -1.900000810623169, + "step": 5763 + }, + { + "epoch": 0.66, + "learning_rate": 1.0218892660657849e-07, + "logits/chosen": -2.4433419704437256, + "logits/rejected": -2.7226004600524902, + "logps/chosen": -170.5955810546875, + "logps/rejected": -267.266357421875, + "loss": 0.6218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9170100092887878, + "rewards/margins": 1.0372779369354248, + "rewards/rejected": -1.9542880058288574, + "step": 5764 + }, + { + "epoch": 0.66, + "learning_rate": 1.0215381013695423e-07, + "logits/chosen": -2.6751208305358887, + "logits/rejected": -2.867249011993408, + "logps/chosen": -154.9772186279297, + "logps/rejected": -199.23590087890625, + "loss": 1.0693, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2327595949172974, + "rewards/margins": -0.4381016492843628, + "rewards/rejected": -0.7946579456329346, + "step": 5765 + }, + { + "epoch": 0.66, + "learning_rate": 1.0211869366732997e-07, + "logits/chosen": -4.129702568054199, + "logits/rejected": -3.8299977779388428, + "logps/chosen": -363.24261474609375, + "logps/rejected": -212.00680541992188, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21536211669445038, + "rewards/margins": 1.8848932981491089, + "rewards/rejected": -2.100255250930786, + "step": 5766 + }, + { + "epoch": 0.66, + "learning_rate": 1.0208357719770571e-07, + "logits/chosen": -2.9163601398468018, + "logits/rejected": -2.844752550125122, + "logps/chosen": -247.35093688964844, + "logps/rejected": -271.97711181640625, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.115823894739151, + "rewards/margins": 2.4005630016326904, + "rewards/rejected": -2.5163869857788086, + "step": 5767 + }, + { + "epoch": 0.66, + "learning_rate": 1.0204846072808147e-07, + "logits/chosen": -1.7753016948699951, + "logits/rejected": -2.1553795337677, + "logps/chosen": -428.62896728515625, + "logps/rejected": -381.05859375, + "loss": 0.6075, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.516228973865509, + "rewards/margins": 1.1322059631347656, + "rewards/rejected": -0.6159769296646118, + "step": 5768 + }, + { + "epoch": 0.67, + "learning_rate": 1.0201334425845721e-07, + "logits/chosen": -2.284559726715088, + "logits/rejected": -2.2727506160736084, + "logps/chosen": -301.8162841796875, + "logps/rejected": -253.64382934570312, + "loss": 1.1537, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6056938767433167, + "rewards/margins": 0.03116026520729065, + "rewards/rejected": -0.6368541121482849, + "step": 5769 + }, + { + "epoch": 0.67, + "learning_rate": 1.0197822778883296e-07, + "logits/chosen": -3.067047595977783, + "logits/rejected": -3.5173614025115967, + "logps/chosen": -176.16964721679688, + "logps/rejected": -190.1708526611328, + "loss": 0.4539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0789172500371933, + "rewards/margins": 2.098031520843506, + "rewards/rejected": -2.1769487857818604, + "step": 5770 + }, + { + "epoch": 0.67, + "learning_rate": 1.019431113192087e-07, + "logits/chosen": -3.1975488662719727, + "logits/rejected": -3.459874153137207, + "logps/chosen": -113.69181823730469, + "logps/rejected": -200.95675659179688, + "loss": 0.2564, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33539271354675293, + "rewards/margins": 2.6684658527374268, + "rewards/rejected": -2.333073139190674, + "step": 5771 + }, + { + "epoch": 0.67, + "learning_rate": 1.0190799484958446e-07, + "logits/chosen": -2.847388505935669, + "logits/rejected": -2.923521041870117, + "logps/chosen": -150.17897033691406, + "logps/rejected": -170.14898681640625, + "loss": 0.4886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09042772650718689, + "rewards/margins": 1.5247735977172852, + "rewards/rejected": -1.4343459606170654, + "step": 5772 + }, + { + "epoch": 0.67, + "learning_rate": 1.018728783799602e-07, + "logits/chosen": -3.1550066471099854, + "logits/rejected": -3.094484806060791, + "logps/chosen": -209.30865478515625, + "logps/rejected": -138.1820068359375, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10885527729988098, + "rewards/margins": 1.2718846797943115, + "rewards/rejected": -1.3807399272918701, + "step": 5773 + }, + { + "epoch": 0.67, + "learning_rate": 1.0183776191033594e-07, + "logits/chosen": -3.1068968772888184, + "logits/rejected": -3.2536115646362305, + "logps/chosen": -129.399169921875, + "logps/rejected": -144.71841430664062, + "loss": 0.4908, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03554859757423401, + "rewards/margins": 1.4550303220748901, + "rewards/rejected": -1.4905788898468018, + "step": 5774 + }, + { + "epoch": 0.67, + "learning_rate": 1.0180264544071168e-07, + "logits/chosen": -3.316539764404297, + "logits/rejected": -3.354823589324951, + "logps/chosen": -164.30441284179688, + "logps/rejected": -258.5980224609375, + "loss": 0.3985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26064959168434143, + "rewards/margins": 1.0162330865859985, + "rewards/rejected": -1.2768826484680176, + "step": 5775 + }, + { + "epoch": 0.67, + "learning_rate": 1.0176752897108744e-07, + "logits/chosen": -3.994906425476074, + "logits/rejected": -4.15238618850708, + "logps/chosen": -226.6694793701172, + "logps/rejected": -328.47900390625, + "loss": 0.4483, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10846003144979477, + "rewards/margins": 1.8222503662109375, + "rewards/rejected": -1.7137904167175293, + "step": 5776 + }, + { + "epoch": 0.67, + "learning_rate": 1.0173241250146318e-07, + "logits/chosen": -3.3136508464813232, + "logits/rejected": -3.368170976638794, + "logps/chosen": -262.1461181640625, + "logps/rejected": -326.2315673828125, + "loss": 0.3512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2825905680656433, + "rewards/margins": 2.5727834701538086, + "rewards/rejected": -2.855374336242676, + "step": 5777 + }, + { + "epoch": 0.67, + "learning_rate": 1.0169729603183893e-07, + "logits/chosen": -3.6554059982299805, + "logits/rejected": -3.600076675415039, + "logps/chosen": -392.1745300292969, + "logps/rejected": -472.44342041015625, + "loss": 0.3589, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7090254426002502, + "rewards/margins": 2.098874092102051, + "rewards/rejected": -2.8078997135162354, + "step": 5778 + }, + { + "epoch": 0.67, + "learning_rate": 1.0166217956221467e-07, + "logits/chosen": -3.3757266998291016, + "logits/rejected": -3.0989155769348145, + "logps/chosen": -266.2609558105469, + "logps/rejected": -206.15481567382812, + "loss": 0.6698, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17097771167755127, + "rewards/margins": 0.9181965589523315, + "rewards/rejected": -1.0891743898391724, + "step": 5779 + }, + { + "epoch": 0.67, + "learning_rate": 1.0162706309259042e-07, + "logits/chosen": -2.482785701751709, + "logits/rejected": -2.464113235473633, + "logps/chosen": -479.3082275390625, + "logps/rejected": -307.0700378417969, + "loss": 0.3014, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3413582146167755, + "rewards/margins": 1.5213143825531006, + "rewards/rejected": -1.1799561977386475, + "step": 5780 + }, + { + "epoch": 0.67, + "learning_rate": 1.0159194662296617e-07, + "logits/chosen": -3.010310649871826, + "logits/rejected": -2.9848668575286865, + "logps/chosen": -378.9236755371094, + "logps/rejected": -376.0784912109375, + "loss": 0.4985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.055248845368623734, + "rewards/margins": 1.4566857814788818, + "rewards/rejected": -1.5119346380233765, + "step": 5781 + }, + { + "epoch": 0.67, + "learning_rate": 1.0155683015334191e-07, + "logits/chosen": -3.836245059967041, + "logits/rejected": -3.853060245513916, + "logps/chosen": -178.43960571289062, + "logps/rejected": -184.9104461669922, + "loss": 0.2176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21229977905750275, + "rewards/margins": 2.320981979370117, + "rewards/rejected": -2.108682155609131, + "step": 5782 + }, + { + "epoch": 0.67, + "learning_rate": 1.0152171368371765e-07, + "logits/chosen": -3.346668243408203, + "logits/rejected": -3.21187162399292, + "logps/chosen": -113.52845764160156, + "logps/rejected": -176.13787841796875, + "loss": 0.3956, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26484766602516174, + "rewards/margins": 1.6654938459396362, + "rewards/rejected": -1.4006460905075073, + "step": 5783 + }, + { + "epoch": 0.67, + "learning_rate": 1.0148659721409341e-07, + "logits/chosen": -2.703242301940918, + "logits/rejected": -2.7914578914642334, + "logps/chosen": -253.92556762695312, + "logps/rejected": -263.5450439453125, + "loss": 0.3074, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2728787064552307, + "rewards/margins": 1.9885873794555664, + "rewards/rejected": -1.7157087326049805, + "step": 5784 + }, + { + "epoch": 0.67, + "learning_rate": 1.0145148074446915e-07, + "logits/chosen": -3.3134214878082275, + "logits/rejected": -3.026581287384033, + "logps/chosen": -169.9081573486328, + "logps/rejected": -152.12149047851562, + "loss": 0.4661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5954524874687195, + "rewards/margins": 0.9023603200912476, + "rewards/rejected": -1.4978127479553223, + "step": 5785 + }, + { + "epoch": 0.67, + "learning_rate": 1.0141636427484489e-07, + "logits/chosen": -2.6906309127807617, + "logits/rejected": -2.81508731842041, + "logps/chosen": -269.5164794921875, + "logps/rejected": -344.8387756347656, + "loss": 0.2401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07378996908664703, + "rewards/margins": 2.0490143299102783, + "rewards/rejected": -1.975224256515503, + "step": 5786 + }, + { + "epoch": 0.67, + "learning_rate": 1.0138124780522064e-07, + "logits/chosen": -3.3714749813079834, + "logits/rejected": -3.3474299907684326, + "logps/chosen": -175.0192108154297, + "logps/rejected": -184.05038452148438, + "loss": 0.5079, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38579031825065613, + "rewards/margins": 1.7025578022003174, + "rewards/rejected": -2.088348150253296, + "step": 5787 + }, + { + "epoch": 0.67, + "learning_rate": 1.013461313355964e-07, + "logits/chosen": -2.9011149406433105, + "logits/rejected": -2.7671215534210205, + "logps/chosen": -434.40509033203125, + "logps/rejected": -357.1767578125, + "loss": 0.4471, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.033111900091171265, + "rewards/margins": 1.1273092031478882, + "rewards/rejected": -1.0941972732543945, + "step": 5788 + }, + { + "epoch": 0.67, + "learning_rate": 1.0131101486597214e-07, + "logits/chosen": -3.8368546962738037, + "logits/rejected": -3.303015947341919, + "logps/chosen": -274.2537536621094, + "logps/rejected": -195.52362060546875, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5419259071350098, + "rewards/margins": 1.8584747314453125, + "rewards/rejected": -2.4004006385803223, + "step": 5789 + }, + { + "epoch": 0.67, + "learning_rate": 1.0127589839634788e-07, + "logits/chosen": -3.756796360015869, + "logits/rejected": -3.6050214767456055, + "logps/chosen": -202.0507354736328, + "logps/rejected": -193.2487335205078, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2241852879524231, + "rewards/margins": 2.0515527725219727, + "rewards/rejected": -1.8273674249649048, + "step": 5790 + }, + { + "epoch": 0.67, + "learning_rate": 1.0124078192672362e-07, + "logits/chosen": -3.883694648742676, + "logits/rejected": -3.527913808822632, + "logps/chosen": -294.76654052734375, + "logps/rejected": -195.85238647460938, + "loss": 0.3692, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24595633149147034, + "rewards/margins": 1.5026288032531738, + "rewards/rejected": -1.7485851049423218, + "step": 5791 + }, + { + "epoch": 0.67, + "learning_rate": 1.0120566545709939e-07, + "logits/chosen": -2.7484538555145264, + "logits/rejected": -2.834972858428955, + "logps/chosen": -379.41717529296875, + "logps/rejected": -254.1068878173828, + "loss": 0.7085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1076100766658783, + "rewards/margins": 1.8199703693389893, + "rewards/rejected": -1.9275805950164795, + "step": 5792 + }, + { + "epoch": 0.67, + "learning_rate": 1.0117054898747513e-07, + "logits/chosen": -2.676919937133789, + "logits/rejected": -2.620404005050659, + "logps/chosen": -400.7017517089844, + "logps/rejected": -336.02569580078125, + "loss": 0.1799, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5211951732635498, + "rewards/margins": 3.4666409492492676, + "rewards/rejected": -2.9454457759857178, + "step": 5793 + }, + { + "epoch": 0.67, + "learning_rate": 1.0113543251785087e-07, + "logits/chosen": -2.9586222171783447, + "logits/rejected": -3.1434831619262695, + "logps/chosen": -177.74632263183594, + "logps/rejected": -176.98435974121094, + "loss": 0.3928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.003329724073410034, + "rewards/margins": 1.0225564241409302, + "rewards/rejected": -1.025886058807373, + "step": 5794 + }, + { + "epoch": 0.67, + "learning_rate": 1.0110031604822661e-07, + "logits/chosen": -3.4130098819732666, + "logits/rejected": -3.1802778244018555, + "logps/chosen": -303.45318603515625, + "logps/rejected": -250.43319702148438, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0031204447150230408, + "rewards/margins": 2.5881195068359375, + "rewards/rejected": -2.5912399291992188, + "step": 5795 + }, + { + "epoch": 0.67, + "learning_rate": 1.0106519957860236e-07, + "logits/chosen": -2.7161779403686523, + "logits/rejected": -2.7206342220306396, + "logps/chosen": -333.5968017578125, + "logps/rejected": -363.04791259765625, + "loss": 0.4413, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08945446461439133, + "rewards/margins": 1.795944333076477, + "rewards/rejected": -1.8853987455368042, + "step": 5796 + }, + { + "epoch": 0.67, + "learning_rate": 1.010300831089781e-07, + "logits/chosen": -3.0436596870422363, + "logits/rejected": -2.7899258136749268, + "logps/chosen": -246.21946716308594, + "logps/rejected": -312.4344482421875, + "loss": 0.5526, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3712711036205292, + "rewards/margins": 2.3040151596069336, + "rewards/rejected": -1.9327441453933716, + "step": 5797 + }, + { + "epoch": 0.67, + "learning_rate": 1.0099496663935386e-07, + "logits/chosen": -3.3282346725463867, + "logits/rejected": -3.342134475708008, + "logps/chosen": -229.2913818359375, + "logps/rejected": -163.58731079101562, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3030242919921875, + "rewards/margins": 1.673163890838623, + "rewards/rejected": -1.370139718055725, + "step": 5798 + }, + { + "epoch": 0.67, + "learning_rate": 1.009598501697296e-07, + "logits/chosen": -2.896559476852417, + "logits/rejected": -2.893857002258301, + "logps/chosen": -312.3309326171875, + "logps/rejected": -307.76947021484375, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14569054543972015, + "rewards/margins": 2.316917896270752, + "rewards/rejected": -2.462608575820923, + "step": 5799 + }, + { + "epoch": 0.67, + "learning_rate": 1.0092473370010535e-07, + "logits/chosen": -3.0085935592651367, + "logits/rejected": -2.9898650646209717, + "logps/chosen": -324.7966613769531, + "logps/rejected": -336.0631103515625, + "loss": 0.5702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1573454141616821, + "rewards/margins": 0.8149951100349426, + "rewards/rejected": -1.9723405838012695, + "step": 5800 + }, + { + "epoch": 0.67, + "learning_rate": 1.0088961723048109e-07, + "logits/chosen": -2.9146485328674316, + "logits/rejected": -2.890150308609009, + "logps/chosen": -235.1514892578125, + "logps/rejected": -434.43377685546875, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07833784818649292, + "rewards/margins": 1.8482413291931152, + "rewards/rejected": -1.769903540611267, + "step": 5801 + }, + { + "epoch": 0.67, + "learning_rate": 1.0085450076085683e-07, + "logits/chosen": -3.4724907875061035, + "logits/rejected": -3.607898473739624, + "logps/chosen": -204.73153686523438, + "logps/rejected": -247.36355590820312, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05382850766181946, + "rewards/margins": 2.374692916870117, + "rewards/rejected": -2.428521156311035, + "step": 5802 + }, + { + "epoch": 0.67, + "learning_rate": 1.0081938429123257e-07, + "logits/chosen": -3.213408946990967, + "logits/rejected": -3.3134405612945557, + "logps/chosen": -309.4417724609375, + "logps/rejected": -383.5044250488281, + "loss": 0.2582, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5420499444007874, + "rewards/margins": 2.840010166168213, + "rewards/rejected": -2.2979602813720703, + "step": 5803 + }, + { + "epoch": 0.67, + "learning_rate": 1.0078426782160834e-07, + "logits/chosen": -2.6683239936828613, + "logits/rejected": -2.630152463912964, + "logps/chosen": -331.8743896484375, + "logps/rejected": -396.02044677734375, + "loss": 0.5073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10291262716054916, + "rewards/margins": 1.9289417266845703, + "rewards/rejected": -2.0318543910980225, + "step": 5804 + }, + { + "epoch": 0.67, + "learning_rate": 1.0074915135198408e-07, + "logits/chosen": -3.5045104026794434, + "logits/rejected": -3.447598695755005, + "logps/chosen": -286.88800048828125, + "logps/rejected": -204.11068725585938, + "loss": 0.5644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7152926325798035, + "rewards/margins": 0.641181230545044, + "rewards/rejected": -1.3564739227294922, + "step": 5805 + }, + { + "epoch": 0.67, + "learning_rate": 1.0071403488235982e-07, + "logits/chosen": -3.722761631011963, + "logits/rejected": -3.300753355026245, + "logps/chosen": -176.16175842285156, + "logps/rejected": -153.8076171875, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6921444535255432, + "rewards/margins": 1.6760385036468506, + "rewards/rejected": -2.368182897567749, + "step": 5806 + }, + { + "epoch": 0.67, + "learning_rate": 1.0067891841273556e-07, + "logits/chosen": -3.275182008743286, + "logits/rejected": -3.2051639556884766, + "logps/chosen": -378.00909423828125, + "logps/rejected": -288.52630615234375, + "loss": 0.3137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5098221898078918, + "rewards/margins": 2.122068405151367, + "rewards/rejected": -2.6318905353546143, + "step": 5807 + }, + { + "epoch": 0.67, + "learning_rate": 1.0064380194311133e-07, + "logits/chosen": -3.3149807453155518, + "logits/rejected": -3.4779679775238037, + "logps/chosen": -187.40206909179688, + "logps/rejected": -222.67373657226562, + "loss": 0.7981, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7969968318939209, + "rewards/margins": 0.6386675238609314, + "rewards/rejected": -1.4356642961502075, + "step": 5808 + }, + { + "epoch": 0.67, + "learning_rate": 1.0060868547348707e-07, + "logits/chosen": -2.9451088905334473, + "logits/rejected": -3.122645378112793, + "logps/chosen": -316.54620361328125, + "logps/rejected": -229.63111877441406, + "loss": 0.3251, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2457353174686432, + "rewards/margins": 2.0575978755950928, + "rewards/rejected": -1.811862587928772, + "step": 5809 + }, + { + "epoch": 0.67, + "learning_rate": 1.0057356900386281e-07, + "logits/chosen": -2.496589422225952, + "logits/rejected": -2.5081653594970703, + "logps/chosen": -238.273193359375, + "logps/rejected": -157.47410583496094, + "loss": 0.7627, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49845245480537415, + "rewards/margins": 0.20113077759742737, + "rewards/rejected": -0.6995831727981567, + "step": 5810 + }, + { + "epoch": 0.67, + "learning_rate": 1.0053845253423855e-07, + "logits/chosen": -3.516080379486084, + "logits/rejected": -3.7569658756256104, + "logps/chosen": -322.4468078613281, + "logps/rejected": -329.191650390625, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5715285539627075, + "rewards/margins": 2.620281457901001, + "rewards/rejected": -2.048752784729004, + "step": 5811 + }, + { + "epoch": 0.67, + "learning_rate": 1.0050333606461429e-07, + "logits/chosen": -3.1130435466766357, + "logits/rejected": -3.6663031578063965, + "logps/chosen": -202.66000366210938, + "logps/rejected": -224.72091674804688, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6141217947006226, + "rewards/margins": 3.2038567066192627, + "rewards/rejected": -2.5897347927093506, + "step": 5812 + }, + { + "epoch": 0.67, + "learning_rate": 1.0046821959499005e-07, + "logits/chosen": -3.4487826824188232, + "logits/rejected": -3.2191720008850098, + "logps/chosen": -283.79180908203125, + "logps/rejected": -156.31439208984375, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12212395668029785, + "rewards/margins": 1.3463075160980225, + "rewards/rejected": -1.4684314727783203, + "step": 5813 + }, + { + "epoch": 0.67, + "learning_rate": 1.0043310312536579e-07, + "logits/chosen": -3.839623212814331, + "logits/rejected": -3.9483420848846436, + "logps/chosen": -172.6593017578125, + "logps/rejected": -189.46749877929688, + "loss": 0.4066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2584480345249176, + "rewards/margins": 1.363820195198059, + "rewards/rejected": -1.6222681999206543, + "step": 5814 + }, + { + "epoch": 0.67, + "learning_rate": 1.0039798665574154e-07, + "logits/chosen": -3.5140485763549805, + "logits/rejected": -3.27122163772583, + "logps/chosen": -383.28204345703125, + "logps/rejected": -358.60211181640625, + "loss": 1.6386, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0435818433761597, + "rewards/margins": -0.3432643413543701, + "rewards/rejected": -0.7003175020217896, + "step": 5815 + }, + { + "epoch": 0.67, + "learning_rate": 1.0036287018611728e-07, + "logits/chosen": -2.905308723449707, + "logits/rejected": -3.0740103721618652, + "logps/chosen": -236.3300018310547, + "logps/rejected": -297.1593017578125, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23686744272708893, + "rewards/margins": 3.691052198410034, + "rewards/rejected": -3.4541850090026855, + "step": 5816 + }, + { + "epoch": 0.67, + "learning_rate": 1.0032775371649303e-07, + "logits/chosen": -3.3402485847473145, + "logits/rejected": -2.9279489517211914, + "logps/chosen": -253.5897216796875, + "logps/rejected": -228.26712036132812, + "loss": 0.2099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32365041971206665, + "rewards/margins": 1.967529058456421, + "rewards/rejected": -1.643878698348999, + "step": 5817 + }, + { + "epoch": 0.67, + "learning_rate": 1.0029263724686878e-07, + "logits/chosen": -2.558081865310669, + "logits/rejected": -2.5557870864868164, + "logps/chosen": -350.565185546875, + "logps/rejected": -196.56488037109375, + "loss": 0.8529, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26384812593460083, + "rewards/margins": 0.38916119933128357, + "rewards/rejected": -0.12531307339668274, + "step": 5818 + }, + { + "epoch": 0.67, + "learning_rate": 1.0025752077724452e-07, + "logits/chosen": -2.8891103267669678, + "logits/rejected": -2.659825325012207, + "logps/chosen": -187.74624633789062, + "logps/rejected": -224.08233642578125, + "loss": 0.3796, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11772335320711136, + "rewards/margins": 1.525943636894226, + "rewards/rejected": -1.4082202911376953, + "step": 5819 + }, + { + "epoch": 0.67, + "learning_rate": 1.0022240430762026e-07, + "logits/chosen": -3.249738931655884, + "logits/rejected": -3.594846248626709, + "logps/chosen": -132.2567138671875, + "logps/rejected": -190.0453643798828, + "loss": 0.4037, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.552525520324707, + "rewards/margins": 1.4455583095550537, + "rewards/rejected": -0.8930327892303467, + "step": 5820 + }, + { + "epoch": 0.67, + "learning_rate": 1.0018728783799602e-07, + "logits/chosen": -2.0396735668182373, + "logits/rejected": -2.428861379623413, + "logps/chosen": -349.7457275390625, + "logps/rejected": -219.21897888183594, + "loss": 0.5149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5093855857849121, + "rewards/margins": 1.0106496810913086, + "rewards/rejected": -1.5200352668762207, + "step": 5821 + }, + { + "epoch": 0.67, + "learning_rate": 1.0015217136837176e-07, + "logits/chosen": -3.482848644256592, + "logits/rejected": -3.140950918197632, + "logps/chosen": -273.78887939453125, + "logps/rejected": -226.2015380859375, + "loss": 0.2856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4961393475532532, + "rewards/margins": 1.8562145233154297, + "rewards/rejected": -2.352353811264038, + "step": 5822 + }, + { + "epoch": 0.67, + "learning_rate": 1.001170548987475e-07, + "logits/chosen": -3.4346351623535156, + "logits/rejected": -3.2384328842163086, + "logps/chosen": -275.3573913574219, + "logps/rejected": -308.7955627441406, + "loss": 0.7892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33671483397483826, + "rewards/margins": 0.2786064147949219, + "rewards/rejected": -0.615321159362793, + "step": 5823 + }, + { + "epoch": 0.67, + "learning_rate": 1.0008193842912325e-07, + "logits/chosen": -3.296865463256836, + "logits/rejected": -3.3891334533691406, + "logps/chosen": -262.0081787109375, + "logps/rejected": -299.94989013671875, + "loss": 0.349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2126980721950531, + "rewards/margins": 2.41021990776062, + "rewards/rejected": -2.622917890548706, + "step": 5824 + }, + { + "epoch": 0.67, + "learning_rate": 1.0004682195949901e-07, + "logits/chosen": -2.6350975036621094, + "logits/rejected": -2.404244899749756, + "logps/chosen": -293.57904052734375, + "logps/rejected": -280.9120178222656, + "loss": 0.5418, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21584530174732208, + "rewards/margins": 1.2097697257995605, + "rewards/rejected": -1.425614833831787, + "step": 5825 + }, + { + "epoch": 0.67, + "learning_rate": 1.0001170548987475e-07, + "logits/chosen": -2.778956651687622, + "logits/rejected": -2.7061169147491455, + "logps/chosen": -351.6490173339844, + "logps/rejected": -287.24847412109375, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3793846666812897, + "rewards/margins": 2.3204641342163086, + "rewards/rejected": -1.9410793781280518, + "step": 5826 + }, + { + "epoch": 0.67, + "learning_rate": 9.99765890202505e-08, + "logits/chosen": -2.9550323486328125, + "logits/rejected": -3.0358657836914062, + "logps/chosen": -347.96710205078125, + "logps/rejected": -211.03768920898438, + "loss": 0.1004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3780539333820343, + "rewards/margins": 2.7084391117095947, + "rewards/rejected": -2.3303849697113037, + "step": 5827 + }, + { + "epoch": 0.67, + "learning_rate": 9.994147255062623e-08, + "logits/chosen": -2.8979969024658203, + "logits/rejected": -3.1647567749023438, + "logps/chosen": -284.60687255859375, + "logps/rejected": -175.18212890625, + "loss": 0.3387, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06215950846672058, + "rewards/margins": 1.5969964265823364, + "rewards/rejected": -1.534836769104004, + "step": 5828 + }, + { + "epoch": 0.67, + "learning_rate": 9.990635608100199e-08, + "logits/chosen": -2.881619453430176, + "logits/rejected": -2.7755208015441895, + "logps/chosen": -359.5775146484375, + "logps/rejected": -415.3125305175781, + "loss": 1.0902, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6240990161895752, + "rewards/margins": -0.045827507972717285, + "rewards/rejected": -0.5782715082168579, + "step": 5829 + }, + { + "epoch": 0.67, + "learning_rate": 9.987123961137773e-08, + "logits/chosen": -2.772982120513916, + "logits/rejected": -2.6127946376800537, + "logps/chosen": -256.42718505859375, + "logps/rejected": -275.9485168457031, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07040619850158691, + "rewards/margins": 1.9554518461227417, + "rewards/rejected": -2.025857925415039, + "step": 5830 + }, + { + "epoch": 0.67, + "learning_rate": 9.983612314175347e-08, + "logits/chosen": -3.1122207641601562, + "logits/rejected": -3.303518772125244, + "logps/chosen": -558.4710693359375, + "logps/rejected": -337.11639404296875, + "loss": 0.1548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26669684052467346, + "rewards/margins": 2.3005690574645996, + "rewards/rejected": -2.033872365951538, + "step": 5831 + }, + { + "epoch": 0.67, + "learning_rate": 9.980100667212922e-08, + "logits/chosen": -2.8682050704956055, + "logits/rejected": -2.988020896911621, + "logps/chosen": -139.43836975097656, + "logps/rejected": -275.3034362792969, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24663792550563812, + "rewards/margins": 3.2882392406463623, + "rewards/rejected": -3.0416011810302734, + "step": 5832 + }, + { + "epoch": 0.67, + "learning_rate": 9.976589020250498e-08, + "logits/chosen": -2.9198756217956543, + "logits/rejected": -2.9667680263519287, + "logps/chosen": -336.41717529296875, + "logps/rejected": -218.00355529785156, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21801556646823883, + "rewards/margins": 2.861636161804199, + "rewards/rejected": -2.643620729446411, + "step": 5833 + }, + { + "epoch": 0.67, + "learning_rate": 9.973077373288072e-08, + "logits/chosen": -2.7886998653411865, + "logits/rejected": -2.79207181930542, + "logps/chosen": -182.76968383789062, + "logps/rejected": -388.615966796875, + "loss": 0.6572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33223211765289307, + "rewards/margins": 1.6258575916290283, + "rewards/rejected": -1.9580897092819214, + "step": 5834 + }, + { + "epoch": 0.67, + "learning_rate": 9.969565726325646e-08, + "logits/chosen": -3.1411495208740234, + "logits/rejected": -3.2230286598205566, + "logps/chosen": -289.51177978515625, + "logps/rejected": -275.09686279296875, + "loss": 0.2119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13136039674282074, + "rewards/margins": 2.24552059173584, + "rewards/rejected": -2.1141600608825684, + "step": 5835 + }, + { + "epoch": 0.67, + "learning_rate": 9.96605407936322e-08, + "logits/chosen": -3.1126928329467773, + "logits/rejected": -3.1807703971862793, + "logps/chosen": -135.57492065429688, + "logps/rejected": -220.7835235595703, + "loss": 0.2185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007068857550621033, + "rewards/margins": 2.2056119441986084, + "rewards/rejected": -2.198542833328247, + "step": 5836 + }, + { + "epoch": 0.67, + "learning_rate": 9.962542432400797e-08, + "logits/chosen": -3.4664762020111084, + "logits/rejected": -3.3600733280181885, + "logps/chosen": -318.40386962890625, + "logps/rejected": -272.092041015625, + "loss": 0.1527, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3521682024002075, + "rewards/margins": 3.011784553527832, + "rewards/rejected": -2.659616470336914, + "step": 5837 + }, + { + "epoch": 0.67, + "learning_rate": 9.959030785438371e-08, + "logits/chosen": -3.5358829498291016, + "logits/rejected": -3.1728873252868652, + "logps/chosen": -244.89422607421875, + "logps/rejected": -220.75164794921875, + "loss": 0.4375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07542509585618973, + "rewards/margins": 1.5355592966079712, + "rewards/rejected": -1.4601341485977173, + "step": 5838 + }, + { + "epoch": 0.67, + "learning_rate": 9.955519138475945e-08, + "logits/chosen": -3.33901309967041, + "logits/rejected": -3.286229133605957, + "logps/chosen": -165.34078979492188, + "logps/rejected": -240.930419921875, + "loss": 0.3763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5709723830223083, + "rewards/margins": 2.2782716751098633, + "rewards/rejected": -2.8492441177368164, + "step": 5839 + }, + { + "epoch": 0.67, + "learning_rate": 9.952007491513519e-08, + "logits/chosen": -3.3268375396728516, + "logits/rejected": -3.4367496967315674, + "logps/chosen": -312.5725402832031, + "logps/rejected": -274.41900634765625, + "loss": 0.7637, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45186296105384827, + "rewards/margins": 0.8509145379066467, + "rewards/rejected": -1.3027775287628174, + "step": 5840 + }, + { + "epoch": 0.67, + "learning_rate": 9.948495844551094e-08, + "logits/chosen": -2.9128224849700928, + "logits/rejected": -2.834986686706543, + "logps/chosen": -131.3606719970703, + "logps/rejected": -135.44131469726562, + "loss": 0.6533, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6300539374351501, + "rewards/margins": 0.58622807264328, + "rewards/rejected": -1.2162820100784302, + "step": 5841 + }, + { + "epoch": 0.67, + "learning_rate": 9.94498419758867e-08, + "logits/chosen": -3.0866498947143555, + "logits/rejected": -3.1177048683166504, + "logps/chosen": -266.5037841796875, + "logps/rejected": -334.56903076171875, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21335217356681824, + "rewards/margins": 2.0195071697235107, + "rewards/rejected": -2.2328596115112305, + "step": 5842 + }, + { + "epoch": 0.67, + "learning_rate": 9.941472550626244e-08, + "logits/chosen": -3.249605178833008, + "logits/rejected": -3.2229976654052734, + "logps/chosen": -241.50848388671875, + "logps/rejected": -155.09881591796875, + "loss": 0.3776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33636894822120667, + "rewards/margins": 1.6979063749313354, + "rewards/rejected": -2.0342752933502197, + "step": 5843 + }, + { + "epoch": 0.67, + "learning_rate": 9.937960903663818e-08, + "logits/chosen": -3.678542375564575, + "logits/rejected": -3.6734166145324707, + "logps/chosen": -142.36630249023438, + "logps/rejected": -144.0872802734375, + "loss": 0.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08605018258094788, + "rewards/margins": 2.555245876312256, + "rewards/rejected": -2.46919584274292, + "step": 5844 + }, + { + "epoch": 0.67, + "learning_rate": 9.934449256701393e-08, + "logits/chosen": -3.554851531982422, + "logits/rejected": -3.091127634048462, + "logps/chosen": -310.6380310058594, + "logps/rejected": -228.78302001953125, + "loss": 0.1554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4400203227996826, + "rewards/margins": 2.2957780361175537, + "rewards/rejected": -1.855757713317871, + "step": 5845 + }, + { + "epoch": 0.67, + "learning_rate": 9.930937609738967e-08, + "logits/chosen": -3.0980947017669678, + "logits/rejected": -3.210541009902954, + "logps/chosen": -234.4436492919922, + "logps/rejected": -188.8125, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31106436252593994, + "rewards/margins": 1.0344637632369995, + "rewards/rejected": -1.34552800655365, + "step": 5846 + }, + { + "epoch": 0.67, + "learning_rate": 9.927425962776541e-08, + "logits/chosen": -2.6038763523101807, + "logits/rejected": -2.6116533279418945, + "logps/chosen": -255.38092041015625, + "logps/rejected": -221.74240112304688, + "loss": 0.5846, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03336520493030548, + "rewards/margins": 0.9973223805427551, + "rewards/rejected": -0.9639571905136108, + "step": 5847 + }, + { + "epoch": 0.67, + "learning_rate": 9.923914315814115e-08, + "logits/chosen": -2.514009952545166, + "logits/rejected": -2.7022643089294434, + "logps/chosen": -274.4974365234375, + "logps/rejected": -263.0378723144531, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025754515081644058, + "rewards/margins": 1.8148938417434692, + "rewards/rejected": -1.8406481742858887, + "step": 5848 + }, + { + "epoch": 0.67, + "learning_rate": 9.920402668851692e-08, + "logits/chosen": -3.582098960876465, + "logits/rejected": -3.658900260925293, + "logps/chosen": -267.30126953125, + "logps/rejected": -174.888671875, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5774483680725098, + "rewards/margins": 0.8964422941207886, + "rewards/rejected": -1.4738906621932983, + "step": 5849 + }, + { + "epoch": 0.67, + "learning_rate": 9.916891021889266e-08, + "logits/chosen": -2.7487282752990723, + "logits/rejected": -2.9112915992736816, + "logps/chosen": -320.3329772949219, + "logps/rejected": -348.9295349121094, + "loss": 0.7899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4242067039012909, + "rewards/margins": 1.186872959136963, + "rewards/rejected": -1.6110796928405762, + "step": 5850 + }, + { + "epoch": 0.67, + "learning_rate": 9.91337937492684e-08, + "logits/chosen": -2.5861692428588867, + "logits/rejected": -2.83107328414917, + "logps/chosen": -208.30081176757812, + "logps/rejected": -234.38783264160156, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027596376836299896, + "rewards/margins": 1.1610729694366455, + "rewards/rejected": -1.1334764957427979, + "step": 5851 + }, + { + "epoch": 0.67, + "learning_rate": 9.909867727964414e-08, + "logits/chosen": -2.796337604522705, + "logits/rejected": -2.8866446018218994, + "logps/chosen": -405.505126953125, + "logps/rejected": -327.7308349609375, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6273916363716125, + "rewards/margins": 2.630035400390625, + "rewards/rejected": -2.0026438236236572, + "step": 5852 + }, + { + "epoch": 0.67, + "learning_rate": 9.906356081001991e-08, + "logits/chosen": -2.748167037963867, + "logits/rejected": -2.9140172004699707, + "logps/chosen": -173.40005493164062, + "logps/rejected": -191.3587188720703, + "loss": 0.4278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17432591319084167, + "rewards/margins": 1.5175037384033203, + "rewards/rejected": -1.6918295621871948, + "step": 5853 + }, + { + "epoch": 0.67, + "learning_rate": 9.902844434039565e-08, + "logits/chosen": -3.1110167503356934, + "logits/rejected": -3.220393657684326, + "logps/chosen": -254.31382751464844, + "logps/rejected": -276.19476318359375, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00897166132926941, + "rewards/margins": 3.100571632385254, + "rewards/rejected": -3.091599702835083, + "step": 5854 + }, + { + "epoch": 0.67, + "learning_rate": 9.899332787077139e-08, + "logits/chosen": -3.512065887451172, + "logits/rejected": -3.6136631965637207, + "logps/chosen": -142.45318603515625, + "logps/rejected": -98.55839538574219, + "loss": 0.5356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3754810094833374, + "rewards/margins": 0.5947369337081909, + "rewards/rejected": -0.9702179431915283, + "step": 5855 + }, + { + "epoch": 0.68, + "learning_rate": 9.895821140114713e-08, + "logits/chosen": -3.166360378265381, + "logits/rejected": -3.441213369369507, + "logps/chosen": -207.2499542236328, + "logps/rejected": -232.8955841064453, + "loss": 0.3732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.435608834028244, + "rewards/margins": 2.831185817718506, + "rewards/rejected": -3.2667946815490723, + "step": 5856 + }, + { + "epoch": 0.68, + "learning_rate": 9.892309493152288e-08, + "logits/chosen": -3.580611228942871, + "logits/rejected": -3.495077610015869, + "logps/chosen": -190.08535766601562, + "logps/rejected": -230.56906127929688, + "loss": 0.4348, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06988823413848877, + "rewards/margins": 1.7075531482696533, + "rewards/rejected": -1.777441382408142, + "step": 5857 + }, + { + "epoch": 0.68, + "learning_rate": 9.888797846189862e-08, + "logits/chosen": -2.831543207168579, + "logits/rejected": -3.042236804962158, + "logps/chosen": -231.40057373046875, + "logps/rejected": -228.1633758544922, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12768608331680298, + "rewards/margins": 1.7721436023712158, + "rewards/rejected": -1.899829626083374, + "step": 5858 + }, + { + "epoch": 0.68, + "learning_rate": 9.885286199227438e-08, + "logits/chosen": -2.8729584217071533, + "logits/rejected": -2.7097322940826416, + "logps/chosen": -310.5925598144531, + "logps/rejected": -218.64349365234375, + "loss": 1.1161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7986485362052917, + "rewards/margins": 1.927675724029541, + "rewards/rejected": -2.7263240814208984, + "step": 5859 + }, + { + "epoch": 0.68, + "learning_rate": 9.881774552265012e-08, + "logits/chosen": -3.352140426635742, + "logits/rejected": -3.01410174369812, + "logps/chosen": -231.48455810546875, + "logps/rejected": -175.70481872558594, + "loss": 0.3894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28056812286376953, + "rewards/margins": 0.9903408885002136, + "rewards/rejected": -1.2709089517593384, + "step": 5860 + }, + { + "epoch": 0.68, + "learning_rate": 9.878262905302586e-08, + "logits/chosen": -3.160825490951538, + "logits/rejected": -3.3330297470092773, + "logps/chosen": -164.91323852539062, + "logps/rejected": -251.02366638183594, + "loss": 0.3042, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27820488810539246, + "rewards/margins": 1.5246586799621582, + "rewards/rejected": -1.2464537620544434, + "step": 5861 + }, + { + "epoch": 0.68, + "learning_rate": 9.874751258340161e-08, + "logits/chosen": -3.110218048095703, + "logits/rejected": -2.913384437561035, + "logps/chosen": -247.629150390625, + "logps/rejected": -216.33143615722656, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3600773513317108, + "rewards/margins": 1.2763354778289795, + "rewards/rejected": -1.6364127397537231, + "step": 5862 + }, + { + "epoch": 0.68, + "learning_rate": 9.871239611377735e-08, + "logits/chosen": -3.811727523803711, + "logits/rejected": -3.5451178550720215, + "logps/chosen": -240.30506896972656, + "logps/rejected": -277.1090087890625, + "loss": 0.3306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07399120926856995, + "rewards/margins": 2.857959032058716, + "rewards/rejected": -2.931950092315674, + "step": 5863 + }, + { + "epoch": 0.68, + "learning_rate": 9.86772796441531e-08, + "logits/chosen": -2.823202610015869, + "logits/rejected": -2.8664822578430176, + "logps/chosen": -332.12860107421875, + "logps/rejected": -318.6343994140625, + "loss": 0.3264, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35973140597343445, + "rewards/margins": 1.6432604789733887, + "rewards/rejected": -1.2835290431976318, + "step": 5864 + }, + { + "epoch": 0.68, + "learning_rate": 9.864216317452884e-08, + "logits/chosen": -3.41549015045166, + "logits/rejected": -3.5235023498535156, + "logps/chosen": -178.75234985351562, + "logps/rejected": -202.8977813720703, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41884109377861023, + "rewards/margins": 1.746932029724121, + "rewards/rejected": -1.328090786933899, + "step": 5865 + }, + { + "epoch": 0.68, + "learning_rate": 9.86070467049046e-08, + "logits/chosen": -2.6146955490112305, + "logits/rejected": -2.742666721343994, + "logps/chosen": -146.34173583984375, + "logps/rejected": -267.6336669921875, + "loss": 0.1726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12419556826353073, + "rewards/margins": 2.6517715454101562, + "rewards/rejected": -2.7759671211242676, + "step": 5866 + }, + { + "epoch": 0.68, + "learning_rate": 9.857193023528034e-08, + "logits/chosen": -3.233966112136841, + "logits/rejected": -3.4024786949157715, + "logps/chosen": -274.607421875, + "logps/rejected": -193.4553680419922, + "loss": 0.5457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.260707825422287, + "rewards/margins": 0.7444044947624207, + "rewards/rejected": -1.0051122903823853, + "step": 5867 + }, + { + "epoch": 0.68, + "learning_rate": 9.853681376565608e-08, + "logits/chosen": -3.29962158203125, + "logits/rejected": -3.138601541519165, + "logps/chosen": -252.79920959472656, + "logps/rejected": -332.0163269042969, + "loss": 0.3241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3704460561275482, + "rewards/margins": 1.6241704225540161, + "rewards/rejected": -1.9946165084838867, + "step": 5868 + }, + { + "epoch": 0.68, + "learning_rate": 9.850169729603182e-08, + "logits/chosen": -3.7388415336608887, + "logits/rejected": -3.3806042671203613, + "logps/chosen": -282.8140869140625, + "logps/rejected": -235.76132202148438, + "loss": 0.2733, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1902931183576584, + "rewards/margins": 2.6075892448425293, + "rewards/rejected": -2.4172964096069336, + "step": 5869 + }, + { + "epoch": 0.68, + "learning_rate": 9.846658082640759e-08, + "logits/chosen": -3.194521903991699, + "logits/rejected": -2.8490991592407227, + "logps/chosen": -261.7351989746094, + "logps/rejected": -92.43534851074219, + "loss": 0.6156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6942052841186523, + "rewards/margins": 0.3979751169681549, + "rewards/rejected": -1.0921803712844849, + "step": 5870 + }, + { + "epoch": 0.68, + "learning_rate": 9.843146435678333e-08, + "logits/chosen": -2.9638099670410156, + "logits/rejected": -3.0124082565307617, + "logps/chosen": -246.3319091796875, + "logps/rejected": -188.26318359375, + "loss": 0.2102, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0751928985118866, + "rewards/margins": 2.242140531539917, + "rewards/rejected": -2.317333459854126, + "step": 5871 + }, + { + "epoch": 0.68, + "learning_rate": 9.839634788715907e-08, + "logits/chosen": -2.9471092224121094, + "logits/rejected": -2.945946455001831, + "logps/chosen": -389.36529541015625, + "logps/rejected": -247.71109008789062, + "loss": 0.6811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3161923587322235, + "rewards/margins": 1.4977154731750488, + "rewards/rejected": -1.8139076232910156, + "step": 5872 + }, + { + "epoch": 0.68, + "learning_rate": 9.836123141753481e-08, + "logits/chosen": -3.081059455871582, + "logits/rejected": -3.056532382965088, + "logps/chosen": -264.02947998046875, + "logps/rejected": -176.89459228515625, + "loss": 0.5698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17406967282295227, + "rewards/margins": 1.549403190612793, + "rewards/rejected": -1.7234728336334229, + "step": 5873 + }, + { + "epoch": 0.68, + "learning_rate": 9.832611494791057e-08, + "logits/chosen": -2.8679544925689697, + "logits/rejected": -2.7538418769836426, + "logps/chosen": -216.33370971679688, + "logps/rejected": -189.82852172851562, + "loss": 0.5979, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6232277750968933, + "rewards/margins": 1.1574459075927734, + "rewards/rejected": -1.7806737422943115, + "step": 5874 + }, + { + "epoch": 0.68, + "learning_rate": 9.829099847828631e-08, + "logits/chosen": -2.838540554046631, + "logits/rejected": -2.8779385089874268, + "logps/chosen": -164.7882080078125, + "logps/rejected": -194.6588134765625, + "loss": 0.2844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013573884963989258, + "rewards/margins": 1.6807608604431152, + "rewards/rejected": -1.694334626197815, + "step": 5875 + }, + { + "epoch": 0.68, + "learning_rate": 9.825588200866206e-08, + "logits/chosen": -3.4519646167755127, + "logits/rejected": -3.3657784461975098, + "logps/chosen": -296.53765869140625, + "logps/rejected": -245.7030487060547, + "loss": 0.5944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.043736085295677185, + "rewards/margins": 1.9400216341018677, + "rewards/rejected": -1.9837578535079956, + "step": 5876 + }, + { + "epoch": 0.68, + "learning_rate": 9.82207655390378e-08, + "logits/chosen": -3.288424491882324, + "logits/rejected": -3.8043079376220703, + "logps/chosen": -58.185569763183594, + "logps/rejected": -236.12933349609375, + "loss": 0.1753, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18835115432739258, + "rewards/margins": 2.941967487335205, + "rewards/rejected": -2.7536165714263916, + "step": 5877 + }, + { + "epoch": 0.68, + "learning_rate": 9.818564906941356e-08, + "logits/chosen": -2.6230850219726562, + "logits/rejected": -2.8278255462646484, + "logps/chosen": -364.2283020019531, + "logps/rejected": -321.23175048828125, + "loss": 0.5218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5488657355308533, + "rewards/margins": 1.4745985269546509, + "rewards/rejected": -2.0234644412994385, + "step": 5878 + }, + { + "epoch": 0.68, + "learning_rate": 9.81505325997893e-08, + "logits/chosen": -3.81003737449646, + "logits/rejected": -3.7499918937683105, + "logps/chosen": -259.1098937988281, + "logps/rejected": -265.09661865234375, + "loss": 0.2907, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02219712734222412, + "rewards/margins": 1.85244882106781, + "rewards/rejected": -1.8302515745162964, + "step": 5879 + }, + { + "epoch": 0.68, + "learning_rate": 9.811541613016504e-08, + "logits/chosen": -3.1402573585510254, + "logits/rejected": -3.1011619567871094, + "logps/chosen": -165.13050842285156, + "logps/rejected": -231.5508270263672, + "loss": 0.5163, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003266632556915283, + "rewards/margins": 1.9961786270141602, + "rewards/rejected": -1.9929120540618896, + "step": 5880 + }, + { + "epoch": 0.68, + "learning_rate": 9.808029966054078e-08, + "logits/chosen": -3.083706855773926, + "logits/rejected": -3.110898494720459, + "logps/chosen": -310.09210205078125, + "logps/rejected": -293.1856689453125, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8207219839096069, + "rewards/margins": 1.0629709959030151, + "rewards/rejected": -1.883692979812622, + "step": 5881 + }, + { + "epoch": 0.68, + "learning_rate": 9.804518319091655e-08, + "logits/chosen": -3.1197102069854736, + "logits/rejected": -3.37087345123291, + "logps/chosen": -334.97894287109375, + "logps/rejected": -343.4999694824219, + "loss": 0.4576, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5193545818328857, + "rewards/margins": 2.6248574256896973, + "rewards/rejected": -2.1055030822753906, + "step": 5882 + }, + { + "epoch": 0.68, + "learning_rate": 9.801006672129229e-08, + "logits/chosen": -2.979750156402588, + "logits/rejected": -3.1595311164855957, + "logps/chosen": -272.1949768066406, + "logps/rejected": -189.91856384277344, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16188693046569824, + "rewards/margins": 2.8442046642303467, + "rewards/rejected": -2.6823177337646484, + "step": 5883 + }, + { + "epoch": 0.68, + "learning_rate": 9.797495025166803e-08, + "logits/chosen": -3.3060319423675537, + "logits/rejected": -3.2739510536193848, + "logps/chosen": -199.9860076904297, + "logps/rejected": -338.5311584472656, + "loss": 0.1203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10825134068727493, + "rewards/margins": 3.75469970703125, + "rewards/rejected": -3.6464486122131348, + "step": 5884 + }, + { + "epoch": 0.68, + "learning_rate": 9.793983378204377e-08, + "logits/chosen": -2.81729793548584, + "logits/rejected": -2.5747714042663574, + "logps/chosen": -218.89434814453125, + "logps/rejected": -279.6658935546875, + "loss": 0.9164, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4768143892288208, + "rewards/margins": -0.10084934532642365, + "rewards/rejected": -0.37596502900123596, + "step": 5885 + }, + { + "epoch": 0.68, + "learning_rate": 9.790471731241952e-08, + "logits/chosen": -4.174584865570068, + "logits/rejected": -3.3945436477661133, + "logps/chosen": -282.98876953125, + "logps/rejected": -174.14517211914062, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09973639249801636, + "rewards/margins": 1.7683066129684448, + "rewards/rejected": -1.8680429458618164, + "step": 5886 + }, + { + "epoch": 0.68, + "learning_rate": 9.786960084279527e-08, + "logits/chosen": -3.1110966205596924, + "logits/rejected": -3.1979238986968994, + "logps/chosen": -328.7284851074219, + "logps/rejected": -319.1300048828125, + "loss": 0.3117, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10484620928764343, + "rewards/margins": 2.649855136871338, + "rewards/rejected": -2.7547013759613037, + "step": 5887 + }, + { + "epoch": 0.68, + "learning_rate": 9.783448437317102e-08, + "logits/chosen": -2.5969231128692627, + "logits/rejected": -2.585523843765259, + "logps/chosen": -252.16845703125, + "logps/rejected": -286.13079833984375, + "loss": 0.8252, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6280567646026611, + "rewards/margins": 0.405137836933136, + "rewards/rejected": -1.0331945419311523, + "step": 5888 + }, + { + "epoch": 0.68, + "learning_rate": 9.779936790354676e-08, + "logits/chosen": -3.793468952178955, + "logits/rejected": -3.461768627166748, + "logps/chosen": -265.3413391113281, + "logps/rejected": -174.39877319335938, + "loss": 0.4242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4071347415447235, + "rewards/margins": 1.0258357524871826, + "rewards/rejected": -1.432970404624939, + "step": 5889 + }, + { + "epoch": 0.68, + "learning_rate": 9.776425143392251e-08, + "logits/chosen": -2.9926743507385254, + "logits/rejected": -3.1489932537078857, + "logps/chosen": -352.81988525390625, + "logps/rejected": -244.9191436767578, + "loss": 0.3261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.397050142288208, + "rewards/margins": 1.7152795791625977, + "rewards/rejected": -2.1123297214508057, + "step": 5890 + }, + { + "epoch": 0.68, + "learning_rate": 9.772913496429825e-08, + "logits/chosen": -2.9910154342651367, + "logits/rejected": -2.745288133621216, + "logps/chosen": -396.885986328125, + "logps/rejected": -287.4437561035156, + "loss": 0.7386, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9280754327774048, + "rewards/margins": 0.4229375123977661, + "rewards/rejected": -1.351012945175171, + "step": 5891 + }, + { + "epoch": 0.68, + "learning_rate": 9.769401849467399e-08, + "logits/chosen": -3.3607473373413086, + "logits/rejected": -3.5347373485565186, + "logps/chosen": -193.89651489257812, + "logps/rejected": -204.18453979492188, + "loss": 0.5605, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05268669128417969, + "rewards/margins": 1.80087411403656, + "rewards/rejected": -1.8535608053207397, + "step": 5892 + }, + { + "epoch": 0.68, + "learning_rate": 9.765890202504975e-08, + "logits/chosen": -2.92319655418396, + "logits/rejected": -3.138502359390259, + "logps/chosen": -191.7445831298828, + "logps/rejected": -191.63958740234375, + "loss": 0.3898, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09467813372612, + "rewards/margins": 1.2776422500610352, + "rewards/rejected": -1.1829640865325928, + "step": 5893 + }, + { + "epoch": 0.68, + "learning_rate": 9.76237855554255e-08, + "logits/chosen": -2.9740335941314697, + "logits/rejected": -2.9942469596862793, + "logps/chosen": -205.74679565429688, + "logps/rejected": -326.3966369628906, + "loss": 0.5167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3622058033943176, + "rewards/margins": 1.6815252304077148, + "rewards/rejected": -2.0437309741973877, + "step": 5894 + }, + { + "epoch": 0.68, + "learning_rate": 9.758866908580124e-08, + "logits/chosen": -2.7646241188049316, + "logits/rejected": -2.7888848781585693, + "logps/chosen": -281.84906005859375, + "logps/rejected": -407.03778076171875, + "loss": 0.5642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45783287286758423, + "rewards/margins": 0.778406023979187, + "rewards/rejected": -1.236238956451416, + "step": 5895 + }, + { + "epoch": 0.68, + "learning_rate": 9.755355261617698e-08, + "logits/chosen": -3.1143622398376465, + "logits/rejected": -2.984952449798584, + "logps/chosen": -193.03182983398438, + "logps/rejected": -262.170166015625, + "loss": 0.2048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013126235455274582, + "rewards/margins": 3.5639405250549316, + "rewards/rejected": -3.5770668983459473, + "step": 5896 + }, + { + "epoch": 0.68, + "learning_rate": 9.751843614655272e-08, + "logits/chosen": -3.590216875076294, + "logits/rejected": -3.7463057041168213, + "logps/chosen": -311.6917724609375, + "logps/rejected": -259.8072509765625, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34893396496772766, + "rewards/margins": 2.4251718521118164, + "rewards/rejected": -2.77410626411438, + "step": 5897 + }, + { + "epoch": 0.68, + "learning_rate": 9.748331967692849e-08, + "logits/chosen": -3.2604527473449707, + "logits/rejected": -3.5903191566467285, + "logps/chosen": -225.63352966308594, + "logps/rejected": -242.15225219726562, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10515525192022324, + "rewards/margins": 2.736217737197876, + "rewards/rejected": -2.8413729667663574, + "step": 5898 + }, + { + "epoch": 0.68, + "learning_rate": 9.744820320730423e-08, + "logits/chosen": -2.968627691268921, + "logits/rejected": -2.811392307281494, + "logps/chosen": -239.76162719726562, + "logps/rejected": -243.31930541992188, + "loss": 0.2341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10404235124588013, + "rewards/margins": 1.9953150749206543, + "rewards/rejected": -1.8912725448608398, + "step": 5899 + }, + { + "epoch": 0.68, + "learning_rate": 9.741308673767997e-08, + "logits/chosen": -3.719634532928467, + "logits/rejected": -3.480433464050293, + "logps/chosen": -260.4949035644531, + "logps/rejected": -154.40673828125, + "loss": 0.4187, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.094922736287117, + "rewards/margins": 1.4717769622802734, + "rewards/rejected": -1.3768543004989624, + "step": 5900 + }, + { + "epoch": 0.68, + "learning_rate": 9.737797026805571e-08, + "logits/chosen": -3.0189483165740967, + "logits/rejected": -2.7639288902282715, + "logps/chosen": -237.5421600341797, + "logps/rejected": -316.7267150878906, + "loss": 0.3731, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15398219227790833, + "rewards/margins": 1.8559207916259766, + "rewards/rejected": -2.0099029541015625, + "step": 5901 + }, + { + "epoch": 0.68, + "learning_rate": 9.734285379843146e-08, + "logits/chosen": -2.857999801635742, + "logits/rejected": -2.9296715259552, + "logps/chosen": -382.718994140625, + "logps/rejected": -329.0953369140625, + "loss": 0.3721, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6838896870613098, + "rewards/margins": 2.614898204803467, + "rewards/rejected": -1.9310085773468018, + "step": 5902 + }, + { + "epoch": 0.68, + "learning_rate": 9.73077373288072e-08, + "logits/chosen": -2.695199966430664, + "logits/rejected": -2.6645305156707764, + "logps/chosen": -514.1402587890625, + "logps/rejected": -298.4873046875, + "loss": 0.6889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36836111545562744, + "rewards/margins": 0.9491144418716431, + "rewards/rejected": -1.3174755573272705, + "step": 5903 + }, + { + "epoch": 0.68, + "learning_rate": 9.727262085918296e-08, + "logits/chosen": -3.167637825012207, + "logits/rejected": -2.9751222133636475, + "logps/chosen": -236.99647521972656, + "logps/rejected": -172.13987731933594, + "loss": 0.3138, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1070348471403122, + "rewards/margins": 1.6412081718444824, + "rewards/rejected": -1.5341730117797852, + "step": 5904 + }, + { + "epoch": 0.68, + "learning_rate": 9.72375043895587e-08, + "logits/chosen": -2.3025355339050293, + "logits/rejected": -2.2213094234466553, + "logps/chosen": -254.81289672851562, + "logps/rejected": -166.6031036376953, + "loss": 0.7376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6056960225105286, + "rewards/margins": 0.6519997119903564, + "rewards/rejected": -1.2576956748962402, + "step": 5905 + }, + { + "epoch": 0.68, + "learning_rate": 9.720238791993444e-08, + "logits/chosen": -3.153719902038574, + "logits/rejected": -3.16300368309021, + "logps/chosen": -246.39857482910156, + "logps/rejected": -277.1581726074219, + "loss": 0.7555, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5568031072616577, + "rewards/margins": 0.0857199877500534, + "rewards/rejected": -0.6425230503082275, + "step": 5906 + }, + { + "epoch": 0.68, + "learning_rate": 9.716727145031019e-08, + "logits/chosen": -3.4165456295013428, + "logits/rejected": -3.545555353164673, + "logps/chosen": -106.32081604003906, + "logps/rejected": -189.77041625976562, + "loss": 0.3703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07883264869451523, + "rewards/margins": 1.639039397239685, + "rewards/rejected": -1.717872142791748, + "step": 5907 + }, + { + "epoch": 0.68, + "learning_rate": 9.713215498068593e-08, + "logits/chosen": -3.312527656555176, + "logits/rejected": -3.2539167404174805, + "logps/chosen": -244.40625, + "logps/rejected": -202.13272094726562, + "loss": 0.3236, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0968976616859436, + "rewards/margins": 1.4981966018676758, + "rewards/rejected": -1.401298999786377, + "step": 5908 + }, + { + "epoch": 0.68, + "learning_rate": 9.709703851106167e-08, + "logits/chosen": -2.7766778469085693, + "logits/rejected": -2.850224494934082, + "logps/chosen": -241.97938537597656, + "logps/rejected": -367.9206848144531, + "loss": 0.2865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004072621464729309, + "rewards/margins": 2.3508472442626953, + "rewards/rejected": -2.354919672012329, + "step": 5909 + }, + { + "epoch": 0.68, + "learning_rate": 9.706192204143743e-08, + "logits/chosen": -3.6115927696228027, + "logits/rejected": -3.5794646739959717, + "logps/chosen": -254.45274353027344, + "logps/rejected": -235.61143493652344, + "loss": 0.2871, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029958456754684448, + "rewards/margins": 1.6145838499069214, + "rewards/rejected": -1.584625482559204, + "step": 5910 + }, + { + "epoch": 0.68, + "learning_rate": 9.702680557181318e-08, + "logits/chosen": -3.144989013671875, + "logits/rejected": -3.3395748138427734, + "logps/chosen": -350.95794677734375, + "logps/rejected": -336.82403564453125, + "loss": 0.342, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2829788625240326, + "rewards/margins": 1.2018797397613525, + "rewards/rejected": -0.9189009070396423, + "step": 5911 + }, + { + "epoch": 0.68, + "learning_rate": 9.699168910218892e-08, + "logits/chosen": -3.132169246673584, + "logits/rejected": -2.8998970985412598, + "logps/chosen": -265.65838623046875, + "logps/rejected": -232.97451782226562, + "loss": 0.61, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9042503237724304, + "rewards/margins": 1.3849542140960693, + "rewards/rejected": -2.2892045974731445, + "step": 5912 + }, + { + "epoch": 0.68, + "learning_rate": 9.695657263256466e-08, + "logits/chosen": -3.017947196960449, + "logits/rejected": -3.2881298065185547, + "logps/chosen": -297.63946533203125, + "logps/rejected": -264.85906982421875, + "loss": 0.4331, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10356324911117554, + "rewards/margins": 1.4964240789413452, + "rewards/rejected": -1.3928608894348145, + "step": 5913 + }, + { + "epoch": 0.68, + "learning_rate": 9.69214561629404e-08, + "logits/chosen": -2.6173129081726074, + "logits/rejected": -2.616830587387085, + "logps/chosen": -345.4765930175781, + "logps/rejected": -373.86322021484375, + "loss": 0.2847, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18101102113723755, + "rewards/margins": 2.104949951171875, + "rewards/rejected": -1.9239389896392822, + "step": 5914 + }, + { + "epoch": 0.68, + "learning_rate": 9.688633969331617e-08, + "logits/chosen": -3.4853265285491943, + "logits/rejected": -3.9362049102783203, + "logps/chosen": -316.37457275390625, + "logps/rejected": -245.63131713867188, + "loss": 0.8502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6739528775215149, + "rewards/margins": 1.7395710945129395, + "rewards/rejected": -2.4135239124298096, + "step": 5915 + }, + { + "epoch": 0.68, + "learning_rate": 9.685122322369191e-08, + "logits/chosen": -3.5124106407165527, + "logits/rejected": -4.004014492034912, + "logps/chosen": -287.0138854980469, + "logps/rejected": -290.85931396484375, + "loss": 0.273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.505718469619751, + "rewards/margins": 3.757826566696167, + "rewards/rejected": -4.263545036315918, + "step": 5916 + }, + { + "epoch": 0.68, + "learning_rate": 9.681610675406765e-08, + "logits/chosen": -2.930224895477295, + "logits/rejected": -2.7579970359802246, + "logps/chosen": -176.7631072998047, + "logps/rejected": -294.4903564453125, + "loss": 0.246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5332675576210022, + "rewards/margins": 1.7678860425949097, + "rewards/rejected": -2.3011536598205566, + "step": 5917 + }, + { + "epoch": 0.68, + "learning_rate": 9.678099028444339e-08, + "logits/chosen": -3.0618443489074707, + "logits/rejected": -2.790306329727173, + "logps/chosen": -350.218994140625, + "logps/rejected": -458.3529052734375, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3290901780128479, + "rewards/margins": 2.8720502853393555, + "rewards/rejected": -2.5429601669311523, + "step": 5918 + }, + { + "epoch": 0.68, + "learning_rate": 9.674587381481915e-08, + "logits/chosen": -3.8332104682922363, + "logits/rejected": -3.7381396293640137, + "logps/chosen": -475.32342529296875, + "logps/rejected": -338.2590637207031, + "loss": 0.1894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4330303966999054, + "rewards/margins": 2.584691047668457, + "rewards/rejected": -2.151660919189453, + "step": 5919 + }, + { + "epoch": 0.68, + "learning_rate": 9.671075734519489e-08, + "logits/chosen": -2.9704089164733887, + "logits/rejected": -3.250443935394287, + "logps/chosen": -267.57647705078125, + "logps/rejected": -297.0458984375, + "loss": 0.4947, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08477458357810974, + "rewards/margins": 3.1051831245422363, + "rewards/rejected": -3.0204086303710938, + "step": 5920 + }, + { + "epoch": 0.68, + "learning_rate": 9.667564087557064e-08, + "logits/chosen": -3.5071747303009033, + "logits/rejected": -2.9166362285614014, + "logps/chosen": -332.71337890625, + "logps/rejected": -240.1144561767578, + "loss": 0.2389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17036943137645721, + "rewards/margins": 2.0181031227111816, + "rewards/rejected": -2.1884727478027344, + "step": 5921 + }, + { + "epoch": 0.68, + "learning_rate": 9.664052440594638e-08, + "logits/chosen": -2.5795063972473145, + "logits/rejected": -2.89300537109375, + "logps/chosen": -218.0670928955078, + "logps/rejected": -286.3052673339844, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3743932843208313, + "rewards/margins": 2.2185218334198, + "rewards/rejected": -1.8441284894943237, + "step": 5922 + }, + { + "epoch": 0.68, + "learning_rate": 9.660540793632214e-08, + "logits/chosen": -2.8650870323181152, + "logits/rejected": -2.709016799926758, + "logps/chosen": -167.69749450683594, + "logps/rejected": -190.02243041992188, + "loss": 0.2947, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17263279855251312, + "rewards/margins": 1.8255832195281982, + "rewards/rejected": -1.6529505252838135, + "step": 5923 + }, + { + "epoch": 0.68, + "learning_rate": 9.657029146669788e-08, + "logits/chosen": -3.473989963531494, + "logits/rejected": -3.223184823989868, + "logps/chosen": -302.7011413574219, + "logps/rejected": -251.4551239013672, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08136235177516937, + "rewards/margins": 2.0978527069091797, + "rewards/rejected": -2.0164904594421387, + "step": 5924 + }, + { + "epoch": 0.68, + "learning_rate": 9.653517499707362e-08, + "logits/chosen": -2.3540353775024414, + "logits/rejected": -2.541011333465576, + "logps/chosen": -427.01348876953125, + "logps/rejected": -277.15411376953125, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01970088481903076, + "rewards/margins": 1.7999213933944702, + "rewards/rejected": -1.780220627784729, + "step": 5925 + }, + { + "epoch": 0.68, + "learning_rate": 9.650005852744936e-08, + "logits/chosen": -3.4029996395111084, + "logits/rejected": -3.131751537322998, + "logps/chosen": -162.26498413085938, + "logps/rejected": -198.1903076171875, + "loss": 0.4036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13549277186393738, + "rewards/margins": 1.0555747747421265, + "rewards/rejected": -1.1910674571990967, + "step": 5926 + }, + { + "epoch": 0.68, + "learning_rate": 9.646494205782512e-08, + "logits/chosen": -3.43620228767395, + "logits/rejected": -3.667235851287842, + "logps/chosen": -188.52598571777344, + "logps/rejected": -162.92849731445312, + "loss": 0.3915, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1242705583572388, + "rewards/margins": 1.9364968538284302, + "rewards/rejected": -3.060767412185669, + "step": 5927 + }, + { + "epoch": 0.68, + "learning_rate": 9.642982558820087e-08, + "logits/chosen": -3.048177719116211, + "logits/rejected": -3.1151814460754395, + "logps/chosen": -174.2582550048828, + "logps/rejected": -237.03492736816406, + "loss": 0.574, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04447430372238159, + "rewards/margins": 1.0580216646194458, + "rewards/rejected": -1.1024960279464722, + "step": 5928 + }, + { + "epoch": 0.68, + "learning_rate": 9.63947091185766e-08, + "logits/chosen": -2.8805556297302246, + "logits/rejected": -2.976654529571533, + "logps/chosen": -243.81417846679688, + "logps/rejected": -308.7523193359375, + "loss": 0.6357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6192861795425415, + "rewards/margins": 2.32954740524292, + "rewards/rejected": -2.948833465576172, + "step": 5929 + }, + { + "epoch": 0.68, + "learning_rate": 9.635959264895235e-08, + "logits/chosen": -3.084739923477173, + "logits/rejected": -2.819614887237549, + "logps/chosen": -211.7991943359375, + "logps/rejected": -267.7151184082031, + "loss": 0.4732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30180293321609497, + "rewards/margins": 0.6858354806900024, + "rewards/rejected": -0.9876383543014526, + "step": 5930 + }, + { + "epoch": 0.68, + "learning_rate": 9.63244761793281e-08, + "logits/chosen": -3.4549076557159424, + "logits/rejected": -2.938642978668213, + "logps/chosen": -284.46844482421875, + "logps/rejected": -223.9385986328125, + "loss": 0.6408, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7684244513511658, + "rewards/margins": 1.2583074569702148, + "rewards/rejected": -2.0267319679260254, + "step": 5931 + }, + { + "epoch": 0.68, + "learning_rate": 9.628935970970385e-08, + "logits/chosen": -2.3775734901428223, + "logits/rejected": -2.565157651901245, + "logps/chosen": -247.88426208496094, + "logps/rejected": -168.5254364013672, + "loss": 0.4352, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12840840220451355, + "rewards/margins": 2.082530975341797, + "rewards/rejected": -2.210939407348633, + "step": 5932 + }, + { + "epoch": 0.68, + "learning_rate": 9.62542432400796e-08, + "logits/chosen": -3.575594425201416, + "logits/rejected": -3.0210154056549072, + "logps/chosen": -633.0364990234375, + "logps/rejected": -372.78143310546875, + "loss": 0.498, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11086101830005646, + "rewards/margins": 0.6086575984954834, + "rewards/rejected": -0.7195186614990234, + "step": 5933 + }, + { + "epoch": 0.68, + "learning_rate": 9.621912677045534e-08, + "logits/chosen": -3.1979904174804688, + "logits/rejected": -3.3008577823638916, + "logps/chosen": -161.5148162841797, + "logps/rejected": -230.66336059570312, + "loss": 0.5103, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4200167655944824, + "rewards/margins": 1.2811305522918701, + "rewards/rejected": -0.8611137270927429, + "step": 5934 + }, + { + "epoch": 0.68, + "learning_rate": 9.618401030083109e-08, + "logits/chosen": -2.8864078521728516, + "logits/rejected": -3.017204761505127, + "logps/chosen": -291.59283447265625, + "logps/rejected": -246.93670654296875, + "loss": 0.377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02375376597046852, + "rewards/margins": 3.2055587768554688, + "rewards/rejected": -3.2293124198913574, + "step": 5935 + }, + { + "epoch": 0.68, + "learning_rate": 9.614889383120683e-08, + "logits/chosen": -3.1528825759887695, + "logits/rejected": -3.2759273052215576, + "logps/chosen": -377.1658935546875, + "logps/rejected": -342.70843505859375, + "loss": 0.5658, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10226978361606598, + "rewards/margins": 1.3240816593170166, + "rewards/rejected": -1.2218118906021118, + "step": 5936 + }, + { + "epoch": 0.68, + "learning_rate": 9.611377736158257e-08, + "logits/chosen": -2.653878688812256, + "logits/rejected": -2.8724582195281982, + "logps/chosen": -331.8513488769531, + "logps/rejected": -279.42864990234375, + "loss": 0.1838, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49364957213401794, + "rewards/margins": 3.144731044769287, + "rewards/rejected": -2.6510813236236572, + "step": 5937 + }, + { + "epoch": 0.68, + "learning_rate": 9.607866089195832e-08, + "logits/chosen": -3.3288676738739014, + "logits/rejected": -3.400179147720337, + "logps/chosen": -328.9051208496094, + "logps/rejected": -266.5824279785156, + "loss": 0.363, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11398360878229141, + "rewards/margins": 1.3210585117340088, + "rewards/rejected": -1.2070749998092651, + "step": 5938 + }, + { + "epoch": 0.68, + "learning_rate": 9.604354442233408e-08, + "logits/chosen": -3.4445133209228516, + "logits/rejected": -3.1126811504364014, + "logps/chosen": -277.7670593261719, + "logps/rejected": -266.5044250488281, + "loss": 0.1978, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6869149208068848, + "rewards/margins": 3.1926522254943848, + "rewards/rejected": -2.5057373046875, + "step": 5939 + }, + { + "epoch": 0.68, + "learning_rate": 9.600842795270982e-08, + "logits/chosen": -3.3059921264648438, + "logits/rejected": -3.241529941558838, + "logps/chosen": -264.0367126464844, + "logps/rejected": -278.5604248046875, + "loss": 0.7722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6335735321044922, + "rewards/margins": 0.4071235656738281, + "rewards/rejected": -1.0406970977783203, + "step": 5940 + }, + { + "epoch": 0.68, + "learning_rate": 9.597331148308556e-08, + "logits/chosen": -2.3128836154937744, + "logits/rejected": -2.287142753601074, + "logps/chosen": -320.8565673828125, + "logps/rejected": -315.33746337890625, + "loss": 0.4496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10860156267881393, + "rewards/margins": 1.3183673620224, + "rewards/rejected": -1.426969051361084, + "step": 5941 + }, + { + "epoch": 0.68, + "learning_rate": 9.59381950134613e-08, + "logits/chosen": -2.8769760131835938, + "logits/rejected": -2.582190752029419, + "logps/chosen": -285.3788757324219, + "logps/rejected": -337.0663757324219, + "loss": 0.3276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08364415913820267, + "rewards/margins": 3.229707717895508, + "rewards/rejected": -3.313352108001709, + "step": 5942 + }, + { + "epoch": 0.69, + "learning_rate": 9.590307854383707e-08, + "logits/chosen": -2.9695239067077637, + "logits/rejected": -2.7692606449127197, + "logps/chosen": -88.38555145263672, + "logps/rejected": -218.48121643066406, + "loss": 0.3564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17113028466701508, + "rewards/margins": 1.4706065654754639, + "rewards/rejected": -1.6417369842529297, + "step": 5943 + }, + { + "epoch": 0.69, + "learning_rate": 9.586796207421281e-08, + "logits/chosen": -2.291576385498047, + "logits/rejected": -2.209322690963745, + "logps/chosen": -343.2523498535156, + "logps/rejected": -192.08343505859375, + "loss": 0.3576, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.022579893469810486, + "rewards/margins": 1.2723634243011475, + "rewards/rejected": -1.2497835159301758, + "step": 5944 + }, + { + "epoch": 0.69, + "learning_rate": 9.583284560458855e-08, + "logits/chosen": -4.107910633087158, + "logits/rejected": -3.79162859916687, + "logps/chosen": -476.7855529785156, + "logps/rejected": -220.15277099609375, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.906489372253418, + "rewards/margins": 2.443168878555298, + "rewards/rejected": -3.349658489227295, + "step": 5945 + }, + { + "epoch": 0.69, + "learning_rate": 9.579772913496429e-08, + "logits/chosen": -4.056990623474121, + "logits/rejected": -3.42629337310791, + "logps/chosen": -441.8648681640625, + "logps/rejected": -270.5398254394531, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1516563445329666, + "rewards/margins": 1.85740327835083, + "rewards/rejected": -2.0090596675872803, + "step": 5946 + }, + { + "epoch": 0.69, + "learning_rate": 9.576261266534004e-08, + "logits/chosen": -3.0301432609558105, + "logits/rejected": -2.650777578353882, + "logps/chosen": -135.35862731933594, + "logps/rejected": -297.5038146972656, + "loss": 0.4611, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45772993564605713, + "rewards/margins": 1.2519938945770264, + "rewards/rejected": -1.7097238302230835, + "step": 5947 + }, + { + "epoch": 0.69, + "learning_rate": 9.572749619571578e-08, + "logits/chosen": -3.1922833919525146, + "logits/rejected": -3.2663650512695312, + "logps/chosen": -252.80966186523438, + "logps/rejected": -242.44398498535156, + "loss": 0.3175, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09758926182985306, + "rewards/margins": 2.984285831451416, + "rewards/rejected": -3.0818748474121094, + "step": 5948 + }, + { + "epoch": 0.69, + "learning_rate": 9.569237972609154e-08, + "logits/chosen": -3.6628801822662354, + "logits/rejected": -3.0616745948791504, + "logps/chosen": -433.5910339355469, + "logps/rejected": -333.8158874511719, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.653103232383728, + "rewards/margins": 1.9750224351882935, + "rewards/rejected": -2.6281256675720215, + "step": 5949 + }, + { + "epoch": 0.69, + "learning_rate": 9.565726325646728e-08, + "logits/chosen": -3.2926175594329834, + "logits/rejected": -3.673074722290039, + "logps/chosen": -427.3772277832031, + "logps/rejected": -219.54449462890625, + "loss": 0.8796, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.043566107749939, + "rewards/margins": 0.2650068402290344, + "rewards/rejected": -1.308572769165039, + "step": 5950 + }, + { + "epoch": 0.69, + "learning_rate": 9.562214678684303e-08, + "logits/chosen": -2.7589492797851562, + "logits/rejected": -2.8663904666900635, + "logps/chosen": -93.24149322509766, + "logps/rejected": -184.89462280273438, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10645896196365356, + "rewards/margins": 1.707930326461792, + "rewards/rejected": -1.8143892288208008, + "step": 5951 + }, + { + "epoch": 0.69, + "learning_rate": 9.558703031721877e-08, + "logits/chosen": -2.928055763244629, + "logits/rejected": -2.813666343688965, + "logps/chosen": -427.10235595703125, + "logps/rejected": -371.71026611328125, + "loss": 0.8606, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28058332204818726, + "rewards/margins": -0.011145040392875671, + "rewards/rejected": -0.26943832635879517, + "step": 5952 + }, + { + "epoch": 0.69, + "learning_rate": 9.555191384759451e-08, + "logits/chosen": -2.3540968894958496, + "logits/rejected": -2.4005236625671387, + "logps/chosen": -241.87937927246094, + "logps/rejected": -217.00381469726562, + "loss": 0.5694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3503603935241699, + "rewards/margins": 0.824764609336853, + "rewards/rejected": -1.1751251220703125, + "step": 5953 + }, + { + "epoch": 0.69, + "learning_rate": 9.551679737797025e-08, + "logits/chosen": -3.3043580055236816, + "logits/rejected": -3.236867666244507, + "logps/chosen": -265.4062805175781, + "logps/rejected": -286.9897766113281, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17976826429367065, + "rewards/margins": 1.8546411991119385, + "rewards/rejected": -1.6748731136322021, + "step": 5954 + }, + { + "epoch": 0.69, + "learning_rate": 9.548168090834601e-08, + "logits/chosen": -3.4081180095672607, + "logits/rejected": -3.2437658309936523, + "logps/chosen": -112.73483276367188, + "logps/rejected": -170.04676818847656, + "loss": 0.5379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4003189206123352, + "rewards/margins": 0.4541509747505188, + "rewards/rejected": -0.8544698357582092, + "step": 5955 + }, + { + "epoch": 0.69, + "learning_rate": 9.544656443872176e-08, + "logits/chosen": -2.455901622772217, + "logits/rejected": -2.83292818069458, + "logps/chosen": -356.8276062011719, + "logps/rejected": -307.1011047363281, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13973775506019592, + "rewards/margins": 3.1099796295166016, + "rewards/rejected": -2.9702420234680176, + "step": 5956 + }, + { + "epoch": 0.69, + "learning_rate": 9.54114479690975e-08, + "logits/chosen": -3.1703572273254395, + "logits/rejected": -3.234567880630493, + "logps/chosen": -237.1409149169922, + "logps/rejected": -226.9910888671875, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.007460739463567734, + "rewards/margins": 3.1464643478393555, + "rewards/rejected": -3.1539249420166016, + "step": 5957 + }, + { + "epoch": 0.69, + "learning_rate": 9.537633149947324e-08, + "logits/chosen": -3.0692288875579834, + "logits/rejected": -3.1069350242614746, + "logps/chosen": -327.1402282714844, + "logps/rejected": -210.0907745361328, + "loss": 0.1348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10889193415641785, + "rewards/margins": 2.741856336593628, + "rewards/rejected": -2.8507485389709473, + "step": 5958 + }, + { + "epoch": 0.69, + "learning_rate": 9.534121502984898e-08, + "logits/chosen": -3.273867607116699, + "logits/rejected": -3.134152412414551, + "logps/chosen": -186.67947387695312, + "logps/rejected": -208.53604125976562, + "loss": 0.3716, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01019556075334549, + "rewards/margins": 1.7677321434020996, + "rewards/rejected": -1.75753653049469, + "step": 5959 + }, + { + "epoch": 0.69, + "learning_rate": 9.530609856022475e-08, + "logits/chosen": -3.6476054191589355, + "logits/rejected": -3.098454475402832, + "logps/chosen": -367.0373840332031, + "logps/rejected": -192.65408325195312, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07294809818267822, + "rewards/margins": 1.5632667541503906, + "rewards/rejected": -1.6362148523330688, + "step": 5960 + }, + { + "epoch": 0.69, + "learning_rate": 9.527098209060049e-08, + "logits/chosen": -3.627934455871582, + "logits/rejected": -3.026698112487793, + "logps/chosen": -364.27777099609375, + "logps/rejected": -220.4020233154297, + "loss": 0.0882, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1829012632369995, + "rewards/margins": 2.846214532852173, + "rewards/rejected": -2.6633129119873047, + "step": 5961 + }, + { + "epoch": 0.69, + "learning_rate": 9.523586562097623e-08, + "logits/chosen": -2.890294075012207, + "logits/rejected": -3.3801653385162354, + "logps/chosen": -393.558837890625, + "logps/rejected": -366.5351257324219, + "loss": 0.3105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11497075855731964, + "rewards/margins": 2.3215389251708984, + "rewards/rejected": -2.436509609222412, + "step": 5962 + }, + { + "epoch": 0.69, + "learning_rate": 9.520074915135197e-08, + "logits/chosen": -3.2424323558807373, + "logits/rejected": -3.2124691009521484, + "logps/chosen": -414.4670104980469, + "logps/rejected": -296.4359436035156, + "loss": 0.4639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6098196506500244, + "rewards/margins": 0.8998351097106934, + "rewards/rejected": -1.5096547603607178, + "step": 5963 + }, + { + "epoch": 0.69, + "learning_rate": 9.516563268172773e-08, + "logits/chosen": -2.8502423763275146, + "logits/rejected": -2.6561310291290283, + "logps/chosen": -245.5434112548828, + "logps/rejected": -205.0363006591797, + "loss": 0.8091, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0279731750488281, + "rewards/margins": 0.11572159826755524, + "rewards/rejected": -1.1436948776245117, + "step": 5964 + }, + { + "epoch": 0.69, + "learning_rate": 9.513051621210347e-08, + "logits/chosen": -3.727949857711792, + "logits/rejected": -3.889094829559326, + "logps/chosen": -218.58102416992188, + "logps/rejected": -190.48703002929688, + "loss": 0.4329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2271902710199356, + "rewards/margins": 1.7468241453170776, + "rewards/rejected": -1.5196338891983032, + "step": 5965 + }, + { + "epoch": 0.69, + "learning_rate": 9.509539974247922e-08, + "logits/chosen": -2.908378839492798, + "logits/rejected": -2.687326431274414, + "logps/chosen": -341.0576477050781, + "logps/rejected": -289.0843811035156, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11070290207862854, + "rewards/margins": 1.4641095399856567, + "rewards/rejected": -1.574812412261963, + "step": 5966 + }, + { + "epoch": 0.69, + "learning_rate": 9.506028327285496e-08, + "logits/chosen": -3.1482057571411133, + "logits/rejected": -3.180713176727295, + "logps/chosen": -320.7928161621094, + "logps/rejected": -304.8851623535156, + "loss": 0.6697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4279455542564392, + "rewards/margins": 0.6166769862174988, + "rewards/rejected": -1.044622540473938, + "step": 5967 + }, + { + "epoch": 0.69, + "learning_rate": 9.502516680323072e-08, + "logits/chosen": -3.1852614879608154, + "logits/rejected": -3.2287983894348145, + "logps/chosen": -199.5655059814453, + "logps/rejected": -288.56402587890625, + "loss": 0.318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46303778886795044, + "rewards/margins": 3.264240264892578, + "rewards/rejected": -3.727278232574463, + "step": 5968 + }, + { + "epoch": 0.69, + "learning_rate": 9.499005033360646e-08, + "logits/chosen": -3.3747456073760986, + "logits/rejected": -3.229358196258545, + "logps/chosen": -199.97312927246094, + "logps/rejected": -149.7859344482422, + "loss": 0.4828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027496159076690674, + "rewards/margins": 0.8731014132499695, + "rewards/rejected": -0.9005975127220154, + "step": 5969 + }, + { + "epoch": 0.69, + "learning_rate": 9.49549338639822e-08, + "logits/chosen": -3.018251419067383, + "logits/rejected": -2.964137554168701, + "logps/chosen": -137.13800048828125, + "logps/rejected": -291.6165771484375, + "loss": 0.1982, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05087370052933693, + "rewards/margins": 2.2845890522003174, + "rewards/rejected": -2.233715534210205, + "step": 5970 + }, + { + "epoch": 0.69, + "learning_rate": 9.491981739435794e-08, + "logits/chosen": -2.632338285446167, + "logits/rejected": -2.4255783557891846, + "logps/chosen": -287.4134826660156, + "logps/rejected": -372.23565673828125, + "loss": 0.384, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16477300226688385, + "rewards/margins": 2.271850109100342, + "rewards/rejected": -2.10707688331604, + "step": 5971 + }, + { + "epoch": 0.69, + "learning_rate": 9.48847009247337e-08, + "logits/chosen": -3.5910515785217285, + "logits/rejected": -3.3889665603637695, + "logps/chosen": -286.16497802734375, + "logps/rejected": -273.6270751953125, + "loss": 0.1225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4751468002796173, + "rewards/margins": 2.9975359439849854, + "rewards/rejected": -2.5223889350891113, + "step": 5972 + }, + { + "epoch": 0.69, + "learning_rate": 9.484958445510944e-08, + "logits/chosen": -3.203512668609619, + "logits/rejected": -3.0651135444641113, + "logps/chosen": -239.86280822753906, + "logps/rejected": -188.34681701660156, + "loss": 0.3033, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19208219647407532, + "rewards/margins": 2.7583422660827637, + "rewards/rejected": -2.5662600994110107, + "step": 5973 + }, + { + "epoch": 0.69, + "learning_rate": 9.481446798548519e-08, + "logits/chosen": -3.206904172897339, + "logits/rejected": -3.29807710647583, + "logps/chosen": -195.0992431640625, + "logps/rejected": -221.79190063476562, + "loss": 0.3215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14370226860046387, + "rewards/margins": 1.8223820924758911, + "rewards/rejected": -1.966084361076355, + "step": 5974 + }, + { + "epoch": 0.69, + "learning_rate": 9.477935151586093e-08, + "logits/chosen": -2.550497531890869, + "logits/rejected": -2.4239213466644287, + "logps/chosen": -283.4840393066406, + "logps/rejected": -208.97262573242188, + "loss": 0.4547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0545407235622406, + "rewards/margins": 3.0212268829345703, + "rewards/rejected": -3.0757675170898438, + "step": 5975 + }, + { + "epoch": 0.69, + "learning_rate": 9.474423504623669e-08, + "logits/chosen": -2.2942302227020264, + "logits/rejected": -2.590351104736328, + "logps/chosen": -208.0811004638672, + "logps/rejected": -284.2514953613281, + "loss": 0.2812, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30359911918640137, + "rewards/margins": 1.8282805681228638, + "rewards/rejected": -2.1318798065185547, + "step": 5976 + }, + { + "epoch": 0.69, + "learning_rate": 9.470911857661243e-08, + "logits/chosen": -3.417524814605713, + "logits/rejected": -3.3184328079223633, + "logps/chosen": -181.62054443359375, + "logps/rejected": -167.8997802734375, + "loss": 0.1773, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2917332947254181, + "rewards/margins": 3.0698933601379395, + "rewards/rejected": -2.7781600952148438, + "step": 5977 + }, + { + "epoch": 0.69, + "learning_rate": 9.467400210698817e-08, + "logits/chosen": -2.996770143508911, + "logits/rejected": -3.0613954067230225, + "logps/chosen": -221.32005310058594, + "logps/rejected": -211.82766723632812, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3448389768600464, + "rewards/margins": 3.677300453186035, + "rewards/rejected": -3.3324618339538574, + "step": 5978 + }, + { + "epoch": 0.69, + "learning_rate": 9.463888563736391e-08, + "logits/chosen": -2.9890193939208984, + "logits/rejected": -3.0928499698638916, + "logps/chosen": -219.18643188476562, + "logps/rejected": -272.91766357421875, + "loss": 0.2358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2659527063369751, + "rewards/margins": 1.997556209564209, + "rewards/rejected": -1.7316036224365234, + "step": 5979 + }, + { + "epoch": 0.69, + "learning_rate": 9.460376916773967e-08, + "logits/chosen": -2.972804546356201, + "logits/rejected": -3.0036230087280273, + "logps/chosen": -215.41224670410156, + "logps/rejected": -234.08358764648438, + "loss": 0.3777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34834349155426025, + "rewards/margins": 2.4757518768310547, + "rewards/rejected": -2.127408266067505, + "step": 5980 + }, + { + "epoch": 0.69, + "learning_rate": 9.456865269811541e-08, + "logits/chosen": -2.8477730751037598, + "logits/rejected": -2.952570676803589, + "logps/chosen": -217.8678741455078, + "logps/rejected": -234.38729858398438, + "loss": 0.2196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3277289569377899, + "rewards/margins": 2.2701382637023926, + "rewards/rejected": -1.9424091577529907, + "step": 5981 + }, + { + "epoch": 0.69, + "learning_rate": 9.453353622849115e-08, + "logits/chosen": -3.1859121322631836, + "logits/rejected": -2.6044540405273438, + "logps/chosen": -168.84429931640625, + "logps/rejected": -102.67850494384766, + "loss": 0.7698, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5869159698486328, + "rewards/margins": 0.3254069685935974, + "rewards/rejected": -0.912322998046875, + "step": 5982 + }, + { + "epoch": 0.69, + "learning_rate": 9.44984197588669e-08, + "logits/chosen": -3.081364154815674, + "logits/rejected": -3.1458065509796143, + "logps/chosen": -505.6863098144531, + "logps/rejected": -444.40240478515625, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24603641033172607, + "rewards/margins": 1.670236349105835, + "rewards/rejected": -1.4241998195648193, + "step": 5983 + }, + { + "epoch": 0.69, + "learning_rate": 9.446330328924266e-08, + "logits/chosen": -2.8908116817474365, + "logits/rejected": -3.1113739013671875, + "logps/chosen": -227.50222778320312, + "logps/rejected": -291.6514892578125, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023221462965011597, + "rewards/margins": 2.510592460632324, + "rewards/rejected": -2.4873709678649902, + "step": 5984 + }, + { + "epoch": 0.69, + "learning_rate": 9.44281868196184e-08, + "logits/chosen": -3.5340750217437744, + "logits/rejected": -3.1018457412719727, + "logps/chosen": -283.7449645996094, + "logps/rejected": -168.56517028808594, + "loss": 0.5044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09705743938684464, + "rewards/margins": 0.7430834770202637, + "rewards/rejected": -0.8401408791542053, + "step": 5985 + }, + { + "epoch": 0.69, + "learning_rate": 9.439307034999414e-08, + "logits/chosen": -3.405971050262451, + "logits/rejected": -3.4541094303131104, + "logps/chosen": -216.0741424560547, + "logps/rejected": -233.92881774902344, + "loss": 0.2123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2612106502056122, + "rewards/margins": 2.4985809326171875, + "rewards/rejected": -2.237370252609253, + "step": 5986 + }, + { + "epoch": 0.69, + "learning_rate": 9.435795388036988e-08, + "logits/chosen": -2.7367215156555176, + "logits/rejected": -2.8680882453918457, + "logps/chosen": -359.45538330078125, + "logps/rejected": -428.4845275878906, + "loss": 0.3563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36730462312698364, + "rewards/margins": 1.0211259126663208, + "rewards/rejected": -0.6538212895393372, + "step": 5987 + }, + { + "epoch": 0.69, + "learning_rate": 9.432283741074565e-08, + "logits/chosen": -3.370603561401367, + "logits/rejected": -3.1870555877685547, + "logps/chosen": -267.0539245605469, + "logps/rejected": -204.13748168945312, + "loss": 1.4681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4085843563079834, + "rewards/margins": 0.45882683992385864, + "rewards/rejected": -1.8674112558364868, + "step": 5988 + }, + { + "epoch": 0.69, + "learning_rate": 9.428772094112139e-08, + "logits/chosen": -3.0258140563964844, + "logits/rejected": -3.222902297973633, + "logps/chosen": -188.02671813964844, + "logps/rejected": -228.3857879638672, + "loss": 0.2306, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15379658341407776, + "rewards/margins": 2.515307903289795, + "rewards/rejected": -2.361511468887329, + "step": 5989 + }, + { + "epoch": 0.69, + "learning_rate": 9.425260447149713e-08, + "logits/chosen": -2.7538259029388428, + "logits/rejected": -3.0795135498046875, + "logps/chosen": -170.61883544921875, + "logps/rejected": -269.06756591796875, + "loss": 0.3611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43979334831237793, + "rewards/margins": 2.4787349700927734, + "rewards/rejected": -2.9185280799865723, + "step": 5990 + }, + { + "epoch": 0.69, + "learning_rate": 9.421748800187287e-08, + "logits/chosen": -3.9834625720977783, + "logits/rejected": -3.8164830207824707, + "logps/chosen": -85.78785705566406, + "logps/rejected": -95.08564758300781, + "loss": 0.7604, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2367699146270752, + "rewards/margins": 0.8676185011863708, + "rewards/rejected": -1.1043884754180908, + "step": 5991 + }, + { + "epoch": 0.69, + "learning_rate": 9.418237153224862e-08, + "logits/chosen": -3.880265712738037, + "logits/rejected": -3.884253740310669, + "logps/chosen": -207.01966857910156, + "logps/rejected": -230.61669921875, + "loss": 0.6254, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.684291422367096, + "rewards/margins": 0.5095913410186768, + "rewards/rejected": -1.193882703781128, + "step": 5992 + }, + { + "epoch": 0.69, + "learning_rate": 9.414725506262438e-08, + "logits/chosen": -2.5358402729034424, + "logits/rejected": -2.477332592010498, + "logps/chosen": -380.142333984375, + "logps/rejected": -328.9883117675781, + "loss": 0.3467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27235597372055054, + "rewards/margins": 1.8779215812683105, + "rewards/rejected": -1.6055656671524048, + "step": 5993 + }, + { + "epoch": 0.69, + "learning_rate": 9.411213859300012e-08, + "logits/chosen": -2.611980438232422, + "logits/rejected": -2.4021310806274414, + "logps/chosen": -197.52935791015625, + "logps/rejected": -399.71771240234375, + "loss": 0.5582, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35566315054893494, + "rewards/margins": 1.4032495021820068, + "rewards/rejected": -1.7589126825332642, + "step": 5994 + }, + { + "epoch": 0.69, + "learning_rate": 9.407702212337586e-08, + "logits/chosen": -2.7740426063537598, + "logits/rejected": -3.051816463470459, + "logps/chosen": -207.21588134765625, + "logps/rejected": -198.9026336669922, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09621091187000275, + "rewards/margins": 2.38224720954895, + "rewards/rejected": -2.286036252975464, + "step": 5995 + }, + { + "epoch": 0.69, + "learning_rate": 9.404190565375161e-08, + "logits/chosen": -3.5111777782440186, + "logits/rejected": -3.4815497398376465, + "logps/chosen": -124.25013732910156, + "logps/rejected": -173.6617889404297, + "loss": 0.2727, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45289158821105957, + "rewards/margins": 1.5268901586532593, + "rewards/rejected": -1.0739984512329102, + "step": 5996 + }, + { + "epoch": 0.69, + "learning_rate": 9.400678918412735e-08, + "logits/chosen": -2.5677411556243896, + "logits/rejected": -2.632093667984009, + "logps/chosen": -275.3363037109375, + "logps/rejected": -223.22439575195312, + "loss": 0.6761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9366806745529175, + "rewards/margins": 0.979651927947998, + "rewards/rejected": -1.9163326025009155, + "step": 5997 + }, + { + "epoch": 0.69, + "learning_rate": 9.397167271450309e-08, + "logits/chosen": -3.3663253784179688, + "logits/rejected": -3.0877747535705566, + "logps/chosen": -350.88836669921875, + "logps/rejected": -194.8474578857422, + "loss": 0.3425, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17219853401184082, + "rewards/margins": 1.412427544593811, + "rewards/rejected": -1.2402288913726807, + "step": 5998 + }, + { + "epoch": 0.69, + "learning_rate": 9.393655624487883e-08, + "logits/chosen": -3.1187140941619873, + "logits/rejected": -2.877419948577881, + "logps/chosen": -169.95843505859375, + "logps/rejected": -208.35560607910156, + "loss": 0.2511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22736942768096924, + "rewards/margins": 2.379943609237671, + "rewards/rejected": -2.6073129177093506, + "step": 5999 + }, + { + "epoch": 0.69, + "learning_rate": 9.39014397752546e-08, + "logits/chosen": -3.1150925159454346, + "logits/rejected": -2.6974105834960938, + "logps/chosen": -219.62974548339844, + "logps/rejected": -166.36656188964844, + "loss": 0.3898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12611934542655945, + "rewards/margins": 1.7597343921661377, + "rewards/rejected": -1.88585364818573, + "step": 6000 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.840113639831543, + "eval_logits/rejected": -2.8027396202087402, + "eval_logps/chosen": -293.7511291503906, + "eval_logps/rejected": -237.29771423339844, + "eval_loss": 0.43171972036361694, + "eval_rewards/accuracies": 0.8142856955528259, + "eval_rewards/chosen": 0.030380776152014732, + "eval_rewards/margins": 1.3258912563323975, + "eval_rewards/rejected": -1.2955104112625122, + "eval_runtime": 32.523, + "eval_samples_per_second": 2.152, + "eval_steps_per_second": 1.076, + "step": 6000 + }, + { + "epoch": 0.69, + "learning_rate": 9.386632330563034e-08, + "logits/chosen": -3.4166324138641357, + "logits/rejected": -3.652947425842285, + "logps/chosen": -166.6906280517578, + "logps/rejected": -265.2337341308594, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3159804940223694, + "rewards/margins": 2.301502227783203, + "rewards/rejected": -1.985521674156189, + "step": 6001 + }, + { + "epoch": 0.69, + "learning_rate": 9.383120683600608e-08, + "logits/chosen": -2.8372294902801514, + "logits/rejected": -2.4821035861968994, + "logps/chosen": -299.708251953125, + "logps/rejected": -344.3417663574219, + "loss": 0.9895, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5646206140518188, + "rewards/margins": -0.008068457245826721, + "rewards/rejected": -0.5565521717071533, + "step": 6002 + }, + { + "epoch": 0.69, + "learning_rate": 9.379609036638182e-08, + "logits/chosen": -3.6709907054901123, + "logits/rejected": -3.3763673305511475, + "logps/chosen": -364.5740051269531, + "logps/rejected": -223.08412170410156, + "loss": 0.6741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13678961992263794, + "rewards/margins": 1.1738712787628174, + "rewards/rejected": -1.3106608390808105, + "step": 6003 + }, + { + "epoch": 0.69, + "learning_rate": 9.376097389675756e-08, + "logits/chosen": -3.5174202919006348, + "logits/rejected": -3.6510329246520996, + "logps/chosen": -261.3431396484375, + "logps/rejected": -204.43133544921875, + "loss": 0.4096, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08903439342975616, + "rewards/margins": 1.520330786705017, + "rewards/rejected": -1.4312963485717773, + "step": 6004 + }, + { + "epoch": 0.69, + "learning_rate": 9.372585742713333e-08, + "logits/chosen": -2.7782506942749023, + "logits/rejected": -3.3736166954040527, + "logps/chosen": -232.96768188476562, + "logps/rejected": -163.9185791015625, + "loss": 0.4041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02696201205253601, + "rewards/margins": 2.0504517555236816, + "rewards/rejected": -2.0234897136688232, + "step": 6005 + }, + { + "epoch": 0.69, + "learning_rate": 9.369074095750907e-08, + "logits/chosen": -2.974097490310669, + "logits/rejected": -3.1402993202209473, + "logps/chosen": -145.9790802001953, + "logps/rejected": -258.89520263671875, + "loss": 0.3086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6514614820480347, + "rewards/margins": 2.5739972591400146, + "rewards/rejected": -3.2254586219787598, + "step": 6006 + }, + { + "epoch": 0.69, + "learning_rate": 9.365562448788481e-08, + "logits/chosen": -3.1007261276245117, + "logits/rejected": -2.7689337730407715, + "logps/chosen": -201.1221466064453, + "logps/rejected": -228.416259765625, + "loss": 0.3246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2954845428466797, + "rewards/margins": 1.5864882469177246, + "rewards/rejected": -1.8819726705551147, + "step": 6007 + }, + { + "epoch": 0.69, + "learning_rate": 9.362050801826055e-08, + "logits/chosen": -3.7400693893432617, + "logits/rejected": -3.3195958137512207, + "logps/chosen": -357.97705078125, + "logps/rejected": -224.19398498535156, + "loss": 0.3045, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14970919489860535, + "rewards/margins": 1.6260571479797363, + "rewards/rejected": -1.476347804069519, + "step": 6008 + }, + { + "epoch": 0.69, + "learning_rate": 9.35853915486363e-08, + "logits/chosen": -3.2363712787628174, + "logits/rejected": -3.261361598968506, + "logps/chosen": -323.5926818847656, + "logps/rejected": -309.9913024902344, + "loss": 0.5962, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6718429327011108, + "rewards/margins": 1.074379563331604, + "rewards/rejected": -1.7462223768234253, + "step": 6009 + }, + { + "epoch": 0.69, + "learning_rate": 9.355027507901206e-08, + "logits/chosen": -3.067127227783203, + "logits/rejected": -2.933277130126953, + "logps/chosen": -349.05303955078125, + "logps/rejected": -326.5354919433594, + "loss": 0.2584, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11776372045278549, + "rewards/margins": 2.3943281173706055, + "rewards/rejected": -2.276564359664917, + "step": 6010 + }, + { + "epoch": 0.69, + "learning_rate": 9.35151586093878e-08, + "logits/chosen": -2.8958113193511963, + "logits/rejected": -2.870753526687622, + "logps/chosen": -325.14794921875, + "logps/rejected": -208.59576416015625, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3097807765007019, + "rewards/margins": 1.3124016523361206, + "rewards/rejected": -1.6221823692321777, + "step": 6011 + }, + { + "epoch": 0.69, + "learning_rate": 9.348004213976354e-08, + "logits/chosen": -3.792510509490967, + "logits/rejected": -3.6828527450561523, + "logps/chosen": -228.98545837402344, + "logps/rejected": -272.0115051269531, + "loss": 0.3872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.047828566282987595, + "rewards/margins": 2.398746967315674, + "rewards/rejected": -2.3509182929992676, + "step": 6012 + }, + { + "epoch": 0.69, + "learning_rate": 9.34449256701393e-08, + "logits/chosen": -4.043623447418213, + "logits/rejected": -3.8414411544799805, + "logps/chosen": -168.51573181152344, + "logps/rejected": -235.5950927734375, + "loss": 0.37, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3390128016471863, + "rewards/margins": 1.6986690759658813, + "rewards/rejected": -1.3596562147140503, + "step": 6013 + }, + { + "epoch": 0.69, + "learning_rate": 9.340980920051504e-08, + "logits/chosen": -2.820889472961426, + "logits/rejected": -2.903125524520874, + "logps/chosen": -301.9021911621094, + "logps/rejected": -285.06964111328125, + "loss": 0.5357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04051404446363449, + "rewards/margins": 1.0360759496688843, + "rewards/rejected": -1.0765900611877441, + "step": 6014 + }, + { + "epoch": 0.69, + "learning_rate": 9.337469273089078e-08, + "logits/chosen": -2.2991232872009277, + "logits/rejected": -2.7903902530670166, + "logps/chosen": -427.9427185058594, + "logps/rejected": -405.8951110839844, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4940497875213623, + "rewards/margins": 3.3739187717437744, + "rewards/rejected": -2.879868984222412, + "step": 6015 + }, + { + "epoch": 0.69, + "learning_rate": 9.333957626126652e-08, + "logits/chosen": -3.739346981048584, + "logits/rejected": -3.2264034748077393, + "logps/chosen": -268.407470703125, + "logps/rejected": -183.58482360839844, + "loss": 0.3869, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18969745934009552, + "rewards/margins": 1.2031750679016113, + "rewards/rejected": -1.0134776830673218, + "step": 6016 + }, + { + "epoch": 0.69, + "learning_rate": 9.330445979164228e-08, + "logits/chosen": -2.258355140686035, + "logits/rejected": -2.4010250568389893, + "logps/chosen": -382.1170349121094, + "logps/rejected": -228.71554565429688, + "loss": 0.4252, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05198746919631958, + "rewards/margins": 1.3811957836151123, + "rewards/rejected": -1.3292083740234375, + "step": 6017 + }, + { + "epoch": 0.69, + "learning_rate": 9.326934332201802e-08, + "logits/chosen": -2.17256498336792, + "logits/rejected": -2.321810245513916, + "logps/chosen": -181.27728271484375, + "logps/rejected": -142.69943237304688, + "loss": 0.4074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021023541688919067, + "rewards/margins": 1.199907898902893, + "rewards/rejected": -1.2209315299987793, + "step": 6018 + }, + { + "epoch": 0.69, + "learning_rate": 9.323422685239376e-08, + "logits/chosen": -3.2929561138153076, + "logits/rejected": -3.318106174468994, + "logps/chosen": -106.81647491455078, + "logps/rejected": -252.80523681640625, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5831893682479858, + "rewards/margins": 2.2657158374786377, + "rewards/rejected": -1.6825265884399414, + "step": 6019 + }, + { + "epoch": 0.69, + "learning_rate": 9.31991103827695e-08, + "logits/chosen": -2.964184284210205, + "logits/rejected": -2.848526954650879, + "logps/chosen": -291.1473388671875, + "logps/rejected": -413.64776611328125, + "loss": 0.219, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6281455159187317, + "rewards/margins": 2.9774327278137207, + "rewards/rejected": -2.349287271499634, + "step": 6020 + }, + { + "epoch": 0.69, + "learning_rate": 9.316399391314527e-08, + "logits/chosen": -3.2635743618011475, + "logits/rejected": -2.6985838413238525, + "logps/chosen": -340.9681091308594, + "logps/rejected": -289.9105529785156, + "loss": 0.4368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42382657527923584, + "rewards/margins": 1.9723930358886719, + "rewards/rejected": -2.3962197303771973, + "step": 6021 + }, + { + "epoch": 0.69, + "learning_rate": 9.312887744352101e-08, + "logits/chosen": -2.9303948879241943, + "logits/rejected": -2.678149700164795, + "logps/chosen": -366.67620849609375, + "logps/rejected": -285.82080078125, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39868390560150146, + "rewards/margins": 1.9952020645141602, + "rewards/rejected": -2.393886089324951, + "step": 6022 + }, + { + "epoch": 0.69, + "learning_rate": 9.309376097389675e-08, + "logits/chosen": -3.860288619995117, + "logits/rejected": -3.5785651206970215, + "logps/chosen": -299.2729187011719, + "logps/rejected": -249.14332580566406, + "loss": 0.3979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3219054937362671, + "rewards/margins": 1.4750010967254639, + "rewards/rejected": -1.1530954837799072, + "step": 6023 + }, + { + "epoch": 0.69, + "learning_rate": 9.30586445042725e-08, + "logits/chosen": -3.0868611335754395, + "logits/rejected": -3.164363145828247, + "logps/chosen": -173.87757873535156, + "logps/rejected": -183.06130981445312, + "loss": 0.5535, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.037305861711502075, + "rewards/margins": 2.171280860900879, + "rewards/rejected": -2.13397479057312, + "step": 6024 + }, + { + "epoch": 0.69, + "learning_rate": 9.302352803464825e-08, + "logits/chosen": -3.4106926918029785, + "logits/rejected": -3.047288417816162, + "logps/chosen": -320.242919921875, + "logps/rejected": -192.03756713867188, + "loss": 0.4767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.007163047790527344, + "rewards/margins": 1.138852834701538, + "rewards/rejected": -1.1460158824920654, + "step": 6025 + }, + { + "epoch": 0.69, + "learning_rate": 9.298841156502399e-08, + "logits/chosen": -3.3021469116210938, + "logits/rejected": -3.368246078491211, + "logps/chosen": -201.25120544433594, + "logps/rejected": -191.1783447265625, + "loss": 0.469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06330351531505585, + "rewards/margins": 1.2624703645706177, + "rewards/rejected": -1.325774073600769, + "step": 6026 + }, + { + "epoch": 0.69, + "learning_rate": 9.295329509539974e-08, + "logits/chosen": -2.4582736492156982, + "logits/rejected": -2.7870218753814697, + "logps/chosen": -289.0015563964844, + "logps/rejected": -260.88232421875, + "loss": 0.2412, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42041948437690735, + "rewards/margins": 2.8942625522613525, + "rewards/rejected": -2.4738430976867676, + "step": 6027 + }, + { + "epoch": 0.69, + "learning_rate": 9.291817862577548e-08, + "logits/chosen": -2.6935644149780273, + "logits/rejected": -2.7103610038757324, + "logps/chosen": -236.2628173828125, + "logps/rejected": -231.2149658203125, + "loss": 0.4417, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11239853501319885, + "rewards/margins": 1.3540091514587402, + "rewards/rejected": -1.2416106462478638, + "step": 6028 + }, + { + "epoch": 0.7, + "learning_rate": 9.288306215615124e-08, + "logits/chosen": -3.088174819946289, + "logits/rejected": -3.1170337200164795, + "logps/chosen": -257.7061462402344, + "logps/rejected": -415.2008056640625, + "loss": 0.3826, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.474758505821228, + "rewards/margins": 1.8754202127456665, + "rewards/rejected": -1.400661587715149, + "step": 6029 + }, + { + "epoch": 0.7, + "learning_rate": 9.284794568652698e-08, + "logits/chosen": -3.3040850162506104, + "logits/rejected": -3.5361647605895996, + "logps/chosen": -289.5772399902344, + "logps/rejected": -251.05218505859375, + "loss": 0.6128, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43226030468940735, + "rewards/margins": 0.992901086807251, + "rewards/rejected": -1.425161361694336, + "step": 6030 + }, + { + "epoch": 0.7, + "learning_rate": 9.281282921690272e-08, + "logits/chosen": -2.7515249252319336, + "logits/rejected": -2.770698308944702, + "logps/chosen": -187.07476806640625, + "logps/rejected": -202.35443115234375, + "loss": 0.6322, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01254485547542572, + "rewards/margins": 0.9121779203414917, + "rewards/rejected": -0.8996330499649048, + "step": 6031 + }, + { + "epoch": 0.7, + "learning_rate": 9.277771274727846e-08, + "logits/chosen": -3.8820619583129883, + "logits/rejected": -3.919194221496582, + "logps/chosen": -151.1553955078125, + "logps/rejected": -144.10935974121094, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7183789610862732, + "rewards/margins": 1.645146369934082, + "rewards/rejected": -0.9267674088478088, + "step": 6032 + }, + { + "epoch": 0.7, + "learning_rate": 9.274259627765423e-08, + "logits/chosen": -3.0267701148986816, + "logits/rejected": -2.7622506618499756, + "logps/chosen": -343.67755126953125, + "logps/rejected": -242.3269805908203, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05399667099118233, + "rewards/margins": 2.347836494445801, + "rewards/rejected": -2.4018332958221436, + "step": 6033 + }, + { + "epoch": 0.7, + "learning_rate": 9.270747980802997e-08, + "logits/chosen": -2.440619468688965, + "logits/rejected": -2.7685794830322266, + "logps/chosen": -203.4610595703125, + "logps/rejected": -275.51824951171875, + "loss": 0.1753, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26984837651252747, + "rewards/margins": 2.978186845779419, + "rewards/rejected": -2.708338737487793, + "step": 6034 + }, + { + "epoch": 0.7, + "learning_rate": 9.267236333840571e-08, + "logits/chosen": -3.2240753173828125, + "logits/rejected": -3.0250988006591797, + "logps/chosen": -224.53692626953125, + "logps/rejected": -279.63531494140625, + "loss": 0.2316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5075360536575317, + "rewards/margins": 2.391237258911133, + "rewards/rejected": -1.8837010860443115, + "step": 6035 + }, + { + "epoch": 0.7, + "learning_rate": 9.263724686878145e-08, + "logits/chosen": -3.7428970336914062, + "logits/rejected": -3.705693483352661, + "logps/chosen": -223.6096954345703, + "logps/rejected": -271.5885009765625, + "loss": 0.6361, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6892812252044678, + "rewards/margins": 1.140865445137024, + "rewards/rejected": -1.8301466703414917, + "step": 6036 + }, + { + "epoch": 0.7, + "learning_rate": 9.26021303991572e-08, + "logits/chosen": -3.426088571548462, + "logits/rejected": -3.5995864868164062, + "logps/chosen": -239.39895629882812, + "logps/rejected": -218.89151000976562, + "loss": 0.1863, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4651263952255249, + "rewards/margins": 2.5338783264160156, + "rewards/rejected": -2.0687522888183594, + "step": 6037 + }, + { + "epoch": 0.7, + "learning_rate": 9.256701392953296e-08, + "logits/chosen": -2.875925064086914, + "logits/rejected": -3.063441276550293, + "logps/chosen": -449.74029541015625, + "logps/rejected": -288.1749572753906, + "loss": 0.743, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09408295154571533, + "rewards/margins": 1.5425708293914795, + "rewards/rejected": -1.4484879970550537, + "step": 6038 + }, + { + "epoch": 0.7, + "learning_rate": 9.25318974599087e-08, + "logits/chosen": -2.52923321723938, + "logits/rejected": -2.668139696121216, + "logps/chosen": -254.533447265625, + "logps/rejected": -269.9871520996094, + "loss": 0.4486, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17552728950977325, + "rewards/margins": 2.4032933712005615, + "rewards/rejected": -2.5788207054138184, + "step": 6039 + }, + { + "epoch": 0.7, + "learning_rate": 9.249678099028444e-08, + "logits/chosen": -2.237473487854004, + "logits/rejected": -2.6885571479797363, + "logps/chosen": -649.3207397460938, + "logps/rejected": -366.9460144042969, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1478407084941864, + "rewards/margins": 2.377479076385498, + "rewards/rejected": -2.2296383380889893, + "step": 6040 + }, + { + "epoch": 0.7, + "learning_rate": 9.246166452066019e-08, + "logits/chosen": -2.454895496368408, + "logits/rejected": -2.4586822986602783, + "logps/chosen": -404.7857666015625, + "logps/rejected": -488.2228698730469, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38262176513671875, + "rewards/margins": 3.266193389892578, + "rewards/rejected": -2.8835718631744385, + "step": 6041 + }, + { + "epoch": 0.7, + "learning_rate": 9.242654805103593e-08, + "logits/chosen": -3.1079211235046387, + "logits/rejected": -3.2922182083129883, + "logps/chosen": -280.726806640625, + "logps/rejected": -317.96234130859375, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3380165100097656, + "rewards/margins": 3.5078577995300293, + "rewards/rejected": -3.1698412895202637, + "step": 6042 + }, + { + "epoch": 0.7, + "learning_rate": 9.239143158141167e-08, + "logits/chosen": -3.528200626373291, + "logits/rejected": -3.416769504547119, + "logps/chosen": -217.13360595703125, + "logps/rejected": -288.1899719238281, + "loss": 0.2145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19072790443897247, + "rewards/margins": 2.006965160369873, + "rewards/rejected": -2.197693109512329, + "step": 6043 + }, + { + "epoch": 0.7, + "learning_rate": 9.235631511178743e-08, + "logits/chosen": -2.812445640563965, + "logits/rejected": -2.6292426586151123, + "logps/chosen": -414.078369140625, + "logps/rejected": -199.8181610107422, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10433898866176605, + "rewards/margins": 1.5319812297821045, + "rewards/rejected": -1.6363203525543213, + "step": 6044 + }, + { + "epoch": 0.7, + "learning_rate": 9.232119864216318e-08, + "logits/chosen": -3.715461015701294, + "logits/rejected": -3.743572235107422, + "logps/chosen": -143.80255126953125, + "logps/rejected": -159.6412811279297, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.314277708530426, + "rewards/margins": 0.7908638715744019, + "rewards/rejected": -1.1051416397094727, + "step": 6045 + }, + { + "epoch": 0.7, + "learning_rate": 9.228608217253892e-08, + "logits/chosen": -2.667994499206543, + "logits/rejected": -2.778740882873535, + "logps/chosen": -418.58172607421875, + "logps/rejected": -291.07830810546875, + "loss": 0.3009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04889379441738129, + "rewards/margins": 1.8454307317733765, + "rewards/rejected": -1.7965368032455444, + "step": 6046 + }, + { + "epoch": 0.7, + "learning_rate": 9.225096570291466e-08, + "logits/chosen": -3.1289799213409424, + "logits/rejected": -3.312389850616455, + "logps/chosen": -448.3828125, + "logps/rejected": -415.6953125, + "loss": 0.1706, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3021773099899292, + "rewards/margins": 4.476931571960449, + "rewards/rejected": -4.1747541427612305, + "step": 6047 + }, + { + "epoch": 0.7, + "learning_rate": 9.22158492332904e-08, + "logits/chosen": -2.8957228660583496, + "logits/rejected": -2.8582935333251953, + "logps/chosen": -239.32534790039062, + "logps/rejected": -188.1544952392578, + "loss": 0.5161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14111265540122986, + "rewards/margins": 0.7452707886695862, + "rewards/rejected": -0.8863834142684937, + "step": 6048 + }, + { + "epoch": 0.7, + "learning_rate": 9.218073276366617e-08, + "logits/chosen": -3.3925716876983643, + "logits/rejected": -3.2689249515533447, + "logps/chosen": -142.191162109375, + "logps/rejected": -148.77618408203125, + "loss": 0.5908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3154623508453369, + "rewards/margins": 1.7119698524475098, + "rewards/rejected": -2.0274322032928467, + "step": 6049 + }, + { + "epoch": 0.7, + "learning_rate": 9.214561629404191e-08, + "logits/chosen": -3.174285888671875, + "logits/rejected": -3.196530818939209, + "logps/chosen": -198.98968505859375, + "logps/rejected": -266.7240295410156, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3481619358062744, + "rewards/margins": 3.261443614959717, + "rewards/rejected": -2.9132819175720215, + "step": 6050 + }, + { + "epoch": 0.7, + "learning_rate": 9.211049982441765e-08, + "logits/chosen": -2.906209945678711, + "logits/rejected": -2.814199209213257, + "logps/chosen": -437.9550476074219, + "logps/rejected": -383.6017761230469, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.000168532133102417, + "rewards/margins": 2.364654302597046, + "rewards/rejected": -2.364485740661621, + "step": 6051 + }, + { + "epoch": 0.7, + "learning_rate": 9.207538335479339e-08, + "logits/chosen": -2.0102009773254395, + "logits/rejected": -2.229428768157959, + "logps/chosen": -523.545166015625, + "logps/rejected": -334.9476013183594, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5726315379142761, + "rewards/margins": 3.4219021797180176, + "rewards/rejected": -2.8492705821990967, + "step": 6052 + }, + { + "epoch": 0.7, + "learning_rate": 9.204026688516913e-08, + "logits/chosen": -3.15023136138916, + "logits/rejected": -2.8303704261779785, + "logps/chosen": -315.3748779296875, + "logps/rejected": -290.06939697265625, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26177504658699036, + "rewards/margins": 4.159036159515381, + "rewards/rejected": -3.897260904312134, + "step": 6053 + }, + { + "epoch": 0.7, + "learning_rate": 9.200515041554488e-08, + "logits/chosen": -3.0002810955047607, + "logits/rejected": -2.9708199501037598, + "logps/chosen": -136.59017944335938, + "logps/rejected": -192.55068969726562, + "loss": 0.3525, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26622360944747925, + "rewards/margins": 2.1436069011688232, + "rewards/rejected": -2.4098305702209473, + "step": 6054 + }, + { + "epoch": 0.7, + "learning_rate": 9.197003394592064e-08, + "logits/chosen": -2.811819553375244, + "logits/rejected": -2.999253749847412, + "logps/chosen": -246.32791137695312, + "logps/rejected": -222.65985107421875, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22088705003261566, + "rewards/margins": 1.7704548835754395, + "rewards/rejected": -1.549567699432373, + "step": 6055 + }, + { + "epoch": 0.7, + "learning_rate": 9.193491747629638e-08, + "logits/chosen": -3.4330711364746094, + "logits/rejected": -3.3735861778259277, + "logps/chosen": -492.11517333984375, + "logps/rejected": -312.5006103515625, + "loss": 0.8913, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19686943292617798, + "rewards/margins": 1.9728426933288574, + "rewards/rejected": -2.1697120666503906, + "step": 6056 + }, + { + "epoch": 0.7, + "learning_rate": 9.189980100667212e-08, + "logits/chosen": -3.257622241973877, + "logits/rejected": -3.226264476776123, + "logps/chosen": -218.746337890625, + "logps/rejected": -230.16815185546875, + "loss": 0.2996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0930292159318924, + "rewards/margins": 1.5349969863891602, + "rewards/rejected": -1.6280262470245361, + "step": 6057 + }, + { + "epoch": 0.7, + "learning_rate": 9.186468453704787e-08, + "logits/chosen": -2.836535930633545, + "logits/rejected": -2.8195290565490723, + "logps/chosen": -477.6898193359375, + "logps/rejected": -306.9068603515625, + "loss": 0.4027, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15680019557476044, + "rewards/margins": 1.6322978734970093, + "rewards/rejected": -1.4754977226257324, + "step": 6058 + }, + { + "epoch": 0.7, + "learning_rate": 9.182956806742361e-08, + "logits/chosen": -2.360788583755493, + "logits/rejected": -2.4111762046813965, + "logps/chosen": -238.1256103515625, + "logps/rejected": -242.71107482910156, + "loss": 0.7039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5982565879821777, + "rewards/margins": 1.0684376955032349, + "rewards/rejected": -1.6666942834854126, + "step": 6059 + }, + { + "epoch": 0.7, + "learning_rate": 9.179445159779936e-08, + "logits/chosen": -3.192493438720703, + "logits/rejected": -3.2156810760498047, + "logps/chosen": -440.06573486328125, + "logps/rejected": -320.5229797363281, + "loss": 0.3079, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04629439115524292, + "rewards/margins": 1.4159642457962036, + "rewards/rejected": -1.4622585773468018, + "step": 6060 + }, + { + "epoch": 0.7, + "learning_rate": 9.175933512817511e-08, + "logits/chosen": -3.0336451530456543, + "logits/rejected": -3.309845209121704, + "logps/chosen": -248.6119384765625, + "logps/rejected": -323.28802490234375, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6368128061294556, + "rewards/margins": 2.2356953620910645, + "rewards/rejected": -2.8725080490112305, + "step": 6061 + }, + { + "epoch": 0.7, + "learning_rate": 9.172421865855086e-08, + "logits/chosen": -3.4810075759887695, + "logits/rejected": -3.614983558654785, + "logps/chosen": -301.15960693359375, + "logps/rejected": -296.7435302734375, + "loss": 0.3215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03429059684276581, + "rewards/margins": 2.6086015701293945, + "rewards/rejected": -2.6428921222686768, + "step": 6062 + }, + { + "epoch": 0.7, + "learning_rate": 9.16891021889266e-08, + "logits/chosen": -3.002271890640259, + "logits/rejected": -3.244307041168213, + "logps/chosen": -350.5118713378906, + "logps/rejected": -315.4681091308594, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02590184658765793, + "rewards/margins": 1.4249475002288818, + "rewards/rejected": -1.4508495330810547, + "step": 6063 + }, + { + "epoch": 0.7, + "learning_rate": 9.165398571930234e-08, + "logits/chosen": -2.8498380184173584, + "logits/rejected": -2.5940990447998047, + "logps/chosen": -258.8536376953125, + "logps/rejected": -524.382080078125, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41133415699005127, + "rewards/margins": 3.3640987873077393, + "rewards/rejected": -2.9527647495269775, + "step": 6064 + }, + { + "epoch": 0.7, + "learning_rate": 9.161886924967808e-08, + "logits/chosen": -2.536240339279175, + "logits/rejected": -2.5002944469451904, + "logps/chosen": -170.204345703125, + "logps/rejected": -216.60829162597656, + "loss": 0.3338, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0811450332403183, + "rewards/margins": 1.2604748010635376, + "rewards/rejected": -1.3416199684143066, + "step": 6065 + }, + { + "epoch": 0.7, + "learning_rate": 9.158375278005385e-08, + "logits/chosen": -3.03999400138855, + "logits/rejected": -3.0646538734436035, + "logps/chosen": -271.26519775390625, + "logps/rejected": -309.0666198730469, + "loss": 0.37, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2324630618095398, + "rewards/margins": 1.3562512397766113, + "rewards/rejected": -1.5887142419815063, + "step": 6066 + }, + { + "epoch": 0.7, + "learning_rate": 9.154863631042959e-08, + "logits/chosen": -2.418137788772583, + "logits/rejected": -2.694409132003784, + "logps/chosen": -330.63702392578125, + "logps/rejected": -254.02572631835938, + "loss": 0.3293, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019152171909809113, + "rewards/margins": 1.9603618383407593, + "rewards/rejected": -1.9412097930908203, + "step": 6067 + }, + { + "epoch": 0.7, + "learning_rate": 9.151351984080533e-08, + "logits/chosen": -3.2673165798187256, + "logits/rejected": -3.0121593475341797, + "logps/chosen": -356.43536376953125, + "logps/rejected": -230.07467651367188, + "loss": 0.2976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21578852832317352, + "rewards/margins": 2.2881555557250977, + "rewards/rejected": -2.503943920135498, + "step": 6068 + }, + { + "epoch": 0.7, + "learning_rate": 9.147840337118107e-08, + "logits/chosen": -3.1592843532562256, + "logits/rejected": -2.9387900829315186, + "logps/chosen": -302.62225341796875, + "logps/rejected": -250.63424682617188, + "loss": 0.7148, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47849375009536743, + "rewards/margins": 1.190877914428711, + "rewards/rejected": -1.6693716049194336, + "step": 6069 + }, + { + "epoch": 0.7, + "learning_rate": 9.144328690155683e-08, + "logits/chosen": -2.6925301551818848, + "logits/rejected": -2.781449317932129, + "logps/chosen": -105.93038940429688, + "logps/rejected": -203.29666137695312, + "loss": 0.5998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4699668288230896, + "rewards/margins": 1.1532864570617676, + "rewards/rejected": -1.6232531070709229, + "step": 6070 + }, + { + "epoch": 0.7, + "learning_rate": 9.140817043193257e-08, + "logits/chosen": -2.662412405014038, + "logits/rejected": -2.913966417312622, + "logps/chosen": -347.3652038574219, + "logps/rejected": -334.6928405761719, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2717040777206421, + "rewards/margins": 3.7210068702697754, + "rewards/rejected": -3.449303150177002, + "step": 6071 + }, + { + "epoch": 0.7, + "learning_rate": 9.137305396230832e-08, + "logits/chosen": -3.0360212326049805, + "logits/rejected": -2.959798812866211, + "logps/chosen": -173.12088012695312, + "logps/rejected": -232.25379943847656, + "loss": 0.3678, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19053027033805847, + "rewards/margins": 1.7906609773635864, + "rewards/rejected": -1.6001307964324951, + "step": 6072 + }, + { + "epoch": 0.7, + "learning_rate": 9.133793749268406e-08, + "logits/chosen": -3.1001136302948, + "logits/rejected": -2.922250270843506, + "logps/chosen": -106.47547912597656, + "logps/rejected": -194.431884765625, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3614736795425415, + "rewards/margins": 1.6043440103530884, + "rewards/rejected": -1.2428703308105469, + "step": 6073 + }, + { + "epoch": 0.7, + "learning_rate": 9.130282102305982e-08, + "logits/chosen": -3.559642791748047, + "logits/rejected": -3.758878231048584, + "logps/chosen": -331.7237854003906, + "logps/rejected": -278.133544921875, + "loss": 0.1923, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4249231219291687, + "rewards/margins": 2.4996120929718018, + "rewards/rejected": -2.0746891498565674, + "step": 6074 + }, + { + "epoch": 0.7, + "learning_rate": 9.126770455343556e-08, + "logits/chosen": -2.680734634399414, + "logits/rejected": -2.8921382427215576, + "logps/chosen": -414.3327941894531, + "logps/rejected": -391.3150634765625, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03231450915336609, + "rewards/margins": 2.0791215896606445, + "rewards/rejected": -2.111435890197754, + "step": 6075 + }, + { + "epoch": 0.7, + "learning_rate": 9.12325880838113e-08, + "logits/chosen": -2.6257877349853516, + "logits/rejected": -2.8078393936157227, + "logps/chosen": -281.26312255859375, + "logps/rejected": -294.7520751953125, + "loss": 0.367, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10890139639377594, + "rewards/margins": 1.608031153678894, + "rewards/rejected": -1.4991297721862793, + "step": 6076 + }, + { + "epoch": 0.7, + "learning_rate": 9.119747161418704e-08, + "logits/chosen": -2.4372124671936035, + "logits/rejected": -2.6279287338256836, + "logps/chosen": -238.24945068359375, + "logps/rejected": -251.92041015625, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18097522854804993, + "rewards/margins": 2.335618734359741, + "rewards/rejected": -2.1546435356140137, + "step": 6077 + }, + { + "epoch": 0.7, + "learning_rate": 9.11623551445628e-08, + "logits/chosen": -3.894831657409668, + "logits/rejected": -3.7625513076782227, + "logps/chosen": -311.07421875, + "logps/rejected": -251.59275817871094, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18871673941612244, + "rewards/margins": 2.9271490573883057, + "rewards/rejected": -2.7384324073791504, + "step": 6078 + }, + { + "epoch": 0.7, + "learning_rate": 9.112723867493855e-08, + "logits/chosen": -3.4402084350585938, + "logits/rejected": -3.331296682357788, + "logps/chosen": -435.89031982421875, + "logps/rejected": -538.884033203125, + "loss": 0.8706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12662279605865479, + "rewards/margins": 1.9657783508300781, + "rewards/rejected": -2.0924010276794434, + "step": 6079 + }, + { + "epoch": 0.7, + "learning_rate": 9.109212220531429e-08, + "logits/chosen": -2.9836983680725098, + "logits/rejected": -2.8109748363494873, + "logps/chosen": -320.979736328125, + "logps/rejected": -370.1586608886719, + "loss": 0.4485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4274188280105591, + "rewards/margins": 1.5109072923660278, + "rewards/rejected": -1.938326120376587, + "step": 6080 + }, + { + "epoch": 0.7, + "learning_rate": 9.105700573569003e-08, + "logits/chosen": -2.84674072265625, + "logits/rejected": -3.2173678874969482, + "logps/chosen": -213.95053100585938, + "logps/rejected": -224.9061279296875, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8070627450942993, + "rewards/margins": 1.4396345615386963, + "rewards/rejected": -2.246697425842285, + "step": 6081 + }, + { + "epoch": 0.7, + "learning_rate": 9.10218892660658e-08, + "logits/chosen": -2.9770922660827637, + "logits/rejected": -3.2050983905792236, + "logps/chosen": -288.4244384765625, + "logps/rejected": -356.51092529296875, + "loss": 0.5225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3454131484031677, + "rewards/margins": 1.7057697772979736, + "rewards/rejected": -2.051182985305786, + "step": 6082 + }, + { + "epoch": 0.7, + "learning_rate": 9.098677279644153e-08, + "logits/chosen": -2.998551368713379, + "logits/rejected": -3.08260440826416, + "logps/chosen": -398.1041259765625, + "logps/rejected": -208.4000701904297, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3431411385536194, + "rewards/margins": 0.9253236651420593, + "rewards/rejected": -1.2684648036956787, + "step": 6083 + }, + { + "epoch": 0.7, + "learning_rate": 9.095165632681728e-08, + "logits/chosen": -3.5488781929016113, + "logits/rejected": -3.564073085784912, + "logps/chosen": -314.515869140625, + "logps/rejected": -269.0707702636719, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31776294112205505, + "rewards/margins": 3.636610984802246, + "rewards/rejected": -3.318848133087158, + "step": 6084 + }, + { + "epoch": 0.7, + "learning_rate": 9.091653985719302e-08, + "logits/chosen": -3.2480010986328125, + "logits/rejected": -3.6437478065490723, + "logps/chosen": -228.5526885986328, + "logps/rejected": -302.8901062011719, + "loss": 0.2915, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004967305809259415, + "rewards/margins": 2.253704786300659, + "rewards/rejected": -2.2586722373962402, + "step": 6085 + }, + { + "epoch": 0.7, + "learning_rate": 9.088142338756877e-08, + "logits/chosen": -3.7899441719055176, + "logits/rejected": -3.668755531311035, + "logps/chosen": -149.9021453857422, + "logps/rejected": -230.98294067382812, + "loss": 0.3714, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3661889433860779, + "rewards/margins": 1.84990656375885, + "rewards/rejected": -1.483717679977417, + "step": 6086 + }, + { + "epoch": 0.7, + "learning_rate": 9.084630691794451e-08, + "logits/chosen": -3.3977017402648926, + "logits/rejected": -3.2625198364257812, + "logps/chosen": -262.5534362792969, + "logps/rejected": -176.13699340820312, + "loss": 0.4267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4516794979572296, + "rewards/margins": 1.8254063129425049, + "rewards/rejected": -2.277086019515991, + "step": 6087 + }, + { + "epoch": 0.7, + "learning_rate": 9.081119044832025e-08, + "logits/chosen": -2.60337495803833, + "logits/rejected": -2.8215227127075195, + "logps/chosen": -313.906494140625, + "logps/rejected": -337.74822998046875, + "loss": 0.135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2593235373497009, + "rewards/margins": 2.7231969833374023, + "rewards/rejected": -2.982520580291748, + "step": 6088 + }, + { + "epoch": 0.7, + "learning_rate": 9.0776073978696e-08, + "logits/chosen": -3.139949083328247, + "logits/rejected": -2.6737875938415527, + "logps/chosen": -291.984130859375, + "logps/rejected": -229.77215576171875, + "loss": 0.2828, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3035607635974884, + "rewards/margins": 2.116612434387207, + "rewards/rejected": -1.813051462173462, + "step": 6089 + }, + { + "epoch": 0.7, + "learning_rate": 9.074095750907176e-08, + "logits/chosen": -2.9340176582336426, + "logits/rejected": -2.796393871307373, + "logps/chosen": -282.9796142578125, + "logps/rejected": -174.66146850585938, + "loss": 0.232, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9376305341720581, + "rewards/margins": 2.3397605419158936, + "rewards/rejected": -1.4021300077438354, + "step": 6090 + }, + { + "epoch": 0.7, + "learning_rate": 9.07058410394475e-08, + "logits/chosen": -2.482470989227295, + "logits/rejected": -2.5537121295928955, + "logps/chosen": -347.8477478027344, + "logps/rejected": -305.0164794921875, + "loss": 0.4984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04678768664598465, + "rewards/margins": 0.8753702640533447, + "rewards/rejected": -0.9221579432487488, + "step": 6091 + }, + { + "epoch": 0.7, + "learning_rate": 9.067072456982324e-08, + "logits/chosen": -3.0897862911224365, + "logits/rejected": -2.987215518951416, + "logps/chosen": -185.39601135253906, + "logps/rejected": -264.1693115234375, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3489857017993927, + "rewards/margins": 1.9779059886932373, + "rewards/rejected": -1.6289201974868774, + "step": 6092 + }, + { + "epoch": 0.7, + "learning_rate": 9.063560810019898e-08, + "logits/chosen": -3.1046433448791504, + "logits/rejected": -3.0982935428619385, + "logps/chosen": -248.7295684814453, + "logps/rejected": -208.10768127441406, + "loss": 0.2503, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17266197502613068, + "rewards/margins": 2.203451633453369, + "rewards/rejected": -2.030789852142334, + "step": 6093 + }, + { + "epoch": 0.7, + "learning_rate": 9.060049163057475e-08, + "logits/chosen": -2.681684970855713, + "logits/rejected": -2.47979474067688, + "logps/chosen": -153.69595336914062, + "logps/rejected": -297.994140625, + "loss": 0.3043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3144238591194153, + "rewards/margins": 1.8866074085235596, + "rewards/rejected": -2.20103120803833, + "step": 6094 + }, + { + "epoch": 0.7, + "learning_rate": 9.056537516095049e-08, + "logits/chosen": -3.5225517749786377, + "logits/rejected": -3.409306526184082, + "logps/chosen": -227.333740234375, + "logps/rejected": -201.8513946533203, + "loss": 0.4026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4450843334197998, + "rewards/margins": 1.2728307247161865, + "rewards/rejected": -1.7179150581359863, + "step": 6095 + }, + { + "epoch": 0.7, + "learning_rate": 9.053025869132623e-08, + "logits/chosen": -3.0217068195343018, + "logits/rejected": -2.8974878787994385, + "logps/chosen": -369.67828369140625, + "logps/rejected": -266.4863586425781, + "loss": 0.6088, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5019599795341492, + "rewards/margins": 2.0528693199157715, + "rewards/rejected": -2.5548291206359863, + "step": 6096 + }, + { + "epoch": 0.7, + "learning_rate": 9.049514222170197e-08, + "logits/chosen": -2.9865059852600098, + "logits/rejected": -3.0960254669189453, + "logps/chosen": -209.69374084472656, + "logps/rejected": -306.7557678222656, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30875837802886963, + "rewards/margins": 1.9658623933792114, + "rewards/rejected": -2.274620771408081, + "step": 6097 + }, + { + "epoch": 0.7, + "learning_rate": 9.046002575207771e-08, + "logits/chosen": -2.866668939590454, + "logits/rejected": -3.0981502532958984, + "logps/chosen": -104.46270751953125, + "logps/rejected": -130.6959686279297, + "loss": 0.2893, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45351600646972656, + "rewards/margins": 1.7967004776000977, + "rewards/rejected": -1.343184471130371, + "step": 6098 + }, + { + "epoch": 0.7, + "learning_rate": 9.042490928245348e-08, + "logits/chosen": -2.996884822845459, + "logits/rejected": -3.12359619140625, + "logps/chosen": -371.37310791015625, + "logps/rejected": -319.3813171386719, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07408209145069122, + "rewards/margins": 0.9064451456069946, + "rewards/rejected": -0.8323631286621094, + "step": 6099 + }, + { + "epoch": 0.7, + "learning_rate": 9.038979281282922e-08, + "logits/chosen": -2.2885138988494873, + "logits/rejected": -2.1659529209136963, + "logps/chosen": -405.9176940917969, + "logps/rejected": -312.5859375, + "loss": 0.6116, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21587128937244415, + "rewards/margins": 1.4609031677246094, + "rewards/rejected": -1.2450318336486816, + "step": 6100 + }, + { + "epoch": 0.7, + "learning_rate": 9.035467634320496e-08, + "logits/chosen": -3.3562283515930176, + "logits/rejected": -3.496826410293579, + "logps/chosen": -114.05194854736328, + "logps/rejected": -212.5638427734375, + "loss": 0.3496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04160606861114502, + "rewards/margins": 2.619828939437866, + "rewards/rejected": -2.5782227516174316, + "step": 6101 + }, + { + "epoch": 0.7, + "learning_rate": 9.03195598735807e-08, + "logits/chosen": -2.2846920490264893, + "logits/rejected": -2.4456565380096436, + "logps/chosen": -237.18800354003906, + "logps/rejected": -292.17706298828125, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7717079520225525, + "rewards/margins": 1.784788727760315, + "rewards/rejected": -1.0130809545516968, + "step": 6102 + }, + { + "epoch": 0.7, + "learning_rate": 9.028444340395645e-08, + "logits/chosen": -3.280986785888672, + "logits/rejected": -3.1609320640563965, + "logps/chosen": -249.81687927246094, + "logps/rejected": -246.35687255859375, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1320053339004517, + "rewards/margins": 1.181520700454712, + "rewards/rejected": -2.313525915145874, + "step": 6103 + }, + { + "epoch": 0.7, + "learning_rate": 9.02493269343322e-08, + "logits/chosen": -3.284933090209961, + "logits/rejected": -3.030592918395996, + "logps/chosen": -164.506103515625, + "logps/rejected": -236.7853546142578, + "loss": 0.6622, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5435166358947754, + "rewards/margins": 0.7093006372451782, + "rewards/rejected": -1.2528172731399536, + "step": 6104 + }, + { + "epoch": 0.7, + "learning_rate": 9.021421046470793e-08, + "logits/chosen": -3.058279037475586, + "logits/rejected": -2.8571083545684814, + "logps/chosen": -240.7840118408203, + "logps/rejected": -259.4144592285156, + "loss": 0.2783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41717565059661865, + "rewards/margins": 1.896039605140686, + "rewards/rejected": -1.4788639545440674, + "step": 6105 + }, + { + "epoch": 0.7, + "learning_rate": 9.017909399508369e-08, + "logits/chosen": -3.3358988761901855, + "logits/rejected": -3.0481863021850586, + "logps/chosen": -160.75454711914062, + "logps/rejected": -254.9085235595703, + "loss": 0.6707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39141929149627686, + "rewards/margins": 0.4302743077278137, + "rewards/rejected": -0.8216936588287354, + "step": 6106 + }, + { + "epoch": 0.7, + "learning_rate": 9.014397752545944e-08, + "logits/chosen": -2.9087164402008057, + "logits/rejected": -3.0266458988189697, + "logps/chosen": -302.02911376953125, + "logps/rejected": -325.68829345703125, + "loss": 0.6052, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3046901226043701, + "rewards/margins": 0.6246770620346069, + "rewards/rejected": -0.929367184638977, + "step": 6107 + }, + { + "epoch": 0.7, + "learning_rate": 9.010886105583518e-08, + "logits/chosen": -3.644380569458008, + "logits/rejected": -3.3995068073272705, + "logps/chosen": -371.10052490234375, + "logps/rejected": -199.57525634765625, + "loss": 0.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4111597239971161, + "rewards/margins": 1.5755419731140137, + "rewards/rejected": -1.1643824577331543, + "step": 6108 + }, + { + "epoch": 0.7, + "learning_rate": 9.007374458621092e-08, + "logits/chosen": -2.690711498260498, + "logits/rejected": -2.697741746902466, + "logps/chosen": -398.8992919921875, + "logps/rejected": -377.29412841796875, + "loss": 0.3093, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3005514144897461, + "rewards/margins": 2.216306686401367, + "rewards/rejected": -1.915755033493042, + "step": 6109 + }, + { + "epoch": 0.7, + "learning_rate": 9.003862811658666e-08, + "logits/chosen": -3.6344897747039795, + "logits/rejected": -3.5506889820098877, + "logps/chosen": -255.87742614746094, + "logps/rejected": -295.5720520019531, + "loss": 0.487, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04624398052692413, + "rewards/margins": 1.099831461906433, + "rewards/rejected": -1.0535873174667358, + "step": 6110 + }, + { + "epoch": 0.7, + "learning_rate": 9.000351164696243e-08, + "logits/chosen": -2.8294291496276855, + "logits/rejected": -3.3157291412353516, + "logps/chosen": -252.84255981445312, + "logps/rejected": -184.09959411621094, + "loss": 0.3571, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3211233913898468, + "rewards/margins": 1.4139246940612793, + "rewards/rejected": -1.7350480556488037, + "step": 6111 + }, + { + "epoch": 0.7, + "learning_rate": 8.996839517733817e-08, + "logits/chosen": -3.0712387561798096, + "logits/rejected": -2.8537580966949463, + "logps/chosen": -310.1214904785156, + "logps/rejected": -265.24652099609375, + "loss": 0.5207, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20905832946300507, + "rewards/margins": 1.4217853546142578, + "rewards/rejected": -1.6308436393737793, + "step": 6112 + }, + { + "epoch": 0.7, + "learning_rate": 8.993327870771391e-08, + "logits/chosen": -2.9889256954193115, + "logits/rejected": -3.0674004554748535, + "logps/chosen": -374.1104431152344, + "logps/rejected": -400.1053466796875, + "loss": 0.3966, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31699153780937195, + "rewards/margins": 1.2058945894241333, + "rewards/rejected": -0.8889029622077942, + "step": 6113 + }, + { + "epoch": 0.7, + "learning_rate": 8.989816223808965e-08, + "logits/chosen": -2.2791781425476074, + "logits/rejected": -2.2096753120422363, + "logps/chosen": -345.59149169921875, + "logps/rejected": -301.6259765625, + "loss": 0.3407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10151234269142151, + "rewards/margins": 2.1730175018310547, + "rewards/rejected": -2.2745299339294434, + "step": 6114 + }, + { + "epoch": 0.7, + "learning_rate": 8.986304576846541e-08, + "logits/chosen": -2.6969971656799316, + "logits/rejected": -3.0195586681365967, + "logps/chosen": -463.4600830078125, + "logps/rejected": -379.70391845703125, + "loss": 0.1347, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5338241457939148, + "rewards/margins": 3.588310480117798, + "rewards/rejected": -3.0544862747192383, + "step": 6115 + }, + { + "epoch": 0.71, + "learning_rate": 8.982792929884116e-08, + "logits/chosen": -3.080277681350708, + "logits/rejected": -3.210265874862671, + "logps/chosen": -147.17086791992188, + "logps/rejected": -365.3971862792969, + "loss": 0.6134, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26703941822052, + "rewards/margins": 1.7488662004470825, + "rewards/rejected": -1.4818267822265625, + "step": 6116 + }, + { + "epoch": 0.71, + "learning_rate": 8.97928128292169e-08, + "logits/chosen": -2.134443759918213, + "logits/rejected": -2.0830495357513428, + "logps/chosen": -232.095458984375, + "logps/rejected": -318.3380126953125, + "loss": 0.2995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18115587532520294, + "rewards/margins": 1.5603512525558472, + "rewards/rejected": -1.7415071725845337, + "step": 6117 + }, + { + "epoch": 0.71, + "learning_rate": 8.975769635959264e-08, + "logits/chosen": -3.3733468055725098, + "logits/rejected": -3.2877206802368164, + "logps/chosen": -128.2617645263672, + "logps/rejected": -211.00587463378906, + "loss": 0.6468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27508026361465454, + "rewards/margins": 0.8277932405471802, + "rewards/rejected": -1.1028735637664795, + "step": 6118 + }, + { + "epoch": 0.71, + "learning_rate": 8.97225798899684e-08, + "logits/chosen": -2.5232133865356445, + "logits/rejected": -2.5417237281799316, + "logps/chosen": -419.806884765625, + "logps/rejected": -297.09686279296875, + "loss": 0.44, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4827737808227539, + "rewards/margins": 1.9735772609710693, + "rewards/rejected": -1.4908034801483154, + "step": 6119 + }, + { + "epoch": 0.71, + "learning_rate": 8.968746342034414e-08, + "logits/chosen": -3.286860942840576, + "logits/rejected": -3.2591984272003174, + "logps/chosen": -239.66928100585938, + "logps/rejected": -289.18499755859375, + "loss": 0.4889, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2852742373943329, + "rewards/margins": 2.460202217102051, + "rewards/rejected": -2.745476484298706, + "step": 6120 + }, + { + "epoch": 0.71, + "learning_rate": 8.965234695071988e-08, + "logits/chosen": -2.5937063694000244, + "logits/rejected": -2.5896732807159424, + "logps/chosen": -244.20794677734375, + "logps/rejected": -212.518798828125, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7949826717376709, + "rewards/margins": 2.0723118782043457, + "rewards/rejected": -1.2773292064666748, + "step": 6121 + }, + { + "epoch": 0.71, + "learning_rate": 8.961723048109562e-08, + "logits/chosen": -2.773634195327759, + "logits/rejected": -2.57669734954834, + "logps/chosen": -459.1122131347656, + "logps/rejected": -309.68609619140625, + "loss": 0.2538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8535553216934204, + "rewards/margins": 2.4536514282226562, + "rewards/rejected": -1.6000959873199463, + "step": 6122 + }, + { + "epoch": 0.71, + "learning_rate": 8.958211401147138e-08, + "logits/chosen": -2.6778433322906494, + "logits/rejected": -2.903172016143799, + "logps/chosen": -240.40826416015625, + "logps/rejected": -169.02305603027344, + "loss": 0.4505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2002800554037094, + "rewards/margins": 1.076744556427002, + "rewards/rejected": -1.277024507522583, + "step": 6123 + }, + { + "epoch": 0.71, + "learning_rate": 8.954699754184713e-08, + "logits/chosen": -2.9962313175201416, + "logits/rejected": -3.462285041809082, + "logps/chosen": -188.261962890625, + "logps/rejected": -393.4789733886719, + "loss": 0.4389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011936947703361511, + "rewards/margins": 1.7323399782180786, + "rewards/rejected": -1.7442768812179565, + "step": 6124 + }, + { + "epoch": 0.71, + "learning_rate": 8.951188107222287e-08, + "logits/chosen": -3.1101598739624023, + "logits/rejected": -2.8698458671569824, + "logps/chosen": -201.03079223632812, + "logps/rejected": -229.35275268554688, + "loss": 0.4257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5676238536834717, + "rewards/margins": 1.1767336130142212, + "rewards/rejected": -1.7443575859069824, + "step": 6125 + }, + { + "epoch": 0.71, + "learning_rate": 8.94767646025986e-08, + "logits/chosen": -2.6355433464050293, + "logits/rejected": -2.564321517944336, + "logps/chosen": -308.01080322265625, + "logps/rejected": -357.89263916015625, + "loss": 0.4064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2250383496284485, + "rewards/margins": 1.7554481029510498, + "rewards/rejected": -1.9804866313934326, + "step": 6126 + }, + { + "epoch": 0.71, + "learning_rate": 8.944164813297437e-08, + "logits/chosen": -2.6624813079833984, + "logits/rejected": -2.73696231842041, + "logps/chosen": -335.05767822265625, + "logps/rejected": -298.4790954589844, + "loss": 0.3402, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28701743483543396, + "rewards/margins": 1.7627724409103394, + "rewards/rejected": -1.475754976272583, + "step": 6127 + }, + { + "epoch": 0.71, + "learning_rate": 8.940653166335011e-08, + "logits/chosen": -3.1313982009887695, + "logits/rejected": -3.1614720821380615, + "logps/chosen": -204.92051696777344, + "logps/rejected": -254.1090850830078, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3639419674873352, + "rewards/margins": 0.6081781387329102, + "rewards/rejected": -0.9721200466156006, + "step": 6128 + }, + { + "epoch": 0.71, + "learning_rate": 8.937141519372585e-08, + "logits/chosen": -3.8979644775390625, + "logits/rejected": -3.6847269535064697, + "logps/chosen": -150.14810180664062, + "logps/rejected": -215.3777618408203, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19504497945308685, + "rewards/margins": 2.20904278755188, + "rewards/rejected": -2.404087543487549, + "step": 6129 + }, + { + "epoch": 0.71, + "learning_rate": 8.93362987241016e-08, + "logits/chosen": -3.201587677001953, + "logits/rejected": -3.0247626304626465, + "logps/chosen": -218.52076721191406, + "logps/rejected": -232.52304077148438, + "loss": 0.216, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.252737820148468, + "rewards/margins": 2.2079148292541504, + "rewards/rejected": -1.9551771879196167, + "step": 6130 + }, + { + "epoch": 0.71, + "learning_rate": 8.930118225447735e-08, + "logits/chosen": -3.1223573684692383, + "logits/rejected": -2.919534683227539, + "logps/chosen": -295.696044921875, + "logps/rejected": -327.59649658203125, + "loss": 0.35, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22886496782302856, + "rewards/margins": 1.7905467748641968, + "rewards/rejected": -1.5616817474365234, + "step": 6131 + }, + { + "epoch": 0.71, + "learning_rate": 8.926606578485309e-08, + "logits/chosen": -3.1395223140716553, + "logits/rejected": -3.190736770629883, + "logps/chosen": -326.79840087890625, + "logps/rejected": -292.4430236816406, + "loss": 0.4236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0670311450958252, + "rewards/margins": 1.5083199739456177, + "rewards/rejected": -1.5753509998321533, + "step": 6132 + }, + { + "epoch": 0.71, + "learning_rate": 8.923094931522884e-08, + "logits/chosen": -2.655749559402466, + "logits/rejected": -2.873871088027954, + "logps/chosen": -337.5138244628906, + "logps/rejected": -431.7476806640625, + "loss": 0.4331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21541815996170044, + "rewards/margins": 2.878141164779663, + "rewards/rejected": -3.093559503555298, + "step": 6133 + }, + { + "epoch": 0.71, + "learning_rate": 8.919583284560458e-08, + "logits/chosen": -2.981325149536133, + "logits/rejected": -2.8795838356018066, + "logps/chosen": -158.6732940673828, + "logps/rejected": -234.12741088867188, + "loss": 0.533, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003499850630760193, + "rewards/margins": 1.4250478744506836, + "rewards/rejected": -1.4215480089187622, + "step": 6134 + }, + { + "epoch": 0.71, + "learning_rate": 8.916071637598034e-08, + "logits/chosen": -3.0649571418762207, + "logits/rejected": -3.1563284397125244, + "logps/chosen": -283.21856689453125, + "logps/rejected": -261.4825439453125, + "loss": 0.1963, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4392419755458832, + "rewards/margins": 3.4560437202453613, + "rewards/rejected": -3.0168018341064453, + "step": 6135 + }, + { + "epoch": 0.71, + "learning_rate": 8.912559990635608e-08, + "logits/chosen": -2.7408676147460938, + "logits/rejected": -2.8689637184143066, + "logps/chosen": -133.58380126953125, + "logps/rejected": -320.807861328125, + "loss": 0.3892, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.054868340492248535, + "rewards/margins": 1.4887454509735107, + "rewards/rejected": -1.4338772296905518, + "step": 6136 + }, + { + "epoch": 0.71, + "learning_rate": 8.909048343673182e-08, + "logits/chosen": -2.889843463897705, + "logits/rejected": -2.9238405227661133, + "logps/chosen": -379.4001159667969, + "logps/rejected": -417.7017517089844, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08037938177585602, + "rewards/margins": 1.4220572710037231, + "rewards/rejected": -1.5024367570877075, + "step": 6137 + }, + { + "epoch": 0.71, + "learning_rate": 8.905536696710756e-08, + "logits/chosen": -2.607058048248291, + "logits/rejected": -2.584132194519043, + "logps/chosen": -264.89532470703125, + "logps/rejected": -255.6737518310547, + "loss": 0.7552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.275716632604599, + "rewards/margins": 0.8577706813812256, + "rewards/rejected": -1.1334872245788574, + "step": 6138 + }, + { + "epoch": 0.71, + "learning_rate": 8.902025049748333e-08, + "logits/chosen": -2.8230292797088623, + "logits/rejected": -2.7770655155181885, + "logps/chosen": -413.58306884765625, + "logps/rejected": -348.7840576171875, + "loss": 0.1465, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17325459420681, + "rewards/margins": 3.4127581119537354, + "rewards/rejected": -3.2395036220550537, + "step": 6139 + }, + { + "epoch": 0.71, + "learning_rate": 8.898513402785907e-08, + "logits/chosen": -3.1502623558044434, + "logits/rejected": -3.4130406379699707, + "logps/chosen": -227.9558868408203, + "logps/rejected": -237.7585906982422, + "loss": 0.2468, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32965323328971863, + "rewards/margins": 1.9935916662216187, + "rewards/rejected": -1.6639385223388672, + "step": 6140 + }, + { + "epoch": 0.71, + "learning_rate": 8.895001755823481e-08, + "logits/chosen": -3.0306992530822754, + "logits/rejected": -3.3390250205993652, + "logps/chosen": -302.54498291015625, + "logps/rejected": -367.52447509765625, + "loss": 0.1801, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0363527312874794, + "rewards/margins": 2.8550732135772705, + "rewards/rejected": -2.8914263248443604, + "step": 6141 + }, + { + "epoch": 0.71, + "learning_rate": 8.891490108861055e-08, + "logits/chosen": -2.9788901805877686, + "logits/rejected": -2.905273199081421, + "logps/chosen": -196.69944763183594, + "logps/rejected": -218.47891235351562, + "loss": 0.4705, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17515115439891815, + "rewards/margins": 1.062846302986145, + "rewards/rejected": -0.8876950740814209, + "step": 6142 + }, + { + "epoch": 0.71, + "learning_rate": 8.88797846189863e-08, + "logits/chosen": -2.2676568031311035, + "logits/rejected": -2.453977108001709, + "logps/chosen": -240.12477111816406, + "logps/rejected": -150.34104919433594, + "loss": 0.9122, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6776915788650513, + "rewards/margins": 0.30193406343460083, + "rewards/rejected": -0.9796257019042969, + "step": 6143 + }, + { + "epoch": 0.71, + "learning_rate": 8.884466814936206e-08, + "logits/chosen": -3.252197742462158, + "logits/rejected": -3.0953197479248047, + "logps/chosen": -277.00811767578125, + "logps/rejected": -140.44696044921875, + "loss": 0.4484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5606539845466614, + "rewards/margins": 0.7531798481941223, + "rewards/rejected": -1.3138338327407837, + "step": 6144 + }, + { + "epoch": 0.71, + "learning_rate": 8.88095516797378e-08, + "logits/chosen": -3.2424378395080566, + "logits/rejected": -3.0616142749786377, + "logps/chosen": -223.42874145507812, + "logps/rejected": -232.1288604736328, + "loss": 0.8824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6102076768875122, + "rewards/margins": 0.01983983814716339, + "rewards/rejected": -0.6300475597381592, + "step": 6145 + }, + { + "epoch": 0.71, + "learning_rate": 8.877443521011354e-08, + "logits/chosen": -2.6053247451782227, + "logits/rejected": -2.7176549434661865, + "logps/chosen": -290.4808654785156, + "logps/rejected": -174.80909729003906, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18050625920295715, + "rewards/margins": 1.30686616897583, + "rewards/rejected": -1.4873723983764648, + "step": 6146 + }, + { + "epoch": 0.71, + "learning_rate": 8.873931874048928e-08, + "logits/chosen": -3.060335636138916, + "logits/rejected": -3.24824857711792, + "logps/chosen": -217.21548461914062, + "logps/rejected": -203.3995361328125, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35658836364746094, + "rewards/margins": 1.695101022720337, + "rewards/rejected": -2.051689386367798, + "step": 6147 + }, + { + "epoch": 0.71, + "learning_rate": 8.870420227086503e-08, + "logits/chosen": -3.42417573928833, + "logits/rejected": -3.034993886947632, + "logps/chosen": -342.2288513183594, + "logps/rejected": -305.8694152832031, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4222422242164612, + "rewards/margins": 3.2604455947875977, + "rewards/rejected": -2.8382034301757812, + "step": 6148 + }, + { + "epoch": 0.71, + "learning_rate": 8.866908580124077e-08, + "logits/chosen": -3.7327561378479004, + "logits/rejected": -3.2240078449249268, + "logps/chosen": -448.0987243652344, + "logps/rejected": -254.05499267578125, + "loss": 0.2335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0336490124464035, + "rewards/margins": 3.5415101051330566, + "rewards/rejected": -3.5751593112945557, + "step": 6149 + }, + { + "epoch": 0.71, + "learning_rate": 8.863396933161653e-08, + "logits/chosen": -3.5748047828674316, + "logits/rejected": -3.8332791328430176, + "logps/chosen": -251.20376586914062, + "logps/rejected": -276.0435791015625, + "loss": 0.6824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024360299110412598, + "rewards/margins": 2.2638607025146484, + "rewards/rejected": -2.2882208824157715, + "step": 6150 + }, + { + "epoch": 0.71, + "learning_rate": 8.859885286199227e-08, + "logits/chosen": -2.514810562133789, + "logits/rejected": -2.6988935470581055, + "logps/chosen": -204.5736083984375, + "logps/rejected": -319.25244140625, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4320363998413086, + "rewards/margins": 3.1014516353607178, + "rewards/rejected": -2.66941499710083, + "step": 6151 + }, + { + "epoch": 0.71, + "learning_rate": 8.856373639236802e-08, + "logits/chosen": -3.5355031490325928, + "logits/rejected": -3.7244834899902344, + "logps/chosen": -255.11334228515625, + "logps/rejected": -233.78982543945312, + "loss": 0.3693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09391672164201736, + "rewards/margins": 1.077737808227539, + "rewards/rejected": -0.9838209748268127, + "step": 6152 + }, + { + "epoch": 0.71, + "learning_rate": 8.852861992274376e-08, + "logits/chosen": -3.375528335571289, + "logits/rejected": -3.2704696655273438, + "logps/chosen": -264.0948181152344, + "logps/rejected": -181.7700653076172, + "loss": 0.3765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17755772173404694, + "rewards/margins": 1.0674458742141724, + "rewards/rejected": -1.2450034618377686, + "step": 6153 + }, + { + "epoch": 0.71, + "learning_rate": 8.84935034531195e-08, + "logits/chosen": -3.856771945953369, + "logits/rejected": -3.809086322784424, + "logps/chosen": -290.18695068359375, + "logps/rejected": -214.68707275390625, + "loss": 0.443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7056518793106079, + "rewards/margins": 2.2653257846832275, + "rewards/rejected": -2.970977783203125, + "step": 6154 + }, + { + "epoch": 0.71, + "learning_rate": 8.845838698349524e-08, + "logits/chosen": -2.712367057800293, + "logits/rejected": -2.7184433937072754, + "logps/chosen": -290.42474365234375, + "logps/rejected": -255.64605712890625, + "loss": 0.5434, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6923716068267822, + "rewards/margins": 0.5961600542068481, + "rewards/rejected": -1.2885316610336304, + "step": 6155 + }, + { + "epoch": 0.71, + "learning_rate": 8.842327051387101e-08, + "logits/chosen": -2.879347324371338, + "logits/rejected": -2.7579498291015625, + "logps/chosen": -283.51519775390625, + "logps/rejected": -303.186767578125, + "loss": 0.374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2077643871307373, + "rewards/margins": 1.8576922416687012, + "rewards/rejected": -2.0654566287994385, + "step": 6156 + }, + { + "epoch": 0.71, + "learning_rate": 8.838815404424675e-08, + "logits/chosen": -3.4113080501556396, + "logits/rejected": -3.4759488105773926, + "logps/chosen": -229.78366088867188, + "logps/rejected": -269.4625549316406, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05036620795726776, + "rewards/margins": 2.6533043384552, + "rewards/rejected": -2.7036707401275635, + "step": 6157 + }, + { + "epoch": 0.71, + "learning_rate": 8.835303757462249e-08, + "logits/chosen": -3.954777956008911, + "logits/rejected": -3.623379707336426, + "logps/chosen": -254.99591064453125, + "logps/rejected": -221.50735473632812, + "loss": 0.3809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2972577214241028, + "rewards/margins": 2.0790953636169434, + "rewards/rejected": -1.7818377017974854, + "step": 6158 + }, + { + "epoch": 0.71, + "learning_rate": 8.831792110499823e-08, + "logits/chosen": -3.025874376296997, + "logits/rejected": -2.8948991298675537, + "logps/chosen": -143.67657470703125, + "logps/rejected": -206.8068084716797, + "loss": 0.4815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4232417345046997, + "rewards/margins": 1.232729434967041, + "rewards/rejected": -1.6559711694717407, + "step": 6159 + }, + { + "epoch": 0.71, + "learning_rate": 8.828280463537399e-08, + "logits/chosen": -3.6685056686401367, + "logits/rejected": -3.7329514026641846, + "logps/chosen": -230.48435974121094, + "logps/rejected": -164.97317504882812, + "loss": 0.6512, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1573733538389206, + "rewards/margins": 0.9867604970932007, + "rewards/rejected": -0.8293873071670532, + "step": 6160 + }, + { + "epoch": 0.71, + "learning_rate": 8.824768816574974e-08, + "logits/chosen": -3.0399792194366455, + "logits/rejected": -2.8784451484680176, + "logps/chosen": -157.69497680664062, + "logps/rejected": -330.81689453125, + "loss": 0.5363, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2155860811471939, + "rewards/margins": 0.6355610489845276, + "rewards/rejected": -0.8511471152305603, + "step": 6161 + }, + { + "epoch": 0.71, + "learning_rate": 8.821257169612548e-08, + "logits/chosen": -2.9349751472473145, + "logits/rejected": -2.7918920516967773, + "logps/chosen": -83.40182495117188, + "logps/rejected": -258.4324951171875, + "loss": 0.2391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02097063511610031, + "rewards/margins": 2.0818381309509277, + "rewards/rejected": -2.102808952331543, + "step": 6162 + }, + { + "epoch": 0.71, + "learning_rate": 8.817745522650122e-08, + "logits/chosen": -3.175665855407715, + "logits/rejected": -3.029724597930908, + "logps/chosen": -115.06820678710938, + "logps/rejected": -189.20867919921875, + "loss": 0.5028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6200711727142334, + "rewards/margins": 1.3493478298187256, + "rewards/rejected": -1.9694191217422485, + "step": 6163 + }, + { + "epoch": 0.71, + "learning_rate": 8.814233875687698e-08, + "logits/chosen": -3.2401554584503174, + "logits/rejected": -3.0954720973968506, + "logps/chosen": -217.81976318359375, + "logps/rejected": -209.472900390625, + "loss": 0.3087, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5007895231246948, + "rewards/margins": 1.5985934734344482, + "rewards/rejected": -1.0978039503097534, + "step": 6164 + }, + { + "epoch": 0.71, + "learning_rate": 8.810722228725272e-08, + "logits/chosen": -3.7518508434295654, + "logits/rejected": -3.8579142093658447, + "logps/chosen": -126.11347961425781, + "logps/rejected": -195.9139862060547, + "loss": 0.3565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012776359915733337, + "rewards/margins": 1.644818902015686, + "rewards/rejected": -1.657595157623291, + "step": 6165 + }, + { + "epoch": 0.71, + "learning_rate": 8.807210581762846e-08, + "logits/chosen": -3.57435941696167, + "logits/rejected": -3.378826141357422, + "logps/chosen": -199.06790161132812, + "logps/rejected": -157.1637725830078, + "loss": 0.4637, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.030479609966278076, + "rewards/margins": 1.3187211751937866, + "rewards/rejected": -1.2882415056228638, + "step": 6166 + }, + { + "epoch": 0.71, + "learning_rate": 8.80369893480042e-08, + "logits/chosen": -2.6487364768981934, + "logits/rejected": -2.5413601398468018, + "logps/chosen": -336.662353515625, + "logps/rejected": -294.89691162109375, + "loss": 0.6296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33858636021614075, + "rewards/margins": 0.7050436735153198, + "rewards/rejected": -1.0436300039291382, + "step": 6167 + }, + { + "epoch": 0.71, + "learning_rate": 8.800187287837996e-08, + "logits/chosen": -2.376983642578125, + "logits/rejected": -2.4024558067321777, + "logps/chosen": -262.71685791015625, + "logps/rejected": -295.6997985839844, + "loss": 0.2968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2541394531726837, + "rewards/margins": 2.597189426422119, + "rewards/rejected": -2.8513288497924805, + "step": 6168 + }, + { + "epoch": 0.71, + "learning_rate": 8.79667564087557e-08, + "logits/chosen": -3.5652055740356445, + "logits/rejected": -3.234449863433838, + "logps/chosen": -258.85235595703125, + "logps/rejected": -250.91078186035156, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15751639008522034, + "rewards/margins": 2.698093891143799, + "rewards/rejected": -2.5405774116516113, + "step": 6169 + }, + { + "epoch": 0.71, + "learning_rate": 8.793163993913145e-08, + "logits/chosen": -2.5158329010009766, + "logits/rejected": -3.010761022567749, + "logps/chosen": -324.042724609375, + "logps/rejected": -222.24356079101562, + "loss": 0.5056, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28060489892959595, + "rewards/margins": 1.1833617687225342, + "rewards/rejected": -1.463966727256775, + "step": 6170 + }, + { + "epoch": 0.71, + "learning_rate": 8.789652346950719e-08, + "logits/chosen": -3.052309513092041, + "logits/rejected": -2.790369987487793, + "logps/chosen": -180.92266845703125, + "logps/rejected": -339.5602722167969, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01766054332256317, + "rewards/margins": 1.7705764770507812, + "rewards/rejected": -1.7529160976409912, + "step": 6171 + }, + { + "epoch": 0.71, + "learning_rate": 8.786140699988295e-08, + "logits/chosen": -2.7410056591033936, + "logits/rejected": -2.5396568775177, + "logps/chosen": -240.76095581054688, + "logps/rejected": -261.06524658203125, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43170684576034546, + "rewards/margins": 1.4314970970153809, + "rewards/rejected": -0.9997903108596802, + "step": 6172 + }, + { + "epoch": 0.71, + "learning_rate": 8.78262905302587e-08, + "logits/chosen": -2.5219969749450684, + "logits/rejected": -2.393662691116333, + "logps/chosen": -194.76675415039062, + "logps/rejected": -259.1099853515625, + "loss": 0.5426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5503987669944763, + "rewards/margins": 1.2846217155456543, + "rewards/rejected": -1.8350204229354858, + "step": 6173 + }, + { + "epoch": 0.71, + "learning_rate": 8.779117406063443e-08, + "logits/chosen": -2.708181381225586, + "logits/rejected": -3.340092658996582, + "logps/chosen": -207.26126098632812, + "logps/rejected": -194.3479461669922, + "loss": 0.2872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014198929071426392, + "rewards/margins": 1.8059428930282593, + "rewards/rejected": -1.8201419115066528, + "step": 6174 + }, + { + "epoch": 0.71, + "learning_rate": 8.775605759101017e-08, + "logits/chosen": -3.085787773132324, + "logits/rejected": -3.1958682537078857, + "logps/chosen": -252.85394287109375, + "logps/rejected": -295.9222717285156, + "loss": 0.2525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19341155886650085, + "rewards/margins": 2.4780240058898926, + "rewards/rejected": -2.6714353561401367, + "step": 6175 + }, + { + "epoch": 0.71, + "learning_rate": 8.772094112138593e-08, + "logits/chosen": -2.4998817443847656, + "logits/rejected": -2.8688364028930664, + "logps/chosen": -333.1115417480469, + "logps/rejected": -270.273681640625, + "loss": 0.2764, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4241678714752197, + "rewards/margins": 2.3558409214019775, + "rewards/rejected": -1.9316731691360474, + "step": 6176 + }, + { + "epoch": 0.71, + "learning_rate": 8.768582465176167e-08, + "logits/chosen": -3.5257928371429443, + "logits/rejected": -3.725550413131714, + "logps/chosen": -217.21205139160156, + "logps/rejected": -339.29620361328125, + "loss": 0.4566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6350345015525818, + "rewards/margins": 2.7671358585357666, + "rewards/rejected": -3.402170181274414, + "step": 6177 + }, + { + "epoch": 0.71, + "learning_rate": 8.765070818213742e-08, + "logits/chosen": -3.4178061485290527, + "logits/rejected": -3.393282175064087, + "logps/chosen": -178.123779296875, + "logps/rejected": -173.11480712890625, + "loss": 0.3419, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18634553253650665, + "rewards/margins": 1.390342354774475, + "rewards/rejected": -1.2039968967437744, + "step": 6178 + }, + { + "epoch": 0.71, + "learning_rate": 8.761559171251316e-08, + "logits/chosen": -3.253429889678955, + "logits/rejected": -3.0460476875305176, + "logps/chosen": -241.54005432128906, + "logps/rejected": -206.118896484375, + "loss": 0.3027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08854848891496658, + "rewards/margins": 1.6797508001327515, + "rewards/rejected": -1.7682993412017822, + "step": 6179 + }, + { + "epoch": 0.71, + "learning_rate": 8.758047524288892e-08, + "logits/chosen": -3.504655361175537, + "logits/rejected": -3.0748355388641357, + "logps/chosen": -419.376220703125, + "logps/rejected": -340.6866455078125, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36702775955200195, + "rewards/margins": 2.025324583053589, + "rewards/rejected": -1.658296823501587, + "step": 6180 + }, + { + "epoch": 0.71, + "learning_rate": 8.754535877326466e-08, + "logits/chosen": -3.2814135551452637, + "logits/rejected": -3.247514247894287, + "logps/chosen": -380.63897705078125, + "logps/rejected": -357.9363098144531, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3790084719657898, + "rewards/margins": 2.9755733013153076, + "rewards/rejected": -3.354581832885742, + "step": 6181 + }, + { + "epoch": 0.71, + "learning_rate": 8.75102423036404e-08, + "logits/chosen": -3.279662847518921, + "logits/rejected": -3.3672127723693848, + "logps/chosen": -290.95379638671875, + "logps/rejected": -213.62118530273438, + "loss": 0.4995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24473357200622559, + "rewards/margins": 1.0393134355545044, + "rewards/rejected": -1.28404700756073, + "step": 6182 + }, + { + "epoch": 0.71, + "learning_rate": 8.747512583401614e-08, + "logits/chosen": -3.0773234367370605, + "logits/rejected": -2.968860149383545, + "logps/chosen": -222.77908325195312, + "logps/rejected": -196.96795654296875, + "loss": 0.8737, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6918673515319824, + "rewards/margins": 0.35349413752555847, + "rewards/rejected": -1.0453613996505737, + "step": 6183 + }, + { + "epoch": 0.71, + "learning_rate": 8.744000936439191e-08, + "logits/chosen": -2.9693663120269775, + "logits/rejected": -2.7879042625427246, + "logps/chosen": -165.81851196289062, + "logps/rejected": -255.0758819580078, + "loss": 0.4798, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4908677339553833, + "rewards/margins": 1.8091825246810913, + "rewards/rejected": -1.318314790725708, + "step": 6184 + }, + { + "epoch": 0.71, + "learning_rate": 8.740489289476765e-08, + "logits/chosen": -3.6850407123565674, + "logits/rejected": -3.1730594635009766, + "logps/chosen": -253.9943084716797, + "logps/rejected": -173.28248596191406, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36148929595947266, + "rewards/margins": 1.256143569946289, + "rewards/rejected": -1.6176327466964722, + "step": 6185 + }, + { + "epoch": 0.71, + "learning_rate": 8.736977642514339e-08, + "logits/chosen": -2.9665849208831787, + "logits/rejected": -2.857121706008911, + "logps/chosen": -361.24261474609375, + "logps/rejected": -325.5003662109375, + "loss": 0.2992, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011166885495185852, + "rewards/margins": 2.113274574279785, + "rewards/rejected": -2.1021080017089844, + "step": 6186 + }, + { + "epoch": 0.71, + "learning_rate": 8.733465995551913e-08, + "logits/chosen": -3.1119096279144287, + "logits/rejected": -2.852790594100952, + "logps/chosen": -334.0766296386719, + "logps/rejected": -266.5248718261719, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.016907820478081703, + "rewards/margins": 1.9130665063858032, + "rewards/rejected": -1.9299744367599487, + "step": 6187 + }, + { + "epoch": 0.71, + "learning_rate": 8.729954348589488e-08, + "logits/chosen": -3.106278419494629, + "logits/rejected": -3.137249708175659, + "logps/chosen": -245.6387481689453, + "logps/rejected": -270.65606689453125, + "loss": 0.6403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34573179483413696, + "rewards/margins": 0.9581091403961182, + "rewards/rejected": -1.3038409948349, + "step": 6188 + }, + { + "epoch": 0.71, + "learning_rate": 8.726442701627064e-08, + "logits/chosen": -2.757159948348999, + "logits/rejected": -2.5926108360290527, + "logps/chosen": -253.1839599609375, + "logps/rejected": -244.2945556640625, + "loss": 0.2897, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5047574639320374, + "rewards/margins": 2.1502790451049805, + "rewards/rejected": -1.645521640777588, + "step": 6189 + }, + { + "epoch": 0.71, + "learning_rate": 8.722931054664638e-08, + "logits/chosen": -2.6531240940093994, + "logits/rejected": -2.907480001449585, + "logps/chosen": -288.8594970703125, + "logps/rejected": -189.79974365234375, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09042418748140335, + "rewards/margins": 0.909171462059021, + "rewards/rejected": -0.999595582485199, + "step": 6190 + }, + { + "epoch": 0.71, + "learning_rate": 8.719419407702212e-08, + "logits/chosen": -2.966796398162842, + "logits/rejected": -2.977308988571167, + "logps/chosen": -190.50076293945312, + "logps/rejected": -304.655517578125, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.287906289100647, + "rewards/margins": 3.5255918502807617, + "rewards/rejected": -3.2376856803894043, + "step": 6191 + }, + { + "epoch": 0.71, + "learning_rate": 8.715907760739787e-08, + "logits/chosen": -3.077587842941284, + "logits/rejected": -2.901366710662842, + "logps/chosen": -322.4462585449219, + "logps/rejected": -269.46905517578125, + "loss": 0.1379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16929081082344055, + "rewards/margins": 2.348426103591919, + "rewards/rejected": -2.179135322570801, + "step": 6192 + }, + { + "epoch": 0.71, + "learning_rate": 8.712396113777361e-08, + "logits/chosen": -3.0393290519714355, + "logits/rejected": -3.0820765495300293, + "logps/chosen": -203.51788330078125, + "logps/rejected": -298.24884033203125, + "loss": 0.3793, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2176513969898224, + "rewards/margins": 1.2252179384231567, + "rewards/rejected": -1.4428694248199463, + "step": 6193 + }, + { + "epoch": 0.71, + "learning_rate": 8.708884466814935e-08, + "logits/chosen": -3.2148985862731934, + "logits/rejected": -2.8627443313598633, + "logps/chosen": -303.32464599609375, + "logps/rejected": -160.66232299804688, + "loss": 0.2389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2496528923511505, + "rewards/margins": 1.8236502408981323, + "rewards/rejected": -1.5739972591400146, + "step": 6194 + }, + { + "epoch": 0.71, + "learning_rate": 8.70537281985251e-08, + "logits/chosen": -3.4498565196990967, + "logits/rejected": -3.2906782627105713, + "logps/chosen": -71.97164916992188, + "logps/rejected": -152.67689514160156, + "loss": 0.2659, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24974283576011658, + "rewards/margins": 2.064680576324463, + "rewards/rejected": -1.8149378299713135, + "step": 6195 + }, + { + "epoch": 0.71, + "learning_rate": 8.701861172890085e-08, + "logits/chosen": -2.5266380310058594, + "logits/rejected": -2.8225021362304688, + "logps/chosen": -277.5642395019531, + "logps/rejected": -407.37811279296875, + "loss": 0.374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12147662043571472, + "rewards/margins": 1.9361413717269897, + "rewards/rejected": -2.0576179027557373, + "step": 6196 + }, + { + "epoch": 0.71, + "learning_rate": 8.69834952592766e-08, + "logits/chosen": -3.1101553440093994, + "logits/rejected": -2.8499412536621094, + "logps/chosen": -276.3446044921875, + "logps/rejected": -226.96554565429688, + "loss": 0.6328, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02519528567790985, + "rewards/margins": 0.7795652151107788, + "rewards/rejected": -0.754369854927063, + "step": 6197 + }, + { + "epoch": 0.71, + "learning_rate": 8.694837878965234e-08, + "logits/chosen": -3.1416380405426025, + "logits/rejected": -3.5125246047973633, + "logps/chosen": -163.97061157226562, + "logps/rejected": -210.718994140625, + "loss": 0.2979, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3095620274543762, + "rewards/margins": 1.8749065399169922, + "rewards/rejected": -2.1844682693481445, + "step": 6198 + }, + { + "epoch": 0.71, + "learning_rate": 8.691326232002808e-08, + "logits/chosen": -3.170940399169922, + "logits/rejected": -3.0370469093322754, + "logps/chosen": -160.14724731445312, + "logps/rejected": -255.4941864013672, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1996043473482132, + "rewards/margins": 2.291872978210449, + "rewards/rejected": -2.092268466949463, + "step": 6199 + }, + { + "epoch": 0.71, + "learning_rate": 8.687814585040382e-08, + "logits/chosen": -2.6889164447784424, + "logits/rejected": -2.549431800842285, + "logps/chosen": -342.87017822265625, + "logps/rejected": -300.8272399902344, + "loss": 0.7314, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8079954385757446, + "rewards/margins": 0.828865647315979, + "rewards/rejected": -1.6368610858917236, + "step": 6200 + }, + { + "epoch": 0.71, + "learning_rate": 8.684302938077959e-08, + "logits/chosen": -2.9095349311828613, + "logits/rejected": -3.0594093799591064, + "logps/chosen": -171.49676513671875, + "logps/rejected": -196.35467529296875, + "loss": 0.3069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0025969967246055603, + "rewards/margins": 1.8880863189697266, + "rewards/rejected": -1.8906834125518799, + "step": 6201 + }, + { + "epoch": 0.71, + "learning_rate": 8.680791291115533e-08, + "logits/chosen": -2.5593883991241455, + "logits/rejected": -2.491218090057373, + "logps/chosen": -253.13067626953125, + "logps/rejected": -210.60482788085938, + "loss": 0.5266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33479252457618713, + "rewards/margins": 1.5333582162857056, + "rewards/rejected": -1.8681507110595703, + "step": 6202 + }, + { + "epoch": 0.72, + "learning_rate": 8.677279644153107e-08, + "logits/chosen": -2.3442609310150146, + "logits/rejected": -2.632598876953125, + "logps/chosen": -216.65208435058594, + "logps/rejected": -233.930908203125, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16358736157417297, + "rewards/margins": 2.069136142730713, + "rewards/rejected": -2.2327234745025635, + "step": 6203 + }, + { + "epoch": 0.72, + "learning_rate": 8.673767997190681e-08, + "logits/chosen": -2.4515955448150635, + "logits/rejected": -2.681119441986084, + "logps/chosen": -201.57717895507812, + "logps/rejected": -142.54989624023438, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3503835201263428, + "rewards/margins": 2.1500415802001953, + "rewards/rejected": -1.7996578216552734, + "step": 6204 + }, + { + "epoch": 0.72, + "learning_rate": 8.670256350228257e-08, + "logits/chosen": -2.764801502227783, + "logits/rejected": -2.8507094383239746, + "logps/chosen": -218.12623596191406, + "logps/rejected": -267.4840087890625, + "loss": 0.1816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018078655004501343, + "rewards/margins": 2.997767210006714, + "rewards/rejected": -3.015845775604248, + "step": 6205 + }, + { + "epoch": 0.72, + "learning_rate": 8.666744703265832e-08, + "logits/chosen": -3.1756718158721924, + "logits/rejected": -3.290731430053711, + "logps/chosen": -148.37005615234375, + "logps/rejected": -216.20755004882812, + "loss": 0.4638, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6528363823890686, + "rewards/margins": 1.7571951150894165, + "rewards/rejected": -1.1043587923049927, + "step": 6206 + }, + { + "epoch": 0.72, + "learning_rate": 8.663233056303406e-08, + "logits/chosen": -3.293508529663086, + "logits/rejected": -3.318122148513794, + "logps/chosen": -280.07598876953125, + "logps/rejected": -296.20123291015625, + "loss": 0.5995, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6083521842956543, + "rewards/margins": 1.3540289402008057, + "rewards/rejected": -1.96238112449646, + "step": 6207 + }, + { + "epoch": 0.72, + "learning_rate": 8.65972140934098e-08, + "logits/chosen": -2.844909906387329, + "logits/rejected": -2.9814465045928955, + "logps/chosen": -339.649169921875, + "logps/rejected": -375.01373291015625, + "loss": 0.1177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.77976393699646, + "rewards/margins": 3.383126974105835, + "rewards/rejected": -2.603363037109375, + "step": 6208 + }, + { + "epoch": 0.72, + "learning_rate": 8.656209762378555e-08, + "logits/chosen": -3.177168369293213, + "logits/rejected": -3.4636151790618896, + "logps/chosen": -422.7376403808594, + "logps/rejected": -260.71148681640625, + "loss": 0.3494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3571590185165405, + "rewards/margins": 2.2169976234436035, + "rewards/rejected": -2.5741565227508545, + "step": 6209 + }, + { + "epoch": 0.72, + "learning_rate": 8.65269811541613e-08, + "logits/chosen": -3.0544073581695557, + "logits/rejected": -3.4044296741485596, + "logps/chosen": -250.47122192382812, + "logps/rejected": -245.9281768798828, + "loss": 0.3694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.056384071707725525, + "rewards/margins": 1.635916829109192, + "rewards/rejected": -1.692300796508789, + "step": 6210 + }, + { + "epoch": 0.72, + "learning_rate": 8.649186468453704e-08, + "logits/chosen": -3.673053741455078, + "logits/rejected": -3.769381284713745, + "logps/chosen": -255.92332458496094, + "logps/rejected": -278.6451721191406, + "loss": 0.1643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1006774753332138, + "rewards/margins": 3.181764841079712, + "rewards/rejected": -3.282442331314087, + "step": 6211 + }, + { + "epoch": 0.72, + "learning_rate": 8.645674821491279e-08, + "logits/chosen": -3.628016948699951, + "logits/rejected": -3.6785573959350586, + "logps/chosen": -179.7212371826172, + "logps/rejected": -187.4581756591797, + "loss": 0.2539, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2319963425397873, + "rewards/margins": 2.410912036895752, + "rewards/rejected": -2.1789159774780273, + "step": 6212 + }, + { + "epoch": 0.72, + "learning_rate": 8.642163174528854e-08, + "logits/chosen": -3.040497303009033, + "logits/rejected": -2.7998287677764893, + "logps/chosen": -325.7772216796875, + "logps/rejected": -210.19358825683594, + "loss": 0.3179, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4744609594345093, + "rewards/margins": 2.0446083545684814, + "rewards/rejected": -1.5701473951339722, + "step": 6213 + }, + { + "epoch": 0.72, + "learning_rate": 8.638651527566428e-08, + "logits/chosen": -3.5246849060058594, + "logits/rejected": -3.673649311065674, + "logps/chosen": -190.5153350830078, + "logps/rejected": -194.64244079589844, + "loss": 0.4543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3522980809211731, + "rewards/margins": 1.897334098815918, + "rewards/rejected": -2.2496321201324463, + "step": 6214 + }, + { + "epoch": 0.72, + "learning_rate": 8.635139880604002e-08, + "logits/chosen": -3.084000587463379, + "logits/rejected": -3.049778938293457, + "logps/chosen": -175.01821899414062, + "logps/rejected": -207.11251831054688, + "loss": 0.3954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013409394770860672, + "rewards/margins": 2.30769944190979, + "rewards/rejected": -2.321108818054199, + "step": 6215 + }, + { + "epoch": 0.72, + "learning_rate": 8.631628233641577e-08, + "logits/chosen": -3.260829448699951, + "logits/rejected": -3.3587207794189453, + "logps/chosen": -308.1179504394531, + "logps/rejected": -208.18878173828125, + "loss": 0.3247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2245536744594574, + "rewards/margins": 2.6164565086364746, + "rewards/rejected": -2.841010093688965, + "step": 6216 + }, + { + "epoch": 0.72, + "learning_rate": 8.628116586679153e-08, + "logits/chosen": -2.3655714988708496, + "logits/rejected": -2.9127345085144043, + "logps/chosen": -112.45896911621094, + "logps/rejected": -214.22207641601562, + "loss": 0.3966, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3266565799713135, + "rewards/margins": 2.041274070739746, + "rewards/rejected": -2.3679306507110596, + "step": 6217 + }, + { + "epoch": 0.72, + "learning_rate": 8.624604939716727e-08, + "logits/chosen": -2.460184335708618, + "logits/rejected": -2.378178119659424, + "logps/chosen": -132.47740173339844, + "logps/rejected": -231.49209594726562, + "loss": 0.3952, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3082536458969116, + "rewards/margins": 2.820432186126709, + "rewards/rejected": -3.128685712814331, + "step": 6218 + }, + { + "epoch": 0.72, + "learning_rate": 8.621093292754301e-08, + "logits/chosen": -2.659320116043091, + "logits/rejected": -2.5503013134002686, + "logps/chosen": -162.3297882080078, + "logps/rejected": -280.72161865234375, + "loss": 0.1946, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7030532360076904, + "rewards/margins": 3.3834805488586426, + "rewards/rejected": -2.680427074432373, + "step": 6219 + }, + { + "epoch": 0.72, + "learning_rate": 8.617581645791875e-08, + "logits/chosen": -3.464320182800293, + "logits/rejected": -3.3932673931121826, + "logps/chosen": -212.67864990234375, + "logps/rejected": -233.3140869140625, + "loss": 0.2289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5519707202911377, + "rewards/margins": 2.6302335262298584, + "rewards/rejected": -2.0782630443573, + "step": 6220 + }, + { + "epoch": 0.72, + "learning_rate": 8.614069998829451e-08, + "logits/chosen": -3.3898746967315674, + "logits/rejected": -3.865929365158081, + "logps/chosen": -79.1580810546875, + "logps/rejected": -156.52774047851562, + "loss": 0.2322, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6686815023422241, + "rewards/margins": 3.0764858722686768, + "rewards/rejected": -2.407804489135742, + "step": 6221 + }, + { + "epoch": 0.72, + "learning_rate": 8.610558351867025e-08, + "logits/chosen": -3.5092577934265137, + "logits/rejected": -3.650083303451538, + "logps/chosen": -309.0154724121094, + "logps/rejected": -272.9697265625, + "loss": 0.4819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23302705585956573, + "rewards/margins": 0.9344514608383179, + "rewards/rejected": -1.1674785614013672, + "step": 6222 + }, + { + "epoch": 0.72, + "learning_rate": 8.6070467049046e-08, + "logits/chosen": -2.97107195854187, + "logits/rejected": -2.852827548980713, + "logps/chosen": -193.140625, + "logps/rejected": -249.558349609375, + "loss": 0.6357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23762360215187073, + "rewards/margins": 1.0629180669784546, + "rewards/rejected": -1.300541639328003, + "step": 6223 + }, + { + "epoch": 0.72, + "learning_rate": 8.603535057942174e-08, + "logits/chosen": -2.468775749206543, + "logits/rejected": -2.7047958374023438, + "logps/chosen": -182.60696411132812, + "logps/rejected": -300.49652099609375, + "loss": 0.3337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1447678506374359, + "rewards/margins": 2.4765188694000244, + "rewards/rejected": -2.621286630630493, + "step": 6224 + }, + { + "epoch": 0.72, + "learning_rate": 8.60002341097975e-08, + "logits/chosen": -2.9897100925445557, + "logits/rejected": -3.1734156608581543, + "logps/chosen": -371.2322998046875, + "logps/rejected": -410.45184326171875, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14682847261428833, + "rewards/margins": 1.5548166036605835, + "rewards/rejected": -1.4079880714416504, + "step": 6225 + }, + { + "epoch": 0.72, + "learning_rate": 8.596511764017324e-08, + "logits/chosen": -2.342602252960205, + "logits/rejected": -2.0993754863739014, + "logps/chosen": -198.84552001953125, + "logps/rejected": -278.88336181640625, + "loss": 0.8826, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6174329519271851, + "rewards/margins": 1.1297760009765625, + "rewards/rejected": -1.747209072113037, + "step": 6226 + }, + { + "epoch": 0.72, + "learning_rate": 8.593000117054898e-08, + "logits/chosen": -2.5627217292785645, + "logits/rejected": -2.472756862640381, + "logps/chosen": -344.158447265625, + "logps/rejected": -376.7000732421875, + "loss": 0.329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2511926591396332, + "rewards/margins": 2.1316771507263184, + "rewards/rejected": -2.3828694820404053, + "step": 6227 + }, + { + "epoch": 0.72, + "learning_rate": 8.589488470092472e-08, + "logits/chosen": -2.4449191093444824, + "logits/rejected": -2.8248982429504395, + "logps/chosen": -350.2446594238281, + "logps/rejected": -337.0544738769531, + "loss": 0.2587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3733539879322052, + "rewards/margins": 2.280017852783203, + "rewards/rejected": -1.9066637754440308, + "step": 6228 + }, + { + "epoch": 0.72, + "learning_rate": 8.585976823130049e-08, + "logits/chosen": -2.95918607711792, + "logits/rejected": -2.9900777339935303, + "logps/chosen": -155.2025146484375, + "logps/rejected": -153.85064697265625, + "loss": 0.5351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2898723781108856, + "rewards/margins": 0.5406811833381653, + "rewards/rejected": -0.8305535316467285, + "step": 6229 + }, + { + "epoch": 0.72, + "learning_rate": 8.582465176167623e-08, + "logits/chosen": -3.568181037902832, + "logits/rejected": -3.4104301929473877, + "logps/chosen": -270.2089538574219, + "logps/rejected": -212.7761993408203, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2646157741546631, + "rewards/margins": 2.255439519882202, + "rewards/rejected": -1.990823745727539, + "step": 6230 + }, + { + "epoch": 0.72, + "learning_rate": 8.578953529205197e-08, + "logits/chosen": -2.601757526397705, + "logits/rejected": -2.986440896987915, + "logps/chosen": -356.9673767089844, + "logps/rejected": -308.0239562988281, + "loss": 0.3803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10056573152542114, + "rewards/margins": 1.7811245918273926, + "rewards/rejected": -1.881690263748169, + "step": 6231 + }, + { + "epoch": 0.72, + "learning_rate": 8.575441882242771e-08, + "logits/chosen": -2.46207594871521, + "logits/rejected": -2.6854052543640137, + "logps/chosen": -285.23443603515625, + "logps/rejected": -170.55178833007812, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4156138300895691, + "rewards/margins": 0.7287877202033997, + "rewards/rejected": -1.1444015502929688, + "step": 6232 + }, + { + "epoch": 0.72, + "learning_rate": 8.571930235280347e-08, + "logits/chosen": -2.7972702980041504, + "logits/rejected": -2.8190526962280273, + "logps/chosen": -220.203125, + "logps/rejected": -214.31539916992188, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015666097402572632, + "rewards/margins": 2.0984582901000977, + "rewards/rejected": -2.082792043685913, + "step": 6233 + }, + { + "epoch": 0.72, + "learning_rate": 8.568418588317922e-08, + "logits/chosen": -3.277355670928955, + "logits/rejected": -3.033801794052124, + "logps/chosen": -137.0720977783203, + "logps/rejected": -139.5054931640625, + "loss": 0.6073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11476925760507584, + "rewards/margins": 0.42781922221183777, + "rewards/rejected": -0.542588472366333, + "step": 6234 + }, + { + "epoch": 0.72, + "learning_rate": 8.564906941355496e-08, + "logits/chosen": -2.894564151763916, + "logits/rejected": -2.8224639892578125, + "logps/chosen": -317.6152648925781, + "logps/rejected": -224.19076538085938, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41659748554229736, + "rewards/margins": 0.7525492310523987, + "rewards/rejected": -1.1691466569900513, + "step": 6235 + }, + { + "epoch": 0.72, + "learning_rate": 8.56139529439307e-08, + "logits/chosen": -3.1319923400878906, + "logits/rejected": -2.718477487564087, + "logps/chosen": -168.4676055908203, + "logps/rejected": -177.952880859375, + "loss": 0.2083, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10882195830345154, + "rewards/margins": 2.1111302375793457, + "rewards/rejected": -2.0023083686828613, + "step": 6236 + }, + { + "epoch": 0.72, + "learning_rate": 8.557883647430645e-08, + "logits/chosen": -2.865548849105835, + "logits/rejected": -2.949894428253174, + "logps/chosen": -264.7803039550781, + "logps/rejected": -140.4404754638672, + "loss": 0.8792, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5142704844474792, + "rewards/margins": 0.2607874274253845, + "rewards/rejected": -0.7750579118728638, + "step": 6237 + }, + { + "epoch": 0.72, + "learning_rate": 8.554372000468219e-08, + "logits/chosen": -2.6610703468322754, + "logits/rejected": -2.7421576976776123, + "logps/chosen": -293.1398620605469, + "logps/rejected": -278.7167663574219, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18916448950767517, + "rewards/margins": 2.3520398139953613, + "rewards/rejected": -2.1628754138946533, + "step": 6238 + }, + { + "epoch": 0.72, + "learning_rate": 8.550860353505793e-08, + "logits/chosen": -2.848931312561035, + "logits/rejected": -2.8345346450805664, + "logps/chosen": -306.6490783691406, + "logps/rejected": -135.3790283203125, + "loss": 0.6323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6681374311447144, + "rewards/margins": 0.3910433053970337, + "rewards/rejected": -1.059180736541748, + "step": 6239 + }, + { + "epoch": 0.72, + "learning_rate": 8.547348706543369e-08, + "logits/chosen": -3.6734566688537598, + "logits/rejected": -3.3634095191955566, + "logps/chosen": -133.25161743164062, + "logps/rejected": -145.0244903564453, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.255374014377594, + "rewards/margins": 1.3699357509613037, + "rewards/rejected": -1.6253098249435425, + "step": 6240 + }, + { + "epoch": 0.72, + "learning_rate": 8.543837059580943e-08, + "logits/chosen": -3.1436634063720703, + "logits/rejected": -3.343234062194824, + "logps/chosen": -146.49276733398438, + "logps/rejected": -278.5816650390625, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22890326380729675, + "rewards/margins": 1.8236750364303589, + "rewards/rejected": -1.5947718620300293, + "step": 6241 + }, + { + "epoch": 0.72, + "learning_rate": 8.540325412618518e-08, + "logits/chosen": -3.0953752994537354, + "logits/rejected": -3.009274482727051, + "logps/chosen": -274.2269287109375, + "logps/rejected": -221.4766082763672, + "loss": 0.9573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9080827832221985, + "rewards/margins": 0.6636852025985718, + "rewards/rejected": -1.5717679262161255, + "step": 6242 + }, + { + "epoch": 0.72, + "learning_rate": 8.536813765656092e-08, + "logits/chosen": -2.9422683715820312, + "logits/rejected": -2.982396125793457, + "logps/chosen": -155.53924560546875, + "logps/rejected": -217.11761474609375, + "loss": 0.3882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11280552297830582, + "rewards/margins": 3.145951271057129, + "rewards/rejected": -3.0331454277038574, + "step": 6243 + }, + { + "epoch": 0.72, + "learning_rate": 8.533302118693666e-08, + "logits/chosen": -3.546570062637329, + "logits/rejected": -2.983919858932495, + "logps/chosen": -242.14044189453125, + "logps/rejected": -262.06158447265625, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21760404109954834, + "rewards/margins": 2.1296586990356445, + "rewards/rejected": -2.3472626209259033, + "step": 6244 + }, + { + "epoch": 0.72, + "learning_rate": 8.52979047173124e-08, + "logits/chosen": -2.847583293914795, + "logits/rejected": -2.9833896160125732, + "logps/chosen": -316.98907470703125, + "logps/rejected": -331.26763916015625, + "loss": 0.2573, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10652070492506027, + "rewards/margins": 2.216336250305176, + "rewards/rejected": -2.1098155975341797, + "step": 6245 + }, + { + "epoch": 0.72, + "learning_rate": 8.526278824768817e-08, + "logits/chosen": -3.1121582984924316, + "logits/rejected": -2.956580400466919, + "logps/chosen": -402.4575500488281, + "logps/rejected": -356.32891845703125, + "loss": 0.1775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1179531067609787, + "rewards/margins": 2.0528955459594727, + "rewards/rejected": -1.9349424839019775, + "step": 6246 + }, + { + "epoch": 0.72, + "learning_rate": 8.522767177806391e-08, + "logits/chosen": -3.5689611434936523, + "logits/rejected": -3.6692147254943848, + "logps/chosen": -242.9729461669922, + "logps/rejected": -307.6684265136719, + "loss": 0.8477, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5586351156234741, + "rewards/margins": 0.29153257608413696, + "rewards/rejected": -1.8501675128936768, + "step": 6247 + }, + { + "epoch": 0.72, + "learning_rate": 8.519255530843965e-08, + "logits/chosen": -3.2566511631011963, + "logits/rejected": -3.240741729736328, + "logps/chosen": -196.38543701171875, + "logps/rejected": -269.51617431640625, + "loss": 0.6098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011288568377494812, + "rewards/margins": 1.4440367221832275, + "rewards/rejected": -1.4553251266479492, + "step": 6248 + }, + { + "epoch": 0.72, + "learning_rate": 8.515743883881539e-08, + "logits/chosen": -3.088057518005371, + "logits/rejected": -3.1470632553100586, + "logps/chosen": -264.2438659667969, + "logps/rejected": -245.93338012695312, + "loss": 0.1388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.646129846572876, + "rewards/margins": 2.357839822769165, + "rewards/rejected": -1.7117100954055786, + "step": 6249 + }, + { + "epoch": 0.72, + "learning_rate": 8.512232236919116e-08, + "logits/chosen": -2.616626262664795, + "logits/rejected": -2.740483522415161, + "logps/chosen": -254.18821716308594, + "logps/rejected": -361.4554443359375, + "loss": 0.3566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9553130269050598, + "rewards/margins": 2.4873809814453125, + "rewards/rejected": -3.4426939487457275, + "step": 6250 + }, + { + "epoch": 0.72, + "learning_rate": 8.50872058995669e-08, + "logits/chosen": -2.6419901847839355, + "logits/rejected": -2.5201663970947266, + "logps/chosen": -213.8834228515625, + "logps/rejected": -210.63494873046875, + "loss": 0.2782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10980918258428574, + "rewards/margins": 1.4906436204910278, + "rewards/rejected": -1.3808345794677734, + "step": 6251 + }, + { + "epoch": 0.72, + "learning_rate": 8.505208942994264e-08, + "logits/chosen": -3.526102066040039, + "logits/rejected": -3.274887800216675, + "logps/chosen": -200.23143005371094, + "logps/rejected": -156.93850708007812, + "loss": 0.3908, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04509430378675461, + "rewards/margins": 2.0682435035705566, + "rewards/rejected": -2.023149251937866, + "step": 6252 + }, + { + "epoch": 0.72, + "learning_rate": 8.501697296031838e-08, + "logits/chosen": -2.593017101287842, + "logits/rejected": -2.6330161094665527, + "logps/chosen": -111.45880889892578, + "logps/rejected": -130.27725219726562, + "loss": 0.362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1506037414073944, + "rewards/margins": 1.2883274555206299, + "rewards/rejected": -1.4389312267303467, + "step": 6253 + }, + { + "epoch": 0.72, + "learning_rate": 8.498185649069413e-08, + "logits/chosen": -2.60383939743042, + "logits/rejected": -2.5834503173828125, + "logps/chosen": -183.40931701660156, + "logps/rejected": -337.6494140625, + "loss": 0.2824, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4244502782821655, + "rewards/margins": 3.612639904022217, + "rewards/rejected": -4.037090301513672, + "step": 6254 + }, + { + "epoch": 0.72, + "learning_rate": 8.494674002106987e-08, + "logits/chosen": -3.1588120460510254, + "logits/rejected": -2.947680950164795, + "logps/chosen": -339.39874267578125, + "logps/rejected": -296.22723388671875, + "loss": 0.4758, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31119251251220703, + "rewards/margins": 1.5254976749420166, + "rewards/rejected": -1.8366901874542236, + "step": 6255 + }, + { + "epoch": 0.72, + "learning_rate": 8.491162355144562e-08, + "logits/chosen": -2.847172737121582, + "logits/rejected": -2.747995138168335, + "logps/chosen": -185.7191162109375, + "logps/rejected": -236.264404296875, + "loss": 0.2347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026789769530296326, + "rewards/margins": 1.7865556478500366, + "rewards/rejected": -1.8133454322814941, + "step": 6256 + }, + { + "epoch": 0.72, + "learning_rate": 8.487650708182137e-08, + "logits/chosen": -3.273263692855835, + "logits/rejected": -3.5938992500305176, + "logps/chosen": -218.46710205078125, + "logps/rejected": -206.01931762695312, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45577287673950195, + "rewards/margins": 3.357509136199951, + "rewards/rejected": -2.901736259460449, + "step": 6257 + }, + { + "epoch": 0.72, + "learning_rate": 8.484139061219712e-08, + "logits/chosen": -2.8537702560424805, + "logits/rejected": -2.9218153953552246, + "logps/chosen": -300.13616943359375, + "logps/rejected": -339.1453857421875, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09189091622829437, + "rewards/margins": 2.1648545265197754, + "rewards/rejected": -2.2567451000213623, + "step": 6258 + }, + { + "epoch": 0.72, + "learning_rate": 8.480627414257286e-08, + "logits/chosen": -3.386720657348633, + "logits/rejected": -3.640533685684204, + "logps/chosen": -138.34902954101562, + "logps/rejected": -137.23846435546875, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26158368587493896, + "rewards/margins": 1.739933967590332, + "rewards/rejected": -2.0015177726745605, + "step": 6259 + }, + { + "epoch": 0.72, + "learning_rate": 8.47711576729486e-08, + "logits/chosen": -3.0016279220581055, + "logits/rejected": -3.2234678268432617, + "logps/chosen": -208.80181884765625, + "logps/rejected": -231.61959838867188, + "loss": 0.328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20624268054962158, + "rewards/margins": 1.494094729423523, + "rewards/rejected": -1.700337290763855, + "step": 6260 + }, + { + "epoch": 0.72, + "learning_rate": 8.473604120332434e-08, + "logits/chosen": -3.2461695671081543, + "logits/rejected": -3.3387646675109863, + "logps/chosen": -218.27008056640625, + "logps/rejected": -276.112548828125, + "loss": 0.3873, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16898766160011292, + "rewards/margins": 3.4332799911499023, + "rewards/rejected": -3.6022677421569824, + "step": 6261 + }, + { + "epoch": 0.72, + "learning_rate": 8.470092473370011e-08, + "logits/chosen": -2.8552987575531006, + "logits/rejected": -3.047182321548462, + "logps/chosen": -136.90223693847656, + "logps/rejected": -314.7531433105469, + "loss": 0.4548, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03014979138970375, + "rewards/margins": 1.6778748035430908, + "rewards/rejected": -1.6477251052856445, + "step": 6262 + }, + { + "epoch": 0.72, + "learning_rate": 8.466580826407585e-08, + "logits/chosen": -2.9430160522460938, + "logits/rejected": -2.8224093914031982, + "logps/chosen": -379.164794921875, + "logps/rejected": -316.68121337890625, + "loss": 0.1365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.828264057636261, + "rewards/margins": 2.887794017791748, + "rewards/rejected": -2.0595297813415527, + "step": 6263 + }, + { + "epoch": 0.72, + "learning_rate": 8.463069179445159e-08, + "logits/chosen": -2.982023000717163, + "logits/rejected": -2.8268449306488037, + "logps/chosen": -140.92398071289062, + "logps/rejected": -224.25408935546875, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1352217197418213, + "rewards/margins": 0.8748186826705933, + "rewards/rejected": -1.0100404024124146, + "step": 6264 + }, + { + "epoch": 0.72, + "learning_rate": 8.459557532482733e-08, + "logits/chosen": -3.3811850547790527, + "logits/rejected": -3.5541558265686035, + "logps/chosen": -175.58084106445312, + "logps/rejected": -174.06512451171875, + "loss": 0.3416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06171835958957672, + "rewards/margins": 1.8693888187408447, + "rewards/rejected": -1.8076703548431396, + "step": 6265 + }, + { + "epoch": 0.72, + "learning_rate": 8.456045885520309e-08, + "logits/chosen": -3.5368447303771973, + "logits/rejected": -3.564699649810791, + "logps/chosen": -347.0350036621094, + "logps/rejected": -306.87115478515625, + "loss": 0.8387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49883148074150085, + "rewards/margins": 1.1292953491210938, + "rewards/rejected": -1.628126621246338, + "step": 6266 + }, + { + "epoch": 0.72, + "learning_rate": 8.452534238557884e-08, + "logits/chosen": -3.1877639293670654, + "logits/rejected": -3.0579466819763184, + "logps/chosen": -249.658935546875, + "logps/rejected": -257.7124938964844, + "loss": 0.4991, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22231227159500122, + "rewards/margins": 0.8941972851753235, + "rewards/rejected": -1.1165095567703247, + "step": 6267 + }, + { + "epoch": 0.72, + "learning_rate": 8.449022591595458e-08, + "logits/chosen": -3.3912158012390137, + "logits/rejected": -3.7296178340911865, + "logps/chosen": -270.26458740234375, + "logps/rejected": -351.25482177734375, + "loss": 0.7967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.731343150138855, + "rewards/margins": 0.803435206413269, + "rewards/rejected": -1.534778356552124, + "step": 6268 + }, + { + "epoch": 0.72, + "learning_rate": 8.445510944633032e-08, + "logits/chosen": -3.0934033393859863, + "logits/rejected": -3.5140411853790283, + "logps/chosen": -162.88919067382812, + "logps/rejected": -191.71502685546875, + "loss": 0.3182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41422122716903687, + "rewards/margins": 1.5621792078018188, + "rewards/rejected": -1.976400375366211, + "step": 6269 + }, + { + "epoch": 0.72, + "learning_rate": 8.441999297670608e-08, + "logits/chosen": -2.909179925918579, + "logits/rejected": -2.954573631286621, + "logps/chosen": -185.3433837890625, + "logps/rejected": -208.2130584716797, + "loss": 0.4458, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16982579231262207, + "rewards/margins": 1.4464986324310303, + "rewards/rejected": -1.2766729593276978, + "step": 6270 + }, + { + "epoch": 0.72, + "learning_rate": 8.438487650708182e-08, + "logits/chosen": -3.0328311920166016, + "logits/rejected": -2.981339454650879, + "logps/chosen": -162.599609375, + "logps/rejected": -219.60101318359375, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17381522059440613, + "rewards/margins": 1.8205586671829224, + "rewards/rejected": -1.9943739175796509, + "step": 6271 + }, + { + "epoch": 0.72, + "learning_rate": 8.434976003745756e-08, + "logits/chosen": -2.5618948936462402, + "logits/rejected": -2.5975489616394043, + "logps/chosen": -98.26287841796875, + "logps/rejected": -192.7917022705078, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005255177617073059, + "rewards/margins": 1.5950639247894287, + "rewards/rejected": -1.6003190279006958, + "step": 6272 + }, + { + "epoch": 0.72, + "learning_rate": 8.43146435678333e-08, + "logits/chosen": -3.4478249549865723, + "logits/rejected": -3.3231570720672607, + "logps/chosen": -260.5186767578125, + "logps/rejected": -303.4814453125, + "loss": 0.1769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37039822340011597, + "rewards/margins": 2.557544231414795, + "rewards/rejected": -2.1871461868286133, + "step": 6273 + }, + { + "epoch": 0.72, + "learning_rate": 8.427952709820907e-08, + "logits/chosen": -3.2165169715881348, + "logits/rejected": -3.0515427589416504, + "logps/chosen": -150.4286651611328, + "logps/rejected": -163.34841918945312, + "loss": 0.3795, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3325687050819397, + "rewards/margins": 1.626227617263794, + "rewards/rejected": -1.293658971786499, + "step": 6274 + }, + { + "epoch": 0.72, + "learning_rate": 8.42444106285848e-08, + "logits/chosen": -2.867514133453369, + "logits/rejected": -2.816272735595703, + "logps/chosen": -374.82489013671875, + "logps/rejected": -194.81591796875, + "loss": 0.5881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2747151851654053, + "rewards/margins": 0.7732227444648743, + "rewards/rejected": -1.0479379892349243, + "step": 6275 + }, + { + "epoch": 0.72, + "learning_rate": 8.420929415896055e-08, + "logits/chosen": -3.601177215576172, + "logits/rejected": -3.122390031814575, + "logps/chosen": -250.98818969726562, + "logps/rejected": -221.5794677734375, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4452561140060425, + "rewards/margins": 1.8691513538360596, + "rewards/rejected": -1.4238951206207275, + "step": 6276 + }, + { + "epoch": 0.72, + "learning_rate": 8.417417768933629e-08, + "logits/chosen": -3.3579907417297363, + "logits/rejected": -3.6270580291748047, + "logps/chosen": -190.69354248046875, + "logps/rejected": -322.753173828125, + "loss": 0.3783, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39213645458221436, + "rewards/margins": 1.87322199344635, + "rewards/rejected": -1.4810855388641357, + "step": 6277 + }, + { + "epoch": 0.72, + "learning_rate": 8.413906121971205e-08, + "logits/chosen": -3.984138011932373, + "logits/rejected": -3.3709700107574463, + "logps/chosen": -241.1252899169922, + "logps/rejected": -172.7447052001953, + "loss": 0.4307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15482193231582642, + "rewards/margins": 1.2046866416931152, + "rewards/rejected": -1.0498647689819336, + "step": 6278 + }, + { + "epoch": 0.72, + "learning_rate": 8.41039447500878e-08, + "logits/chosen": -3.11362624168396, + "logits/rejected": -2.725102424621582, + "logps/chosen": -443.1846618652344, + "logps/rejected": -326.4090576171875, + "loss": 0.563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9030854105949402, + "rewards/margins": 1.71173095703125, + "rewards/rejected": -2.614816188812256, + "step": 6279 + }, + { + "epoch": 0.72, + "learning_rate": 8.406882828046354e-08, + "logits/chosen": -2.859363079071045, + "logits/rejected": -2.7622170448303223, + "logps/chosen": -267.2108459472656, + "logps/rejected": -248.2975311279297, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24156567454338074, + "rewards/margins": 3.2222490310668945, + "rewards/rejected": -3.4638147354125977, + "step": 6280 + }, + { + "epoch": 0.72, + "learning_rate": 8.403371181083928e-08, + "logits/chosen": -3.3904356956481934, + "logits/rejected": -3.4035303592681885, + "logps/chosen": -397.56695556640625, + "logps/rejected": -380.6833190917969, + "loss": 0.159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009605005383491516, + "rewards/margins": 2.7309536933898926, + "rewards/rejected": -2.740558624267578, + "step": 6281 + }, + { + "epoch": 0.72, + "learning_rate": 8.399859534121503e-08, + "logits/chosen": -3.179352283477783, + "logits/rejected": -3.0661356449127197, + "logps/chosen": -291.62567138671875, + "logps/rejected": -203.4659423828125, + "loss": 0.423, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0720459520816803, + "rewards/margins": 1.2864990234375, + "rewards/rejected": -1.2144529819488525, + "step": 6282 + }, + { + "epoch": 0.72, + "learning_rate": 8.396347887159077e-08, + "logits/chosen": -2.5746030807495117, + "logits/rejected": -2.355295181274414, + "logps/chosen": -358.5271301269531, + "logps/rejected": -389.95880126953125, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4041932225227356, + "rewards/margins": 0.38582074642181396, + "rewards/rejected": -0.7900140285491943, + "step": 6283 + }, + { + "epoch": 0.72, + "learning_rate": 8.392836240196652e-08, + "logits/chosen": -2.477261781692505, + "logits/rejected": -2.4821016788482666, + "logps/chosen": -200.21766662597656, + "logps/rejected": -156.7978973388672, + "loss": 0.5477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5151647329330444, + "rewards/margins": 0.8082389235496521, + "rewards/rejected": -1.3234035968780518, + "step": 6284 + }, + { + "epoch": 0.72, + "learning_rate": 8.389324593234227e-08, + "logits/chosen": -2.5965070724487305, + "logits/rejected": -2.547884702682495, + "logps/chosen": -348.1279296875, + "logps/rejected": -263.2189025878906, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09889373183250427, + "rewards/margins": 1.7601655721664429, + "rewards/rejected": -1.6612718105316162, + "step": 6285 + }, + { + "epoch": 0.72, + "learning_rate": 8.385812946271802e-08, + "logits/chosen": -2.3406271934509277, + "logits/rejected": -2.4149844646453857, + "logps/chosen": -318.87615966796875, + "logps/rejected": -240.38192749023438, + "loss": 0.6432, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5517070889472961, + "rewards/margins": 1.0659584999084473, + "rewards/rejected": -1.6176655292510986, + "step": 6286 + }, + { + "epoch": 0.72, + "learning_rate": 8.382301299309376e-08, + "logits/chosen": -3.260585308074951, + "logits/rejected": -3.2350287437438965, + "logps/chosen": -372.6209411621094, + "logps/rejected": -251.24610900878906, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01149781048297882, + "rewards/margins": 1.3019291162490845, + "rewards/rejected": -1.290431261062622, + "step": 6287 + }, + { + "epoch": 0.72, + "learning_rate": 8.37878965234695e-08, + "logits/chosen": -2.6980996131896973, + "logits/rejected": -2.7562761306762695, + "logps/chosen": -205.42996215820312, + "logps/rejected": -253.03916931152344, + "loss": 0.1743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14497989416122437, + "rewards/margins": 2.63859224319458, + "rewards/rejected": -2.493612289428711, + "step": 6288 + }, + { + "epoch": 0.72, + "learning_rate": 8.375278005384524e-08, + "logits/chosen": -3.0154929161071777, + "logits/rejected": -3.160339117050171, + "logps/chosen": -510.25537109375, + "logps/rejected": -162.481201171875, + "loss": 0.8945, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7747213840484619, + "rewards/margins": 0.2632142901420593, + "rewards/rejected": -1.037935733795166, + "step": 6289 + }, + { + "epoch": 0.73, + "learning_rate": 8.371766358422098e-08, + "logits/chosen": -3.520028591156006, + "logits/rejected": -3.06062650680542, + "logps/chosen": -366.04168701171875, + "logps/rejected": -228.6161651611328, + "loss": 0.3049, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26746052503585815, + "rewards/margins": 2.2751567363739014, + "rewards/rejected": -2.5426173210144043, + "step": 6290 + }, + { + "epoch": 0.73, + "learning_rate": 8.368254711459675e-08, + "logits/chosen": -3.4397199153900146, + "logits/rejected": -3.4305295944213867, + "logps/chosen": -320.2738342285156, + "logps/rejected": -294.5267639160156, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1618047058582306, + "rewards/margins": 1.9696441888809204, + "rewards/rejected": -2.131448984146118, + "step": 6291 + }, + { + "epoch": 0.73, + "learning_rate": 8.364743064497249e-08, + "logits/chosen": -3.214134454727173, + "logits/rejected": -3.80350661277771, + "logps/chosen": -152.89271545410156, + "logps/rejected": -312.3417053222656, + "loss": 0.3712, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.018940806388855, + "rewards/margins": 1.8186287879943848, + "rewards/rejected": -2.83756947517395, + "step": 6292 + }, + { + "epoch": 0.73, + "learning_rate": 8.361231417534823e-08, + "logits/chosen": -2.613271713256836, + "logits/rejected": -2.9848105907440186, + "logps/chosen": -121.55078887939453, + "logps/rejected": -217.26815795898438, + "loss": 0.7162, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01217576302587986, + "rewards/margins": 0.4635871648788452, + "rewards/rejected": -0.45141148567199707, + "step": 6293 + }, + { + "epoch": 0.73, + "learning_rate": 8.357719770572397e-08, + "logits/chosen": -2.84100341796875, + "logits/rejected": -2.6846184730529785, + "logps/chosen": -156.8587188720703, + "logps/rejected": -273.0836181640625, + "loss": 0.3239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29572510719299316, + "rewards/margins": 1.8496615886688232, + "rewards/rejected": -2.1453864574432373, + "step": 6294 + }, + { + "epoch": 0.73, + "learning_rate": 8.354208123609974e-08, + "logits/chosen": -3.0451793670654297, + "logits/rejected": -3.0388383865356445, + "logps/chosen": -148.17198181152344, + "logps/rejected": -282.2555847167969, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24630150198936462, + "rewards/margins": 1.7564725875854492, + "rewards/rejected": -2.002774238586426, + "step": 6295 + }, + { + "epoch": 0.73, + "learning_rate": 8.350696476647548e-08, + "logits/chosen": -2.857025623321533, + "logits/rejected": -2.7571797370910645, + "logps/chosen": -296.7508544921875, + "logps/rejected": -302.4924011230469, + "loss": 0.6116, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.731676459312439, + "rewards/margins": 1.7516429424285889, + "rewards/rejected": -2.4833192825317383, + "step": 6296 + }, + { + "epoch": 0.73, + "learning_rate": 8.347184829685122e-08, + "logits/chosen": -2.7498693466186523, + "logits/rejected": -2.867306709289551, + "logps/chosen": -169.7121124267578, + "logps/rejected": -267.9374084472656, + "loss": 0.2856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1465682089328766, + "rewards/margins": 1.8432800769805908, + "rewards/rejected": -1.9898483753204346, + "step": 6297 + }, + { + "epoch": 0.73, + "learning_rate": 8.343673182722696e-08, + "logits/chosen": -2.8049240112304688, + "logits/rejected": -2.8365559577941895, + "logps/chosen": -251.622802734375, + "logps/rejected": -372.61907958984375, + "loss": 0.4332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36610913276672363, + "rewards/margins": 1.4710171222686768, + "rewards/rejected": -1.8371262550354004, + "step": 6298 + }, + { + "epoch": 0.73, + "learning_rate": 8.340161535760271e-08, + "logits/chosen": -3.7430360317230225, + "logits/rejected": -3.947934150695801, + "logps/chosen": -100.45657348632812, + "logps/rejected": -196.48182678222656, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09701310098171234, + "rewards/margins": 1.3893128633499146, + "rewards/rejected": -1.486325979232788, + "step": 6299 + }, + { + "epoch": 0.73, + "learning_rate": 8.336649888797845e-08, + "logits/chosen": -3.241220235824585, + "logits/rejected": -3.17036771774292, + "logps/chosen": -201.83982849121094, + "logps/rejected": -226.7751922607422, + "loss": 0.1453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4980280101299286, + "rewards/margins": 2.3650870323181152, + "rewards/rejected": -1.8670592308044434, + "step": 6300 + }, + { + "epoch": 0.73, + "learning_rate": 8.333138241835421e-08, + "logits/chosen": -2.6817245483398438, + "logits/rejected": -2.552454710006714, + "logps/chosen": -243.77114868164062, + "logps/rejected": -377.82427978515625, + "loss": 0.4219, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6259527206420898, + "rewards/margins": 1.9098961353302002, + "rewards/rejected": -1.2839434146881104, + "step": 6301 + }, + { + "epoch": 0.73, + "learning_rate": 8.329626594872995e-08, + "logits/chosen": -3.139981746673584, + "logits/rejected": -3.062119245529175, + "logps/chosen": -150.7879180908203, + "logps/rejected": -217.0288848876953, + "loss": 0.4946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09455959498882294, + "rewards/margins": 1.3896270990371704, + "rewards/rejected": -1.4841866493225098, + "step": 6302 + }, + { + "epoch": 0.73, + "learning_rate": 8.32611494791057e-08, + "logits/chosen": -2.615550994873047, + "logits/rejected": -2.8647384643554688, + "logps/chosen": -408.47674560546875, + "logps/rejected": -331.8103332519531, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6378285884857178, + "rewards/margins": 3.4544098377227783, + "rewards/rejected": -2.8165812492370605, + "step": 6303 + }, + { + "epoch": 0.73, + "learning_rate": 8.322603300948144e-08, + "logits/chosen": -2.3938000202178955, + "logits/rejected": -2.8679771423339844, + "logps/chosen": -379.28179931640625, + "logps/rejected": -276.7356262207031, + "loss": 0.447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1512470543384552, + "rewards/margins": 1.646498680114746, + "rewards/rejected": -1.797745943069458, + "step": 6304 + }, + { + "epoch": 0.73, + "learning_rate": 8.319091653985718e-08, + "logits/chosen": -4.037156105041504, + "logits/rejected": -3.8509926795959473, + "logps/chosen": -399.1663818359375, + "logps/rejected": -300.700927734375, + "loss": 0.2947, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1439438760280609, + "rewards/margins": 2.057008743286133, + "rewards/rejected": -1.9130650758743286, + "step": 6305 + }, + { + "epoch": 0.73, + "learning_rate": 8.315580007023292e-08, + "logits/chosen": -3.091869354248047, + "logits/rejected": -3.1040849685668945, + "logps/chosen": -341.19952392578125, + "logps/rejected": -230.09078979492188, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04455310106277466, + "rewards/margins": 1.0075570344924927, + "rewards/rejected": -0.963003933429718, + "step": 6306 + }, + { + "epoch": 0.73, + "learning_rate": 8.312068360060869e-08, + "logits/chosen": -3.091261625289917, + "logits/rejected": -3.0261282920837402, + "logps/chosen": -275.23687744140625, + "logps/rejected": -283.5963134765625, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08226747810840607, + "rewards/margins": 2.639946460723877, + "rewards/rejected": -2.5576791763305664, + "step": 6307 + }, + { + "epoch": 0.73, + "learning_rate": 8.308556713098443e-08, + "logits/chosen": -2.7452588081359863, + "logits/rejected": -2.919250011444092, + "logps/chosen": -174.54354858398438, + "logps/rejected": -182.00390625, + "loss": 0.5934, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23117992281913757, + "rewards/margins": 0.5466278195381165, + "rewards/rejected": -0.3154478669166565, + "step": 6308 + }, + { + "epoch": 0.73, + "learning_rate": 8.305045066136017e-08, + "logits/chosen": -2.735462188720703, + "logits/rejected": -2.7080869674682617, + "logps/chosen": -144.44998168945312, + "logps/rejected": -238.22006225585938, + "loss": 0.6183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.571324348449707, + "rewards/margins": 0.4916572570800781, + "rewards/rejected": -1.0629817247390747, + "step": 6309 + }, + { + "epoch": 0.73, + "learning_rate": 8.301533419173591e-08, + "logits/chosen": -2.7152786254882812, + "logits/rejected": -2.510056972503662, + "logps/chosen": -174.60498046875, + "logps/rejected": -187.27255249023438, + "loss": 0.3028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3051367402076721, + "rewards/margins": 2.016900062561035, + "rewards/rejected": -2.3220367431640625, + "step": 6310 + }, + { + "epoch": 0.73, + "learning_rate": 8.298021772211167e-08, + "logits/chosen": -3.484388828277588, + "logits/rejected": -3.6179251670837402, + "logps/chosen": -182.06463623046875, + "logps/rejected": -223.07818603515625, + "loss": 0.2026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5659706592559814, + "rewards/margins": 2.03658390045166, + "rewards/rejected": -1.4706132411956787, + "step": 6311 + }, + { + "epoch": 0.73, + "learning_rate": 8.294510125248742e-08, + "logits/chosen": -2.9861137866973877, + "logits/rejected": -2.924213171005249, + "logps/chosen": -224.51083374023438, + "logps/rejected": -266.17486572265625, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7859976291656494, + "rewards/margins": 2.972860336303711, + "rewards/rejected": -3.7588582038879395, + "step": 6312 + }, + { + "epoch": 0.73, + "learning_rate": 8.290998478286316e-08, + "logits/chosen": -3.5376572608947754, + "logits/rejected": -3.3463213443756104, + "logps/chosen": -206.10394287109375, + "logps/rejected": -363.5721435546875, + "loss": 0.4037, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26267892122268677, + "rewards/margins": 1.7122085094451904, + "rewards/rejected": -1.4495295286178589, + "step": 6313 + }, + { + "epoch": 0.73, + "learning_rate": 8.28748683132389e-08, + "logits/chosen": -3.677359104156494, + "logits/rejected": -3.882296085357666, + "logps/chosen": -172.92117309570312, + "logps/rejected": -283.5487060546875, + "loss": 0.4488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16247737407684326, + "rewards/margins": 2.753994941711426, + "rewards/rejected": -2.9164721965789795, + "step": 6314 + }, + { + "epoch": 0.73, + "learning_rate": 8.283975184361466e-08, + "logits/chosen": -3.6444387435913086, + "logits/rejected": -3.5155320167541504, + "logps/chosen": -242.06478881835938, + "logps/rejected": -289.468994140625, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29052865505218506, + "rewards/margins": 3.2564077377319336, + "rewards/rejected": -3.546936511993408, + "step": 6315 + }, + { + "epoch": 0.73, + "learning_rate": 8.28046353739904e-08, + "logits/chosen": -3.368485927581787, + "logits/rejected": -3.2354683876037598, + "logps/chosen": -323.9401550292969, + "logps/rejected": -262.0630798339844, + "loss": 0.4261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1523815095424652, + "rewards/margins": 1.9821691513061523, + "rewards/rejected": -2.1345505714416504, + "step": 6316 + }, + { + "epoch": 0.73, + "learning_rate": 8.276951890436614e-08, + "logits/chosen": -3.4895639419555664, + "logits/rejected": -2.777543306350708, + "logps/chosen": -292.3086242675781, + "logps/rejected": -260.3674621582031, + "loss": 0.2573, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06809408962726593, + "rewards/margins": 2.3456273078918457, + "rewards/rejected": -2.2775332927703857, + "step": 6317 + }, + { + "epoch": 0.73, + "learning_rate": 8.273440243474189e-08, + "logits/chosen": -2.4468207359313965, + "logits/rejected": -2.7551393508911133, + "logps/chosen": -235.4058837890625, + "logps/rejected": -279.052490234375, + "loss": 0.2761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08247226476669312, + "rewards/margins": 2.1060848236083984, + "rewards/rejected": -2.1885571479797363, + "step": 6318 + }, + { + "epoch": 0.73, + "learning_rate": 8.269928596511764e-08, + "logits/chosen": -2.7532811164855957, + "logits/rejected": -2.7586770057678223, + "logps/chosen": -221.7967071533203, + "logps/rejected": -221.61431884765625, + "loss": 0.2395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11650492250919342, + "rewards/margins": 1.88939368724823, + "rewards/rejected": -2.0058987140655518, + "step": 6319 + }, + { + "epoch": 0.73, + "learning_rate": 8.266416949549339e-08, + "logits/chosen": -2.5041565895080566, + "logits/rejected": -2.70467209815979, + "logps/chosen": -414.3048095703125, + "logps/rejected": -388.85211181640625, + "loss": 0.2927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2787785828113556, + "rewards/margins": 2.1656136512756348, + "rewards/rejected": -1.886834979057312, + "step": 6320 + }, + { + "epoch": 0.73, + "learning_rate": 8.262905302586913e-08, + "logits/chosen": -3.1583809852600098, + "logits/rejected": -3.023759365081787, + "logps/chosen": -444.9303894042969, + "logps/rejected": -266.6452331542969, + "loss": 0.5707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24766342341899872, + "rewards/margins": 1.3770396709442139, + "rewards/rejected": -1.624703049659729, + "step": 6321 + }, + { + "epoch": 0.73, + "learning_rate": 8.259393655624487e-08, + "logits/chosen": -3.009894371032715, + "logits/rejected": -3.143695116043091, + "logps/chosen": -466.63433837890625, + "logps/rejected": -356.092529296875, + "loss": 0.4783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027475692331790924, + "rewards/margins": 1.6950523853302002, + "rewards/rejected": -1.667576789855957, + "step": 6322 + }, + { + "epoch": 0.73, + "learning_rate": 8.255882008662063e-08, + "logits/chosen": -3.135611057281494, + "logits/rejected": -2.9321141242980957, + "logps/chosen": -378.253173828125, + "logps/rejected": -378.6016845703125, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06771749258041382, + "rewards/margins": 2.4001057147979736, + "rewards/rejected": -2.332387924194336, + "step": 6323 + }, + { + "epoch": 0.73, + "learning_rate": 8.252370361699637e-08, + "logits/chosen": -2.955157995223999, + "logits/rejected": -3.264991521835327, + "logps/chosen": -292.7845153808594, + "logps/rejected": -361.56597900390625, + "loss": 0.3371, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12923410534858704, + "rewards/margins": 2.9329910278320312, + "rewards/rejected": -3.062225341796875, + "step": 6324 + }, + { + "epoch": 0.73, + "learning_rate": 8.248858714737211e-08, + "logits/chosen": -2.7200727462768555, + "logits/rejected": -2.8574166297912598, + "logps/chosen": -101.61862182617188, + "logps/rejected": -175.15118408203125, + "loss": 0.3844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5044230818748474, + "rewards/margins": 2.0023608207702637, + "rewards/rejected": -2.506783962249756, + "step": 6325 + }, + { + "epoch": 0.73, + "learning_rate": 8.245347067774786e-08, + "logits/chosen": -3.4870667457580566, + "logits/rejected": -3.700751304626465, + "logps/chosen": -288.1723327636719, + "logps/rejected": -211.3855743408203, + "loss": 0.3466, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1870412975549698, + "rewards/margins": 1.7043567895889282, + "rewards/rejected": -1.51731538772583, + "step": 6326 + }, + { + "epoch": 0.73, + "learning_rate": 8.241835420812361e-08, + "logits/chosen": -3.5563127994537354, + "logits/rejected": -3.6597371101379395, + "logps/chosen": -359.447021484375, + "logps/rejected": -316.94757080078125, + "loss": 0.4296, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19766739010810852, + "rewards/margins": 1.4659759998321533, + "rewards/rejected": -1.663643479347229, + "step": 6327 + }, + { + "epoch": 0.73, + "learning_rate": 8.238323773849935e-08, + "logits/chosen": -3.213351249694824, + "logits/rejected": -2.8402633666992188, + "logps/chosen": -240.65390014648438, + "logps/rejected": -363.2618103027344, + "loss": 0.2001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2318764626979828, + "rewards/margins": 2.739772319793701, + "rewards/rejected": -2.5078957080841064, + "step": 6328 + }, + { + "epoch": 0.73, + "learning_rate": 8.23481212688751e-08, + "logits/chosen": -3.1121230125427246, + "logits/rejected": -3.19346022605896, + "logps/chosen": -306.8558349609375, + "logps/rejected": -318.6468200683594, + "loss": 0.3308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04743701219558716, + "rewards/margins": 2.361138343811035, + "rewards/rejected": -2.4085752964019775, + "step": 6329 + }, + { + "epoch": 0.73, + "learning_rate": 8.231300479925084e-08, + "logits/chosen": -3.440436840057373, + "logits/rejected": -3.5107665061950684, + "logps/chosen": -227.20660400390625, + "logps/rejected": -226.88888549804688, + "loss": 0.2843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24231500923633575, + "rewards/margins": 1.882246732711792, + "rewards/rejected": -1.6399317979812622, + "step": 6330 + }, + { + "epoch": 0.73, + "learning_rate": 8.22778883296266e-08, + "logits/chosen": -3.0982139110565186, + "logits/rejected": -3.0002055168151855, + "logps/chosen": -198.01141357421875, + "logps/rejected": -179.04078674316406, + "loss": 0.7845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7550547122955322, + "rewards/margins": 1.0219645500183105, + "rewards/rejected": -1.7770192623138428, + "step": 6331 + }, + { + "epoch": 0.73, + "learning_rate": 8.224277186000234e-08, + "logits/chosen": -2.2251553535461426, + "logits/rejected": -2.4125113487243652, + "logps/chosen": -339.6291198730469, + "logps/rejected": -382.272216796875, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20114058256149292, + "rewards/margins": 2.482461452484131, + "rewards/rejected": -2.281320810317993, + "step": 6332 + }, + { + "epoch": 0.73, + "learning_rate": 8.220765539037808e-08, + "logits/chosen": -3.0635986328125, + "logits/rejected": -3.0098931789398193, + "logps/chosen": -131.987548828125, + "logps/rejected": -98.20221710205078, + "loss": 0.4996, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1639045625925064, + "rewards/margins": 0.680739164352417, + "rewards/rejected": -0.844643771648407, + "step": 6333 + }, + { + "epoch": 0.73, + "learning_rate": 8.217253892075382e-08, + "logits/chosen": -3.476184844970703, + "logits/rejected": -3.7759203910827637, + "logps/chosen": -183.18801879882812, + "logps/rejected": -241.10028076171875, + "loss": 0.2953, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4520948529243469, + "rewards/margins": 2.8170266151428223, + "rewards/rejected": -2.364931583404541, + "step": 6334 + }, + { + "epoch": 0.73, + "learning_rate": 8.213742245112959e-08, + "logits/chosen": -2.7072935104370117, + "logits/rejected": -2.904968500137329, + "logps/chosen": -392.99102783203125, + "logps/rejected": -366.7575378417969, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5494084358215332, + "rewards/margins": 4.479907989501953, + "rewards/rejected": -3.93049955368042, + "step": 6335 + }, + { + "epoch": 0.73, + "learning_rate": 8.210230598150533e-08, + "logits/chosen": -3.2992005348205566, + "logits/rejected": -3.2719123363494873, + "logps/chosen": -167.09947204589844, + "logps/rejected": -208.82496643066406, + "loss": 0.5886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07910394668579102, + "rewards/margins": 1.9716858863830566, + "rewards/rejected": -2.0507898330688477, + "step": 6336 + }, + { + "epoch": 0.73, + "learning_rate": 8.206718951188107e-08, + "logits/chosen": -3.0594019889831543, + "logits/rejected": -3.0872421264648438, + "logps/chosen": -364.610107421875, + "logps/rejected": -281.11517333984375, + "loss": 0.5836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2355513572692871, + "rewards/margins": 0.9955876469612122, + "rewards/rejected": -1.2311389446258545, + "step": 6337 + }, + { + "epoch": 0.73, + "learning_rate": 8.203207304225681e-08, + "logits/chosen": -3.19759464263916, + "logits/rejected": -3.4111719131469727, + "logps/chosen": -196.11135864257812, + "logps/rejected": -205.60726928710938, + "loss": 0.2528, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3736079931259155, + "rewards/margins": 1.9612159729003906, + "rewards/rejected": -1.5876080989837646, + "step": 6338 + }, + { + "epoch": 0.73, + "learning_rate": 8.199695657263255e-08, + "logits/chosen": -2.2560067176818848, + "logits/rejected": -2.4120278358459473, + "logps/chosen": -425.8804931640625, + "logps/rejected": -339.73419189453125, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011721886694431305, + "rewards/margins": 1.8477811813354492, + "rewards/rejected": -1.859503149986267, + "step": 6339 + }, + { + "epoch": 0.73, + "learning_rate": 8.196184010300832e-08, + "logits/chosen": -2.738877534866333, + "logits/rejected": -2.7374746799468994, + "logps/chosen": -391.0930480957031, + "logps/rejected": -288.0426025390625, + "loss": 0.3969, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3270891308784485, + "rewards/margins": 2.420353412628174, + "rewards/rejected": -2.09326434135437, + "step": 6340 + }, + { + "epoch": 0.73, + "learning_rate": 8.192672363338406e-08, + "logits/chosen": -2.9948980808258057, + "logits/rejected": -3.3099937438964844, + "logps/chosen": -325.0084228515625, + "logps/rejected": -379.2665710449219, + "loss": 0.6689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13226857781410217, + "rewards/margins": 1.3249480724334717, + "rewards/rejected": -1.192679524421692, + "step": 6341 + }, + { + "epoch": 0.73, + "learning_rate": 8.18916071637598e-08, + "logits/chosen": -2.9887402057647705, + "logits/rejected": -2.799689769744873, + "logps/chosen": -491.7427978515625, + "logps/rejected": -280.3680114746094, + "loss": 0.3918, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4385789930820465, + "rewards/margins": 2.0152087211608887, + "rewards/rejected": -1.5766297578811646, + "step": 6342 + }, + { + "epoch": 0.73, + "learning_rate": 8.185649069413554e-08, + "logits/chosen": -3.2631914615631104, + "logits/rejected": -3.0676965713500977, + "logps/chosen": -255.1527099609375, + "logps/rejected": -267.9626770019531, + "loss": 0.1692, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8432037234306335, + "rewards/margins": 2.703885078430176, + "rewards/rejected": -1.8606815338134766, + "step": 6343 + }, + { + "epoch": 0.73, + "learning_rate": 8.182137422451129e-08, + "logits/chosen": -2.6290388107299805, + "logits/rejected": -2.8013126850128174, + "logps/chosen": -267.56170654296875, + "logps/rejected": -168.4673614501953, + "loss": 0.1918, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24051152169704437, + "rewards/margins": 2.336066246032715, + "rewards/rejected": -2.095554828643799, + "step": 6344 + }, + { + "epoch": 0.73, + "learning_rate": 8.178625775488703e-08, + "logits/chosen": -3.4057188034057617, + "logits/rejected": -3.060662269592285, + "logps/chosen": -285.64752197265625, + "logps/rejected": -297.5594482421875, + "loss": 0.6255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21215638518333435, + "rewards/margins": 1.5438926219940186, + "rewards/rejected": -1.7560489177703857, + "step": 6345 + }, + { + "epoch": 0.73, + "learning_rate": 8.175114128526279e-08, + "logits/chosen": -2.923799991607666, + "logits/rejected": -2.5578091144561768, + "logps/chosen": -213.54385375976562, + "logps/rejected": -278.5721130371094, + "loss": 0.3564, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.034294985234737396, + "rewards/margins": 1.345210075378418, + "rewards/rejected": -1.3109149932861328, + "step": 6346 + }, + { + "epoch": 0.73, + "learning_rate": 8.171602481563853e-08, + "logits/chosen": -2.8355188369750977, + "logits/rejected": -2.515604019165039, + "logps/chosen": -199.46115112304688, + "logps/rejected": -207.05599975585938, + "loss": 0.1932, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18269848823547363, + "rewards/margins": 2.8039393424987793, + "rewards/rejected": -2.6212410926818848, + "step": 6347 + }, + { + "epoch": 0.73, + "learning_rate": 8.168090834601428e-08, + "logits/chosen": -3.6552932262420654, + "logits/rejected": -3.697052001953125, + "logps/chosen": -188.9609375, + "logps/rejected": -122.27813720703125, + "loss": 0.3203, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3424084484577179, + "rewards/margins": 1.938812017440796, + "rewards/rejected": -1.5964034795761108, + "step": 6348 + }, + { + "epoch": 0.73, + "learning_rate": 8.164579187639002e-08, + "logits/chosen": -1.9697914123535156, + "logits/rejected": -2.1959214210510254, + "logps/chosen": -286.42877197265625, + "logps/rejected": -243.05398559570312, + "loss": 0.2584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06282326579093933, + "rewards/margins": 1.7223454713821411, + "rewards/rejected": -1.7851686477661133, + "step": 6349 + }, + { + "epoch": 0.73, + "learning_rate": 8.161067540676576e-08, + "logits/chosen": -3.0408599376678467, + "logits/rejected": -2.951542377471924, + "logps/chosen": -312.7063293457031, + "logps/rejected": -380.89276123046875, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2843957245349884, + "rewards/margins": 2.938615083694458, + "rewards/rejected": -2.654219150543213, + "step": 6350 + }, + { + "epoch": 0.73, + "learning_rate": 8.15755589371415e-08, + "logits/chosen": -3.1710567474365234, + "logits/rejected": -3.0091583728790283, + "logps/chosen": -273.3824462890625, + "logps/rejected": -155.568359375, + "loss": 0.4274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3847086727619171, + "rewards/margins": 1.5609290599822998, + "rewards/rejected": -1.945637822151184, + "step": 6351 + }, + { + "epoch": 0.73, + "learning_rate": 8.154044246751727e-08, + "logits/chosen": -2.915783405303955, + "logits/rejected": -2.8454856872558594, + "logps/chosen": -251.03933715820312, + "logps/rejected": -370.4572448730469, + "loss": 0.4402, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18023307621479034, + "rewards/margins": 2.408940315246582, + "rewards/rejected": -2.2287070751190186, + "step": 6352 + }, + { + "epoch": 0.73, + "learning_rate": 8.150532599789301e-08, + "logits/chosen": -3.021355390548706, + "logits/rejected": -2.7353310585021973, + "logps/chosen": -351.80126953125, + "logps/rejected": -261.84014892578125, + "loss": 0.1935, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15690863132476807, + "rewards/margins": 3.200167417526245, + "rewards/rejected": -3.0432591438293457, + "step": 6353 + }, + { + "epoch": 0.73, + "learning_rate": 8.147020952826875e-08, + "logits/chosen": -3.6495766639709473, + "logits/rejected": -3.8161864280700684, + "logps/chosen": -198.04107666015625, + "logps/rejected": -185.9938507080078, + "loss": 0.2707, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3888223171234131, + "rewards/margins": 2.704798698425293, + "rewards/rejected": -2.31597638130188, + "step": 6354 + }, + { + "epoch": 0.73, + "learning_rate": 8.143509305864449e-08, + "logits/chosen": -2.3097035884857178, + "logits/rejected": -2.286344528198242, + "logps/chosen": -194.91038513183594, + "logps/rejected": -213.70880126953125, + "loss": 0.4154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24412240087985992, + "rewards/margins": 1.9403396844863892, + "rewards/rejected": -2.184462070465088, + "step": 6355 + }, + { + "epoch": 0.73, + "learning_rate": 8.139997658902025e-08, + "logits/chosen": -3.6603951454162598, + "logits/rejected": -3.6158432960510254, + "logps/chosen": -137.65428161621094, + "logps/rejected": -138.30923461914062, + "loss": 0.3607, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.055985912680625916, + "rewards/margins": 1.1378557682037354, + "rewards/rejected": -1.0818698406219482, + "step": 6356 + }, + { + "epoch": 0.73, + "learning_rate": 8.1364860119396e-08, + "logits/chosen": -2.837092399597168, + "logits/rejected": -2.816150426864624, + "logps/chosen": -243.66888427734375, + "logps/rejected": -175.4250030517578, + "loss": 0.6395, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14669516682624817, + "rewards/margins": 0.36867502331733704, + "rewards/rejected": -0.5153701901435852, + "step": 6357 + }, + { + "epoch": 0.73, + "learning_rate": 8.132974364977174e-08, + "logits/chosen": -2.4531631469726562, + "logits/rejected": -2.4560277462005615, + "logps/chosen": -509.62567138671875, + "logps/rejected": -264.672607421875, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27339839935302734, + "rewards/margins": 2.5294604301452637, + "rewards/rejected": -2.2560620307922363, + "step": 6358 + }, + { + "epoch": 0.73, + "learning_rate": 8.129462718014748e-08, + "logits/chosen": -2.3350751399993896, + "logits/rejected": -2.4873359203338623, + "logps/chosen": -359.728271484375, + "logps/rejected": -277.0501708984375, + "loss": 0.6514, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.019529804587364197, + "rewards/margins": 0.603407621383667, + "rewards/rejected": -0.5838778614997864, + "step": 6359 + }, + { + "epoch": 0.73, + "learning_rate": 8.125951071052324e-08, + "logits/chosen": -2.404452085494995, + "logits/rejected": -2.1451528072357178, + "logps/chosen": -253.50416564941406, + "logps/rejected": -348.7359313964844, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37544316053390503, + "rewards/margins": 2.4515695571899414, + "rewards/rejected": -2.0761263370513916, + "step": 6360 + }, + { + "epoch": 0.73, + "learning_rate": 8.122439424089898e-08, + "logits/chosen": -2.7389893531799316, + "logits/rejected": -2.730325698852539, + "logps/chosen": -398.6631164550781, + "logps/rejected": -269.17828369140625, + "loss": 0.5243, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0642206072807312, + "rewards/margins": 0.8052090406417847, + "rewards/rejected": -0.8694296479225159, + "step": 6361 + }, + { + "epoch": 0.73, + "learning_rate": 8.118927777127472e-08, + "logits/chosen": -3.770435333251953, + "logits/rejected": -3.221694231033325, + "logps/chosen": -413.30889892578125, + "logps/rejected": -252.68704223632812, + "loss": 0.5221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0518014132976532, + "rewards/margins": 1.49700927734375, + "rewards/rejected": -1.548810601234436, + "step": 6362 + }, + { + "epoch": 0.73, + "learning_rate": 8.115416130165047e-08, + "logits/chosen": -3.401925563812256, + "logits/rejected": -3.8937439918518066, + "logps/chosen": -242.93101501464844, + "logps/rejected": -276.4593505859375, + "loss": 0.1997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004763960838317871, + "rewards/margins": 3.1438703536987305, + "rewards/rejected": -3.148634433746338, + "step": 6363 + }, + { + "epoch": 0.73, + "learning_rate": 8.111904483202622e-08, + "logits/chosen": -2.9693784713745117, + "logits/rejected": -3.167632579803467, + "logps/chosen": -197.57054138183594, + "logps/rejected": -354.4634094238281, + "loss": 0.2437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07068327069282532, + "rewards/margins": 2.051967144012451, + "rewards/rejected": -1.9812836647033691, + "step": 6364 + }, + { + "epoch": 0.73, + "learning_rate": 8.108392836240196e-08, + "logits/chosen": -3.2029991149902344, + "logits/rejected": -3.379465341567993, + "logps/chosen": -256.3448181152344, + "logps/rejected": -309.5802001953125, + "loss": 0.1356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007408337667584419, + "rewards/margins": 2.6076111793518066, + "rewards/rejected": -2.6150195598602295, + "step": 6365 + }, + { + "epoch": 0.73, + "learning_rate": 8.10488118927777e-08, + "logits/chosen": -2.960991859436035, + "logits/rejected": -3.1886990070343018, + "logps/chosen": -253.54054260253906, + "logps/rejected": -251.45574951171875, + "loss": 0.1368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13899201154708862, + "rewards/margins": 2.6622610092163086, + "rewards/rejected": -2.801253080368042, + "step": 6366 + }, + { + "epoch": 0.73, + "learning_rate": 8.101369542315345e-08, + "logits/chosen": -2.7048802375793457, + "logits/rejected": -2.8295600414276123, + "logps/chosen": -174.927978515625, + "logps/rejected": -171.82675170898438, + "loss": 0.4775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.050238706171512604, + "rewards/margins": 1.6084250211715698, + "rewards/rejected": -1.5581862926483154, + "step": 6367 + }, + { + "epoch": 0.73, + "learning_rate": 8.097857895352921e-08, + "logits/chosen": -3.2393269538879395, + "logits/rejected": -3.596400260925293, + "logps/chosen": -113.79402160644531, + "logps/rejected": -223.12271118164062, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3267463743686676, + "rewards/margins": 3.569837808609009, + "rewards/rejected": -3.243091106414795, + "step": 6368 + }, + { + "epoch": 0.73, + "learning_rate": 8.094346248390495e-08, + "logits/chosen": -4.098820209503174, + "logits/rejected": -3.85150146484375, + "logps/chosen": -284.1236877441406, + "logps/rejected": -178.03887939453125, + "loss": 0.3697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18107455968856812, + "rewards/margins": 1.7937403917312622, + "rewards/rejected": -1.9748151302337646, + "step": 6369 + }, + { + "epoch": 0.73, + "learning_rate": 8.09083460142807e-08, + "logits/chosen": -3.7076416015625, + "logits/rejected": -3.507441997528076, + "logps/chosen": -326.6656799316406, + "logps/rejected": -296.0179748535156, + "loss": 0.4952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3776654601097107, + "rewards/margins": 1.8549624681472778, + "rewards/rejected": -2.2326278686523438, + "step": 6370 + }, + { + "epoch": 0.73, + "learning_rate": 8.087322954465643e-08, + "logits/chosen": -2.939852237701416, + "logits/rejected": -2.79557204246521, + "logps/chosen": -407.7025451660156, + "logps/rejected": -309.4299621582031, + "loss": 0.4675, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17217525839805603, + "rewards/margins": 1.0717875957489014, + "rewards/rejected": -0.899612307548523, + "step": 6371 + }, + { + "epoch": 0.73, + "learning_rate": 8.083811307503219e-08, + "logits/chosen": -2.56084942817688, + "logits/rejected": -2.915778398513794, + "logps/chosen": -479.06134033203125, + "logps/rejected": -342.6046447753906, + "loss": 0.2155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17251892387866974, + "rewards/margins": 3.286240577697754, + "rewards/rejected": -3.4587600231170654, + "step": 6372 + }, + { + "epoch": 0.73, + "learning_rate": 8.080299660540793e-08, + "logits/chosen": -2.6991729736328125, + "logits/rejected": -2.800914764404297, + "logps/chosen": -164.33297729492188, + "logps/rejected": -225.2251434326172, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.022231578826904297, + "rewards/margins": 2.265775203704834, + "rewards/rejected": -2.2435436248779297, + "step": 6373 + }, + { + "epoch": 0.73, + "learning_rate": 8.076788013578368e-08, + "logits/chosen": -3.3584461212158203, + "logits/rejected": -3.167689800262451, + "logps/chosen": -242.91632080078125, + "logps/rejected": -159.3011932373047, + "loss": 0.252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1409316509962082, + "rewards/margins": 2.01949405670166, + "rewards/rejected": -1.8785622119903564, + "step": 6374 + }, + { + "epoch": 0.73, + "learning_rate": 8.073276366615942e-08, + "logits/chosen": -3.6896262168884277, + "logits/rejected": -3.4929118156433105, + "logps/chosen": -234.193359375, + "logps/rejected": -301.0379638671875, + "loss": 0.7213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.736250638961792, + "rewards/margins": 1.606673002243042, + "rewards/rejected": -2.342923641204834, + "step": 6375 + }, + { + "epoch": 0.74, + "learning_rate": 8.069764719653518e-08, + "logits/chosen": -3.1718246936798096, + "logits/rejected": -3.4381251335144043, + "logps/chosen": -84.1958999633789, + "logps/rejected": -140.40133666992188, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.017517946660518646, + "rewards/margins": 1.9749609231948853, + "rewards/rejected": -1.992478847503662, + "step": 6376 + }, + { + "epoch": 0.74, + "learning_rate": 8.066253072691092e-08, + "logits/chosen": -3.036738872528076, + "logits/rejected": -2.881462574005127, + "logps/chosen": -572.7735595703125, + "logps/rejected": -326.032958984375, + "loss": 0.2442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35994771122932434, + "rewards/margins": 2.322282075881958, + "rewards/rejected": -1.962334394454956, + "step": 6377 + }, + { + "epoch": 0.74, + "learning_rate": 8.062741425728666e-08, + "logits/chosen": -3.245387077331543, + "logits/rejected": -2.988888740539551, + "logps/chosen": -232.01620483398438, + "logps/rejected": -214.65536499023438, + "loss": 0.3768, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5159415006637573, + "rewards/margins": 1.1996166706085205, + "rewards/rejected": -0.6836751103401184, + "step": 6378 + }, + { + "epoch": 0.74, + "learning_rate": 8.05922977876624e-08, + "logits/chosen": -2.6346516609191895, + "logits/rejected": -2.4973678588867188, + "logps/chosen": -314.97100830078125, + "logps/rejected": -259.6939392089844, + "loss": 0.4411, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13622024655342102, + "rewards/margins": 1.2620465755462646, + "rewards/rejected": -1.125826358795166, + "step": 6379 + }, + { + "epoch": 0.74, + "learning_rate": 8.055718131803817e-08, + "logits/chosen": -3.396212100982666, + "logits/rejected": -3.045229196548462, + "logps/chosen": -235.3717803955078, + "logps/rejected": -196.712646484375, + "loss": 0.4741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03689885139465332, + "rewards/margins": 1.4741908311843872, + "rewards/rejected": -1.4372919797897339, + "step": 6380 + }, + { + "epoch": 0.74, + "learning_rate": 8.052206484841391e-08, + "logits/chosen": -3.6933372020721436, + "logits/rejected": -3.816805601119995, + "logps/chosen": -140.7845458984375, + "logps/rejected": -227.11376953125, + "loss": 0.4645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6311870813369751, + "rewards/margins": 1.0784900188446045, + "rewards/rejected": -1.7096772193908691, + "step": 6381 + }, + { + "epoch": 0.74, + "learning_rate": 8.048694837878965e-08, + "logits/chosen": -3.1431894302368164, + "logits/rejected": -2.702505588531494, + "logps/chosen": -240.70468139648438, + "logps/rejected": -204.72630310058594, + "loss": 0.2977, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1493338793516159, + "rewards/margins": 1.5567643642425537, + "rewards/rejected": -1.4074304103851318, + "step": 6382 + }, + { + "epoch": 0.74, + "learning_rate": 8.045183190916539e-08, + "logits/chosen": -3.2592625617980957, + "logits/rejected": -2.8733367919921875, + "logps/chosen": -236.16873168945312, + "logps/rejected": -240.5750732421875, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1355106234550476, + "rewards/margins": 2.313220500946045, + "rewards/rejected": -2.4487311840057373, + "step": 6383 + }, + { + "epoch": 0.74, + "learning_rate": 8.041671543954113e-08, + "logits/chosen": -3.474336862564087, + "logits/rejected": -3.057342290878296, + "logps/chosen": -191.15052795410156, + "logps/rejected": -261.76080322265625, + "loss": 0.2824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41848063468933105, + "rewards/margins": 1.907273292541504, + "rewards/rejected": -2.325753927230835, + "step": 6384 + }, + { + "epoch": 0.74, + "learning_rate": 8.03815989699169e-08, + "logits/chosen": -3.7758278846740723, + "logits/rejected": -3.2835068702697754, + "logps/chosen": -217.2694091796875, + "logps/rejected": -178.60794067382812, + "loss": 0.4247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5022034049034119, + "rewards/margins": 1.6934077739715576, + "rewards/rejected": -2.1956112384796143, + "step": 6385 + }, + { + "epoch": 0.74, + "learning_rate": 8.034648250029264e-08, + "logits/chosen": -3.574932098388672, + "logits/rejected": -4.089390754699707, + "logps/chosen": -182.7436981201172, + "logps/rejected": -238.77569580078125, + "loss": 0.1871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3667198419570923, + "rewards/margins": 2.95084547996521, + "rewards/rejected": -3.317565441131592, + "step": 6386 + }, + { + "epoch": 0.74, + "learning_rate": 8.031136603066838e-08, + "logits/chosen": -2.1259918212890625, + "logits/rejected": -2.2059555053710938, + "logps/chosen": -347.5075378417969, + "logps/rejected": -248.9064178466797, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4463377296924591, + "rewards/margins": 1.2772406339645386, + "rewards/rejected": -0.8309028744697571, + "step": 6387 + }, + { + "epoch": 0.74, + "learning_rate": 8.027624956104412e-08, + "logits/chosen": -2.9886698722839355, + "logits/rejected": -3.0674943923950195, + "logps/chosen": -436.2943115234375, + "logps/rejected": -410.9330749511719, + "loss": 0.4561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5100390315055847, + "rewards/margins": 1.3337712287902832, + "rewards/rejected": -1.8438104391098022, + "step": 6388 + }, + { + "epoch": 0.74, + "learning_rate": 8.024113309141987e-08, + "logits/chosen": -3.5982842445373535, + "logits/rejected": -3.6389830112457275, + "logps/chosen": -112.19334411621094, + "logps/rejected": -143.26820373535156, + "loss": 0.7685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.029625654220581, + "rewards/margins": 0.5360532402992249, + "rewards/rejected": -1.5656788349151611, + "step": 6389 + }, + { + "epoch": 0.74, + "learning_rate": 8.020601662179561e-08, + "logits/chosen": -2.8903892040252686, + "logits/rejected": -2.902412176132202, + "logps/chosen": -201.92568969726562, + "logps/rejected": -229.05555725097656, + "loss": 0.1263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49130260944366455, + "rewards/margins": 3.060708522796631, + "rewards/rejected": -2.5694057941436768, + "step": 6390 + }, + { + "epoch": 0.74, + "learning_rate": 8.017090015217137e-08, + "logits/chosen": -3.1453559398651123, + "logits/rejected": -2.9990971088409424, + "logps/chosen": -344.6080322265625, + "logps/rejected": -387.9911193847656, + "loss": 0.5832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015421777963638306, + "rewards/margins": 1.2975811958312988, + "rewards/rejected": -1.2821593284606934, + "step": 6391 + }, + { + "epoch": 0.74, + "learning_rate": 8.013578368254711e-08, + "logits/chosen": -3.3820130825042725, + "logits/rejected": -3.2194957733154297, + "logps/chosen": -256.8796081542969, + "logps/rejected": -308.796875, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024566650390625, + "rewards/margins": 2.9478092193603516, + "rewards/rejected": -2.9232425689697266, + "step": 6392 + }, + { + "epoch": 0.74, + "learning_rate": 8.010066721292286e-08, + "logits/chosen": -3.1914281845092773, + "logits/rejected": -3.6177053451538086, + "logps/chosen": -222.4288330078125, + "logps/rejected": -325.9631042480469, + "loss": 0.2631, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6532437205314636, + "rewards/margins": 2.415058135986328, + "rewards/rejected": -1.7618142366409302, + "step": 6393 + }, + { + "epoch": 0.74, + "learning_rate": 8.00655507432986e-08, + "logits/chosen": -3.3823838233947754, + "logits/rejected": -3.2002811431884766, + "logps/chosen": -358.5682373046875, + "logps/rejected": -311.07305908203125, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2525482177734375, + "rewards/margins": 1.7154912948608398, + "rewards/rejected": -1.9680395126342773, + "step": 6394 + }, + { + "epoch": 0.74, + "learning_rate": 8.003043427367434e-08, + "logits/chosen": -2.5339107513427734, + "logits/rejected": -2.720278024673462, + "logps/chosen": -160.11904907226562, + "logps/rejected": -142.06956481933594, + "loss": 0.3725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16557712852954865, + "rewards/margins": 1.1674367189407349, + "rewards/rejected": -1.333013892173767, + "step": 6395 + }, + { + "epoch": 0.74, + "learning_rate": 7.999531780405008e-08, + "logits/chosen": -2.83575439453125, + "logits/rejected": -2.7528390884399414, + "logps/chosen": -145.2420654296875, + "logps/rejected": -200.6960906982422, + "loss": 0.4333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21539145708084106, + "rewards/margins": 1.1188607215881348, + "rewards/rejected": -1.3342522382736206, + "step": 6396 + }, + { + "epoch": 0.74, + "learning_rate": 7.996020133442585e-08, + "logits/chosen": -2.6903083324432373, + "logits/rejected": -2.676698923110962, + "logps/chosen": -214.57102966308594, + "logps/rejected": -193.89443969726562, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12050998210906982, + "rewards/margins": 2.1191534996032715, + "rewards/rejected": -1.998643398284912, + "step": 6397 + }, + { + "epoch": 0.74, + "learning_rate": 7.992508486480159e-08, + "logits/chosen": -3.0630149841308594, + "logits/rejected": -3.0034971237182617, + "logps/chosen": -275.3714904785156, + "logps/rejected": -309.33416748046875, + "loss": 0.3386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0684414803981781, + "rewards/margins": 1.2675408124923706, + "rewards/rejected": -1.335982322692871, + "step": 6398 + }, + { + "epoch": 0.74, + "learning_rate": 7.988996839517733e-08, + "logits/chosen": -2.747711658477783, + "logits/rejected": -2.778618812561035, + "logps/chosen": -254.91064453125, + "logps/rejected": -202.17062377929688, + "loss": 0.6683, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39809221029281616, + "rewards/margins": 1.691719651222229, + "rewards/rejected": -2.0898118019104004, + "step": 6399 + }, + { + "epoch": 0.74, + "learning_rate": 7.985485192555307e-08, + "logits/chosen": -2.94149112701416, + "logits/rejected": -3.0213146209716797, + "logps/chosen": -267.23883056640625, + "logps/rejected": -187.97970581054688, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06932765245437622, + "rewards/margins": 0.9977157711982727, + "rewards/rejected": -0.9283881187438965, + "step": 6400 + }, + { + "epoch": 0.74, + "learning_rate": 7.981973545592884e-08, + "logits/chosen": -2.706113815307617, + "logits/rejected": -2.74666690826416, + "logps/chosen": -166.17848205566406, + "logps/rejected": -272.041015625, + "loss": 0.631, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004582107067108154, + "rewards/margins": 1.252347469329834, + "rewards/rejected": -1.247765302658081, + "step": 6401 + }, + { + "epoch": 0.74, + "learning_rate": 7.978461898630458e-08, + "logits/chosen": -3.3254823684692383, + "logits/rejected": -3.224916696548462, + "logps/chosen": -372.8643798828125, + "logps/rejected": -328.0347900390625, + "loss": 0.7962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6515939235687256, + "rewards/margins": 1.082826852798462, + "rewards/rejected": -1.7344207763671875, + "step": 6402 + }, + { + "epoch": 0.74, + "learning_rate": 7.974950251668032e-08, + "logits/chosen": -2.6197896003723145, + "logits/rejected": -2.4741783142089844, + "logps/chosen": -217.13755798339844, + "logps/rejected": -139.45465087890625, + "loss": 0.3904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27704551815986633, + "rewards/margins": 0.9774760603904724, + "rewards/rejected": -0.7004304528236389, + "step": 6403 + }, + { + "epoch": 0.74, + "learning_rate": 7.971438604705606e-08, + "logits/chosen": -3.1816813945770264, + "logits/rejected": -3.1724586486816406, + "logps/chosen": -219.45481872558594, + "logps/rejected": -236.34559631347656, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2791505753993988, + "rewards/margins": 2.07902193069458, + "rewards/rejected": -2.3581724166870117, + "step": 6404 + }, + { + "epoch": 0.74, + "learning_rate": 7.967926957743181e-08, + "logits/chosen": -2.437246322631836, + "logits/rejected": -2.4722900390625, + "logps/chosen": -303.12457275390625, + "logps/rejected": -240.4605712890625, + "loss": 0.5546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0028901025652885437, + "rewards/margins": 1.148919701576233, + "rewards/rejected": -1.1460297107696533, + "step": 6405 + }, + { + "epoch": 0.74, + "learning_rate": 7.964415310780756e-08, + "logits/chosen": -3.187304973602295, + "logits/rejected": -3.1214237213134766, + "logps/chosen": -171.21070861816406, + "logps/rejected": -178.51560974121094, + "loss": 0.3804, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20186583697795868, + "rewards/margins": 1.6068201065063477, + "rewards/rejected": -1.4049540758132935, + "step": 6406 + }, + { + "epoch": 0.74, + "learning_rate": 7.96090366381833e-08, + "logits/chosen": -2.596289873123169, + "logits/rejected": -2.517672538757324, + "logps/chosen": -286.5695495605469, + "logps/rejected": -233.165771484375, + "loss": 0.4037, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4390692710876465, + "rewards/margins": 1.5821290016174316, + "rewards/rejected": -2.021198272705078, + "step": 6407 + }, + { + "epoch": 0.74, + "learning_rate": 7.957392016855905e-08, + "logits/chosen": -3.241090774536133, + "logits/rejected": -3.4115002155303955, + "logps/chosen": -328.1256408691406, + "logps/rejected": -226.6127166748047, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8321160674095154, + "rewards/margins": 2.0095484256744385, + "rewards/rejected": -2.8416645526885986, + "step": 6408 + }, + { + "epoch": 0.74, + "learning_rate": 7.95388036989348e-08, + "logits/chosen": -3.1972084045410156, + "logits/rejected": -3.244002342224121, + "logps/chosen": -275.59259033203125, + "logps/rejected": -284.1604919433594, + "loss": 0.2857, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18991619348526, + "rewards/margins": 1.8637348413467407, + "rewards/rejected": -1.673818588256836, + "step": 6409 + }, + { + "epoch": 0.74, + "learning_rate": 7.950368722931054e-08, + "logits/chosen": -2.7098658084869385, + "logits/rejected": -2.969816207885742, + "logps/chosen": -268.836669921875, + "logps/rejected": -179.5952911376953, + "loss": 0.4366, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03782980516552925, + "rewards/margins": 1.3893709182739258, + "rewards/rejected": -1.3515410423278809, + "step": 6410 + }, + { + "epoch": 0.74, + "learning_rate": 7.946857075968628e-08, + "logits/chosen": -2.1057214736938477, + "logits/rejected": -2.249715805053711, + "logps/chosen": -159.45518493652344, + "logps/rejected": -113.3492431640625, + "loss": 0.6113, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3513216972351074, + "rewards/margins": 0.49614864587783813, + "rewards/rejected": -0.8474702835083008, + "step": 6411 + }, + { + "epoch": 0.74, + "learning_rate": 7.943345429006203e-08, + "logits/chosen": -3.3374617099761963, + "logits/rejected": -3.3425800800323486, + "logps/chosen": -297.6021423339844, + "logps/rejected": -257.83013916015625, + "loss": 0.3108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13965582847595215, + "rewards/margins": 2.656076669692993, + "rewards/rejected": -2.7957327365875244, + "step": 6412 + }, + { + "epoch": 0.74, + "learning_rate": 7.939833782043779e-08, + "logits/chosen": -3.2378180027008057, + "logits/rejected": -3.326991081237793, + "logps/chosen": -387.8145751953125, + "logps/rejected": -380.1577453613281, + "loss": 0.1679, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16405567526817322, + "rewards/margins": 2.832596778869629, + "rewards/rejected": -2.6685409545898438, + "step": 6413 + }, + { + "epoch": 0.74, + "learning_rate": 7.936322135081353e-08, + "logits/chosen": -3.245939254760742, + "logits/rejected": -3.8335585594177246, + "logps/chosen": -207.14767456054688, + "logps/rejected": -346.48956298828125, + "loss": 0.1937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.093332439661026, + "rewards/margins": 2.6958625316619873, + "rewards/rejected": -2.7891950607299805, + "step": 6414 + }, + { + "epoch": 0.74, + "learning_rate": 7.932810488118927e-08, + "logits/chosen": -3.906020402908325, + "logits/rejected": -3.6584229469299316, + "logps/chosen": -187.86448669433594, + "logps/rejected": -133.4293670654297, + "loss": 0.4861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5021368861198425, + "rewards/margins": 0.7564477920532227, + "rewards/rejected": -1.2585844993591309, + "step": 6415 + }, + { + "epoch": 0.74, + "learning_rate": 7.929298841156501e-08, + "logits/chosen": -3.4207942485809326, + "logits/rejected": -3.3908863067626953, + "logps/chosen": -252.47952270507812, + "logps/rejected": -259.82464599609375, + "loss": 0.3399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07614961266517639, + "rewards/margins": 2.7489774227142334, + "rewards/rejected": -2.825127124786377, + "step": 6416 + }, + { + "epoch": 0.74, + "learning_rate": 7.925787194194077e-08, + "logits/chosen": -3.124948501586914, + "logits/rejected": -2.9276134967803955, + "logps/chosen": -292.0642395019531, + "logps/rejected": -355.5625305175781, + "loss": 0.4231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2036464512348175, + "rewards/margins": 1.5435770750045776, + "rewards/rejected": -1.7472234964370728, + "step": 6417 + }, + { + "epoch": 0.74, + "learning_rate": 7.922275547231652e-08, + "logits/chosen": -2.600986957550049, + "logits/rejected": -2.8939731121063232, + "logps/chosen": -308.0492858886719, + "logps/rejected": -184.551513671875, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4691298007965088, + "rewards/margins": 1.3447052240371704, + "rewards/rejected": -1.8138350248336792, + "step": 6418 + }, + { + "epoch": 0.74, + "learning_rate": 7.918763900269226e-08, + "logits/chosen": -2.6393394470214844, + "logits/rejected": -2.6255719661712646, + "logps/chosen": -412.720947265625, + "logps/rejected": -305.37445068359375, + "loss": 0.4452, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8761891722679138, + "rewards/margins": 1.6873996257781982, + "rewards/rejected": -0.8112105131149292, + "step": 6419 + }, + { + "epoch": 0.74, + "learning_rate": 7.9152522533068e-08, + "logits/chosen": -3.1580066680908203, + "logits/rejected": -3.27726674079895, + "logps/chosen": -212.27728271484375, + "logps/rejected": -212.1817169189453, + "loss": 0.445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.854422390460968, + "rewards/margins": 1.4548239707946777, + "rewards/rejected": -2.30924654006958, + "step": 6420 + }, + { + "epoch": 0.74, + "learning_rate": 7.911740606344376e-08, + "logits/chosen": -3.57234263420105, + "logits/rejected": -3.3071887493133545, + "logps/chosen": -341.68768310546875, + "logps/rejected": -379.4610290527344, + "loss": 0.5278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29412856698036194, + "rewards/margins": 1.1350258588790894, + "rewards/rejected": -1.429154396057129, + "step": 6421 + }, + { + "epoch": 0.74, + "learning_rate": 7.90822895938195e-08, + "logits/chosen": -3.5060548782348633, + "logits/rejected": -3.3839635848999023, + "logps/chosen": -280.2442626953125, + "logps/rejected": -234.03265380859375, + "loss": 0.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43068408966064453, + "rewards/margins": 1.528872013092041, + "rewards/rejected": -1.0981879234313965, + "step": 6422 + }, + { + "epoch": 0.74, + "learning_rate": 7.904717312419524e-08, + "logits/chosen": -3.278953790664673, + "logits/rejected": -3.2084038257598877, + "logps/chosen": -144.41165161132812, + "logps/rejected": -248.35287475585938, + "loss": 0.4286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3198094367980957, + "rewards/margins": 0.879848301410675, + "rewards/rejected": -0.5600388646125793, + "step": 6423 + }, + { + "epoch": 0.74, + "learning_rate": 7.901205665457098e-08, + "logits/chosen": -3.436493158340454, + "logits/rejected": -2.995820999145508, + "logps/chosen": -600.589111328125, + "logps/rejected": -253.78677368164062, + "loss": 0.3408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5846751928329468, + "rewards/margins": 1.6193628311157227, + "rewards/rejected": -2.204037666320801, + "step": 6424 + }, + { + "epoch": 0.74, + "learning_rate": 7.897694018494675e-08, + "logits/chosen": -3.365835428237915, + "logits/rejected": -3.244309425354004, + "logps/chosen": -102.35816955566406, + "logps/rejected": -141.8848114013672, + "loss": 0.3855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35344141721725464, + "rewards/margins": 1.7179454565048218, + "rewards/rejected": -2.0713868141174316, + "step": 6425 + }, + { + "epoch": 0.74, + "learning_rate": 7.894182371532249e-08, + "logits/chosen": -2.6167845726013184, + "logits/rejected": -2.8456854820251465, + "logps/chosen": -114.60612487792969, + "logps/rejected": -258.79095458984375, + "loss": 0.2206, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11766412854194641, + "rewards/margins": 2.6881935596466064, + "rewards/rejected": -2.5705294609069824, + "step": 6426 + }, + { + "epoch": 0.74, + "learning_rate": 7.890670724569823e-08, + "logits/chosen": -3.733285427093506, + "logits/rejected": -3.630037546157837, + "logps/chosen": -181.77764892578125, + "logps/rejected": -130.56777954101562, + "loss": 0.3525, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19714099168777466, + "rewards/margins": 1.4447553157806396, + "rewards/rejected": -1.2476142644882202, + "step": 6427 + }, + { + "epoch": 0.74, + "learning_rate": 7.887159077607397e-08, + "logits/chosen": -3.0915865898132324, + "logits/rejected": -3.127490997314453, + "logps/chosen": -189.06521606445312, + "logps/rejected": -222.28404235839844, + "loss": 0.3588, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2025388777256012, + "rewards/margins": 1.9891682863235474, + "rewards/rejected": -2.191707134246826, + "step": 6428 + }, + { + "epoch": 0.74, + "learning_rate": 7.883647430644973e-08, + "logits/chosen": -2.939854621887207, + "logits/rejected": -2.994647979736328, + "logps/chosen": -331.1605224609375, + "logps/rejected": -325.30084228515625, + "loss": 0.4809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39373812079429626, + "rewards/margins": 1.2941648960113525, + "rewards/rejected": -1.6879029273986816, + "step": 6429 + }, + { + "epoch": 0.74, + "learning_rate": 7.880135783682548e-08, + "logits/chosen": -3.0066096782684326, + "logits/rejected": -2.836583137512207, + "logps/chosen": -328.8945617675781, + "logps/rejected": -418.656494140625, + "loss": 0.5088, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0811169296503067, + "rewards/margins": 1.0951902866363525, + "rewards/rejected": -1.176307201385498, + "step": 6430 + }, + { + "epoch": 0.74, + "learning_rate": 7.876624136720122e-08, + "logits/chosen": -2.556809902191162, + "logits/rejected": -2.7570528984069824, + "logps/chosen": -265.0262451171875, + "logps/rejected": -257.71258544921875, + "loss": 0.3842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25023895502090454, + "rewards/margins": 1.2042670249938965, + "rewards/rejected": -0.9540281891822815, + "step": 6431 + }, + { + "epoch": 0.74, + "learning_rate": 7.873112489757696e-08, + "logits/chosen": -3.328857898712158, + "logits/rejected": -3.3412721157073975, + "logps/chosen": -137.2555389404297, + "logps/rejected": -260.1296081542969, + "loss": 0.1855, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46027398109436035, + "rewards/margins": 3.091370105743408, + "rewards/rejected": -2.631096363067627, + "step": 6432 + }, + { + "epoch": 0.74, + "learning_rate": 7.86960084279527e-08, + "logits/chosen": -3.1151885986328125, + "logits/rejected": -3.8100008964538574, + "logps/chosen": -122.67564392089844, + "logps/rejected": -138.04373168945312, + "loss": 0.5859, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6028680801391602, + "rewards/margins": 0.7388539910316467, + "rewards/rejected": -1.3417221307754517, + "step": 6433 + }, + { + "epoch": 0.74, + "learning_rate": 7.866089195832845e-08, + "logits/chosen": -3.106966972351074, + "logits/rejected": -3.1327462196350098, + "logps/chosen": -303.5438537597656, + "logps/rejected": -318.565185546875, + "loss": 0.5153, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22139447927474976, + "rewards/margins": 1.160252571105957, + "rewards/rejected": -0.938857913017273, + "step": 6434 + }, + { + "epoch": 0.74, + "learning_rate": 7.86257754887042e-08, + "logits/chosen": -2.7859878540039062, + "logits/rejected": -2.6443378925323486, + "logps/chosen": -368.04296875, + "logps/rejected": -341.12554931640625, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.367580771446228, + "rewards/margins": 2.7012882232666016, + "rewards/rejected": -2.333707332611084, + "step": 6435 + }, + { + "epoch": 0.74, + "learning_rate": 7.859065901907995e-08, + "logits/chosen": -3.4898462295532227, + "logits/rejected": -3.5386297702789307, + "logps/chosen": -222.08287048339844, + "logps/rejected": -194.33514404296875, + "loss": 0.2954, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6948533654212952, + "rewards/margins": 1.697340726852417, + "rewards/rejected": -1.0024874210357666, + "step": 6436 + }, + { + "epoch": 0.74, + "learning_rate": 7.855554254945569e-08, + "logits/chosen": -3.062453031539917, + "logits/rejected": -3.28981876373291, + "logps/chosen": -184.9267578125, + "logps/rejected": -275.02825927734375, + "loss": 0.385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35631585121154785, + "rewards/margins": 1.3602895736694336, + "rewards/rejected": -1.7166054248809814, + "step": 6437 + }, + { + "epoch": 0.74, + "learning_rate": 7.852042607983144e-08, + "logits/chosen": -2.796786308288574, + "logits/rejected": -2.7710378170013428, + "logps/chosen": -197.33758544921875, + "logps/rejected": -208.72271728515625, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7828726768493652, + "rewards/margins": 1.1915662288665771, + "rewards/rejected": -1.974439024925232, + "step": 6438 + }, + { + "epoch": 0.74, + "learning_rate": 7.848530961020718e-08, + "logits/chosen": -2.8964333534240723, + "logits/rejected": -2.9952995777130127, + "logps/chosen": -296.0469970703125, + "logps/rejected": -177.16619873046875, + "loss": 0.613, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7515465021133423, + "rewards/margins": 0.48830538988113403, + "rewards/rejected": -1.2398518323898315, + "step": 6439 + }, + { + "epoch": 0.74, + "learning_rate": 7.845019314058292e-08, + "logits/chosen": -2.991663932800293, + "logits/rejected": -2.755199670791626, + "logps/chosen": -169.64254760742188, + "logps/rejected": -281.62921142578125, + "loss": 0.5176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24918991327285767, + "rewards/margins": 1.3706790208816528, + "rewards/rejected": -1.6198689937591553, + "step": 6440 + }, + { + "epoch": 0.74, + "learning_rate": 7.841507667095866e-08, + "logits/chosen": -2.790821075439453, + "logits/rejected": -2.8970396518707275, + "logps/chosen": -342.53814697265625, + "logps/rejected": -256.77569580078125, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12828460335731506, + "rewards/margins": 2.112487316131592, + "rewards/rejected": -2.240771770477295, + "step": 6441 + }, + { + "epoch": 0.74, + "learning_rate": 7.837996020133443e-08, + "logits/chosen": -2.9871037006378174, + "logits/rejected": -3.110673189163208, + "logps/chosen": -242.28240966796875, + "logps/rejected": -267.92486572265625, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33631861209869385, + "rewards/margins": 2.2164556980133057, + "rewards/rejected": -1.8801369667053223, + "step": 6442 + }, + { + "epoch": 0.74, + "learning_rate": 7.834484373171017e-08, + "logits/chosen": -3.826474666595459, + "logits/rejected": -3.6512093544006348, + "logps/chosen": -154.7283172607422, + "logps/rejected": -178.46890258789062, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.051804088056087494, + "rewards/margins": 1.9189543724060059, + "rewards/rejected": -1.9707584381103516, + "step": 6443 + }, + { + "epoch": 0.74, + "learning_rate": 7.830972726208591e-08, + "logits/chosen": -2.5595662593841553, + "logits/rejected": -2.4383225440979004, + "logps/chosen": -205.67005920410156, + "logps/rejected": -321.8591003417969, + "loss": 0.3382, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03614449501037598, + "rewards/margins": 2.0067455768585205, + "rewards/rejected": -1.970601201057434, + "step": 6444 + }, + { + "epoch": 0.74, + "learning_rate": 7.827461079246165e-08, + "logits/chosen": -2.9525718688964844, + "logits/rejected": -2.8749959468841553, + "logps/chosen": -196.96963500976562, + "logps/rejected": -239.5850830078125, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14940962195396423, + "rewards/margins": 0.7104698419570923, + "rewards/rejected": -0.8598795533180237, + "step": 6445 + }, + { + "epoch": 0.74, + "learning_rate": 7.823949432283742e-08, + "logits/chosen": -2.9717981815338135, + "logits/rejected": -2.896757125854492, + "logps/chosen": -242.0891571044922, + "logps/rejected": -261.220458984375, + "loss": 0.1385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5329132080078125, + "rewards/margins": 3.6137008666992188, + "rewards/rejected": -3.0807876586914062, + "step": 6446 + }, + { + "epoch": 0.74, + "learning_rate": 7.820437785321316e-08, + "logits/chosen": -2.7301371097564697, + "logits/rejected": -2.7544052600860596, + "logps/chosen": -308.7724609375, + "logps/rejected": -376.7019958496094, + "loss": 0.33, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16255971789360046, + "rewards/margins": 3.549356460571289, + "rewards/rejected": -3.386796474456787, + "step": 6447 + }, + { + "epoch": 0.74, + "learning_rate": 7.81692613835889e-08, + "logits/chosen": -3.586872100830078, + "logits/rejected": -3.6079599857330322, + "logps/chosen": -224.94430541992188, + "logps/rejected": -261.7461853027344, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4623001217842102, + "rewards/margins": 2.1842904090881348, + "rewards/rejected": -2.6465904712677, + "step": 6448 + }, + { + "epoch": 0.74, + "learning_rate": 7.813414491396464e-08, + "logits/chosen": -2.221890926361084, + "logits/rejected": -2.5076801776885986, + "logps/chosen": -368.9783020019531, + "logps/rejected": -338.59368896484375, + "loss": 0.2227, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3380822539329529, + "rewards/margins": 2.8566441535949707, + "rewards/rejected": -2.518561840057373, + "step": 6449 + }, + { + "epoch": 0.74, + "learning_rate": 7.80990284443404e-08, + "logits/chosen": -2.3335511684417725, + "logits/rejected": -2.4149727821350098, + "logps/chosen": -363.0848388671875, + "logps/rejected": -289.3457946777344, + "loss": 0.4195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2778128683567047, + "rewards/margins": 2.289132833480835, + "rewards/rejected": -2.566945791244507, + "step": 6450 + }, + { + "epoch": 0.74, + "learning_rate": 7.806391197471613e-08, + "logits/chosen": -3.797441005706787, + "logits/rejected": -3.7945451736450195, + "logps/chosen": -388.6646423339844, + "logps/rejected": -509.83148193359375, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0017329156398773193, + "rewards/margins": 2.1972265243530273, + "rewards/rejected": -2.1989593505859375, + "step": 6451 + }, + { + "epoch": 0.74, + "learning_rate": 7.802879550509189e-08, + "logits/chosen": -2.4465785026550293, + "logits/rejected": -2.6925251483917236, + "logps/chosen": -443.947998046875, + "logps/rejected": -427.9801330566406, + "loss": 0.4823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20724400877952576, + "rewards/margins": 1.4689677953720093, + "rewards/rejected": -1.6762118339538574, + "step": 6452 + }, + { + "epoch": 0.74, + "learning_rate": 7.799367903546763e-08, + "logits/chosen": -2.629246234893799, + "logits/rejected": -2.5227861404418945, + "logps/chosen": -204.33900451660156, + "logps/rejected": -199.80282592773438, + "loss": 0.4519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6496258974075317, + "rewards/margins": 0.9693929553031921, + "rewards/rejected": -1.619018793106079, + "step": 6453 + }, + { + "epoch": 0.74, + "learning_rate": 7.795856256584338e-08, + "logits/chosen": -1.957680106163025, + "logits/rejected": -2.047872304916382, + "logps/chosen": -403.48095703125, + "logps/rejected": -300.4806213378906, + "loss": 0.6239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018184572458267212, + "rewards/margins": 0.6611497402191162, + "rewards/rejected": -0.6793343424797058, + "step": 6454 + }, + { + "epoch": 0.74, + "learning_rate": 7.792344609621912e-08, + "logits/chosen": -2.8011691570281982, + "logits/rejected": -3.073173761367798, + "logps/chosen": -248.10963439941406, + "logps/rejected": -205.0605010986328, + "loss": 0.419, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01224873960018158, + "rewards/margins": 1.0846678018569946, + "rewards/rejected": -1.0724190473556519, + "step": 6455 + }, + { + "epoch": 0.74, + "learning_rate": 7.788832962659486e-08, + "logits/chosen": -2.9114861488342285, + "logits/rejected": -3.0614700317382812, + "logps/chosen": -162.31491088867188, + "logps/rejected": -206.80274963378906, + "loss": 0.4839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.042829930782318115, + "rewards/margins": 1.900216817855835, + "rewards/rejected": -1.8573869466781616, + "step": 6456 + }, + { + "epoch": 0.74, + "learning_rate": 7.78532131569706e-08, + "logits/chosen": -3.803100109100342, + "logits/rejected": -3.5004806518554688, + "logps/chosen": -289.9686279296875, + "logps/rejected": -228.89007568359375, + "loss": 0.347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5590882897377014, + "rewards/margins": 1.36179518699646, + "rewards/rejected": -0.8027068376541138, + "step": 6457 + }, + { + "epoch": 0.74, + "learning_rate": 7.781809668734637e-08, + "logits/chosen": -3.0667452812194824, + "logits/rejected": -2.779268741607666, + "logps/chosen": -157.91685485839844, + "logps/rejected": -111.47301483154297, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3417852222919464, + "rewards/margins": 0.4036257863044739, + "rewards/rejected": -0.7454110383987427, + "step": 6458 + }, + { + "epoch": 0.74, + "learning_rate": 7.778298021772211e-08, + "logits/chosen": -2.27905011177063, + "logits/rejected": -2.396794557571411, + "logps/chosen": -260.6044616699219, + "logps/rejected": -199.41409301757812, + "loss": 0.4745, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09461800754070282, + "rewards/margins": 1.592972755432129, + "rewards/rejected": -1.4983547925949097, + "step": 6459 + }, + { + "epoch": 0.74, + "learning_rate": 7.774786374809785e-08, + "logits/chosen": -2.370687961578369, + "logits/rejected": -2.562455892562866, + "logps/chosen": -413.1358337402344, + "logps/rejected": -236.24578857421875, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04253290593624115, + "rewards/margins": 1.184563398361206, + "rewards/rejected": -1.1420304775238037, + "step": 6460 + }, + { + "epoch": 0.74, + "learning_rate": 7.77127472784736e-08, + "logits/chosen": -2.976067304611206, + "logits/rejected": -3.039309024810791, + "logps/chosen": -188.83444213867188, + "logps/rejected": -283.1610107421875, + "loss": 0.3443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34446537494659424, + "rewards/margins": 2.617556095123291, + "rewards/rejected": -2.9620213508605957, + "step": 6461 + }, + { + "epoch": 0.74, + "learning_rate": 7.767763080884935e-08, + "logits/chosen": -2.663421630859375, + "logits/rejected": -2.738847494125366, + "logps/chosen": -343.8318786621094, + "logps/rejected": -344.9078063964844, + "loss": 0.3352, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24475470185279846, + "rewards/margins": 1.6283448934555054, + "rewards/rejected": -1.3835902214050293, + "step": 6462 + }, + { + "epoch": 0.75, + "learning_rate": 7.76425143392251e-08, + "logits/chosen": -3.265627145767212, + "logits/rejected": -2.7743136882781982, + "logps/chosen": -167.58860778808594, + "logps/rejected": -235.95474243164062, + "loss": 0.6213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5353987216949463, + "rewards/margins": 1.347619891166687, + "rewards/rejected": -1.8830187320709229, + "step": 6463 + }, + { + "epoch": 0.75, + "learning_rate": 7.760739786960084e-08, + "logits/chosen": -2.7575244903564453, + "logits/rejected": -2.8512141704559326, + "logps/chosen": -198.43621826171875, + "logps/rejected": -201.7295684814453, + "loss": 0.4046, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023220881819725037, + "rewards/margins": 1.2861474752426147, + "rewards/rejected": -1.262926697731018, + "step": 6464 + }, + { + "epoch": 0.75, + "learning_rate": 7.757228139997658e-08, + "logits/chosen": -2.938166856765747, + "logits/rejected": -2.9401979446411133, + "logps/chosen": -410.30169677734375, + "logps/rejected": -249.59146118164062, + "loss": 0.3811, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44342729449272156, + "rewards/margins": 1.0467824935913086, + "rewards/rejected": -0.6033551096916199, + "step": 6465 + }, + { + "epoch": 0.75, + "learning_rate": 7.753716493035234e-08, + "logits/chosen": -2.6886961460113525, + "logits/rejected": -2.662539482116699, + "logps/chosen": -313.06982421875, + "logps/rejected": -261.8187255859375, + "loss": 0.5557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7038354873657227, + "rewards/margins": 1.35688054561615, + "rewards/rejected": -2.060716152191162, + "step": 6466 + }, + { + "epoch": 0.75, + "learning_rate": 7.750204846072808e-08, + "logits/chosen": -3.495527744293213, + "logits/rejected": -3.6539969444274902, + "logps/chosen": -125.57868957519531, + "logps/rejected": -165.42938232421875, + "loss": 0.3294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12933042645454407, + "rewards/margins": 1.6982178688049316, + "rewards/rejected": -1.8275482654571533, + "step": 6467 + }, + { + "epoch": 0.75, + "learning_rate": 7.746693199110382e-08, + "logits/chosen": -2.175307035446167, + "logits/rejected": -2.2903549671173096, + "logps/chosen": -273.85369873046875, + "logps/rejected": -317.2001037597656, + "loss": 0.3841, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3220002055168152, + "rewards/margins": 1.8508973121643066, + "rewards/rejected": -1.5288970470428467, + "step": 6468 + }, + { + "epoch": 0.75, + "learning_rate": 7.743181552147957e-08, + "logits/chosen": -3.516238212585449, + "logits/rejected": -3.28208065032959, + "logps/chosen": -338.69451904296875, + "logps/rejected": -224.90167236328125, + "loss": 0.3856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19431215524673462, + "rewards/margins": 1.2812305688858032, + "rewards/rejected": -1.475542664527893, + "step": 6469 + }, + { + "epoch": 0.75, + "learning_rate": 7.739669905185533e-08, + "logits/chosen": -3.49448823928833, + "logits/rejected": -3.4101572036743164, + "logps/chosen": -317.0505676269531, + "logps/rejected": -309.2614440917969, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24605783820152283, + "rewards/margins": 3.482137441635132, + "rewards/rejected": -3.236079692840576, + "step": 6470 + }, + { + "epoch": 0.75, + "learning_rate": 7.736158258223107e-08, + "logits/chosen": -3.598710060119629, + "logits/rejected": -3.518826723098755, + "logps/chosen": -155.84597778320312, + "logps/rejected": -220.13134765625, + "loss": 0.4369, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2908773422241211, + "rewards/margins": 0.7860502004623413, + "rewards/rejected": -0.495172917842865, + "step": 6471 + }, + { + "epoch": 0.75, + "learning_rate": 7.732646611260681e-08, + "logits/chosen": -2.9399354457855225, + "logits/rejected": -2.9047601222991943, + "logps/chosen": -146.7411651611328, + "logps/rejected": -236.7665557861328, + "loss": 0.6512, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6469926834106445, + "rewards/margins": 0.5870485305786133, + "rewards/rejected": -1.2340412139892578, + "step": 6472 + }, + { + "epoch": 0.75, + "learning_rate": 7.729134964298255e-08, + "logits/chosen": -2.8062145709991455, + "logits/rejected": -2.9400105476379395, + "logps/chosen": -387.98846435546875, + "logps/rejected": -378.82940673828125, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2947269678115845, + "rewards/margins": 1.979923129081726, + "rewards/rejected": -2.2746500968933105, + "step": 6473 + }, + { + "epoch": 0.75, + "learning_rate": 7.725623317335831e-08, + "logits/chosen": -2.7872681617736816, + "logits/rejected": -2.628687858581543, + "logps/chosen": -215.11219787597656, + "logps/rejected": -183.9815673828125, + "loss": 0.8189, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4634980261325836, + "rewards/margins": 0.2766202688217163, + "rewards/rejected": -0.7401183843612671, + "step": 6474 + }, + { + "epoch": 0.75, + "learning_rate": 7.722111670373405e-08, + "logits/chosen": -2.7798078060150146, + "logits/rejected": -2.8525900840759277, + "logps/chosen": -261.1553649902344, + "logps/rejected": -85.6917953491211, + "loss": 0.5672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3356441855430603, + "rewards/margins": 0.5613486766815186, + "rewards/rejected": -0.8969928026199341, + "step": 6475 + }, + { + "epoch": 0.75, + "learning_rate": 7.71860002341098e-08, + "logits/chosen": -3.0236315727233887, + "logits/rejected": -3.264103651046753, + "logps/chosen": -310.6517333984375, + "logps/rejected": -256.31103515625, + "loss": 0.1968, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04125623404979706, + "rewards/margins": 3.073268413543701, + "rewards/rejected": -3.0320119857788086, + "step": 6476 + }, + { + "epoch": 0.75, + "learning_rate": 7.715088376448554e-08, + "logits/chosen": -3.1249635219573975, + "logits/rejected": -3.0256919860839844, + "logps/chosen": -243.515380859375, + "logps/rejected": -299.82598876953125, + "loss": 0.6058, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3581535220146179, + "rewards/margins": 1.5307197570800781, + "rewards/rejected": -1.1725661754608154, + "step": 6477 + }, + { + "epoch": 0.75, + "learning_rate": 7.711576729486129e-08, + "logits/chosen": -2.7523279190063477, + "logits/rejected": -3.046518564224243, + "logps/chosen": -236.1796875, + "logps/rejected": -226.11782836914062, + "loss": 0.3512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22763609886169434, + "rewards/margins": 1.9732410907745361, + "rewards/rejected": -2.2008771896362305, + "step": 6478 + }, + { + "epoch": 0.75, + "learning_rate": 7.708065082523703e-08, + "logits/chosen": -3.1514666080474854, + "logits/rejected": -3.377070426940918, + "logps/chosen": -254.89974975585938, + "logps/rejected": -245.13015747070312, + "loss": 0.4774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4769400358200073, + "rewards/margins": 1.8159513473510742, + "rewards/rejected": -2.292891502380371, + "step": 6479 + }, + { + "epoch": 0.75, + "learning_rate": 7.704553435561278e-08, + "logits/chosen": -2.430668830871582, + "logits/rejected": -2.4704012870788574, + "logps/chosen": -369.027587890625, + "logps/rejected": -246.21453857421875, + "loss": 0.4602, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01004992425441742, + "rewards/margins": 2.0527431964874268, + "rewards/rejected": -2.0426933765411377, + "step": 6480 + }, + { + "epoch": 0.75, + "learning_rate": 7.701041788598853e-08, + "logits/chosen": -2.7770090103149414, + "logits/rejected": -2.5841822624206543, + "logps/chosen": -407.4916076660156, + "logps/rejected": -360.21533203125, + "loss": 0.1751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07695138454437256, + "rewards/margins": 2.216918706893921, + "rewards/rejected": -2.139967203140259, + "step": 6481 + }, + { + "epoch": 0.75, + "learning_rate": 7.697530141636427e-08, + "logits/chosen": -2.646050214767456, + "logits/rejected": -2.833428382873535, + "logps/chosen": -100.26174926757812, + "logps/rejected": -241.74966430664062, + "loss": 0.1991, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22922126948833466, + "rewards/margins": 2.1988589763641357, + "rewards/rejected": -1.9696377515792847, + "step": 6482 + }, + { + "epoch": 0.75, + "learning_rate": 7.694018494674002e-08, + "logits/chosen": -3.226240634918213, + "logits/rejected": -3.4811646938323975, + "logps/chosen": -91.27635192871094, + "logps/rejected": -236.05487060546875, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14726081490516663, + "rewards/margins": 2.4725241661071777, + "rewards/rejected": -2.325263023376465, + "step": 6483 + }, + { + "epoch": 0.75, + "learning_rate": 7.690506847711576e-08, + "logits/chosen": -3.153597593307495, + "logits/rejected": -3.48480486869812, + "logps/chosen": -135.2821044921875, + "logps/rejected": -205.56961059570312, + "loss": 0.5563, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06084933876991272, + "rewards/margins": 1.118162751197815, + "rewards/rejected": -1.057313323020935, + "step": 6484 + }, + { + "epoch": 0.75, + "learning_rate": 7.68699520074915e-08, + "logits/chosen": -3.8122591972351074, + "logits/rejected": -3.4576544761657715, + "logps/chosen": -229.5215301513672, + "logps/rejected": -236.5208282470703, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5939233899116516, + "rewards/margins": 2.7691822052001953, + "rewards/rejected": -2.1752588748931885, + "step": 6485 + }, + { + "epoch": 0.75, + "learning_rate": 7.683483553786725e-08, + "logits/chosen": -2.7706494331359863, + "logits/rejected": -2.802107095718384, + "logps/chosen": -139.47958374023438, + "logps/rejected": -203.73521423339844, + "loss": 0.3474, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4977773129940033, + "rewards/margins": 1.4175978899002075, + "rewards/rejected": -0.9198206067085266, + "step": 6486 + }, + { + "epoch": 0.75, + "learning_rate": 7.679971906824301e-08, + "logits/chosen": -2.6499171257019043, + "logits/rejected": -2.5231680870056152, + "logps/chosen": -394.2127685546875, + "logps/rejected": -271.6762390136719, + "loss": 0.2705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018754959106445312, + "rewards/margins": 1.9008960723876953, + "rewards/rejected": -1.8821409940719604, + "step": 6487 + }, + { + "epoch": 0.75, + "learning_rate": 7.676460259861875e-08, + "logits/chosen": -2.912457227706909, + "logits/rejected": -2.9687581062316895, + "logps/chosen": -466.8310241699219, + "logps/rejected": -396.41912841796875, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.304269015789032, + "rewards/margins": 1.706907033920288, + "rewards/rejected": -1.4026378393173218, + "step": 6488 + }, + { + "epoch": 0.75, + "learning_rate": 7.672948612899449e-08, + "logits/chosen": -3.2861125469207764, + "logits/rejected": -3.310567855834961, + "logps/chosen": -323.02691650390625, + "logps/rejected": -328.266357421875, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1362614929676056, + "rewards/margins": 3.3413617610931396, + "rewards/rejected": -3.477623224258423, + "step": 6489 + }, + { + "epoch": 0.75, + "learning_rate": 7.669436965937023e-08, + "logits/chosen": -3.835117816925049, + "logits/rejected": -3.4782800674438477, + "logps/chosen": -256.03033447265625, + "logps/rejected": -191.8092803955078, + "loss": 0.2582, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17934419214725494, + "rewards/margins": 1.3111696243286133, + "rewards/rejected": -1.1318254470825195, + "step": 6490 + }, + { + "epoch": 0.75, + "learning_rate": 7.6659253189746e-08, + "logits/chosen": -2.8962535858154297, + "logits/rejected": -2.8031957149505615, + "logps/chosen": -287.2691955566406, + "logps/rejected": -274.0860900878906, + "loss": 0.7863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09967190027236938, + "rewards/margins": 0.35114896297454834, + "rewards/rejected": -0.4508208930492401, + "step": 6491 + }, + { + "epoch": 0.75, + "learning_rate": 7.662413672012174e-08, + "logits/chosen": -2.1327548027038574, + "logits/rejected": -2.3547451496124268, + "logps/chosen": -432.81805419921875, + "logps/rejected": -262.3018798828125, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015278682112693787, + "rewards/margins": 1.9702988862991333, + "rewards/rejected": -1.9550203084945679, + "step": 6492 + }, + { + "epoch": 0.75, + "learning_rate": 7.658902025049748e-08, + "logits/chosen": -3.563631057739258, + "logits/rejected": -3.6595945358276367, + "logps/chosen": -385.14483642578125, + "logps/rejected": -184.54776000976562, + "loss": 0.5147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42551517486572266, + "rewards/margins": 1.3504798412322998, + "rewards/rejected": -1.7759950160980225, + "step": 6493 + }, + { + "epoch": 0.75, + "learning_rate": 7.655390378087322e-08, + "logits/chosen": -3.4432175159454346, + "logits/rejected": -3.4605228900909424, + "logps/chosen": -241.50985717773438, + "logps/rejected": -262.1628112792969, + "loss": 0.3358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05820727348327637, + "rewards/margins": 2.607933282852173, + "rewards/rejected": -2.6661407947540283, + "step": 6494 + }, + { + "epoch": 0.75, + "learning_rate": 7.651878731124897e-08, + "logits/chosen": -3.345986843109131, + "logits/rejected": -3.1482748985290527, + "logps/chosen": -154.74949645996094, + "logps/rejected": -147.97879028320312, + "loss": 0.4647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47857144474983215, + "rewards/margins": 1.7878497838974, + "rewards/rejected": -2.266421318054199, + "step": 6495 + }, + { + "epoch": 0.75, + "learning_rate": 7.648367084162471e-08, + "logits/chosen": -2.862565755844116, + "logits/rejected": -3.1387453079223633, + "logps/chosen": -285.0855712890625, + "logps/rejected": -311.04852294921875, + "loss": 0.7567, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29181766510009766, + "rewards/margins": 0.827465832233429, + "rewards/rejected": -1.1192835569381714, + "step": 6496 + }, + { + "epoch": 0.75, + "learning_rate": 7.644855437200047e-08, + "logits/chosen": -2.906141757965088, + "logits/rejected": -3.2622570991516113, + "logps/chosen": -245.92922973632812, + "logps/rejected": -175.70713806152344, + "loss": 0.3614, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09336479008197784, + "rewards/margins": 1.1728240251541138, + "rewards/rejected": -1.2661888599395752, + "step": 6497 + }, + { + "epoch": 0.75, + "learning_rate": 7.641343790237621e-08, + "logits/chosen": -2.4805121421813965, + "logits/rejected": -2.6254630088806152, + "logps/chosen": -388.48260498046875, + "logps/rejected": -259.4971618652344, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44157448410987854, + "rewards/margins": 2.603321075439453, + "rewards/rejected": -2.1617465019226074, + "step": 6498 + }, + { + "epoch": 0.75, + "learning_rate": 7.637832143275196e-08, + "logits/chosen": -3.412341356277466, + "logits/rejected": -3.3840365409851074, + "logps/chosen": -231.84307861328125, + "logps/rejected": -217.4248046875, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06419935822486877, + "rewards/margins": 2.5246706008911133, + "rewards/rejected": -2.58886981010437, + "step": 6499 + }, + { + "epoch": 0.75, + "learning_rate": 7.63432049631277e-08, + "logits/chosen": -2.933264970779419, + "logits/rejected": -3.0499138832092285, + "logps/chosen": -141.75442504882812, + "logps/rejected": -246.0009307861328, + "loss": 0.3028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23343193531036377, + "rewards/margins": 3.0776004791259766, + "rewards/rejected": -3.311032772064209, + "step": 6500 + }, + { + "epoch": 0.75, + "learning_rate": 7.630808849350344e-08, + "logits/chosen": -2.55734920501709, + "logits/rejected": -2.356348991394043, + "logps/chosen": -227.35995483398438, + "logps/rejected": -170.04684448242188, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11066178977489471, + "rewards/margins": 2.1427924633026123, + "rewards/rejected": -2.032130718231201, + "step": 6501 + }, + { + "epoch": 0.75, + "learning_rate": 7.627297202387918e-08, + "logits/chosen": -3.721618175506592, + "logits/rejected": -3.640932083129883, + "logps/chosen": -220.23220825195312, + "logps/rejected": -271.0509033203125, + "loss": 1.3128, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3311026096343994, + "rewards/margins": 0.12671315670013428, + "rewards/rejected": -1.4578156471252441, + "step": 6502 + }, + { + "epoch": 0.75, + "learning_rate": 7.623785555425495e-08, + "logits/chosen": -3.321223735809326, + "logits/rejected": -3.2864201068878174, + "logps/chosen": -290.44427490234375, + "logps/rejected": -283.0976257324219, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2212362289428711, + "rewards/margins": 2.2553791999816895, + "rewards/rejected": -2.0341432094573975, + "step": 6503 + }, + { + "epoch": 0.75, + "learning_rate": 7.620273908463069e-08, + "logits/chosen": -2.6551616191864014, + "logits/rejected": -2.6672258377075195, + "logps/chosen": -212.0513916015625, + "logps/rejected": -185.9377899169922, + "loss": 0.3492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03805989772081375, + "rewards/margins": 1.279830813407898, + "rewards/rejected": -1.3178906440734863, + "step": 6504 + }, + { + "epoch": 0.75, + "learning_rate": 7.616762261500643e-08, + "logits/chosen": -3.584686279296875, + "logits/rejected": -3.767699718475342, + "logps/chosen": -104.268310546875, + "logps/rejected": -176.30438232421875, + "loss": 0.236, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3064998388290405, + "rewards/margins": 2.479572296142578, + "rewards/rejected": -1.1730725765228271, + "step": 6505 + }, + { + "epoch": 0.75, + "learning_rate": 7.613250614538217e-08, + "logits/chosen": -3.2867660522460938, + "logits/rejected": -3.0920982360839844, + "logps/chosen": -321.56304931640625, + "logps/rejected": -360.7660217285156, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34653210639953613, + "rewards/margins": 5.007863521575928, + "rewards/rejected": -4.661332130432129, + "step": 6506 + }, + { + "epoch": 0.75, + "learning_rate": 7.609738967575794e-08, + "logits/chosen": -2.9876575469970703, + "logits/rejected": -3.019923210144043, + "logps/chosen": -373.105712890625, + "logps/rejected": -399.05523681640625, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33076804876327515, + "rewards/margins": 2.3888180255889893, + "rewards/rejected": -2.71958589553833, + "step": 6507 + }, + { + "epoch": 0.75, + "learning_rate": 7.606227320613368e-08, + "logits/chosen": -2.9815986156463623, + "logits/rejected": -3.1906673908233643, + "logps/chosen": -122.9112777709961, + "logps/rejected": -163.3248748779297, + "loss": 0.442, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018664300441741943, + "rewards/margins": 1.7209234237670898, + "rewards/rejected": -1.7022590637207031, + "step": 6508 + }, + { + "epoch": 0.75, + "learning_rate": 7.602715673650942e-08, + "logits/chosen": -3.793114423751831, + "logits/rejected": -3.602126121520996, + "logps/chosen": -277.98248291015625, + "logps/rejected": -254.28778076171875, + "loss": 0.2329, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3082159161567688, + "rewards/margins": 2.5891823768615723, + "rewards/rejected": -2.2809667587280273, + "step": 6509 + }, + { + "epoch": 0.75, + "learning_rate": 7.599204026688516e-08, + "logits/chosen": -3.323521137237549, + "logits/rejected": -3.162421464920044, + "logps/chosen": -191.2319793701172, + "logps/rejected": -232.73318481445312, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6039155721664429, + "rewards/margins": 2.6276028156280518, + "rewards/rejected": -3.2315187454223633, + "step": 6510 + }, + { + "epoch": 0.75, + "learning_rate": 7.595692379726092e-08, + "logits/chosen": -2.9219470024108887, + "logits/rejected": -3.052826404571533, + "logps/chosen": -95.91213989257812, + "logps/rejected": -103.5226058959961, + "loss": 0.3907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15509366989135742, + "rewards/margins": 1.2208197116851807, + "rewards/rejected": -1.3759132623672485, + "step": 6511 + }, + { + "epoch": 0.75, + "learning_rate": 7.592180732763666e-08, + "logits/chosen": -2.7765085697174072, + "logits/rejected": -2.6297311782836914, + "logps/chosen": -255.2647705078125, + "logps/rejected": -187.45362854003906, + "loss": 0.5367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5771779417991638, + "rewards/margins": 1.3416435718536377, + "rewards/rejected": -1.9188215732574463, + "step": 6512 + }, + { + "epoch": 0.75, + "learning_rate": 7.58866908580124e-08, + "logits/chosen": -2.9897921085357666, + "logits/rejected": -3.4109253883361816, + "logps/chosen": -216.80075073242188, + "logps/rejected": -204.0928497314453, + "loss": 0.3939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.115714892745018, + "rewards/margins": 1.4345275163650513, + "rewards/rejected": -1.3188127279281616, + "step": 6513 + }, + { + "epoch": 0.75, + "learning_rate": 7.585157438838815e-08, + "logits/chosen": -3.4047112464904785, + "logits/rejected": -3.05232572555542, + "logps/chosen": -345.5143127441406, + "logps/rejected": -203.15670776367188, + "loss": 0.3518, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41347840428352356, + "rewards/margins": 1.628831148147583, + "rewards/rejected": -1.2153527736663818, + "step": 6514 + }, + { + "epoch": 0.75, + "learning_rate": 7.58164579187639e-08, + "logits/chosen": -3.5607805252075195, + "logits/rejected": -3.3864588737487793, + "logps/chosen": -192.48434448242188, + "logps/rejected": -246.35940551757812, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5536952614784241, + "rewards/margins": 2.3185577392578125, + "rewards/rejected": -1.7648625373840332, + "step": 6515 + }, + { + "epoch": 0.75, + "learning_rate": 7.578134144913965e-08, + "logits/chosen": -3.5859196186065674, + "logits/rejected": -3.5124669075012207, + "logps/chosen": -150.69947814941406, + "logps/rejected": -137.43556213378906, + "loss": 0.4847, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3370472490787506, + "rewards/margins": 0.8788605332374573, + "rewards/rejected": -0.5418132543563843, + "step": 6516 + }, + { + "epoch": 0.75, + "learning_rate": 7.574622497951539e-08, + "logits/chosen": -3.533740997314453, + "logits/rejected": -3.088996410369873, + "logps/chosen": -184.53836059570312, + "logps/rejected": -232.36146545410156, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30797645449638367, + "rewards/margins": 1.9857368469238281, + "rewards/rejected": -1.677760362625122, + "step": 6517 + }, + { + "epoch": 0.75, + "learning_rate": 7.571110850989113e-08, + "logits/chosen": -2.908053398132324, + "logits/rejected": -2.9372589588165283, + "logps/chosen": -226.55169677734375, + "logps/rejected": -289.9132080078125, + "loss": 0.2553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26379287242889404, + "rewards/margins": 2.493405342102051, + "rewards/rejected": -2.2296125888824463, + "step": 6518 + }, + { + "epoch": 0.75, + "learning_rate": 7.56759920402669e-08, + "logits/chosen": -3.2346673011779785, + "logits/rejected": -3.3661887645721436, + "logps/chosen": -333.4208068847656, + "logps/rejected": -227.08041381835938, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04627283662557602, + "rewards/margins": 1.468430757522583, + "rewards/rejected": -1.422157883644104, + "step": 6519 + }, + { + "epoch": 0.75, + "learning_rate": 7.564087557064263e-08, + "logits/chosen": -3.1157736778259277, + "logits/rejected": -3.807528495788574, + "logps/chosen": -191.6927947998047, + "logps/rejected": -255.67822265625, + "loss": 0.3196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1446167230606079, + "rewards/margins": 2.9586405754089355, + "rewards/rejected": -2.814023971557617, + "step": 6520 + }, + { + "epoch": 0.75, + "learning_rate": 7.560575910101837e-08, + "logits/chosen": -3.30830979347229, + "logits/rejected": -2.6087608337402344, + "logps/chosen": -409.8335266113281, + "logps/rejected": -282.00921630859375, + "loss": 0.1598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26857730746269226, + "rewards/margins": 2.9933900833129883, + "rewards/rejected": -2.7248125076293945, + "step": 6521 + }, + { + "epoch": 0.75, + "learning_rate": 7.557064263139412e-08, + "logits/chosen": -3.2779014110565186, + "logits/rejected": -3.1872267723083496, + "logps/chosen": -112.30634307861328, + "logps/rejected": -215.7979278564453, + "loss": 0.3798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2633770704269409, + "rewards/margins": 1.795915961265564, + "rewards/rejected": -2.059292793273926, + "step": 6522 + }, + { + "epoch": 0.75, + "learning_rate": 7.553552616176987e-08, + "logits/chosen": -2.8517587184906006, + "logits/rejected": -3.4699535369873047, + "logps/chosen": -198.90774536132812, + "logps/rejected": -323.7243347167969, + "loss": 0.4929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5169811844825745, + "rewards/margins": 1.3740003108978271, + "rewards/rejected": -1.890981674194336, + "step": 6523 + }, + { + "epoch": 0.75, + "learning_rate": 7.550040969214562e-08, + "logits/chosen": -3.440267324447632, + "logits/rejected": -3.6587626934051514, + "logps/chosen": -291.5665283203125, + "logps/rejected": -326.76947021484375, + "loss": 0.4492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11885866522789001, + "rewards/margins": 2.3523738384246826, + "rewards/rejected": -2.4712321758270264, + "step": 6524 + }, + { + "epoch": 0.75, + "learning_rate": 7.546529322252136e-08, + "logits/chosen": -3.2368786334991455, + "logits/rejected": -3.089158773422241, + "logps/chosen": -404.9744873046875, + "logps/rejected": -363.9091796875, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3825230002403259, + "rewards/margins": 3.537560224533081, + "rewards/rejected": -3.1550374031066895, + "step": 6525 + }, + { + "epoch": 0.75, + "learning_rate": 7.54301767528971e-08, + "logits/chosen": -3.444892644882202, + "logits/rejected": -3.101372718811035, + "logps/chosen": -278.055908203125, + "logps/rejected": -326.1573486328125, + "loss": 0.6762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5893330574035645, + "rewards/margins": 0.5603678226470947, + "rewards/rejected": -1.1497007608413696, + "step": 6526 + }, + { + "epoch": 0.75, + "learning_rate": 7.539506028327285e-08, + "logits/chosen": -3.052907943725586, + "logits/rejected": -2.8268730640411377, + "logps/chosen": -278.0086975097656, + "logps/rejected": -356.0139465332031, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4018939137458801, + "rewards/margins": 2.136855125427246, + "rewards/rejected": -2.5387492179870605, + "step": 6527 + }, + { + "epoch": 0.75, + "learning_rate": 7.53599438136486e-08, + "logits/chosen": -3.8642566204071045, + "logits/rejected": -3.7533626556396484, + "logps/chosen": -132.16558837890625, + "logps/rejected": -190.14651489257812, + "loss": 0.2779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3817184269428253, + "rewards/margins": 2.039013147354126, + "rewards/rejected": -2.420731544494629, + "step": 6528 + }, + { + "epoch": 0.75, + "learning_rate": 7.532482734402434e-08, + "logits/chosen": -3.681105852127075, + "logits/rejected": -3.413938045501709, + "logps/chosen": -161.257080078125, + "logps/rejected": -190.24114990234375, + "loss": 0.6837, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0057098865509033, + "rewards/margins": 1.7423654794692993, + "rewards/rejected": -2.748075485229492, + "step": 6529 + }, + { + "epoch": 0.75, + "learning_rate": 7.528971087440008e-08, + "logits/chosen": -3.1546051502227783, + "logits/rejected": -3.4408740997314453, + "logps/chosen": -375.99066162109375, + "logps/rejected": -393.6514587402344, + "loss": 0.3022, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04169077426195145, + "rewards/margins": 2.201643943786621, + "rewards/rejected": -2.2433345317840576, + "step": 6530 + }, + { + "epoch": 0.75, + "learning_rate": 7.525459440477583e-08, + "logits/chosen": -3.6739842891693115, + "logits/rejected": -3.5017249584198, + "logps/chosen": -387.4170837402344, + "logps/rejected": -368.8540954589844, + "loss": 0.4688, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4473057985305786, + "rewards/margins": 1.2038755416870117, + "rewards/rejected": -1.6511814594268799, + "step": 6531 + }, + { + "epoch": 0.75, + "learning_rate": 7.521947793515159e-08, + "logits/chosen": -3.5818443298339844, + "logits/rejected": -3.6348114013671875, + "logps/chosen": -291.43756103515625, + "logps/rejected": -295.6882019042969, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26185524463653564, + "rewards/margins": 1.2087764739990234, + "rewards/rejected": -1.4706318378448486, + "step": 6532 + }, + { + "epoch": 0.75, + "learning_rate": 7.518436146552733e-08, + "logits/chosen": -3.0696208477020264, + "logits/rejected": -2.701815128326416, + "logps/chosen": -391.3270568847656, + "logps/rejected": -257.32366943359375, + "loss": 0.3493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1496993899345398, + "rewards/margins": 1.5526443719863892, + "rewards/rejected": -1.7023438215255737, + "step": 6533 + }, + { + "epoch": 0.75, + "learning_rate": 7.514924499590307e-08, + "logits/chosen": -3.4806714057922363, + "logits/rejected": -3.2448201179504395, + "logps/chosen": -282.55810546875, + "logps/rejected": -318.3855285644531, + "loss": 0.4413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25659283995628357, + "rewards/margins": 1.9953383207321167, + "rewards/rejected": -1.7387455701828003, + "step": 6534 + }, + { + "epoch": 0.75, + "learning_rate": 7.511412852627881e-08, + "logits/chosen": -2.794063091278076, + "logits/rejected": -2.7325100898742676, + "logps/chosen": -284.23211669921875, + "logps/rejected": -277.35882568359375, + "loss": 0.4252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3852311968803406, + "rewards/margins": 0.9680879712104797, + "rewards/rejected": -1.3533190488815308, + "step": 6535 + }, + { + "epoch": 0.75, + "learning_rate": 7.507901205665458e-08, + "logits/chosen": -2.3681752681732178, + "logits/rejected": -2.484633445739746, + "logps/chosen": -477.0494689941406, + "logps/rejected": -317.46539306640625, + "loss": 0.3556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0661531463265419, + "rewards/margins": 1.3548974990844727, + "rewards/rejected": -1.421050786972046, + "step": 6536 + }, + { + "epoch": 0.75, + "learning_rate": 7.504389558703032e-08, + "logits/chosen": -3.001845359802246, + "logits/rejected": -3.1923954486846924, + "logps/chosen": -209.1826171875, + "logps/rejected": -233.042724609375, + "loss": 0.3112, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28428834676742554, + "rewards/margins": 1.8297929763793945, + "rewards/rejected": -1.5455045700073242, + "step": 6537 + }, + { + "epoch": 0.75, + "learning_rate": 7.500877911740606e-08, + "logits/chosen": -2.6471452713012695, + "logits/rejected": -2.7673840522766113, + "logps/chosen": -351.2968444824219, + "logps/rejected": -392.42919921875, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2282203882932663, + "rewards/margins": 1.928169846534729, + "rewards/rejected": -2.1563901901245117, + "step": 6538 + }, + { + "epoch": 0.75, + "learning_rate": 7.497366264778181e-08, + "logits/chosen": -3.1957271099090576, + "logits/rejected": -2.5698862075805664, + "logps/chosen": -314.2757873535156, + "logps/rejected": -216.6915283203125, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2935863733291626, + "rewards/margins": 2.604506015777588, + "rewards/rejected": -2.3109195232391357, + "step": 6539 + }, + { + "epoch": 0.75, + "learning_rate": 7.493854617815755e-08, + "logits/chosen": -3.0691072940826416, + "logits/rejected": -3.3866875171661377, + "logps/chosen": -184.68789672851562, + "logps/rejected": -204.42652893066406, + "loss": 0.1982, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29127073287963867, + "rewards/margins": 3.587167501449585, + "rewards/rejected": -3.295896530151367, + "step": 6540 + }, + { + "epoch": 0.75, + "learning_rate": 7.49034297085333e-08, + "logits/chosen": -3.2182934284210205, + "logits/rejected": -3.0686349868774414, + "logps/chosen": -361.7158203125, + "logps/rejected": -281.7326965332031, + "loss": 0.1886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1662176251411438, + "rewards/margins": 2.864023208618164, + "rewards/rejected": -3.030241012573242, + "step": 6541 + }, + { + "epoch": 0.75, + "learning_rate": 7.486831323890905e-08, + "logits/chosen": -3.263988494873047, + "logits/rejected": -3.339576482772827, + "logps/chosen": -356.34869384765625, + "logps/rejected": -208.26235961914062, + "loss": 0.3911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43283194303512573, + "rewards/margins": 1.680727243423462, + "rewards/rejected": -2.1135592460632324, + "step": 6542 + }, + { + "epoch": 0.75, + "learning_rate": 7.483319676928479e-08, + "logits/chosen": -3.0811853408813477, + "logits/rejected": -2.7996299266815186, + "logps/chosen": -325.34234619140625, + "logps/rejected": -204.7603759765625, + "loss": 0.3678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027760900557041168, + "rewards/margins": 1.0378828048706055, + "rewards/rejected": -1.0656436681747437, + "step": 6543 + }, + { + "epoch": 0.75, + "learning_rate": 7.479808029966053e-08, + "logits/chosen": -3.185659885406494, + "logits/rejected": -3.025155544281006, + "logps/chosen": -376.037353515625, + "logps/rejected": -253.80215454101562, + "loss": 0.4801, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18034932017326355, + "rewards/margins": 0.9969501495361328, + "rewards/rejected": -1.1772993803024292, + "step": 6544 + }, + { + "epoch": 0.75, + "learning_rate": 7.476296383003628e-08, + "logits/chosen": -3.1137399673461914, + "logits/rejected": -3.4895453453063965, + "logps/chosen": -131.1956024169922, + "logps/rejected": -309.62750244140625, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1104879379272461, + "rewards/margins": 2.5583739280700684, + "rewards/rejected": -2.6688621044158936, + "step": 6545 + }, + { + "epoch": 0.75, + "learning_rate": 7.472784736041202e-08, + "logits/chosen": -2.944122076034546, + "logits/rejected": -2.8698127269744873, + "logps/chosen": -209.2655487060547, + "logps/rejected": -254.73280334472656, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4455290138721466, + "rewards/margins": 1.7793720960617065, + "rewards/rejected": -1.3338429927825928, + "step": 6546 + }, + { + "epoch": 0.75, + "learning_rate": 7.469273089078778e-08, + "logits/chosen": -3.407456874847412, + "logits/rejected": -3.568614959716797, + "logps/chosen": -121.22969055175781, + "logps/rejected": -140.32835388183594, + "loss": 0.4275, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2390754222869873, + "rewards/margins": 1.1134474277496338, + "rewards/rejected": -1.3525227308273315, + "step": 6547 + }, + { + "epoch": 0.75, + "learning_rate": 7.465761442116352e-08, + "logits/chosen": -2.6996140480041504, + "logits/rejected": -2.9075300693511963, + "logps/chosen": -273.2840881347656, + "logps/rejected": -247.98765563964844, + "loss": 0.5271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15210485458374023, + "rewards/margins": 0.7527235746383667, + "rewards/rejected": -0.9048283696174622, + "step": 6548 + }, + { + "epoch": 0.75, + "learning_rate": 7.462249795153927e-08, + "logits/chosen": -3.72564697265625, + "logits/rejected": -3.3327300548553467, + "logps/chosen": -172.9583740234375, + "logps/rejected": -177.21322631835938, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21348682045936584, + "rewards/margins": 1.2866634130477905, + "rewards/rejected": -1.500150203704834, + "step": 6549 + }, + { + "epoch": 0.76, + "learning_rate": 7.458738148191501e-08, + "logits/chosen": -2.6583213806152344, + "logits/rejected": -2.983236789703369, + "logps/chosen": -224.46682739257812, + "logps/rejected": -317.26763916015625, + "loss": 0.4317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05847051739692688, + "rewards/margins": 0.8639366626739502, + "rewards/rejected": -0.8054660558700562, + "step": 6550 + }, + { + "epoch": 0.76, + "learning_rate": 7.455226501229077e-08, + "logits/chosen": -3.823631525039673, + "logits/rejected": -3.8988664150238037, + "logps/chosen": -157.77517700195312, + "logps/rejected": -160.58628845214844, + "loss": 0.1956, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7096014022827148, + "rewards/margins": 2.6385297775268555, + "rewards/rejected": -1.9289283752441406, + "step": 6551 + }, + { + "epoch": 0.76, + "learning_rate": 7.45171485426665e-08, + "logits/chosen": -2.4724228382110596, + "logits/rejected": -2.536151885986328, + "logps/chosen": -271.87908935546875, + "logps/rejected": -237.53294372558594, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.303237646818161, + "rewards/margins": 1.7927935123443604, + "rewards/rejected": -2.0960309505462646, + "step": 6552 + }, + { + "epoch": 0.76, + "learning_rate": 7.448203207304226e-08, + "logits/chosen": -2.82582426071167, + "logits/rejected": -2.897369861602783, + "logps/chosen": -275.49566650390625, + "logps/rejected": -283.8038635253906, + "loss": 0.3681, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33534103631973267, + "rewards/margins": 1.918959140777588, + "rewards/rejected": -1.5836180448532104, + "step": 6553 + }, + { + "epoch": 0.76, + "learning_rate": 7.4446915603418e-08, + "logits/chosen": -3.2628626823425293, + "logits/rejected": -3.2031784057617188, + "logps/chosen": -174.4482421875, + "logps/rejected": -131.67965698242188, + "loss": 0.5132, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3746340572834015, + "rewards/margins": 1.8896312713623047, + "rewards/rejected": -2.264265298843384, + "step": 6554 + }, + { + "epoch": 0.76, + "learning_rate": 7.441179913379375e-08, + "logits/chosen": -3.6295006275177, + "logits/rejected": -3.622544765472412, + "logps/chosen": -340.1664123535156, + "logps/rejected": -265.8690185546875, + "loss": 0.4776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7618261575698853, + "rewards/margins": 1.3425447940826416, + "rewards/rejected": -2.1043710708618164, + "step": 6555 + }, + { + "epoch": 0.76, + "learning_rate": 7.43766826641695e-08, + "logits/chosen": -3.0347893238067627, + "logits/rejected": -2.8768563270568848, + "logps/chosen": -397.5435791015625, + "logps/rejected": -216.635498046875, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09412747621536255, + "rewards/margins": 1.7820837497711182, + "rewards/rejected": -1.876211166381836, + "step": 6556 + }, + { + "epoch": 0.76, + "learning_rate": 7.434156619454524e-08, + "logits/chosen": -2.8980870246887207, + "logits/rejected": -2.7699642181396484, + "logps/chosen": -286.5835876464844, + "logps/rejected": -198.95071411132812, + "loss": 0.4486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8167777061462402, + "rewards/margins": 1.9117138385772705, + "rewards/rejected": -2.7284913063049316, + "step": 6557 + }, + { + "epoch": 0.76, + "learning_rate": 7.430644972492099e-08, + "logits/chosen": -3.6914734840393066, + "logits/rejected": -3.420201301574707, + "logps/chosen": -330.1303405761719, + "logps/rejected": -233.630615234375, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4414750933647156, + "rewards/margins": 1.1753051280975342, + "rewards/rejected": -1.6167802810668945, + "step": 6558 + }, + { + "epoch": 0.76, + "learning_rate": 7.427133325529673e-08, + "logits/chosen": -3.2196602821350098, + "logits/rejected": -2.7053022384643555, + "logps/chosen": -191.9552001953125, + "logps/rejected": -210.07772827148438, + "loss": 0.3756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5732313990592957, + "rewards/margins": 2.16591477394104, + "rewards/rejected": -2.7391462326049805, + "step": 6559 + }, + { + "epoch": 0.76, + "learning_rate": 7.423621678567247e-08, + "logits/chosen": -3.453099012374878, + "logits/rejected": -3.1872401237487793, + "logps/chosen": -195.47744750976562, + "logps/rejected": -231.50392150878906, + "loss": 0.3645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4133031964302063, + "rewards/margins": 1.0815770626068115, + "rewards/rejected": -1.494880199432373, + "step": 6560 + }, + { + "epoch": 0.76, + "learning_rate": 7.420110031604822e-08, + "logits/chosen": -2.5221362113952637, + "logits/rejected": -2.389615535736084, + "logps/chosen": -269.4752502441406, + "logps/rejected": -215.35653686523438, + "loss": 0.3649, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.100715771317482, + "rewards/margins": 1.5482146739959717, + "rewards/rejected": -1.4474987983703613, + "step": 6561 + }, + { + "epoch": 0.76, + "learning_rate": 7.416598384642397e-08, + "logits/chosen": -3.8376359939575195, + "logits/rejected": -3.818405866622925, + "logps/chosen": -162.30712890625, + "logps/rejected": -178.76254272460938, + "loss": 0.2959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26024556159973145, + "rewards/margins": 1.7581939697265625, + "rewards/rejected": -2.018439292907715, + "step": 6562 + }, + { + "epoch": 0.76, + "learning_rate": 7.413086737679972e-08, + "logits/chosen": -3.087174892425537, + "logits/rejected": -3.609579563140869, + "logps/chosen": -270.89105224609375, + "logps/rejected": -223.346435546875, + "loss": 0.4289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5051583647727966, + "rewards/margins": 3.220599412918091, + "rewards/rejected": -3.725757598876953, + "step": 6563 + }, + { + "epoch": 0.76, + "learning_rate": 7.409575090717546e-08, + "logits/chosen": -3.2304420471191406, + "logits/rejected": -2.9432637691497803, + "logps/chosen": -173.7431640625, + "logps/rejected": -219.8651580810547, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08300942182540894, + "rewards/margins": 1.6432474851608276, + "rewards/rejected": -1.5602381229400635, + "step": 6564 + }, + { + "epoch": 0.76, + "learning_rate": 7.406063443755121e-08, + "logits/chosen": -3.2650394439697266, + "logits/rejected": -3.551687240600586, + "logps/chosen": -382.53240966796875, + "logps/rejected": -364.50823974609375, + "loss": 0.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06722836196422577, + "rewards/margins": 2.3627755641937256, + "rewards/rejected": -2.2955472469329834, + "step": 6565 + }, + { + "epoch": 0.76, + "learning_rate": 7.402551796792695e-08, + "logits/chosen": -3.673449993133545, + "logits/rejected": -3.650412082672119, + "logps/chosen": -116.31795501708984, + "logps/rejected": -153.26809692382812, + "loss": 0.3034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07303275167942047, + "rewards/margins": 1.7260956764221191, + "rewards/rejected": -1.6530628204345703, + "step": 6566 + }, + { + "epoch": 0.76, + "learning_rate": 7.399040149830271e-08, + "logits/chosen": -3.037778615951538, + "logits/rejected": -3.0889666080474854, + "logps/chosen": -315.35723876953125, + "logps/rejected": -297.5625915527344, + "loss": 0.5119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.043115824460983276, + "rewards/margins": 1.1498191356658936, + "rewards/rejected": -1.1929349899291992, + "step": 6567 + }, + { + "epoch": 0.76, + "learning_rate": 7.395528502867845e-08, + "logits/chosen": -2.7946603298187256, + "logits/rejected": -2.781637668609619, + "logps/chosen": -229.590087890625, + "logps/rejected": -322.6257019042969, + "loss": 0.2951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22338059544563293, + "rewards/margins": 1.4999415874481201, + "rewards/rejected": -1.2765610218048096, + "step": 6568 + }, + { + "epoch": 0.76, + "learning_rate": 7.39201685590542e-08, + "logits/chosen": -3.799649953842163, + "logits/rejected": -3.091071605682373, + "logps/chosen": -292.37030029296875, + "logps/rejected": -172.63653564453125, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6050893664360046, + "rewards/margins": 2.57327938079834, + "rewards/rejected": -1.9681899547576904, + "step": 6569 + }, + { + "epoch": 0.76, + "learning_rate": 7.388505208942994e-08, + "logits/chosen": -2.6681320667266846, + "logits/rejected": -2.2829763889312744, + "logps/chosen": -232.72991943359375, + "logps/rejected": -234.8534698486328, + "loss": 0.721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4991178512573242, + "rewards/margins": 1.1059459447860718, + "rewards/rejected": -1.605063796043396, + "step": 6570 + }, + { + "epoch": 0.76, + "learning_rate": 7.384993561980568e-08, + "logits/chosen": -2.693619966506958, + "logits/rejected": -2.6823458671569824, + "logps/chosen": -188.834228515625, + "logps/rejected": -186.0283203125, + "loss": 0.3496, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8836168050765991, + "rewards/margins": 2.0868711471557617, + "rewards/rejected": -1.2032543420791626, + "step": 6571 + }, + { + "epoch": 0.76, + "learning_rate": 7.381481915018144e-08, + "logits/chosen": -2.8669321537017822, + "logits/rejected": -3.1156327724456787, + "logps/chosen": -394.03985595703125, + "logps/rejected": -349.4714050292969, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4411448836326599, + "rewards/margins": 1.8779528141021729, + "rewards/rejected": -2.3190977573394775, + "step": 6572 + }, + { + "epoch": 0.76, + "learning_rate": 7.377970268055718e-08, + "logits/chosen": -3.653082847595215, + "logits/rejected": -3.659794569015503, + "logps/chosen": -138.87667846679688, + "logps/rejected": -165.00100708007812, + "loss": 0.2209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6658039689064026, + "rewards/margins": 2.3833413124084473, + "rewards/rejected": -1.7175371646881104, + "step": 6573 + }, + { + "epoch": 0.76, + "learning_rate": 7.374458621093292e-08, + "logits/chosen": -3.5320115089416504, + "logits/rejected": -3.4295780658721924, + "logps/chosen": -311.75421142578125, + "logps/rejected": -240.383544921875, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20077693462371826, + "rewards/margins": 1.664237380027771, + "rewards/rejected": -1.4634605646133423, + "step": 6574 + }, + { + "epoch": 0.76, + "learning_rate": 7.370946974130867e-08, + "logits/chosen": -3.0961413383483887, + "logits/rejected": -3.030369520187378, + "logps/chosen": -223.82855224609375, + "logps/rejected": -285.28643798828125, + "loss": 0.3682, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.018676668405532837, + "rewards/margins": 1.618493676185608, + "rewards/rejected": -1.599817156791687, + "step": 6575 + }, + { + "epoch": 0.76, + "learning_rate": 7.367435327168441e-08, + "logits/chosen": -2.709744691848755, + "logits/rejected": -2.7605671882629395, + "logps/chosen": -300.780029296875, + "logps/rejected": -239.3838653564453, + "loss": 0.5513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22409212589263916, + "rewards/margins": 1.4853479862213135, + "rewards/rejected": -1.709439992904663, + "step": 6576 + }, + { + "epoch": 0.76, + "learning_rate": 7.363923680206015e-08, + "logits/chosen": -2.934593677520752, + "logits/rejected": -2.5984690189361572, + "logps/chosen": -341.60540771484375, + "logps/rejected": -358.59320068359375, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11304014921188354, + "rewards/margins": 2.775171995162964, + "rewards/rejected": -2.888212203979492, + "step": 6577 + }, + { + "epoch": 0.76, + "learning_rate": 7.360412033243591e-08, + "logits/chosen": -2.4159858226776123, + "logits/rejected": -2.5416922569274902, + "logps/chosen": -148.5828399658203, + "logps/rejected": -169.5196533203125, + "loss": 0.6456, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3145846128463745, + "rewards/margins": 1.2747557163238525, + "rewards/rejected": -1.5893402099609375, + "step": 6578 + }, + { + "epoch": 0.76, + "learning_rate": 7.356900386281165e-08, + "logits/chosen": -2.7396240234375, + "logits/rejected": -3.0702242851257324, + "logps/chosen": -124.77307891845703, + "logps/rejected": -189.300537109375, + "loss": 0.324, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04652014374732971, + "rewards/margins": 2.0168776512145996, + "rewards/rejected": -1.9703575372695923, + "step": 6579 + }, + { + "epoch": 0.76, + "learning_rate": 7.35338873931874e-08, + "logits/chosen": -3.2427845001220703, + "logits/rejected": -3.3166089057922363, + "logps/chosen": -385.1775207519531, + "logps/rejected": -345.2815246582031, + "loss": 0.4005, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4405224621295929, + "rewards/margins": 2.231431007385254, + "rewards/rejected": -2.6719532012939453, + "step": 6580 + }, + { + "epoch": 0.76, + "learning_rate": 7.349877092356314e-08, + "logits/chosen": -3.1772992610931396, + "logits/rejected": -3.5933563709259033, + "logps/chosen": -285.979248046875, + "logps/rejected": -338.789306640625, + "loss": 0.2289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1195153146982193, + "rewards/margins": 3.2299652099609375, + "rewards/rejected": -3.349480628967285, + "step": 6581 + }, + { + "epoch": 0.76, + "learning_rate": 7.34636544539389e-08, + "logits/chosen": -2.854369878768921, + "logits/rejected": -3.33432674407959, + "logps/chosen": -255.5343780517578, + "logps/rejected": -345.756591796875, + "loss": 0.3158, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03855688124895096, + "rewards/margins": 3.160937547683716, + "rewards/rejected": -3.1223807334899902, + "step": 6582 + }, + { + "epoch": 0.76, + "learning_rate": 7.342853798431464e-08, + "logits/chosen": -3.3230390548706055, + "logits/rejected": -3.6015889644622803, + "logps/chosen": -386.1002197265625, + "logps/rejected": -301.2063903808594, + "loss": 1.0807, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8801460266113281, + "rewards/margins": -0.1642184853553772, + "rewards/rejected": -0.7159274816513062, + "step": 6583 + }, + { + "epoch": 0.76, + "learning_rate": 7.339342151469039e-08, + "logits/chosen": -3.225224494934082, + "logits/rejected": -3.286911725997925, + "logps/chosen": -223.21884155273438, + "logps/rejected": -247.82131958007812, + "loss": 0.3229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11151683330535889, + "rewards/margins": 1.7528144121170044, + "rewards/rejected": -1.641297459602356, + "step": 6584 + }, + { + "epoch": 0.76, + "learning_rate": 7.335830504506613e-08, + "logits/chosen": -3.4800806045532227, + "logits/rejected": -3.612044095993042, + "logps/chosen": -344.05462646484375, + "logps/rejected": -362.6707763671875, + "loss": 0.4374, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46219661831855774, + "rewards/margins": 1.9599944353103638, + "rewards/rejected": -1.497797966003418, + "step": 6585 + }, + { + "epoch": 0.76, + "learning_rate": 7.332318857544189e-08, + "logits/chosen": -2.48722505569458, + "logits/rejected": -2.799565553665161, + "logps/chosen": -293.5058898925781, + "logps/rejected": -328.15283203125, + "loss": 0.1652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33406656980514526, + "rewards/margins": 2.9243407249450684, + "rewards/rejected": -2.5902740955352783, + "step": 6586 + }, + { + "epoch": 0.76, + "learning_rate": 7.328807210581763e-08, + "logits/chosen": -2.4580211639404297, + "logits/rejected": -2.5151443481445312, + "logps/chosen": -385.6964111328125, + "logps/rejected": -387.6826171875, + "loss": 0.1973, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3196598291397095, + "rewards/margins": 2.1024599075317383, + "rewards/rejected": -1.7827999591827393, + "step": 6587 + }, + { + "epoch": 0.76, + "learning_rate": 7.325295563619337e-08, + "logits/chosen": -2.8867688179016113, + "logits/rejected": -2.915910482406616, + "logps/chosen": -304.27899169921875, + "logps/rejected": -221.94970703125, + "loss": 0.3937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5438318848609924, + "rewards/margins": 1.2057363986968994, + "rewards/rejected": -1.749568223953247, + "step": 6588 + }, + { + "epoch": 0.76, + "learning_rate": 7.321783916656912e-08, + "logits/chosen": -2.8618574142456055, + "logits/rejected": -3.043779134750366, + "logps/chosen": -370.2707214355469, + "logps/rejected": -199.16653442382812, + "loss": 0.2611, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6325554847717285, + "rewards/margins": 2.4252612590789795, + "rewards/rejected": -1.792705774307251, + "step": 6589 + }, + { + "epoch": 0.76, + "learning_rate": 7.318272269694486e-08, + "logits/chosen": -2.5004491806030273, + "logits/rejected": -2.4756076335906982, + "logps/chosen": -387.2206115722656, + "logps/rejected": -300.1881103515625, + "loss": 0.219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12096215039491653, + "rewards/margins": 2.4372568130493164, + "rewards/rejected": -2.5582189559936523, + "step": 6590 + }, + { + "epoch": 0.76, + "learning_rate": 7.31476062273206e-08, + "logits/chosen": -2.73048734664917, + "logits/rejected": -2.8974485397338867, + "logps/chosen": -216.206787109375, + "logps/rejected": -452.98919677734375, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6143167614936829, + "rewards/margins": 2.635281562805176, + "rewards/rejected": -3.249598264694214, + "step": 6591 + }, + { + "epoch": 0.76, + "learning_rate": 7.311248975769636e-08, + "logits/chosen": -2.9269661903381348, + "logits/rejected": -2.8026859760284424, + "logps/chosen": -351.3409423828125, + "logps/rejected": -222.53414916992188, + "loss": 0.4706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4823496341705322, + "rewards/margins": 1.4993293285369873, + "rewards/rejected": -1.9816789627075195, + "step": 6592 + }, + { + "epoch": 0.76, + "learning_rate": 7.30773732880721e-08, + "logits/chosen": -3.4206268787384033, + "logits/rejected": -3.165123701095581, + "logps/chosen": -219.85240173339844, + "logps/rejected": -161.5966796875, + "loss": 0.4695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4366489052772522, + "rewards/margins": 0.7438679337501526, + "rewards/rejected": -1.1805168390274048, + "step": 6593 + }, + { + "epoch": 0.76, + "learning_rate": 7.304225681844785e-08, + "logits/chosen": -3.3184702396392822, + "logits/rejected": -3.3513588905334473, + "logps/chosen": -289.2336120605469, + "logps/rejected": -285.96282958984375, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08510027080774307, + "rewards/margins": 2.770881175994873, + "rewards/rejected": -2.8559815883636475, + "step": 6594 + }, + { + "epoch": 0.76, + "learning_rate": 7.300714034882359e-08, + "logits/chosen": -3.0261547565460205, + "logits/rejected": -3.293872117996216, + "logps/chosen": -365.9212646484375, + "logps/rejected": -216.68141174316406, + "loss": 0.6271, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23850193619728088, + "rewards/margins": 0.6997445225715637, + "rewards/rejected": -0.9382464289665222, + "step": 6595 + }, + { + "epoch": 0.76, + "learning_rate": 7.297202387919934e-08, + "logits/chosen": -2.764916181564331, + "logits/rejected": -2.733583688735962, + "logps/chosen": -253.4749755859375, + "logps/rejected": -271.82354736328125, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2616100013256073, + "rewards/margins": 2.655777931213379, + "rewards/rejected": -2.9173879623413086, + "step": 6596 + }, + { + "epoch": 0.76, + "learning_rate": 7.293690740957509e-08, + "logits/chosen": -3.0367796421051025, + "logits/rejected": -2.8973419666290283, + "logps/chosen": -119.31654357910156, + "logps/rejected": -163.88279724121094, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6820077896118164, + "rewards/margins": 2.5302162170410156, + "rewards/rejected": -1.8482081890106201, + "step": 6597 + }, + { + "epoch": 0.76, + "learning_rate": 7.290179093995084e-08, + "logits/chosen": -3.1570887565612793, + "logits/rejected": -3.130584716796875, + "logps/chosen": -281.2960205078125, + "logps/rejected": -351.78338623046875, + "loss": 0.6444, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26752883195877075, + "rewards/margins": 1.0191702842712402, + "rewards/rejected": -1.2866990566253662, + "step": 6598 + }, + { + "epoch": 0.76, + "learning_rate": 7.286667447032658e-08, + "logits/chosen": -3.4877877235412598, + "logits/rejected": -3.5196831226348877, + "logps/chosen": -141.37815856933594, + "logps/rejected": -268.1093444824219, + "loss": 0.4137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015153162181377411, + "rewards/margins": 2.7920899391174316, + "rewards/rejected": -2.8072431087493896, + "step": 6599 + }, + { + "epoch": 0.76, + "learning_rate": 7.283155800070233e-08, + "logits/chosen": -3.2647998332977295, + "logits/rejected": -3.3458619117736816, + "logps/chosen": -207.10198974609375, + "logps/rejected": -146.5789794921875, + "loss": 0.3883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22705727815628052, + "rewards/margins": 1.4499037265777588, + "rewards/rejected": -1.222846508026123, + "step": 6600 + }, + { + "epoch": 0.76, + "learning_rate": 7.279644153107807e-08, + "logits/chosen": -3.068354368209839, + "logits/rejected": -3.0070900917053223, + "logps/chosen": -154.88912963867188, + "logps/rejected": -164.224853515625, + "loss": 0.2916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4312378764152527, + "rewards/margins": 1.673109531402588, + "rewards/rejected": -2.1043477058410645, + "step": 6601 + }, + { + "epoch": 0.76, + "learning_rate": 7.276132506145382e-08, + "logits/chosen": -2.479203224182129, + "logits/rejected": -2.439589023590088, + "logps/chosen": -410.0198974609375, + "logps/rejected": -344.8086242675781, + "loss": 0.2098, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07584428787231445, + "rewards/margins": 1.9041697978973389, + "rewards/rejected": -1.8283253908157349, + "step": 6602 + }, + { + "epoch": 0.76, + "learning_rate": 7.272620859182957e-08, + "logits/chosen": -3.464813709259033, + "logits/rejected": -3.331787347793579, + "logps/chosen": -210.48318481445312, + "logps/rejected": -194.12054443359375, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.040523916482925415, + "rewards/margins": 1.0720840692520142, + "rewards/rejected": -1.0315600633621216, + "step": 6603 + }, + { + "epoch": 0.76, + "learning_rate": 7.269109212220531e-08, + "logits/chosen": -2.8950209617614746, + "logits/rejected": -2.8414676189422607, + "logps/chosen": -244.26309204101562, + "logps/rejected": -216.5749969482422, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2093818634748459, + "rewards/margins": 2.6461052894592285, + "rewards/rejected": -2.436723232269287, + "step": 6604 + }, + { + "epoch": 0.76, + "learning_rate": 7.265597565258105e-08, + "logits/chosen": -2.4789164066314697, + "logits/rejected": -2.731724262237549, + "logps/chosen": -415.61572265625, + "logps/rejected": -332.0397033691406, + "loss": 0.2105, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5051683187484741, + "rewards/margins": 2.543698787689209, + "rewards/rejected": -2.0385305881500244, + "step": 6605 + }, + { + "epoch": 0.76, + "learning_rate": 7.26208591829568e-08, + "logits/chosen": -3.04355525970459, + "logits/rejected": -3.0584754943847656, + "logps/chosen": -145.44419860839844, + "logps/rejected": -183.7479248046875, + "loss": 0.2454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11683131009340286, + "rewards/margins": 1.7397232055664062, + "rewards/rejected": -1.8565545082092285, + "step": 6606 + }, + { + "epoch": 0.76, + "learning_rate": 7.258574271333254e-08, + "logits/chosen": -3.069213628768921, + "logits/rejected": -3.0839767456054688, + "logps/chosen": -382.6197509765625, + "logps/rejected": -352.81036376953125, + "loss": 0.4289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4472340941429138, + "rewards/margins": 1.5382022857666016, + "rewards/rejected": -1.9854364395141602, + "step": 6607 + }, + { + "epoch": 0.76, + "learning_rate": 7.25506262437083e-08, + "logits/chosen": -2.5663466453552246, + "logits/rejected": -2.448749303817749, + "logps/chosen": -431.25640869140625, + "logps/rejected": -407.8976135253906, + "loss": 0.41, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08725781738758087, + "rewards/margins": 1.3570433855056763, + "rewards/rejected": -1.4443011283874512, + "step": 6608 + }, + { + "epoch": 0.76, + "learning_rate": 7.251550977408404e-08, + "logits/chosen": -3.1010258197784424, + "logits/rejected": -3.592433452606201, + "logps/chosen": -124.57841491699219, + "logps/rejected": -231.12217712402344, + "loss": 0.6285, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5493834018707275, + "rewards/margins": 1.2481483221054077, + "rewards/rejected": -1.7975317239761353, + "step": 6609 + }, + { + "epoch": 0.76, + "learning_rate": 7.248039330445979e-08, + "logits/chosen": -2.9423599243164062, + "logits/rejected": -3.1406102180480957, + "logps/chosen": -195.53724670410156, + "logps/rejected": -139.0950927734375, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09939060360193253, + "rewards/margins": 1.8934528827667236, + "rewards/rejected": -1.9928436279296875, + "step": 6610 + }, + { + "epoch": 0.76, + "learning_rate": 7.244527683483553e-08, + "logits/chosen": -3.5579724311828613, + "logits/rejected": -3.3107471466064453, + "logps/chosen": -177.3703155517578, + "logps/rejected": -177.27903747558594, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08712656050920486, + "rewards/margins": 2.6514687538146973, + "rewards/rejected": -2.5643422603607178, + "step": 6611 + }, + { + "epoch": 0.76, + "learning_rate": 7.241016036521129e-08, + "logits/chosen": -3.035994529724121, + "logits/rejected": -3.455512285232544, + "logps/chosen": -324.8985290527344, + "logps/rejected": -379.5412902832031, + "loss": 0.3732, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20846936106681824, + "rewards/margins": 2.7955129146575928, + "rewards/rejected": -2.587043523788452, + "step": 6612 + }, + { + "epoch": 0.76, + "learning_rate": 7.237504389558703e-08, + "logits/chosen": -3.4439263343811035, + "logits/rejected": -3.084343910217285, + "logps/chosen": -136.7746124267578, + "logps/rejected": -274.33209228515625, + "loss": 0.7127, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46779265999794006, + "rewards/margins": 1.4575713872909546, + "rewards/rejected": -1.9253641366958618, + "step": 6613 + }, + { + "epoch": 0.76, + "learning_rate": 7.233992742596278e-08, + "logits/chosen": -3.0896201133728027, + "logits/rejected": -2.9064269065856934, + "logps/chosen": -542.2271728515625, + "logps/rejected": -318.9859619140625, + "loss": 0.4517, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030910730361938477, + "rewards/margins": 1.4932682514190674, + "rewards/rejected": -1.4623576402664185, + "step": 6614 + }, + { + "epoch": 0.76, + "learning_rate": 7.230481095633852e-08, + "logits/chosen": -3.2109017372131348, + "logits/rejected": -3.528550386428833, + "logps/chosen": -137.2234649658203, + "logps/rejected": -335.4786682128906, + "loss": 0.3683, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08276181668043137, + "rewards/margins": 2.5111641883850098, + "rewards/rejected": -2.4284024238586426, + "step": 6615 + }, + { + "epoch": 0.76, + "learning_rate": 7.226969448671426e-08, + "logits/chosen": -2.943584442138672, + "logits/rejected": -3.2587332725524902, + "logps/chosen": -268.35321044921875, + "logps/rejected": -353.7181091308594, + "loss": 0.2004, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12791457772254944, + "rewards/margins": 3.424621820449829, + "rewards/rejected": -3.2967071533203125, + "step": 6616 + }, + { + "epoch": 0.76, + "learning_rate": 7.223457801709002e-08, + "logits/chosen": -2.634294271469116, + "logits/rejected": -2.826751232147217, + "logps/chosen": -314.0805969238281, + "logps/rejected": -284.1463928222656, + "loss": 0.5663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7083326578140259, + "rewards/margins": 1.1154899597167969, + "rewards/rejected": -1.8238226175308228, + "step": 6617 + }, + { + "epoch": 0.76, + "learning_rate": 7.219946154746576e-08, + "logits/chosen": -2.7137644290924072, + "logits/rejected": -2.4702041149139404, + "logps/chosen": -162.53102111816406, + "logps/rejected": -209.2808380126953, + "loss": 0.3577, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0676794946193695, + "rewards/margins": 2.1183981895446777, + "rewards/rejected": -2.0507185459136963, + "step": 6618 + }, + { + "epoch": 0.76, + "learning_rate": 7.21643450778415e-08, + "logits/chosen": -3.59616756439209, + "logits/rejected": -3.283940315246582, + "logps/chosen": -138.39112854003906, + "logps/rejected": -252.56558227539062, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0873841643333435, + "rewards/margins": 2.8912107944488525, + "rewards/rejected": -2.978595018386841, + "step": 6619 + }, + { + "epoch": 0.76, + "learning_rate": 7.212922860821725e-08, + "logits/chosen": -3.281773567199707, + "logits/rejected": -3.1433534622192383, + "logps/chosen": -450.8482666015625, + "logps/rejected": -387.91387939453125, + "loss": 0.536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31919652223587036, + "rewards/margins": 0.6780502796173096, + "rewards/rejected": -0.9972468018531799, + "step": 6620 + }, + { + "epoch": 0.76, + "learning_rate": 7.209411213859299e-08, + "logits/chosen": -3.5525543689727783, + "logits/rejected": -3.659085512161255, + "logps/chosen": -212.59400939941406, + "logps/rejected": -214.4329376220703, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6282408237457275, + "rewards/margins": 3.8010103702545166, + "rewards/rejected": -3.172769546508789, + "step": 6621 + }, + { + "epoch": 0.76, + "learning_rate": 7.205899566896873e-08, + "logits/chosen": -3.501840591430664, + "logits/rejected": -3.640934944152832, + "logps/chosen": -366.5970458984375, + "logps/rejected": -329.14691162109375, + "loss": 0.1273, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44421055912971497, + "rewards/margins": 4.386341094970703, + "rewards/rejected": -3.9421300888061523, + "step": 6622 + }, + { + "epoch": 0.76, + "learning_rate": 7.202387919934449e-08, + "logits/chosen": -3.627495288848877, + "logits/rejected": -3.5513572692871094, + "logps/chosen": -250.59255981445312, + "logps/rejected": -246.3236846923828, + "loss": 0.409, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3748490512371063, + "rewards/margins": 1.4611618518829346, + "rewards/rejected": -1.8360109329223633, + "step": 6623 + }, + { + "epoch": 0.76, + "learning_rate": 7.198876272972023e-08, + "logits/chosen": -2.953254222869873, + "logits/rejected": -3.1718297004699707, + "logps/chosen": -260.9346923828125, + "logps/rejected": -284.1019287109375, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13985252380371094, + "rewards/margins": 3.4038891792297363, + "rewards/rejected": -3.2640368938446045, + "step": 6624 + }, + { + "epoch": 0.76, + "learning_rate": 7.195364626009598e-08, + "logits/chosen": -2.8256802558898926, + "logits/rejected": -2.689800262451172, + "logps/chosen": -274.3408508300781, + "logps/rejected": -280.6924743652344, + "loss": 0.3296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.524655818939209, + "rewards/margins": 2.620244026184082, + "rewards/rejected": -2.095588207244873, + "step": 6625 + }, + { + "epoch": 0.76, + "learning_rate": 7.191852979047172e-08, + "logits/chosen": -3.253890037536621, + "logits/rejected": -3.3274035453796387, + "logps/chosen": -349.3419494628906, + "logps/rejected": -321.2421875, + "loss": 0.2225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10230762511491776, + "rewards/margins": 2.3544728755950928, + "rewards/rejected": -2.2521653175354004, + "step": 6626 + }, + { + "epoch": 0.76, + "learning_rate": 7.188341332084748e-08, + "logits/chosen": -2.6496565341949463, + "logits/rejected": -3.2538821697235107, + "logps/chosen": -293.84222412109375, + "logps/rejected": -279.19268798828125, + "loss": 0.6286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27105391025543213, + "rewards/margins": 1.2021445035934448, + "rewards/rejected": -0.9310905337333679, + "step": 6627 + }, + { + "epoch": 0.76, + "learning_rate": 7.184829685122322e-08, + "logits/chosen": -3.8767013549804688, + "logits/rejected": -3.9895691871643066, + "logps/chosen": -236.79617309570312, + "logps/rejected": -275.4668884277344, + "loss": 0.2359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17204219102859497, + "rewards/margins": 2.416640281677246, + "rewards/rejected": -2.5886824131011963, + "step": 6628 + }, + { + "epoch": 0.76, + "learning_rate": 7.181318038159897e-08, + "logits/chosen": -3.6003079414367676, + "logits/rejected": -3.406780481338501, + "logps/chosen": -385.0673522949219, + "logps/rejected": -255.81765747070312, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6024448275566101, + "rewards/margins": 1.4827666282653809, + "rewards/rejected": -0.8803219795227051, + "step": 6629 + }, + { + "epoch": 0.76, + "learning_rate": 7.177806391197471e-08, + "logits/chosen": -2.565763235092163, + "logits/rejected": -3.053691864013672, + "logps/chosen": -269.202880859375, + "logps/rejected": -309.23834228515625, + "loss": 0.2889, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2925536632537842, + "rewards/margins": 2.461045026779175, + "rewards/rejected": -2.1684916019439697, + "step": 6630 + }, + { + "epoch": 0.76, + "learning_rate": 7.174294744235047e-08, + "logits/chosen": -3.250678539276123, + "logits/rejected": -2.79311466217041, + "logps/chosen": -204.64784240722656, + "logps/rejected": -173.93641662597656, + "loss": 1.0101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5714042782783508, + "rewards/margins": -0.042747579514980316, + "rewards/rejected": -0.5286567211151123, + "step": 6631 + }, + { + "epoch": 0.76, + "learning_rate": 7.17078309727262e-08, + "logits/chosen": -2.7324862480163574, + "logits/rejected": -2.448641061782837, + "logps/chosen": -399.866455078125, + "logps/rejected": -241.06732177734375, + "loss": 0.2154, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2321067750453949, + "rewards/margins": 2.325801134109497, + "rewards/rejected": -2.0936942100524902, + "step": 6632 + }, + { + "epoch": 0.76, + "learning_rate": 7.167271450310195e-08, + "logits/chosen": -2.811323642730713, + "logits/rejected": -2.7569775581359863, + "logps/chosen": -103.21920776367188, + "logps/rejected": -268.3199462890625, + "loss": 0.2326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3497190475463867, + "rewards/margins": 2.877138614654541, + "rewards/rejected": -3.226858139038086, + "step": 6633 + }, + { + "epoch": 0.76, + "learning_rate": 7.16375980334777e-08, + "logits/chosen": -3.074343204498291, + "logits/rejected": -3.1325931549072266, + "logps/chosen": -159.3836669921875, + "logps/rejected": -293.4352111816406, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1650058478116989, + "rewards/margins": 2.8727447986602783, + "rewards/rejected": -2.7077388763427734, + "step": 6634 + }, + { + "epoch": 0.76, + "learning_rate": 7.160248156385344e-08, + "logits/chosen": -2.786348342895508, + "logits/rejected": -2.751363754272461, + "logps/chosen": -226.59130859375, + "logps/rejected": -234.95745849609375, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8025518655776978, + "rewards/margins": 0.8964369893074036, + "rewards/rejected": -1.6989887952804565, + "step": 6635 + }, + { + "epoch": 0.77, + "learning_rate": 7.156736509422918e-08, + "logits/chosen": -3.1613729000091553, + "logits/rejected": -3.12076997756958, + "logps/chosen": -352.45343017578125, + "logps/rejected": -250.51727294921875, + "loss": 0.2797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09752234816551208, + "rewards/margins": 1.8817801475524902, + "rewards/rejected": -1.7842576503753662, + "step": 6636 + }, + { + "epoch": 0.77, + "learning_rate": 7.153224862460494e-08, + "logits/chosen": -3.170042037963867, + "logits/rejected": -2.876321792602539, + "logps/chosen": -397.5350341796875, + "logps/rejected": -249.6288299560547, + "loss": 0.0954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01793639361858368, + "rewards/margins": 2.633105754852295, + "rewards/rejected": -2.6510422229766846, + "step": 6637 + }, + { + "epoch": 0.77, + "learning_rate": 7.149713215498068e-08, + "logits/chosen": -3.916865348815918, + "logits/rejected": -3.617325782775879, + "logps/chosen": -282.83282470703125, + "logps/rejected": -254.2771453857422, + "loss": 0.6052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21804767847061157, + "rewards/margins": 0.9236118793487549, + "rewards/rejected": -1.1416594982147217, + "step": 6638 + }, + { + "epoch": 0.77, + "learning_rate": 7.146201568535643e-08, + "logits/chosen": -3.4050631523132324, + "logits/rejected": -3.0166072845458984, + "logps/chosen": -364.1839294433594, + "logps/rejected": -260.59027099609375, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.423850417137146, + "rewards/margins": 2.8847544193267822, + "rewards/rejected": -2.460904121398926, + "step": 6639 + }, + { + "epoch": 0.77, + "learning_rate": 7.142689921573217e-08, + "logits/chosen": -3.8831093311309814, + "logits/rejected": -3.748851776123047, + "logps/chosen": -411.30816650390625, + "logps/rejected": -311.664306640625, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018756315112113953, + "rewards/margins": 1.118965744972229, + "rewards/rejected": -1.1002094745635986, + "step": 6640 + }, + { + "epoch": 0.77, + "learning_rate": 7.139178274610792e-08, + "logits/chosen": -2.618786573410034, + "logits/rejected": -2.669257879257202, + "logps/chosen": -441.3109130859375, + "logps/rejected": -335.4688720703125, + "loss": 0.5434, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.048127755522727966, + "rewards/margins": 1.0577514171600342, + "rewards/rejected": -1.009623646736145, + "step": 6641 + }, + { + "epoch": 0.77, + "learning_rate": 7.135666627648366e-08, + "logits/chosen": -2.6366286277770996, + "logits/rejected": -2.548802375793457, + "logps/chosen": -218.60044860839844, + "logps/rejected": -215.73910522460938, + "loss": 0.3389, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.039543554186820984, + "rewards/margins": 2.1690666675567627, + "rewards/rejected": -2.2086100578308105, + "step": 6642 + }, + { + "epoch": 0.77, + "learning_rate": 7.132154980685942e-08, + "logits/chosen": -3.4602255821228027, + "logits/rejected": -3.6024680137634277, + "logps/chosen": -175.3814697265625, + "logps/rejected": -175.419921875, + "loss": 0.4408, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3367140293121338, + "rewards/margins": 1.3090533018112183, + "rewards/rejected": -0.9723392128944397, + "step": 6643 + }, + { + "epoch": 0.77, + "learning_rate": 7.128643333723516e-08, + "logits/chosen": -2.9610910415649414, + "logits/rejected": -2.975351572036743, + "logps/chosen": -169.8816680908203, + "logps/rejected": -238.64947509765625, + "loss": 0.389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5003246068954468, + "rewards/margins": 2.149965763092041, + "rewards/rejected": -2.6502902507781982, + "step": 6644 + }, + { + "epoch": 0.77, + "learning_rate": 7.125131686761091e-08, + "logits/chosen": -3.1338865756988525, + "logits/rejected": -3.1535000801086426, + "logps/chosen": -250.19027709960938, + "logps/rejected": -214.16444396972656, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26782864332199097, + "rewards/margins": 1.165224552154541, + "rewards/rejected": -0.89739590883255, + "step": 6645 + }, + { + "epoch": 0.77, + "learning_rate": 7.121620039798665e-08, + "logits/chosen": -3.5538368225097656, + "logits/rejected": -3.584465503692627, + "logps/chosen": -376.7685241699219, + "logps/rejected": -244.937255859375, + "loss": 0.3278, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2897607386112213, + "rewards/margins": 1.399794578552246, + "rewards/rejected": -1.6895554065704346, + "step": 6646 + }, + { + "epoch": 0.77, + "learning_rate": 7.11810839283624e-08, + "logits/chosen": -3.461435317993164, + "logits/rejected": -3.579923629760742, + "logps/chosen": -238.83140563964844, + "logps/rejected": -269.3743591308594, + "loss": 0.3737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1338203251361847, + "rewards/margins": 2.5245859622955322, + "rewards/rejected": -2.39076566696167, + "step": 6647 + }, + { + "epoch": 0.77, + "learning_rate": 7.114596745873815e-08, + "logits/chosen": -2.7149853706359863, + "logits/rejected": -2.9587318897247314, + "logps/chosen": -331.7323913574219, + "logps/rejected": -207.65380859375, + "loss": 0.3228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7141863107681274, + "rewards/margins": 1.951920509338379, + "rewards/rejected": -1.237734079360962, + "step": 6648 + }, + { + "epoch": 0.77, + "learning_rate": 7.111085098911389e-08, + "logits/chosen": -2.763516664505005, + "logits/rejected": -2.8042593002319336, + "logps/chosen": -331.2662353515625, + "logps/rejected": -303.4269104003906, + "loss": 0.1619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20973020792007446, + "rewards/margins": 2.8871233463287354, + "rewards/rejected": -2.6773931980133057, + "step": 6649 + }, + { + "epoch": 0.77, + "learning_rate": 7.107573451948963e-08, + "logits/chosen": -2.8691704273223877, + "logits/rejected": -2.6528162956237793, + "logps/chosen": -342.21435546875, + "logps/rejected": -269.13677978515625, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40638381242752075, + "rewards/margins": 2.143589735031128, + "rewards/rejected": -1.7372061014175415, + "step": 6650 + }, + { + "epoch": 0.77, + "learning_rate": 7.104061804986538e-08, + "logits/chosen": -3.4815313816070557, + "logits/rejected": -3.369663715362549, + "logps/chosen": -344.0582275390625, + "logps/rejected": -278.8072204589844, + "loss": 0.2923, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5736908912658691, + "rewards/margins": 1.7734524011611938, + "rewards/rejected": -1.1997616291046143, + "step": 6651 + }, + { + "epoch": 0.77, + "learning_rate": 7.100550158024112e-08, + "logits/chosen": -3.1271679401397705, + "logits/rejected": -3.412827491760254, + "logps/chosen": -192.120361328125, + "logps/rejected": -193.65478515625, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1226220354437828, + "rewards/margins": 2.0264298915863037, + "rewards/rejected": -1.9038078784942627, + "step": 6652 + }, + { + "epoch": 0.77, + "learning_rate": 7.097038511061688e-08, + "logits/chosen": -3.3209421634674072, + "logits/rejected": -3.6667425632476807, + "logps/chosen": -152.48516845703125, + "logps/rejected": -171.1188507080078, + "loss": 0.3763, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1553243100643158, + "rewards/margins": 2.03448748588562, + "rewards/rejected": -2.1898114681243896, + "step": 6653 + }, + { + "epoch": 0.77, + "learning_rate": 7.093526864099262e-08, + "logits/chosen": -2.957042694091797, + "logits/rejected": -2.5807225704193115, + "logps/chosen": -326.23101806640625, + "logps/rejected": -416.5695495605469, + "loss": 0.4976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26947540044784546, + "rewards/margins": 1.2970519065856934, + "rewards/rejected": -1.5665273666381836, + "step": 6654 + }, + { + "epoch": 0.77, + "learning_rate": 7.090015217136837e-08, + "logits/chosen": -3.4777767658233643, + "logits/rejected": -3.6004629135131836, + "logps/chosen": -357.6923522949219, + "logps/rejected": -329.5279541015625, + "loss": 0.3539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30500689148902893, + "rewards/margins": 1.3186278343200684, + "rewards/rejected": -1.623634696006775, + "step": 6655 + }, + { + "epoch": 0.77, + "learning_rate": 7.086503570174411e-08, + "logits/chosen": -3.143831491470337, + "logits/rejected": -3.323920726776123, + "logps/chosen": -205.88693237304688, + "logps/rejected": -174.4905242919922, + "loss": 0.363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05166993290185928, + "rewards/margins": 1.30592942237854, + "rewards/rejected": -1.357599139213562, + "step": 6656 + }, + { + "epoch": 0.77, + "learning_rate": 7.082991923211987e-08, + "logits/chosen": -3.742042064666748, + "logits/rejected": -3.53684663772583, + "logps/chosen": -161.99057006835938, + "logps/rejected": -224.2655029296875, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0058508701622486115, + "rewards/margins": 2.103346586227417, + "rewards/rejected": -2.0974957942962646, + "step": 6657 + }, + { + "epoch": 0.77, + "learning_rate": 7.079480276249561e-08, + "logits/chosen": -2.90797758102417, + "logits/rejected": -3.008984088897705, + "logps/chosen": -285.32415771484375, + "logps/rejected": -241.7268524169922, + "loss": 0.2385, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15643614530563354, + "rewards/margins": 2.2269272804260254, + "rewards/rejected": -2.070491313934326, + "step": 6658 + }, + { + "epoch": 0.77, + "learning_rate": 7.075968629287136e-08, + "logits/chosen": -3.953744411468506, + "logits/rejected": -3.896888017654419, + "logps/chosen": -107.08198547363281, + "logps/rejected": -168.3124237060547, + "loss": 0.2167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09443998336791992, + "rewards/margins": 2.375124216079712, + "rewards/rejected": -2.469564437866211, + "step": 6659 + }, + { + "epoch": 0.77, + "learning_rate": 7.07245698232471e-08, + "logits/chosen": -3.9205381870269775, + "logits/rejected": -3.880673885345459, + "logps/chosen": -171.4766845703125, + "logps/rejected": -141.7198944091797, + "loss": 0.3566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05676780641078949, + "rewards/margins": 1.5634183883666992, + "rewards/rejected": -1.6201860904693604, + "step": 6660 + }, + { + "epoch": 0.77, + "learning_rate": 7.068945335362286e-08, + "logits/chosen": -2.9770309925079346, + "logits/rejected": -2.7663726806640625, + "logps/chosen": -593.0457763671875, + "logps/rejected": -283.5436706542969, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.059510380029678345, + "rewards/margins": 1.713625192642212, + "rewards/rejected": -1.654114842414856, + "step": 6661 + }, + { + "epoch": 0.77, + "learning_rate": 7.06543368839986e-08, + "logits/chosen": -3.2980799674987793, + "logits/rejected": -3.4159045219421387, + "logps/chosen": -126.28732299804688, + "logps/rejected": -130.4154052734375, + "loss": 0.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6020300388336182, + "rewards/margins": 0.33821365237236023, + "rewards/rejected": -0.9402437210083008, + "step": 6662 + }, + { + "epoch": 0.77, + "learning_rate": 7.061922041437434e-08, + "logits/chosen": -2.407522439956665, + "logits/rejected": -2.7059106826782227, + "logps/chosen": -401.61114501953125, + "logps/rejected": -251.32638549804688, + "loss": 0.2221, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8054996132850647, + "rewards/margins": 2.454982042312622, + "rewards/rejected": -1.6494823694229126, + "step": 6663 + }, + { + "epoch": 0.77, + "learning_rate": 7.058410394475008e-08, + "logits/chosen": -3.126997470855713, + "logits/rejected": -3.265401840209961, + "logps/chosen": -129.668212890625, + "logps/rejected": -239.59869384765625, + "loss": 0.2923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1536291241645813, + "rewards/margins": 2.4989748001098633, + "rewards/rejected": -2.652604103088379, + "step": 6664 + }, + { + "epoch": 0.77, + "learning_rate": 7.054898747512583e-08, + "logits/chosen": -3.0334644317626953, + "logits/rejected": -2.9264278411865234, + "logps/chosen": -547.9782104492188, + "logps/rejected": -377.5182189941406, + "loss": 0.7001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5975138545036316, + "rewards/margins": 0.7485095262527466, + "rewards/rejected": -1.346023440361023, + "step": 6665 + }, + { + "epoch": 0.77, + "learning_rate": 7.051387100550157e-08, + "logits/chosen": -3.206604480743408, + "logits/rejected": -3.213071346282959, + "logps/chosen": -310.5701904296875, + "logps/rejected": -199.7858428955078, + "loss": 0.7248, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30779728293418884, + "rewards/margins": 0.45612674951553345, + "rewards/rejected": -0.7639240026473999, + "step": 6666 + }, + { + "epoch": 0.77, + "learning_rate": 7.047875453587733e-08, + "logits/chosen": -3.0999605655670166, + "logits/rejected": -3.1131157875061035, + "logps/chosen": -187.26983642578125, + "logps/rejected": -247.4287567138672, + "loss": 0.1693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.278534471988678, + "rewards/margins": 2.5923070907592773, + "rewards/rejected": -2.313772678375244, + "step": 6667 + }, + { + "epoch": 0.77, + "learning_rate": 7.044363806625307e-08, + "logits/chosen": -2.938352584838867, + "logits/rejected": -3.012949228286743, + "logps/chosen": -295.8277893066406, + "logps/rejected": -250.16600036621094, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29879462718963623, + "rewards/margins": 1.6152398586273193, + "rewards/rejected": -1.914034366607666, + "step": 6668 + }, + { + "epoch": 0.77, + "learning_rate": 7.040852159662882e-08, + "logits/chosen": -2.5871517658233643, + "logits/rejected": -2.361168146133423, + "logps/chosen": -194.32699584960938, + "logps/rejected": -208.1702880859375, + "loss": 0.4451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3959242105484009, + "rewards/margins": 1.2236363887786865, + "rewards/rejected": -1.6195604801177979, + "step": 6669 + }, + { + "epoch": 0.77, + "learning_rate": 7.037340512700456e-08, + "logits/chosen": -2.299639940261841, + "logits/rejected": -2.3219780921936035, + "logps/chosen": -207.6853485107422, + "logps/rejected": -233.10833740234375, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5891304612159729, + "rewards/margins": 1.2911396026611328, + "rewards/rejected": -1.8802701234817505, + "step": 6670 + }, + { + "epoch": 0.77, + "learning_rate": 7.03382886573803e-08, + "logits/chosen": -3.1508078575134277, + "logits/rejected": -3.2363715171813965, + "logps/chosen": -194.78114318847656, + "logps/rejected": -209.69784545898438, + "loss": 0.4915, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5257551074028015, + "rewards/margins": 1.1251301765441895, + "rewards/rejected": -1.6508853435516357, + "step": 6671 + }, + { + "epoch": 0.77, + "learning_rate": 7.030317218775606e-08, + "logits/chosen": -3.638791561126709, + "logits/rejected": -3.496417284011841, + "logps/chosen": -287.460205078125, + "logps/rejected": -246.27737426757812, + "loss": 0.4654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6346075534820557, + "rewards/margins": 1.455268383026123, + "rewards/rejected": -2.0898756980895996, + "step": 6672 + }, + { + "epoch": 0.77, + "learning_rate": 7.02680557181318e-08, + "logits/chosen": -2.658177375793457, + "logits/rejected": -2.5061192512512207, + "logps/chosen": -121.78240966796875, + "logps/rejected": -216.1166229248047, + "loss": 0.3421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22282692790031433, + "rewards/margins": 1.6694157123565674, + "rewards/rejected": -1.8922427892684937, + "step": 6673 + }, + { + "epoch": 0.77, + "learning_rate": 7.023293924850755e-08, + "logits/chosen": -3.0012474060058594, + "logits/rejected": -3.2033538818359375, + "logps/chosen": -195.69949340820312, + "logps/rejected": -164.20249938964844, + "loss": 1.0208, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0998694896697998, + "rewards/margins": -0.4673287570476532, + "rewards/rejected": -0.632540762424469, + "step": 6674 + }, + { + "epoch": 0.77, + "learning_rate": 7.019782277888329e-08, + "logits/chosen": -3.6494016647338867, + "logits/rejected": -3.647291660308838, + "logps/chosen": -305.6177978515625, + "logps/rejected": -291.98211669921875, + "loss": 0.3706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.005999669432640076, + "rewards/margins": 1.6800059080123901, + "rewards/rejected": -1.6860055923461914, + "step": 6675 + }, + { + "epoch": 0.77, + "learning_rate": 7.016270630925904e-08, + "logits/chosen": -2.1459193229675293, + "logits/rejected": -2.3247013092041016, + "logps/chosen": -304.48175048828125, + "logps/rejected": -278.87744140625, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19487211108207703, + "rewards/margins": 2.0786263942718506, + "rewards/rejected": -2.27349853515625, + "step": 6676 + }, + { + "epoch": 0.77, + "learning_rate": 7.012758983963479e-08, + "logits/chosen": -2.4983439445495605, + "logits/rejected": -2.3572745323181152, + "logps/chosen": -368.85272216796875, + "logps/rejected": -395.04937744140625, + "loss": 0.5107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08674860000610352, + "rewards/margins": 0.9968321323394775, + "rewards/rejected": -1.083580732345581, + "step": 6677 + }, + { + "epoch": 0.77, + "learning_rate": 7.009247337001054e-08, + "logits/chosen": -3.2500767707824707, + "logits/rejected": -3.1882596015930176, + "logps/chosen": -307.1824035644531, + "logps/rejected": -298.03289794921875, + "loss": 0.4929, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41428425908088684, + "rewards/margins": 1.2904757261276245, + "rewards/rejected": -1.7047600746154785, + "step": 6678 + }, + { + "epoch": 0.77, + "learning_rate": 7.005735690038628e-08, + "logits/chosen": -3.0657925605773926, + "logits/rejected": -2.844149112701416, + "logps/chosen": -267.1283874511719, + "logps/rejected": -245.80654907226562, + "loss": 0.2491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.053504228591918945, + "rewards/margins": 2.0084354877471924, + "rewards/rejected": -1.9549312591552734, + "step": 6679 + }, + { + "epoch": 0.77, + "learning_rate": 7.002224043076202e-08, + "logits/chosen": -3.683476448059082, + "logits/rejected": -3.304522752761841, + "logps/chosen": -232.00079345703125, + "logps/rejected": -151.9200897216797, + "loss": 0.6956, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5242166519165039, + "rewards/margins": 0.9087802171707153, + "rewards/rejected": -1.4329968690872192, + "step": 6680 + }, + { + "epoch": 0.77, + "learning_rate": 6.998712396113776e-08, + "logits/chosen": -2.591982364654541, + "logits/rejected": -2.9178860187530518, + "logps/chosen": -253.76431274414062, + "logps/rejected": -218.83712768554688, + "loss": 0.545, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46065133810043335, + "rewards/margins": 2.1175832748413086, + "rewards/rejected": -2.5782344341278076, + "step": 6681 + }, + { + "epoch": 0.77, + "learning_rate": 6.995200749151351e-08, + "logits/chosen": -3.4662060737609863, + "logits/rejected": -3.510436534881592, + "logps/chosen": -182.4345703125, + "logps/rejected": -218.57492065429688, + "loss": 0.2201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14503085613250732, + "rewards/margins": 1.7174546718597412, + "rewards/rejected": -1.862485647201538, + "step": 6682 + }, + { + "epoch": 0.77, + "learning_rate": 6.991689102188926e-08, + "logits/chosen": -3.5678062438964844, + "logits/rejected": -3.510648250579834, + "logps/chosen": -273.54888916015625, + "logps/rejected": -229.04791259765625, + "loss": 0.2161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05583646893501282, + "rewards/margins": 1.977405071258545, + "rewards/rejected": -2.0332415103912354, + "step": 6683 + }, + { + "epoch": 0.77, + "learning_rate": 6.988177455226501e-08, + "logits/chosen": -3.1037189960479736, + "logits/rejected": -2.9564051628112793, + "logps/chosen": -169.9220733642578, + "logps/rejected": -156.9796600341797, + "loss": 0.4259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051321886479854584, + "rewards/margins": 1.4328068494796753, + "rewards/rejected": -1.484128713607788, + "step": 6684 + }, + { + "epoch": 0.77, + "learning_rate": 6.984665808264075e-08, + "logits/chosen": -4.028126239776611, + "logits/rejected": -3.4977357387542725, + "logps/chosen": -285.17303466796875, + "logps/rejected": -243.04954528808594, + "loss": 0.7534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6664621829986572, + "rewards/margins": 1.6112804412841797, + "rewards/rejected": -2.277742624282837, + "step": 6685 + }, + { + "epoch": 0.77, + "learning_rate": 6.98115416130165e-08, + "logits/chosen": -3.0857648849487305, + "logits/rejected": -2.881385326385498, + "logps/chosen": -207.3128662109375, + "logps/rejected": -205.68197631835938, + "loss": 0.3716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40231770277023315, + "rewards/margins": 1.7757490873336792, + "rewards/rejected": -2.1780667304992676, + "step": 6686 + }, + { + "epoch": 0.77, + "learning_rate": 6.977642514339224e-08, + "logits/chosen": -3.3062264919281006, + "logits/rejected": -3.8700459003448486, + "logps/chosen": -327.03826904296875, + "logps/rejected": -308.7749328613281, + "loss": 0.2786, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4860183596611023, + "rewards/margins": 3.414578437805176, + "rewards/rejected": -2.928560256958008, + "step": 6687 + }, + { + "epoch": 0.77, + "learning_rate": 6.9741308673768e-08, + "logits/chosen": -3.3069076538085938, + "logits/rejected": -2.9672775268554688, + "logps/chosen": -197.88490295410156, + "logps/rejected": -231.03875732421875, + "loss": 0.3622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3107379674911499, + "rewards/margins": 1.9883081912994385, + "rewards/rejected": -2.299046039581299, + "step": 6688 + }, + { + "epoch": 0.77, + "learning_rate": 6.970619220414374e-08, + "logits/chosen": -2.2867507934570312, + "logits/rejected": -2.3371777534484863, + "logps/chosen": -329.5579833984375, + "logps/rejected": -273.6596984863281, + "loss": 0.3441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1117854118347168, + "rewards/margins": 1.8767683506011963, + "rewards/rejected": -1.9885536432266235, + "step": 6689 + }, + { + "epoch": 0.77, + "learning_rate": 6.967107573451949e-08, + "logits/chosen": -2.403434991836548, + "logits/rejected": -2.490604877471924, + "logps/chosen": -321.72454833984375, + "logps/rejected": -294.1709899902344, + "loss": 0.378, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1586051881313324, + "rewards/margins": 1.4651830196380615, + "rewards/rejected": -1.3065778017044067, + "step": 6690 + }, + { + "epoch": 0.77, + "learning_rate": 6.963595926489523e-08, + "logits/chosen": -3.4632797241210938, + "logits/rejected": -2.8650028705596924, + "logps/chosen": -224.56494140625, + "logps/rejected": -221.48805236816406, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09693184494972229, + "rewards/margins": 0.7122924327850342, + "rewards/rejected": -0.8092243075370789, + "step": 6691 + }, + { + "epoch": 0.77, + "learning_rate": 6.960084279527099e-08, + "logits/chosen": -2.65057373046875, + "logits/rejected": -2.860349178314209, + "logps/chosen": -269.19403076171875, + "logps/rejected": -252.79336547851562, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005473703145980835, + "rewards/margins": 2.2291879653930664, + "rewards/rejected": -2.23466157913208, + "step": 6692 + }, + { + "epoch": 0.77, + "learning_rate": 6.956572632564673e-08, + "logits/chosen": -2.7140681743621826, + "logits/rejected": -2.541001319885254, + "logps/chosen": -173.672119140625, + "logps/rejected": -200.89979553222656, + "loss": 0.2195, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3719238042831421, + "rewards/margins": 1.954592227935791, + "rewards/rejected": -1.5826683044433594, + "step": 6693 + }, + { + "epoch": 0.77, + "learning_rate": 6.953060985602247e-08, + "logits/chosen": -2.369978904724121, + "logits/rejected": -2.412426710128784, + "logps/chosen": -491.5198059082031, + "logps/rejected": -305.1441650390625, + "loss": 0.5659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3981339931488037, + "rewards/margins": 0.8056747913360596, + "rewards/rejected": -1.2038087844848633, + "step": 6694 + }, + { + "epoch": 0.77, + "learning_rate": 6.949549338639822e-08, + "logits/chosen": -3.1163010597229004, + "logits/rejected": -2.7899465560913086, + "logps/chosen": -249.82479858398438, + "logps/rejected": -238.3206787109375, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6367157101631165, + "rewards/margins": 0.979875922203064, + "rewards/rejected": -1.6165915727615356, + "step": 6695 + }, + { + "epoch": 0.77, + "learning_rate": 6.946037691677396e-08, + "logits/chosen": -2.7141871452331543, + "logits/rejected": -2.260397434234619, + "logps/chosen": -238.58926391601562, + "logps/rejected": -253.93014526367188, + "loss": 0.3872, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3174598813056946, + "rewards/margins": 2.9219038486480713, + "rewards/rejected": -3.2393639087677, + "step": 6696 + }, + { + "epoch": 0.77, + "learning_rate": 6.94252604471497e-08, + "logits/chosen": -3.612842559814453, + "logits/rejected": -3.3571887016296387, + "logps/chosen": -202.368408203125, + "logps/rejected": -128.33477783203125, + "loss": 0.628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7427929043769836, + "rewards/margins": 0.3290589451789856, + "rewards/rejected": -1.0718518495559692, + "step": 6697 + }, + { + "epoch": 0.77, + "learning_rate": 6.939014397752546e-08, + "logits/chosen": -2.4758729934692383, + "logits/rejected": -2.435983896255493, + "logps/chosen": -231.92620849609375, + "logps/rejected": -224.98638916015625, + "loss": 0.5777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08920292556285858, + "rewards/margins": 0.8306562900543213, + "rewards/rejected": -0.9198591709136963, + "step": 6698 + }, + { + "epoch": 0.77, + "learning_rate": 6.93550275079012e-08, + "logits/chosen": -2.631636381149292, + "logits/rejected": -2.5341367721557617, + "logps/chosen": -167.33358764648438, + "logps/rejected": -202.0367889404297, + "loss": 0.5301, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14251533150672913, + "rewards/margins": 1.1371806859970093, + "rewards/rejected": -0.994665265083313, + "step": 6699 + }, + { + "epoch": 0.77, + "learning_rate": 6.931991103827695e-08, + "logits/chosen": -3.4491076469421387, + "logits/rejected": -3.3249030113220215, + "logps/chosen": -280.7845153808594, + "logps/rejected": -340.880126953125, + "loss": 0.3092, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23002813756465912, + "rewards/margins": 2.1367220878601074, + "rewards/rejected": -1.906693935394287, + "step": 6700 + }, + { + "epoch": 0.77, + "learning_rate": 6.928479456865269e-08, + "logits/chosen": -4.165854454040527, + "logits/rejected": -3.9465556144714355, + "logps/chosen": -308.9183349609375, + "logps/rejected": -217.78736877441406, + "loss": 0.3733, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2005363404750824, + "rewards/margins": 2.523404121398926, + "rewards/rejected": -2.723940372467041, + "step": 6701 + }, + { + "epoch": 0.77, + "learning_rate": 6.924967809902845e-08, + "logits/chosen": -3.0210328102111816, + "logits/rejected": -3.1567909717559814, + "logps/chosen": -249.5545654296875, + "logps/rejected": -296.9821472167969, + "loss": 0.3312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028331607580184937, + "rewards/margins": 1.805163025856018, + "rewards/rejected": -1.8334946632385254, + "step": 6702 + }, + { + "epoch": 0.77, + "learning_rate": 6.921456162940419e-08, + "logits/chosen": -2.8079795837402344, + "logits/rejected": -2.703418254852295, + "logps/chosen": -250.61407470703125, + "logps/rejected": -208.05105590820312, + "loss": 0.509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38428425788879395, + "rewards/margins": 1.064389705657959, + "rewards/rejected": -1.448673963546753, + "step": 6703 + }, + { + "epoch": 0.77, + "learning_rate": 6.917944515977994e-08, + "logits/chosen": -3.065878391265869, + "logits/rejected": -3.3464512825012207, + "logps/chosen": -207.82421875, + "logps/rejected": -180.16165161132812, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10318169742822647, + "rewards/margins": 1.5119572877883911, + "rewards/rejected": -1.6151390075683594, + "step": 6704 + }, + { + "epoch": 0.77, + "learning_rate": 6.914432869015568e-08, + "logits/chosen": -2.4405694007873535, + "logits/rejected": -2.912033796310425, + "logps/chosen": -171.74766540527344, + "logps/rejected": -220.39625549316406, + "loss": 0.3265, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20217232406139374, + "rewards/margins": 1.8300931453704834, + "rewards/rejected": -1.6279206275939941, + "step": 6705 + }, + { + "epoch": 0.77, + "learning_rate": 6.910921222053144e-08, + "logits/chosen": -3.0810532569885254, + "logits/rejected": -2.9901366233825684, + "logps/chosen": -201.556884765625, + "logps/rejected": -231.21929931640625, + "loss": 0.4034, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16011367738246918, + "rewards/margins": 1.2109529972076416, + "rewards/rejected": -1.0508393049240112, + "step": 6706 + }, + { + "epoch": 0.77, + "learning_rate": 6.907409575090718e-08, + "logits/chosen": -3.664017677307129, + "logits/rejected": -3.8751847743988037, + "logps/chosen": -128.16595458984375, + "logps/rejected": -208.48812866210938, + "loss": 0.3481, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02687002532184124, + "rewards/margins": 1.6927893161773682, + "rewards/rejected": -1.665919542312622, + "step": 6707 + }, + { + "epoch": 0.77, + "learning_rate": 6.903897928128292e-08, + "logits/chosen": -3.779604434967041, + "logits/rejected": -3.592599868774414, + "logps/chosen": -331.807861328125, + "logps/rejected": -268.95294189453125, + "loss": 0.5366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34618818759918213, + "rewards/margins": 1.5669677257537842, + "rewards/rejected": -1.9131557941436768, + "step": 6708 + }, + { + "epoch": 0.77, + "learning_rate": 6.900386281165867e-08, + "logits/chosen": -1.9799443483352661, + "logits/rejected": -2.3324592113494873, + "logps/chosen": -388.5754699707031, + "logps/rejected": -304.8343200683594, + "loss": 0.2636, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041544921696186066, + "rewards/margins": 1.6176334619522095, + "rewards/rejected": -1.576088547706604, + "step": 6709 + }, + { + "epoch": 0.77, + "learning_rate": 6.896874634203441e-08, + "logits/chosen": -3.3241117000579834, + "logits/rejected": -3.6553640365600586, + "logps/chosen": -156.78369140625, + "logps/rejected": -184.84396362304688, + "loss": 0.4084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5225830078125, + "rewards/margins": 1.7056708335876465, + "rewards/rejected": -2.2282538414001465, + "step": 6710 + }, + { + "epoch": 0.77, + "learning_rate": 6.893362987241015e-08, + "logits/chosen": -2.772982597351074, + "logits/rejected": -2.9071977138519287, + "logps/chosen": -292.4184875488281, + "logps/rejected": -298.1151123046875, + "loss": 0.4043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6170984506607056, + "rewards/margins": 2.4586594104766846, + "rewards/rejected": -3.0757577419281006, + "step": 6711 + }, + { + "epoch": 0.77, + "learning_rate": 6.88985134027859e-08, + "logits/chosen": -2.6297972202301025, + "logits/rejected": -2.673067331314087, + "logps/chosen": -212.06954956054688, + "logps/rejected": -323.43798828125, + "loss": 0.3525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1884596347808838, + "rewards/margins": 2.799792766571045, + "rewards/rejected": -2.9882524013519287, + "step": 6712 + }, + { + "epoch": 0.77, + "learning_rate": 6.886339693316165e-08, + "logits/chosen": -3.1794915199279785, + "logits/rejected": -3.0216410160064697, + "logps/chosen": -152.64456176757812, + "logps/rejected": -226.64984130859375, + "loss": 0.0764, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31350061297416687, + "rewards/margins": 3.30137038230896, + "rewards/rejected": -2.9878697395324707, + "step": 6713 + }, + { + "epoch": 0.77, + "learning_rate": 6.88282804635374e-08, + "logits/chosen": -2.8323333263397217, + "logits/rejected": -2.676301956176758, + "logps/chosen": -170.78399658203125, + "logps/rejected": -266.1107177734375, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47175097465515137, + "rewards/margins": 2.5664544105529785, + "rewards/rejected": -2.094703197479248, + "step": 6714 + }, + { + "epoch": 0.77, + "learning_rate": 6.879316399391314e-08, + "logits/chosen": -3.6261677742004395, + "logits/rejected": -3.4445090293884277, + "logps/chosen": -209.65435791015625, + "logps/rejected": -152.09315490722656, + "loss": 0.4167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34582990407943726, + "rewards/margins": 1.3734242916107178, + "rewards/rejected": -1.7192540168762207, + "step": 6715 + }, + { + "epoch": 0.77, + "learning_rate": 6.87580475242889e-08, + "logits/chosen": -3.9616589546203613, + "logits/rejected": -4.160224437713623, + "logps/chosen": -448.9508972167969, + "logps/rejected": -395.95074462890625, + "loss": 0.4243, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1326984465122223, + "rewards/margins": 1.4185104370117188, + "rewards/rejected": -1.2858119010925293, + "step": 6716 + }, + { + "epoch": 0.77, + "learning_rate": 6.872293105466463e-08, + "logits/chosen": -3.589077949523926, + "logits/rejected": -3.7929000854492188, + "logps/chosen": -97.75932312011719, + "logps/rejected": -164.31167602539062, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5288053154945374, + "rewards/margins": 2.668259620666504, + "rewards/rejected": -2.1394546031951904, + "step": 6717 + }, + { + "epoch": 0.77, + "learning_rate": 6.868781458504038e-08, + "logits/chosen": -3.2190661430358887, + "logits/rejected": -3.196746587753296, + "logps/chosen": -131.4701385498047, + "logps/rejected": -240.82192993164062, + "loss": 0.336, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2786732017993927, + "rewards/margins": 2.012192726135254, + "rewards/rejected": -1.7335196733474731, + "step": 6718 + }, + { + "epoch": 0.77, + "learning_rate": 6.865269811541613e-08, + "logits/chosen": -2.5830647945404053, + "logits/rejected": -2.7446999549865723, + "logps/chosen": -197.2930908203125, + "logps/rejected": -220.03729248046875, + "loss": 0.5581, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6865454912185669, + "rewards/margins": 1.1993141174316406, + "rewards/rejected": -1.885859489440918, + "step": 6719 + }, + { + "epoch": 0.77, + "learning_rate": 6.861758164579187e-08, + "logits/chosen": -2.7141997814178467, + "logits/rejected": -3.18156099319458, + "logps/chosen": -194.86984252929688, + "logps/rejected": -290.25103759765625, + "loss": 0.3386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5052317380905151, + "rewards/margins": 2.147275447845459, + "rewards/rejected": -2.6525073051452637, + "step": 6720 + }, + { + "epoch": 0.77, + "learning_rate": 6.858246517616762e-08, + "logits/chosen": -3.4171056747436523, + "logits/rejected": -3.4894349575042725, + "logps/chosen": -254.89073181152344, + "logps/rejected": -218.66152954101562, + "loss": 0.3948, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1206553652882576, + "rewards/margins": 2.0240213871002197, + "rewards/rejected": -2.144676685333252, + "step": 6721 + }, + { + "epoch": 0.77, + "learning_rate": 6.854734870654336e-08, + "logits/chosen": -3.953467845916748, + "logits/rejected": -3.4559316635131836, + "logps/chosen": -222.63113403320312, + "logps/rejected": -149.2471160888672, + "loss": 0.5424, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4133056700229645, + "rewards/margins": 1.4775272607803345, + "rewards/rejected": -1.8908329010009766, + "step": 6722 + }, + { + "epoch": 0.78, + "learning_rate": 6.851223223691912e-08, + "logits/chosen": -3.2970376014709473, + "logits/rejected": -3.287259578704834, + "logps/chosen": -74.08699035644531, + "logps/rejected": -177.20535278320312, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2937467098236084, + "rewards/margins": 1.9125843048095703, + "rewards/rejected": -2.2063307762145996, + "step": 6723 + }, + { + "epoch": 0.78, + "learning_rate": 6.847711576729486e-08, + "logits/chosen": -3.0797908306121826, + "logits/rejected": -3.4974417686462402, + "logps/chosen": -96.86317443847656, + "logps/rejected": -199.33294677734375, + "loss": 0.2675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09369315952062607, + "rewards/margins": 2.2629525661468506, + "rewards/rejected": -2.1692593097686768, + "step": 6724 + }, + { + "epoch": 0.78, + "learning_rate": 6.84419992976706e-08, + "logits/chosen": -2.5442748069763184, + "logits/rejected": -2.6962130069732666, + "logps/chosen": -333.09405517578125, + "logps/rejected": -353.643798828125, + "loss": 0.2489, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11452893912792206, + "rewards/margins": 3.045459270477295, + "rewards/rejected": -2.9309303760528564, + "step": 6725 + }, + { + "epoch": 0.78, + "learning_rate": 6.840688282804635e-08, + "logits/chosen": -3.0283076763153076, + "logits/rejected": -3.076552152633667, + "logps/chosen": -127.31571960449219, + "logps/rejected": -241.12603759765625, + "loss": 0.8069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8784939050674438, + "rewards/margins": 1.5343942642211914, + "rewards/rejected": -2.4128880500793457, + "step": 6726 + }, + { + "epoch": 0.78, + "learning_rate": 6.83717663584221e-08, + "logits/chosen": -3.639894962310791, + "logits/rejected": -3.145113468170166, + "logps/chosen": -125.79888916015625, + "logps/rejected": -113.52177429199219, + "loss": 0.5894, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27103590965270996, + "rewards/margins": 0.8505220413208008, + "rewards/rejected": -1.1215579509735107, + "step": 6727 + }, + { + "epoch": 0.78, + "learning_rate": 6.833664988879783e-08, + "logits/chosen": -2.985213041305542, + "logits/rejected": -2.97275447845459, + "logps/chosen": -238.44332885742188, + "logps/rejected": -400.7316589355469, + "loss": 0.2834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3571183383464813, + "rewards/margins": 2.4925742149353027, + "rewards/rejected": -2.8496925830841064, + "step": 6728 + }, + { + "epoch": 0.78, + "learning_rate": 6.830153341917359e-08, + "logits/chosen": -3.8371853828430176, + "logits/rejected": -4.0859270095825195, + "logps/chosen": -209.26126098632812, + "logps/rejected": -244.828369140625, + "loss": 0.2125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37621572613716125, + "rewards/margins": 2.1031761169433594, + "rewards/rejected": -2.479391574859619, + "step": 6729 + }, + { + "epoch": 0.78, + "learning_rate": 6.826641694954933e-08, + "logits/chosen": -3.2145960330963135, + "logits/rejected": -2.757065773010254, + "logps/chosen": -282.7794189453125, + "logps/rejected": -288.9834899902344, + "loss": 0.4508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4973795413970947, + "rewards/margins": 1.7604317665100098, + "rewards/rejected": -2.2578113079071045, + "step": 6730 + }, + { + "epoch": 0.78, + "learning_rate": 6.823130047992508e-08, + "logits/chosen": -3.1276392936706543, + "logits/rejected": -3.2401773929595947, + "logps/chosen": -164.40130615234375, + "logps/rejected": -211.66004943847656, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4409925639629364, + "rewards/margins": 2.64860200881958, + "rewards/rejected": -2.207609176635742, + "step": 6731 + }, + { + "epoch": 0.78, + "learning_rate": 6.819618401030082e-08, + "logits/chosen": -2.721419095993042, + "logits/rejected": -2.879438877105713, + "logps/chosen": -205.6863250732422, + "logps/rejected": -365.1981506347656, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08001469075679779, + "rewards/margins": 1.901955246925354, + "rewards/rejected": -1.8219406604766846, + "step": 6732 + }, + { + "epoch": 0.78, + "learning_rate": 6.816106754067658e-08, + "logits/chosen": -3.140615463256836, + "logits/rejected": -2.9657602310180664, + "logps/chosen": -189.47483825683594, + "logps/rejected": -248.07785034179688, + "loss": 0.2646, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19043980538845062, + "rewards/margins": 2.8594305515289307, + "rewards/rejected": -2.6689910888671875, + "step": 6733 + }, + { + "epoch": 0.78, + "learning_rate": 6.812595107105232e-08, + "logits/chosen": -3.192077159881592, + "logits/rejected": -3.2576041221618652, + "logps/chosen": -292.09600830078125, + "logps/rejected": -383.9446105957031, + "loss": 0.3518, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5639198422431946, + "rewards/margins": 3.098264694213867, + "rewards/rejected": -2.5343449115753174, + "step": 6734 + }, + { + "epoch": 0.78, + "learning_rate": 6.809083460142807e-08, + "logits/chosen": -3.1543610095977783, + "logits/rejected": -3.1077637672424316, + "logps/chosen": -253.5218505859375, + "logps/rejected": -260.5858154296875, + "loss": 0.9603, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7870556116104126, + "rewards/margins": 0.790581226348877, + "rewards/rejected": -2.57763671875, + "step": 6735 + }, + { + "epoch": 0.78, + "learning_rate": 6.805571813180381e-08, + "logits/chosen": -3.3754167556762695, + "logits/rejected": -3.0748298168182373, + "logps/chosen": -194.5589141845703, + "logps/rejected": -154.12843322753906, + "loss": 0.3291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48940175771713257, + "rewards/margins": 1.190769076347351, + "rewards/rejected": -1.6801708936691284, + "step": 6736 + }, + { + "epoch": 0.78, + "learning_rate": 6.802060166217957e-08, + "logits/chosen": -2.3164124488830566, + "logits/rejected": -2.206270456314087, + "logps/chosen": -167.54647827148438, + "logps/rejected": -346.6318359375, + "loss": 0.2911, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49878543615341187, + "rewards/margins": 2.0892560482025146, + "rewards/rejected": -1.590470552444458, + "step": 6737 + }, + { + "epoch": 0.78, + "learning_rate": 6.798548519255531e-08, + "logits/chosen": -2.5072081089019775, + "logits/rejected": -2.5344462394714355, + "logps/chosen": -391.7557678222656, + "logps/rejected": -384.0386047363281, + "loss": 0.7118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6141217947006226, + "rewards/margins": 0.5466809272766113, + "rewards/rejected": -1.1608027219772339, + "step": 6738 + }, + { + "epoch": 0.78, + "learning_rate": 6.795036872293105e-08, + "logits/chosen": -3.234917402267456, + "logits/rejected": -3.0003583431243896, + "logps/chosen": -140.91015625, + "logps/rejected": -212.99766540527344, + "loss": 0.4784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5570802688598633, + "rewards/margins": 0.9766693115234375, + "rewards/rejected": -1.5337495803833008, + "step": 6739 + }, + { + "epoch": 0.78, + "learning_rate": 6.79152522533068e-08, + "logits/chosen": -3.507350444793701, + "logits/rejected": -3.6469550132751465, + "logps/chosen": -300.06341552734375, + "logps/rejected": -289.1601257324219, + "loss": 0.4252, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21855475008487701, + "rewards/margins": 1.4474685192108154, + "rewards/rejected": -1.2289139032363892, + "step": 6740 + }, + { + "epoch": 0.78, + "learning_rate": 6.788013578368254e-08, + "logits/chosen": -3.430755138397217, + "logits/rejected": -3.4893906116485596, + "logps/chosen": -233.16452026367188, + "logps/rejected": -234.4214324951172, + "loss": 0.3757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48202037811279297, + "rewards/margins": 1.6968953609466553, + "rewards/rejected": -2.1789157390594482, + "step": 6741 + }, + { + "epoch": 0.78, + "learning_rate": 6.784501931405828e-08, + "logits/chosen": -2.916330337524414, + "logits/rejected": -3.223453998565674, + "logps/chosen": -380.7151794433594, + "logps/rejected": -315.21337890625, + "loss": 0.5683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11315461993217468, + "rewards/margins": 0.6936452984809875, + "rewards/rejected": -0.8067998886108398, + "step": 6742 + }, + { + "epoch": 0.78, + "learning_rate": 6.780990284443404e-08, + "logits/chosen": -2.292304515838623, + "logits/rejected": -2.218099594116211, + "logps/chosen": -379.0765686035156, + "logps/rejected": -304.52001953125, + "loss": 0.7581, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03990750014781952, + "rewards/margins": 0.5408694744110107, + "rewards/rejected": -0.5009620189666748, + "step": 6743 + }, + { + "epoch": 0.78, + "learning_rate": 6.777478637480978e-08, + "logits/chosen": -3.335839033126831, + "logits/rejected": -3.1805925369262695, + "logps/chosen": -159.94342041015625, + "logps/rejected": -213.87875366210938, + "loss": 0.4119, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15493294596672058, + "rewards/margins": 1.755934715270996, + "rewards/rejected": -1.6010017395019531, + "step": 6744 + }, + { + "epoch": 0.78, + "learning_rate": 6.773966990518553e-08, + "logits/chosen": -3.0763607025146484, + "logits/rejected": -3.220470905303955, + "logps/chosen": -193.21755981445312, + "logps/rejected": -246.71707153320312, + "loss": 0.3546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.157790407538414, + "rewards/margins": 1.964208722114563, + "rewards/rejected": -1.8064182996749878, + "step": 6745 + }, + { + "epoch": 0.78, + "learning_rate": 6.770455343556127e-08, + "logits/chosen": -4.093331336975098, + "logits/rejected": -3.918449878692627, + "logps/chosen": -273.9168701171875, + "logps/rejected": -225.55035400390625, + "loss": 0.4165, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0437413826584816, + "rewards/margins": 1.8876534700393677, + "rewards/rejected": -1.9313948154449463, + "step": 6746 + }, + { + "epoch": 0.78, + "learning_rate": 6.766943696593703e-08, + "logits/chosen": -2.823594093322754, + "logits/rejected": -2.828108549118042, + "logps/chosen": -178.00807189941406, + "logps/rejected": -167.67849731445312, + "loss": 0.3685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14032909274101257, + "rewards/margins": 1.632938265800476, + "rewards/rejected": -1.773267388343811, + "step": 6747 + }, + { + "epoch": 0.78, + "learning_rate": 6.763432049631277e-08, + "logits/chosen": -3.5244622230529785, + "logits/rejected": -2.9853432178497314, + "logps/chosen": -326.2583312988281, + "logps/rejected": -375.45306396484375, + "loss": 0.4111, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3484554588794708, + "rewards/margins": 1.8319854736328125, + "rewards/rejected": -1.4835301637649536, + "step": 6748 + }, + { + "epoch": 0.78, + "learning_rate": 6.759920402668852e-08, + "logits/chosen": -3.5182995796203613, + "logits/rejected": -3.472938299179077, + "logps/chosen": -178.6217041015625, + "logps/rejected": -212.73318481445312, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2701353132724762, + "rewards/margins": 1.1895174980163574, + "rewards/rejected": -1.4596527814865112, + "step": 6749 + }, + { + "epoch": 0.78, + "learning_rate": 6.756408755706426e-08, + "logits/chosen": -3.1557199954986572, + "logits/rejected": -3.30847430229187, + "logps/chosen": -186.4058837890625, + "logps/rejected": -349.13092041015625, + "loss": 0.4101, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15429064631462097, + "rewards/margins": 1.6989136934280396, + "rewards/rejected": -1.5446232557296753, + "step": 6750 + }, + { + "epoch": 0.78, + "learning_rate": 6.752897108744001e-08, + "logits/chosen": -2.824800729751587, + "logits/rejected": -2.909651279449463, + "logps/chosen": -237.24766540527344, + "logps/rejected": -248.22439575195312, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04972691833972931, + "rewards/margins": 2.4869258403778076, + "rewards/rejected": -2.5366525650024414, + "step": 6751 + }, + { + "epoch": 0.78, + "learning_rate": 6.749385461781576e-08, + "logits/chosen": -2.609004497528076, + "logits/rejected": -2.7477595806121826, + "logps/chosen": -244.7191162109375, + "logps/rejected": -409.5538635253906, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06118001043796539, + "rewards/margins": 2.9354121685028076, + "rewards/rejected": -2.874232292175293, + "step": 6752 + }, + { + "epoch": 0.78, + "learning_rate": 6.74587381481915e-08, + "logits/chosen": -3.1830174922943115, + "logits/rejected": -3.420806407928467, + "logps/chosen": -366.7705078125, + "logps/rejected": -322.2606201171875, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03745002672076225, + "rewards/margins": 1.7738221883773804, + "rewards/rejected": -1.8112722635269165, + "step": 6753 + }, + { + "epoch": 0.78, + "learning_rate": 6.742362167856725e-08, + "logits/chosen": -3.294076919555664, + "logits/rejected": -3.3481709957122803, + "logps/chosen": -104.99702453613281, + "logps/rejected": -195.2013702392578, + "loss": 0.3271, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17698076367378235, + "rewards/margins": 2.5153186321258545, + "rewards/rejected": -2.3383378982543945, + "step": 6754 + }, + { + "epoch": 0.78, + "learning_rate": 6.738850520894299e-08, + "logits/chosen": -3.2617311477661133, + "logits/rejected": -3.1979317665100098, + "logps/chosen": -250.09423828125, + "logps/rejected": -282.5974426269531, + "loss": 0.4308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41982129216194153, + "rewards/margins": 2.2229840755462646, + "rewards/rejected": -2.642805576324463, + "step": 6755 + }, + { + "epoch": 0.78, + "learning_rate": 6.735338873931873e-08, + "logits/chosen": -2.6085855960845947, + "logits/rejected": -2.5268099308013916, + "logps/chosen": -385.3382873535156, + "logps/rejected": -295.58770751953125, + "loss": 0.5526, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1918623149394989, + "rewards/margins": 1.3135995864868164, + "rewards/rejected": -1.1217373609542847, + "step": 6756 + }, + { + "epoch": 0.78, + "learning_rate": 6.731827226969448e-08, + "logits/chosen": -3.7514588832855225, + "logits/rejected": -3.706843852996826, + "logps/chosen": -352.1486511230469, + "logps/rejected": -377.5674743652344, + "loss": 0.7044, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6865320205688477, + "rewards/margins": 1.8352543115615845, + "rewards/rejected": -2.5217862129211426, + "step": 6757 + }, + { + "epoch": 0.78, + "learning_rate": 6.728315580007023e-08, + "logits/chosen": -3.1319174766540527, + "logits/rejected": -2.9577760696411133, + "logps/chosen": -346.95306396484375, + "logps/rejected": -297.95489501953125, + "loss": 0.3384, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23028407990932465, + "rewards/margins": 2.3285415172576904, + "rewards/rejected": -2.5588254928588867, + "step": 6758 + }, + { + "epoch": 0.78, + "learning_rate": 6.724803933044598e-08, + "logits/chosen": -3.114828109741211, + "logits/rejected": -3.174327850341797, + "logps/chosen": -287.14227294921875, + "logps/rejected": -172.8885040283203, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.050583165138959885, + "rewards/margins": 1.3143872022628784, + "rewards/rejected": -1.2638039588928223, + "step": 6759 + }, + { + "epoch": 0.78, + "learning_rate": 6.721292286082172e-08, + "logits/chosen": -2.3377041816711426, + "logits/rejected": -2.2714385986328125, + "logps/chosen": -445.12890625, + "logps/rejected": -344.93212890625, + "loss": 0.5583, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18782201409339905, + "rewards/margins": 1.1310091018676758, + "rewards/rejected": -1.3188310861587524, + "step": 6760 + }, + { + "epoch": 0.78, + "learning_rate": 6.717780639119747e-08, + "logits/chosen": -3.6790966987609863, + "logits/rejected": -3.457943916320801, + "logps/chosen": -190.5650634765625, + "logps/rejected": -183.3907470703125, + "loss": 0.3711, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03913339972496033, + "rewards/margins": 1.7355867624282837, + "rewards/rejected": -1.69645357131958, + "step": 6761 + }, + { + "epoch": 0.78, + "learning_rate": 6.714268992157321e-08, + "logits/chosen": -2.9540276527404785, + "logits/rejected": -3.2087879180908203, + "logps/chosen": -310.9033203125, + "logps/rejected": -377.8594665527344, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07904690504074097, + "rewards/margins": 2.6294355392456055, + "rewards/rejected": -2.708482503890991, + "step": 6762 + }, + { + "epoch": 0.78, + "learning_rate": 6.710757345194897e-08, + "logits/chosen": -3.121577262878418, + "logits/rejected": -2.9825241565704346, + "logps/chosen": -190.72232055664062, + "logps/rejected": -249.54039001464844, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32170891761779785, + "rewards/margins": 2.3561086654663086, + "rewards/rejected": -2.6778173446655273, + "step": 6763 + }, + { + "epoch": 0.78, + "learning_rate": 6.707245698232471e-08, + "logits/chosen": -2.6934216022491455, + "logits/rejected": -2.5509557723999023, + "logps/chosen": -219.39537048339844, + "logps/rejected": -198.03453063964844, + "loss": 0.3693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.053799454122781754, + "rewards/margins": 2.354480266571045, + "rewards/rejected": -2.4082798957824707, + "step": 6764 + }, + { + "epoch": 0.78, + "learning_rate": 6.703734051270045e-08, + "logits/chosen": -3.3928158283233643, + "logits/rejected": -3.656370162963867, + "logps/chosen": -200.74713134765625, + "logps/rejected": -196.04647827148438, + "loss": 0.4213, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24045610427856445, + "rewards/margins": 1.6359903812408447, + "rewards/rejected": -1.3955341577529907, + "step": 6765 + }, + { + "epoch": 0.78, + "learning_rate": 6.70022240430762e-08, + "logits/chosen": -3.3586671352386475, + "logits/rejected": -3.0453810691833496, + "logps/chosen": -281.5528869628906, + "logps/rejected": -233.470458984375, + "loss": 0.21, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18034525215625763, + "rewards/margins": 1.91818368434906, + "rewards/rejected": -1.7378385066986084, + "step": 6766 + }, + { + "epoch": 0.78, + "learning_rate": 6.696710757345194e-08, + "logits/chosen": -2.9963741302490234, + "logits/rejected": -3.0809192657470703, + "logps/chosen": -325.0373229980469, + "logps/rejected": -338.1695556640625, + "loss": 0.3675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08370541781187057, + "rewards/margins": 1.7142030000686646, + "rewards/rejected": -1.6304974555969238, + "step": 6767 + }, + { + "epoch": 0.78, + "learning_rate": 6.69319911038277e-08, + "logits/chosen": -3.9133057594299316, + "logits/rejected": -4.145673751831055, + "logps/chosen": -141.12066650390625, + "logps/rejected": -197.3154296875, + "loss": 0.2254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10973822325468063, + "rewards/margins": 2.369058847427368, + "rewards/rejected": -2.47879695892334, + "step": 6768 + }, + { + "epoch": 0.78, + "learning_rate": 6.689687463420344e-08, + "logits/chosen": -3.24739933013916, + "logits/rejected": -2.9996776580810547, + "logps/chosen": -219.65505981445312, + "logps/rejected": -140.8477325439453, + "loss": 0.3351, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08886474370956421, + "rewards/margins": 1.2973827123641968, + "rewards/rejected": -1.2085179090499878, + "step": 6769 + }, + { + "epoch": 0.78, + "learning_rate": 6.686175816457918e-08, + "logits/chosen": -3.363778591156006, + "logits/rejected": -2.9350457191467285, + "logps/chosen": -222.83401489257812, + "logps/rejected": -140.0408477783203, + "loss": 0.3642, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30889570713043213, + "rewards/margins": 1.2068887948989868, + "rewards/rejected": -0.8979930877685547, + "step": 6770 + }, + { + "epoch": 0.78, + "learning_rate": 6.682664169495493e-08, + "logits/chosen": -3.74440336227417, + "logits/rejected": -3.0728302001953125, + "logps/chosen": -344.6134338378906, + "logps/rejected": -234.85174560546875, + "loss": 0.4611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15568304061889648, + "rewards/margins": 1.9276459217071533, + "rewards/rejected": -2.08332896232605, + "step": 6771 + }, + { + "epoch": 0.78, + "learning_rate": 6.679152522533067e-08, + "logits/chosen": -3.187014102935791, + "logits/rejected": -2.7611026763916016, + "logps/chosen": -400.0369873046875, + "logps/rejected": -366.6341552734375, + "loss": 0.248, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5380492210388184, + "rewards/margins": 2.0029382705688477, + "rewards/rejected": -1.4648890495300293, + "step": 6772 + }, + { + "epoch": 0.78, + "learning_rate": 6.675640875570641e-08, + "logits/chosen": -3.7856593132019043, + "logits/rejected": -3.3120570182800293, + "logps/chosen": -176.43128967285156, + "logps/rejected": -233.589599609375, + "loss": 0.2833, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28247469663619995, + "rewards/margins": 2.0179269313812256, + "rewards/rejected": -2.300401449203491, + "step": 6773 + }, + { + "epoch": 0.78, + "learning_rate": 6.672129228608217e-08, + "logits/chosen": -2.836500406265259, + "logits/rejected": -2.7420554161071777, + "logps/chosen": -241.29519653320312, + "logps/rejected": -273.9883117675781, + "loss": 0.2079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23826636373996735, + "rewards/margins": 2.014457941055298, + "rewards/rejected": -1.7761915922164917, + "step": 6774 + }, + { + "epoch": 0.78, + "learning_rate": 6.668617581645791e-08, + "logits/chosen": -2.2259774208068848, + "logits/rejected": -2.2946832180023193, + "logps/chosen": -295.7873229980469, + "logps/rejected": -362.86376953125, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06256968528032303, + "rewards/margins": 1.7125999927520752, + "rewards/rejected": -1.7751696109771729, + "step": 6775 + }, + { + "epoch": 0.78, + "learning_rate": 6.665105934683366e-08, + "logits/chosen": -2.91729736328125, + "logits/rejected": -3.104247570037842, + "logps/chosen": -243.43309020996094, + "logps/rejected": -226.74729919433594, + "loss": 0.4557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.028580009937286377, + "rewards/margins": 1.3193166255950928, + "rewards/rejected": -1.2907365560531616, + "step": 6776 + }, + { + "epoch": 0.78, + "learning_rate": 6.66159428772094e-08, + "logits/chosen": -2.8264846801757812, + "logits/rejected": -2.886767864227295, + "logps/chosen": -249.47784423828125, + "logps/rejected": -345.46807861328125, + "loss": 0.1872, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2977813482284546, + "rewards/margins": 2.637699842453003, + "rewards/rejected": -2.339918613433838, + "step": 6777 + }, + { + "epoch": 0.78, + "learning_rate": 6.658082640758516e-08, + "logits/chosen": -3.1995604038238525, + "logits/rejected": -3.302152633666992, + "logps/chosen": -242.20147705078125, + "logps/rejected": -239.53463745117188, + "loss": 0.3446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4255322515964508, + "rewards/margins": 2.058684825897217, + "rewards/rejected": -1.633152723312378, + "step": 6778 + }, + { + "epoch": 0.78, + "learning_rate": 6.65457099379609e-08, + "logits/chosen": -3.6889188289642334, + "logits/rejected": -3.450836181640625, + "logps/chosen": -315.62506103515625, + "logps/rejected": -386.10528564453125, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4720994234085083, + "rewards/margins": 2.7224912643432617, + "rewards/rejected": -3.1945910453796387, + "step": 6779 + }, + { + "epoch": 0.78, + "learning_rate": 6.651059346833665e-08, + "logits/chosen": -3.357210874557495, + "logits/rejected": -3.532439708709717, + "logps/chosen": -130.021240234375, + "logps/rejected": -150.75794982910156, + "loss": 0.3988, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1779811531305313, + "rewards/margins": 1.5697866678237915, + "rewards/rejected": -1.7477679252624512, + "step": 6780 + }, + { + "epoch": 0.78, + "learning_rate": 6.647547699871239e-08, + "logits/chosen": -2.7669804096221924, + "logits/rejected": -2.6656670570373535, + "logps/chosen": -343.6640625, + "logps/rejected": -286.4060974121094, + "loss": 0.2533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.322679340839386, + "rewards/margins": 1.9554792642593384, + "rewards/rejected": -1.6328001022338867, + "step": 6781 + }, + { + "epoch": 0.78, + "learning_rate": 6.644036052908815e-08, + "logits/chosen": -3.6595458984375, + "logits/rejected": -3.6741576194763184, + "logps/chosen": -319.9183349609375, + "logps/rejected": -178.3214111328125, + "loss": 0.3645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021932169795036316, + "rewards/margins": 1.3726439476013184, + "rewards/rejected": -1.394576072692871, + "step": 6782 + }, + { + "epoch": 0.78, + "learning_rate": 6.640524405946389e-08, + "logits/chosen": -2.7663049697875977, + "logits/rejected": -2.773707866668701, + "logps/chosen": -298.0637512207031, + "logps/rejected": -331.1943054199219, + "loss": 0.6273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21168047189712524, + "rewards/margins": 1.6439831256866455, + "rewards/rejected": -1.8556636571884155, + "step": 6783 + }, + { + "epoch": 0.78, + "learning_rate": 6.637012758983963e-08, + "logits/chosen": -2.413454532623291, + "logits/rejected": -2.4851861000061035, + "logps/chosen": -313.65740966796875, + "logps/rejected": -325.2154846191406, + "loss": 0.744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6182162165641785, + "rewards/margins": 0.6682770848274231, + "rewards/rejected": -1.2864933013916016, + "step": 6784 + }, + { + "epoch": 0.78, + "learning_rate": 6.633501112021538e-08, + "logits/chosen": -2.8339695930480957, + "logits/rejected": -2.8127851486206055, + "logps/chosen": -255.8428955078125, + "logps/rejected": -264.88763427734375, + "loss": 0.3299, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5841181874275208, + "rewards/margins": 1.8814163208007812, + "rewards/rejected": -1.2972981929779053, + "step": 6785 + }, + { + "epoch": 0.78, + "learning_rate": 6.629989465059112e-08, + "logits/chosen": -3.440809965133667, + "logits/rejected": -3.3545117378234863, + "logps/chosen": -276.4635009765625, + "logps/rejected": -277.1734313964844, + "loss": 0.8527, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1395409107208252, + "rewards/margins": -0.1582939624786377, + "rewards/rejected": -0.981246829032898, + "step": 6786 + }, + { + "epoch": 0.78, + "learning_rate": 6.626477818096686e-08, + "logits/chosen": -2.8321609497070312, + "logits/rejected": -2.9400930404663086, + "logps/chosen": -277.513427734375, + "logps/rejected": -272.1589050292969, + "loss": 0.3343, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26306143403053284, + "rewards/margins": 2.845707416534424, + "rewards/rejected": -2.582645893096924, + "step": 6787 + }, + { + "epoch": 0.78, + "learning_rate": 6.622966171134262e-08, + "logits/chosen": -3.9973134994506836, + "logits/rejected": -3.818408966064453, + "logps/chosen": -282.3304138183594, + "logps/rejected": -231.53199768066406, + "loss": 0.3774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5747742056846619, + "rewards/margins": 1.8720349073410034, + "rewards/rejected": -2.4468092918395996, + "step": 6788 + }, + { + "epoch": 0.78, + "learning_rate": 6.619454524171836e-08, + "logits/chosen": -3.307832717895508, + "logits/rejected": -3.1887705326080322, + "logps/chosen": -175.63357543945312, + "logps/rejected": -128.66934204101562, + "loss": 0.5811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8687562346458435, + "rewards/margins": 0.5754307508468628, + "rewards/rejected": -1.444187045097351, + "step": 6789 + }, + { + "epoch": 0.78, + "learning_rate": 6.615942877209411e-08, + "logits/chosen": -3.338956594467163, + "logits/rejected": -3.8372104167938232, + "logps/chosen": -232.51220703125, + "logps/rejected": -250.4418182373047, + "loss": 0.3114, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045563653111457825, + "rewards/margins": 2.0462989807128906, + "rewards/rejected": -2.000735282897949, + "step": 6790 + }, + { + "epoch": 0.78, + "learning_rate": 6.612431230246985e-08, + "logits/chosen": -3.0698294639587402, + "logits/rejected": -3.0826964378356934, + "logps/chosen": -267.5284423828125, + "logps/rejected": -252.05587768554688, + "loss": 0.598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6337540149688721, + "rewards/margins": 1.0007356405258179, + "rewards/rejected": -1.6344897747039795, + "step": 6791 + }, + { + "epoch": 0.78, + "learning_rate": 6.60891958328456e-08, + "logits/chosen": -3.7292609214782715, + "logits/rejected": -3.66865873336792, + "logps/chosen": -259.9373779296875, + "logps/rejected": -200.75469970703125, + "loss": 0.4879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01669202744960785, + "rewards/margins": 2.5636091232299805, + "rewards/rejected": -2.580300807952881, + "step": 6792 + }, + { + "epoch": 0.78, + "learning_rate": 6.605407936322135e-08, + "logits/chosen": -2.5751566886901855, + "logits/rejected": -2.7059214115142822, + "logps/chosen": -278.82928466796875, + "logps/rejected": -324.973876953125, + "loss": 0.1452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011072635650634766, + "rewards/margins": 3.033458948135376, + "rewards/rejected": -3.022386312484741, + "step": 6793 + }, + { + "epoch": 0.78, + "learning_rate": 6.60189628935971e-08, + "logits/chosen": -2.929518699645996, + "logits/rejected": -3.1311535835266113, + "logps/chosen": -176.34332275390625, + "logps/rejected": -283.4391174316406, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5106788873672485, + "rewards/margins": 1.731013536453247, + "rewards/rejected": -2.241692304611206, + "step": 6794 + }, + { + "epoch": 0.78, + "learning_rate": 6.598384642397284e-08, + "logits/chosen": -2.2326102256774902, + "logits/rejected": -2.2284975051879883, + "logps/chosen": -493.1743469238281, + "logps/rejected": -302.8863220214844, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02053987979888916, + "rewards/margins": 2.1247901916503906, + "rewards/rejected": -2.1453301906585693, + "step": 6795 + }, + { + "epoch": 0.78, + "learning_rate": 6.59487299543486e-08, + "logits/chosen": -3.254190444946289, + "logits/rejected": -3.0437073707580566, + "logps/chosen": -251.1642608642578, + "logps/rejected": -206.36558532714844, + "loss": 0.5846, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20217253267765045, + "rewards/margins": 1.5759482383728027, + "rewards/rejected": -1.778120756149292, + "step": 6796 + }, + { + "epoch": 0.78, + "learning_rate": 6.591361348472433e-08, + "logits/chosen": -3.623806953430176, + "logits/rejected": -3.25968861579895, + "logps/chosen": -213.21212768554688, + "logps/rejected": -194.13119506835938, + "loss": 0.3354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3657728433609009, + "rewards/margins": 1.3045611381530762, + "rewards/rejected": -0.9387881755828857, + "step": 6797 + }, + { + "epoch": 0.78, + "learning_rate": 6.587849701510009e-08, + "logits/chosen": -2.8545451164245605, + "logits/rejected": -2.9689009189605713, + "logps/chosen": -428.3218994140625, + "logps/rejected": -402.47186279296875, + "loss": 0.1716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11881456524133682, + "rewards/margins": 2.7766900062561035, + "rewards/rejected": -2.8955044746398926, + "step": 6798 + }, + { + "epoch": 0.78, + "learning_rate": 6.584338054547583e-08, + "logits/chosen": -3.151240348815918, + "logits/rejected": -3.191244125366211, + "logps/chosen": -248.50411987304688, + "logps/rejected": -275.0198059082031, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07250522077083588, + "rewards/margins": 2.02418851852417, + "rewards/rejected": -1.9516834020614624, + "step": 6799 + }, + { + "epoch": 0.78, + "learning_rate": 6.580826407585157e-08, + "logits/chosen": -2.534738063812256, + "logits/rejected": -2.695298194885254, + "logps/chosen": -344.35687255859375, + "logps/rejected": -233.5052490234375, + "loss": 0.3189, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15096017718315125, + "rewards/margins": 1.6633756160736084, + "rewards/rejected": -1.5124156475067139, + "step": 6800 + }, + { + "epoch": 0.78, + "learning_rate": 6.577314760622731e-08, + "logits/chosen": -3.6435608863830566, + "logits/rejected": -3.618330717086792, + "logps/chosen": -372.45458984375, + "logps/rejected": -424.6346435546875, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36903122067451477, + "rewards/margins": 1.4902716875076294, + "rewards/rejected": -1.8593029975891113, + "step": 6801 + }, + { + "epoch": 0.78, + "learning_rate": 6.573803113660306e-08, + "logits/chosen": -3.243924617767334, + "logits/rejected": -3.005852699279785, + "logps/chosen": -290.582275390625, + "logps/rejected": -231.67242431640625, + "loss": 0.3837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26784196496009827, + "rewards/margins": 1.3116226196289062, + "rewards/rejected": -1.5794646739959717, + "step": 6802 + }, + { + "epoch": 0.78, + "learning_rate": 6.57029146669788e-08, + "logits/chosen": -3.0750694274902344, + "logits/rejected": -3.0231032371520996, + "logps/chosen": -265.1117858886719, + "logps/rejected": -264.3658447265625, + "loss": 0.2899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07153302431106567, + "rewards/margins": 2.1334519386291504, + "rewards/rejected": -2.2049849033355713, + "step": 6803 + }, + { + "epoch": 0.78, + "learning_rate": 6.566779819735456e-08, + "logits/chosen": -3.1575279235839844, + "logits/rejected": -3.2016336917877197, + "logps/chosen": -188.55160522460938, + "logps/rejected": -404.25933837890625, + "loss": 0.39, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49122166633605957, + "rewards/margins": 2.0866000652313232, + "rewards/rejected": -2.577822208404541, + "step": 6804 + }, + { + "epoch": 0.78, + "learning_rate": 6.56326817277303e-08, + "logits/chosen": -2.8930015563964844, + "logits/rejected": -2.9958088397979736, + "logps/chosen": -224.76318359375, + "logps/rejected": -237.51742553710938, + "loss": 0.3313, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5508943796157837, + "rewards/margins": 1.6808643341064453, + "rewards/rejected": -1.1299699544906616, + "step": 6805 + }, + { + "epoch": 0.78, + "learning_rate": 6.559756525810605e-08, + "logits/chosen": -3.676809549331665, + "logits/rejected": -3.416433095932007, + "logps/chosen": -290.9303894042969, + "logps/rejected": -263.67584228515625, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29353535175323486, + "rewards/margins": 3.2508864402770996, + "rewards/rejected": -3.544421911239624, + "step": 6806 + }, + { + "epoch": 0.78, + "learning_rate": 6.55624487884818e-08, + "logits/chosen": -3.5082030296325684, + "logits/rejected": -3.3803114891052246, + "logps/chosen": -291.1971435546875, + "logps/rejected": -139.90341186523438, + "loss": 0.3721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.008053764700889587, + "rewards/margins": 1.2492146492004395, + "rewards/rejected": -1.2572684288024902, + "step": 6807 + }, + { + "epoch": 0.78, + "learning_rate": 6.552733231885755e-08, + "logits/chosen": -3.324326515197754, + "logits/rejected": -3.219181776046753, + "logps/chosen": -137.62664794921875, + "logps/rejected": -160.95101928710938, + "loss": 0.3778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22813719511032104, + "rewards/margins": 1.5762128829956055, + "rewards/rejected": -1.8043501377105713, + "step": 6808 + }, + { + "epoch": 0.78, + "learning_rate": 6.549221584923329e-08, + "logits/chosen": -3.5332770347595215, + "logits/rejected": -3.6086230278015137, + "logps/chosen": -145.14193725585938, + "logps/rejected": -185.52456665039062, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7385649085044861, + "rewards/margins": 1.2557134628295898, + "rewards/rejected": -1.9942783117294312, + "step": 6809 + }, + { + "epoch": 0.79, + "learning_rate": 6.545709937960904e-08, + "logits/chosen": -2.8462533950805664, + "logits/rejected": -2.814697027206421, + "logps/chosen": -200.31187438964844, + "logps/rejected": -177.14085388183594, + "loss": 0.29, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22135767340660095, + "rewards/margins": 2.3665382862091064, + "rewards/rejected": -2.1451804637908936, + "step": 6810 + }, + { + "epoch": 0.79, + "learning_rate": 6.542198290998478e-08, + "logits/chosen": -4.075146198272705, + "logits/rejected": -3.4060144424438477, + "logps/chosen": -462.3502197265625, + "logps/rejected": -235.717041015625, + "loss": 0.6143, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1132855415344238, + "rewards/margins": 1.1087597608566284, + "rewards/rejected": -2.222045421600342, + "step": 6811 + }, + { + "epoch": 0.79, + "learning_rate": 6.538686644036054e-08, + "logits/chosen": -2.4754040241241455, + "logits/rejected": -2.546112298965454, + "logps/chosen": -123.22860717773438, + "logps/rejected": -258.0232238769531, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18078333139419556, + "rewards/margins": 2.830596923828125, + "rewards/rejected": -3.011380195617676, + "step": 6812 + }, + { + "epoch": 0.79, + "learning_rate": 6.535174997073628e-08, + "logits/chosen": -2.5822649002075195, + "logits/rejected": -2.540574550628662, + "logps/chosen": -429.48834228515625, + "logps/rejected": -270.4426574707031, + "loss": 0.5582, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24676388502120972, + "rewards/margins": 0.573560893535614, + "rewards/rejected": -0.3267970085144043, + "step": 6813 + }, + { + "epoch": 0.79, + "learning_rate": 6.531663350111202e-08, + "logits/chosen": -3.6841418743133545, + "logits/rejected": -3.4845662117004395, + "logps/chosen": -405.1519775390625, + "logps/rejected": -234.80450439453125, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4059818387031555, + "rewards/margins": 1.407465934753418, + "rewards/rejected": -1.8134475946426392, + "step": 6814 + }, + { + "epoch": 0.79, + "learning_rate": 6.528151703148777e-08, + "logits/chosen": -2.416717290878296, + "logits/rejected": -2.509876251220703, + "logps/chosen": -348.9311218261719, + "logps/rejected": -206.58132934570312, + "loss": 0.6232, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3263876736164093, + "rewards/margins": 0.6023266315460205, + "rewards/rejected": -0.9287142753601074, + "step": 6815 + }, + { + "epoch": 0.79, + "learning_rate": 6.524640056186351e-08, + "logits/chosen": -3.0835540294647217, + "logits/rejected": -3.078287124633789, + "logps/chosen": -410.3081359863281, + "logps/rejected": -322.372802734375, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44953134655952454, + "rewards/margins": 4.033242225646973, + "rewards/rejected": -3.5837106704711914, + "step": 6816 + }, + { + "epoch": 0.79, + "learning_rate": 6.521128409223925e-08, + "logits/chosen": -3.110285758972168, + "logits/rejected": -2.9107210636138916, + "logps/chosen": -252.9046173095703, + "logps/rejected": -179.50257873535156, + "loss": 0.4523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26467981934547424, + "rewards/margins": 1.929786205291748, + "rewards/rejected": -2.1944661140441895, + "step": 6817 + }, + { + "epoch": 0.79, + "learning_rate": 6.5176167622615e-08, + "logits/chosen": -3.3176422119140625, + "logits/rejected": -3.4946208000183105, + "logps/chosen": -288.4944763183594, + "logps/rejected": -193.10409545898438, + "loss": 0.6439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23782888054847717, + "rewards/margins": 1.6781200170516968, + "rewards/rejected": -1.9159488677978516, + "step": 6818 + }, + { + "epoch": 0.79, + "learning_rate": 6.514105115299075e-08, + "logits/chosen": -3.3733420372009277, + "logits/rejected": -3.5341546535491943, + "logps/chosen": -280.0050048828125, + "logps/rejected": -271.9837951660156, + "loss": 0.2155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07101286947727203, + "rewards/margins": 1.7053579092025757, + "rewards/rejected": -1.6343448162078857, + "step": 6819 + }, + { + "epoch": 0.79, + "learning_rate": 6.510593468336649e-08, + "logits/chosen": -3.0949971675872803, + "logits/rejected": -2.9136245250701904, + "logps/chosen": -252.19403076171875, + "logps/rejected": -139.9955291748047, + "loss": 0.3175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005233511328697205, + "rewards/margins": 1.298378586769104, + "rewards/rejected": -1.3036121129989624, + "step": 6820 + }, + { + "epoch": 0.79, + "learning_rate": 6.507081821374224e-08, + "logits/chosen": -2.731044054031372, + "logits/rejected": -3.0995235443115234, + "logps/chosen": -289.49560546875, + "logps/rejected": -297.180908203125, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21933630108833313, + "rewards/margins": 1.7601103782653809, + "rewards/rejected": -1.9794467687606812, + "step": 6821 + }, + { + "epoch": 0.79, + "learning_rate": 6.503570174411798e-08, + "logits/chosen": -2.9692575931549072, + "logits/rejected": -3.4178466796875, + "logps/chosen": -204.48736572265625, + "logps/rejected": -250.20037841796875, + "loss": 0.394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24500279128551483, + "rewards/margins": 1.3969091176986694, + "rewards/rejected": -1.6419118642807007, + "step": 6822 + }, + { + "epoch": 0.79, + "learning_rate": 6.500058527449374e-08, + "logits/chosen": -3.520697593688965, + "logits/rejected": -3.402129650115967, + "logps/chosen": -382.16400146484375, + "logps/rejected": -261.93499755859375, + "loss": 0.379, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13567866384983063, + "rewards/margins": 1.9754241704940796, + "rewards/rejected": -1.8397455215454102, + "step": 6823 + }, + { + "epoch": 0.79, + "learning_rate": 6.496546880486948e-08, + "logits/chosen": -2.646972179412842, + "logits/rejected": -2.864185333251953, + "logps/chosen": -104.6274185180664, + "logps/rejected": -171.93748474121094, + "loss": 0.6351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5043372511863708, + "rewards/margins": 0.32665354013442993, + "rewards/rejected": -0.8309907913208008, + "step": 6824 + }, + { + "epoch": 0.79, + "learning_rate": 6.493035233524523e-08, + "logits/chosen": -3.1588165760040283, + "logits/rejected": -3.0602598190307617, + "logps/chosen": -315.955810546875, + "logps/rejected": -276.62884521484375, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004336923360824585, + "rewards/margins": 2.298098564147949, + "rewards/rejected": -2.3024356365203857, + "step": 6825 + }, + { + "epoch": 0.79, + "learning_rate": 6.489523586562097e-08, + "logits/chosen": -3.055269718170166, + "logits/rejected": -2.679443597793579, + "logps/chosen": -233.97152709960938, + "logps/rejected": -209.08705139160156, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025846242904663086, + "rewards/margins": 1.4674662351608276, + "rewards/rejected": -1.4933124780654907, + "step": 6826 + }, + { + "epoch": 0.79, + "learning_rate": 6.486011939599673e-08, + "logits/chosen": -3.4356706142425537, + "logits/rejected": -3.129436492919922, + "logps/chosen": -291.881591796875, + "logps/rejected": -176.69021606445312, + "loss": 0.2119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3934863805770874, + "rewards/margins": 2.326484203338623, + "rewards/rejected": -2.719970464706421, + "step": 6827 + }, + { + "epoch": 0.79, + "learning_rate": 6.482500292637247e-08, + "logits/chosen": -2.8660202026367188, + "logits/rejected": -3.2505035400390625, + "logps/chosen": -141.67660522460938, + "logps/rejected": -170.54150390625, + "loss": 0.9362, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7895552515983582, + "rewards/margins": 0.45149314403533936, + "rewards/rejected": -1.2410484552383423, + "step": 6828 + }, + { + "epoch": 0.79, + "learning_rate": 6.478988645674822e-08, + "logits/chosen": -2.951704263687134, + "logits/rejected": -2.724363088607788, + "logps/chosen": -218.65257263183594, + "logps/rejected": -338.906005859375, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11450247466564178, + "rewards/margins": 2.7068371772766113, + "rewards/rejected": -2.8213396072387695, + "step": 6829 + }, + { + "epoch": 0.79, + "learning_rate": 6.475476998712396e-08, + "logits/chosen": -2.655768632888794, + "logits/rejected": -2.5083253383636475, + "logps/chosen": -318.69573974609375, + "logps/rejected": -212.20181274414062, + "loss": 0.6272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14670154452323914, + "rewards/margins": 0.700103759765625, + "rewards/rejected": -0.8468053340911865, + "step": 6830 + }, + { + "epoch": 0.79, + "learning_rate": 6.47196535174997e-08, + "logits/chosen": -3.526679039001465, + "logits/rejected": -3.421184778213501, + "logps/chosen": -184.98226928710938, + "logps/rejected": -212.30490112304688, + "loss": 0.2182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2574253976345062, + "rewards/margins": 2.6191964149475098, + "rewards/rejected": -2.3617711067199707, + "step": 6831 + }, + { + "epoch": 0.79, + "learning_rate": 6.468453704787545e-08, + "logits/chosen": -3.065885066986084, + "logits/rejected": -3.1498255729675293, + "logps/chosen": -160.78990173339844, + "logps/rejected": -182.32052612304688, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9904659986495972, + "rewards/margins": 2.6232385635375977, + "rewards/rejected": -1.6327725648880005, + "step": 6832 + }, + { + "epoch": 0.79, + "learning_rate": 6.46494205782512e-08, + "logits/chosen": -3.322476625442505, + "logits/rejected": -3.3201043605804443, + "logps/chosen": -263.06976318359375, + "logps/rejected": -278.8676452636719, + "loss": 0.2113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1049315333366394, + "rewards/margins": 2.6032118797302246, + "rewards/rejected": -2.4982802867889404, + "step": 6833 + }, + { + "epoch": 0.79, + "learning_rate": 6.461430410862694e-08, + "logits/chosen": -3.4816198348999023, + "logits/rejected": -3.0614264011383057, + "logps/chosen": -537.0972900390625, + "logps/rejected": -357.44903564453125, + "loss": 0.2455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09117260575294495, + "rewards/margins": 1.797135353088379, + "rewards/rejected": -1.8883081674575806, + "step": 6834 + }, + { + "epoch": 0.79, + "learning_rate": 6.457918763900269e-08, + "logits/chosen": -3.2478866577148438, + "logits/rejected": -3.455711841583252, + "logps/chosen": -265.21160888671875, + "logps/rejected": -258.420654296875, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28122416138648987, + "rewards/margins": 1.3312146663665771, + "rewards/rejected": -1.6124387979507446, + "step": 6835 + }, + { + "epoch": 0.79, + "learning_rate": 6.454407116937843e-08, + "logits/chosen": -2.487886905670166, + "logits/rejected": -2.1274523735046387, + "logps/chosen": -198.15780639648438, + "logps/rejected": -287.2400207519531, + "loss": 0.4461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22394424676895142, + "rewards/margins": 1.759330153465271, + "rewards/rejected": -1.5353859663009644, + "step": 6836 + }, + { + "epoch": 0.79, + "learning_rate": 6.450895469975418e-08, + "logits/chosen": -3.334784507751465, + "logits/rejected": -3.4259417057037354, + "logps/chosen": -205.77072143554688, + "logps/rejected": -342.7296142578125, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22416365146636963, + "rewards/margins": 1.3657113313674927, + "rewards/rejected": -1.141547679901123, + "step": 6837 + }, + { + "epoch": 0.79, + "learning_rate": 6.447383823012992e-08, + "logits/chosen": -2.63106632232666, + "logits/rejected": -2.457960605621338, + "logps/chosen": -315.22119140625, + "logps/rejected": -244.2825927734375, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23190326988697052, + "rewards/margins": 1.2602914571762085, + "rewards/rejected": -1.0283881425857544, + "step": 6838 + }, + { + "epoch": 0.79, + "learning_rate": 6.443872176050568e-08, + "logits/chosen": -3.7818500995635986, + "logits/rejected": -3.5495409965515137, + "logps/chosen": -322.14459228515625, + "logps/rejected": -368.0477294921875, + "loss": 0.2012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2929685115814209, + "rewards/margins": 3.0361342430114746, + "rewards/rejected": -2.7431657314300537, + "step": 6839 + }, + { + "epoch": 0.79, + "learning_rate": 6.440360529088142e-08, + "logits/chosen": -3.5276098251342773, + "logits/rejected": -3.145176649093628, + "logps/chosen": -187.4014434814453, + "logps/rejected": -226.14730834960938, + "loss": 0.364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1325991451740265, + "rewards/margins": 1.8107569217681885, + "rewards/rejected": -1.9433560371398926, + "step": 6840 + }, + { + "epoch": 0.79, + "learning_rate": 6.436848882125717e-08, + "logits/chosen": -3.291114330291748, + "logits/rejected": -3.165273427963257, + "logps/chosen": -313.86248779296875, + "logps/rejected": -209.34783935546875, + "loss": 0.3262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6855396032333374, + "rewards/margins": 1.6952064037322998, + "rewards/rejected": -2.3807458877563477, + "step": 6841 + }, + { + "epoch": 0.79, + "learning_rate": 6.433337235163291e-08, + "logits/chosen": -2.854081630706787, + "logits/rejected": -3.0671939849853516, + "logps/chosen": -362.5175476074219, + "logps/rejected": -205.03860473632812, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19321422278881073, + "rewards/margins": 1.5972611904144287, + "rewards/rejected": -1.7904754877090454, + "step": 6842 + }, + { + "epoch": 0.79, + "learning_rate": 6.429825588200867e-08, + "logits/chosen": -2.711979389190674, + "logits/rejected": -2.952932119369507, + "logps/chosen": -265.2267761230469, + "logps/rejected": -213.13897705078125, + "loss": 0.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7980820536613464, + "rewards/margins": 1.2786426544189453, + "rewards/rejected": -2.0767245292663574, + "step": 6843 + }, + { + "epoch": 0.79, + "learning_rate": 6.426313941238441e-08, + "logits/chosen": -3.6388802528381348, + "logits/rejected": -3.4347779750823975, + "logps/chosen": -252.03073120117188, + "logps/rejected": -196.06954956054688, + "loss": 0.6733, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17113324999809265, + "rewards/margins": 0.9060022830963135, + "rewards/rejected": -1.0771355628967285, + "step": 6844 + }, + { + "epoch": 0.79, + "learning_rate": 6.422802294276015e-08, + "logits/chosen": -3.3597936630249023, + "logits/rejected": -3.6872520446777344, + "logps/chosen": -192.5332794189453, + "logps/rejected": -356.21484375, + "loss": 0.6125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03905147314071655, + "rewards/margins": 1.770747184753418, + "rewards/rejected": -1.8097987174987793, + "step": 6845 + }, + { + "epoch": 0.79, + "learning_rate": 6.41929064731359e-08, + "logits/chosen": -3.152092456817627, + "logits/rejected": -2.998012065887451, + "logps/chosen": -268.07440185546875, + "logps/rejected": -295.1825866699219, + "loss": 0.2639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06760285794734955, + "rewards/margins": 2.1348395347595215, + "rewards/rejected": -2.2024424076080322, + "step": 6846 + }, + { + "epoch": 0.79, + "learning_rate": 6.415779000351164e-08, + "logits/chosen": -3.4751336574554443, + "logits/rejected": -3.2016994953155518, + "logps/chosen": -331.830322265625, + "logps/rejected": -226.12942504882812, + "loss": 0.3918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40196648240089417, + "rewards/margins": 1.5671952962875366, + "rewards/rejected": -1.9691617488861084, + "step": 6847 + }, + { + "epoch": 0.79, + "learning_rate": 6.412267353388738e-08, + "logits/chosen": -3.483635663986206, + "logits/rejected": -3.550340414047241, + "logps/chosen": -148.66677856445312, + "logps/rejected": -149.79293823242188, + "loss": 0.4052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020672565326094627, + "rewards/margins": 1.3683110475540161, + "rewards/rejected": -1.3476386070251465, + "step": 6848 + }, + { + "epoch": 0.79, + "learning_rate": 6.408755706426314e-08, + "logits/chosen": -3.5034255981445312, + "logits/rejected": -3.1293697357177734, + "logps/chosen": -292.9563293457031, + "logps/rejected": -232.19862365722656, + "loss": 0.3061, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.163407564163208, + "rewards/margins": 2.2269904613494873, + "rewards/rejected": -2.3903980255126953, + "step": 6849 + }, + { + "epoch": 0.79, + "learning_rate": 6.405244059463888e-08, + "logits/chosen": -3.2432289123535156, + "logits/rejected": -2.998201847076416, + "logps/chosen": -224.025390625, + "logps/rejected": -338.45501708984375, + "loss": 0.5427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5834694504737854, + "rewards/margins": 0.4937437176704407, + "rewards/rejected": -1.077213168144226, + "step": 6850 + }, + { + "epoch": 0.79, + "learning_rate": 6.401732412501463e-08, + "logits/chosen": -2.7048275470733643, + "logits/rejected": -2.884060859680176, + "logps/chosen": -341.81610107421875, + "logps/rejected": -250.89706420898438, + "loss": 0.3224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32893475890159607, + "rewards/margins": 1.8751518726348877, + "rewards/rejected": -2.2040867805480957, + "step": 6851 + }, + { + "epoch": 0.79, + "learning_rate": 6.398220765539037e-08, + "logits/chosen": -2.9495837688446045, + "logits/rejected": -3.0840003490448, + "logps/chosen": -244.6237335205078, + "logps/rejected": -247.16070556640625, + "loss": 0.3171, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23765386641025543, + "rewards/margins": 2.435872793197632, + "rewards/rejected": -2.198219060897827, + "step": 6852 + }, + { + "epoch": 0.79, + "learning_rate": 6.394709118576613e-08, + "logits/chosen": -2.576324939727783, + "logits/rejected": -2.7576746940612793, + "logps/chosen": -357.1170349121094, + "logps/rejected": -321.25433349609375, + "loss": 0.2696, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14340072870254517, + "rewards/margins": 2.000047445297241, + "rewards/rejected": -1.8566467761993408, + "step": 6853 + }, + { + "epoch": 0.79, + "learning_rate": 6.391197471614187e-08, + "logits/chosen": -2.7901506423950195, + "logits/rejected": -2.627124309539795, + "logps/chosen": -246.06459045410156, + "logps/rejected": -317.5487976074219, + "loss": 0.4376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11575320363044739, + "rewards/margins": 1.1238127946853638, + "rewards/rejected": -1.2395659685134888, + "step": 6854 + }, + { + "epoch": 0.79, + "learning_rate": 6.387685824651762e-08, + "logits/chosen": -3.6491472721099854, + "logits/rejected": -3.2009377479553223, + "logps/chosen": -164.7742919921875, + "logps/rejected": -257.3173828125, + "loss": 0.1883, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15473785996437073, + "rewards/margins": 3.5676796436309814, + "rewards/rejected": -3.4129416942596436, + "step": 6855 + }, + { + "epoch": 0.79, + "learning_rate": 6.384174177689336e-08, + "logits/chosen": -3.6167948246002197, + "logits/rejected": -3.400479316711426, + "logps/chosen": -255.48751831054688, + "logps/rejected": -237.25625610351562, + "loss": 0.2229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7463910579681396, + "rewards/margins": 2.4294629096984863, + "rewards/rejected": -1.6830718517303467, + "step": 6856 + }, + { + "epoch": 0.79, + "learning_rate": 6.380662530726912e-08, + "logits/chosen": -3.2815089225769043, + "logits/rejected": -2.9852240085601807, + "logps/chosen": -215.11795043945312, + "logps/rejected": -200.4466552734375, + "loss": 0.4554, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32384806871414185, + "rewards/margins": 1.28456711769104, + "rewards/rejected": -1.608415126800537, + "step": 6857 + }, + { + "epoch": 0.79, + "learning_rate": 6.377150883764486e-08, + "logits/chosen": -2.9041736125946045, + "logits/rejected": -2.953225612640381, + "logps/chosen": -364.7045593261719, + "logps/rejected": -420.2625732421875, + "loss": 0.3503, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0018400996923446655, + "rewards/margins": 2.2849137783050537, + "rewards/rejected": -2.283073663711548, + "step": 6858 + }, + { + "epoch": 0.79, + "learning_rate": 6.37363923680206e-08, + "logits/chosen": -3.1414053440093994, + "logits/rejected": -3.195016860961914, + "logps/chosen": -169.529052734375, + "logps/rejected": -186.9878387451172, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13334645330905914, + "rewards/margins": 1.7156486511230469, + "rewards/rejected": -1.8489950895309448, + "step": 6859 + }, + { + "epoch": 0.79, + "learning_rate": 6.370127589839635e-08, + "logits/chosen": -2.6622955799102783, + "logits/rejected": -2.5270578861236572, + "logps/chosen": -284.8570556640625, + "logps/rejected": -298.7757568359375, + "loss": 0.3865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1261647641658783, + "rewards/margins": 1.1796964406967163, + "rewards/rejected": -1.3058613538742065, + "step": 6860 + }, + { + "epoch": 0.79, + "learning_rate": 6.366615942877209e-08, + "logits/chosen": -3.5499696731567383, + "logits/rejected": -3.4584922790527344, + "logps/chosen": -256.90087890625, + "logps/rejected": -313.9091491699219, + "loss": 0.2595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07758432626724243, + "rewards/margins": 2.432985305786133, + "rewards/rejected": -2.5105698108673096, + "step": 6861 + }, + { + "epoch": 0.79, + "learning_rate": 6.363104295914783e-08, + "logits/chosen": -2.6109440326690674, + "logits/rejected": -2.77958083152771, + "logps/chosen": -285.7755126953125, + "logps/rejected": -317.174560546875, + "loss": 0.4043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7892271876335144, + "rewards/margins": 2.113048553466797, + "rewards/rejected": -2.902275562286377, + "step": 6862 + }, + { + "epoch": 0.79, + "learning_rate": 6.359592648952359e-08, + "logits/chosen": -3.326241970062256, + "logits/rejected": -3.6112136840820312, + "logps/chosen": -240.3668212890625, + "logps/rejected": -319.6700439453125, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6217849254608154, + "rewards/margins": 2.2391417026519775, + "rewards/rejected": -1.617356538772583, + "step": 6863 + }, + { + "epoch": 0.79, + "learning_rate": 6.356081001989933e-08, + "logits/chosen": -2.931922435760498, + "logits/rejected": -2.992692470550537, + "logps/chosen": -202.0928192138672, + "logps/rejected": -172.689208984375, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21721456944942474, + "rewards/margins": 2.461024284362793, + "rewards/rejected": -2.243809700012207, + "step": 6864 + }, + { + "epoch": 0.79, + "learning_rate": 6.352569355027507e-08, + "logits/chosen": -2.940180778503418, + "logits/rejected": -2.844433307647705, + "logps/chosen": -400.3804931640625, + "logps/rejected": -334.21319580078125, + "loss": 0.5852, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3360590934753418, + "rewards/margins": 1.5003061294555664, + "rewards/rejected": -1.164246916770935, + "step": 6865 + }, + { + "epoch": 0.79, + "learning_rate": 6.349057708065082e-08, + "logits/chosen": -2.623579502105713, + "logits/rejected": -2.619565010070801, + "logps/chosen": -155.14590454101562, + "logps/rejected": -297.96221923828125, + "loss": 0.3125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6103005409240723, + "rewards/margins": 2.128880500793457, + "rewards/rejected": -1.5185800790786743, + "step": 6866 + }, + { + "epoch": 0.79, + "learning_rate": 6.345546061102656e-08, + "logits/chosen": -3.420469284057617, + "logits/rejected": -3.5028417110443115, + "logps/chosen": -313.4007263183594, + "logps/rejected": -336.6433410644531, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06480172276496887, + "rewards/margins": 2.2274088859558105, + "rewards/rejected": -2.162606954574585, + "step": 6867 + }, + { + "epoch": 0.79, + "learning_rate": 6.342034414140232e-08, + "logits/chosen": -3.0210304260253906, + "logits/rejected": -3.022796869277954, + "logps/chosen": -139.5048370361328, + "logps/rejected": -221.88027954101562, + "loss": 0.2173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10810017585754395, + "rewards/margins": 3.3061201572418213, + "rewards/rejected": -3.4142203330993652, + "step": 6868 + }, + { + "epoch": 0.79, + "learning_rate": 6.338522767177806e-08, + "logits/chosen": -2.8449249267578125, + "logits/rejected": -2.953348159790039, + "logps/chosen": -201.3939208984375, + "logps/rejected": -243.2868194580078, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14871792495250702, + "rewards/margins": 2.3179965019226074, + "rewards/rejected": -2.169278383255005, + "step": 6869 + }, + { + "epoch": 0.79, + "learning_rate": 6.335011120215381e-08, + "logits/chosen": -2.5218968391418457, + "logits/rejected": -2.669893503189087, + "logps/chosen": -384.6403503417969, + "logps/rejected": -268.40850830078125, + "loss": 0.2024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5844606757164001, + "rewards/margins": 2.635908603668213, + "rewards/rejected": -2.051448106765747, + "step": 6870 + }, + { + "epoch": 0.79, + "learning_rate": 6.331499473252955e-08, + "logits/chosen": -3.293840169906616, + "logits/rejected": -3.332963466644287, + "logps/chosen": -173.6635284423828, + "logps/rejected": -100.22616577148438, + "loss": 0.5461, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.044323503971099854, + "rewards/margins": 0.6928259134292603, + "rewards/rejected": -0.7371494174003601, + "step": 6871 + }, + { + "epoch": 0.79, + "learning_rate": 6.32798782629053e-08, + "logits/chosen": -2.722712516784668, + "logits/rejected": -3.0634419918060303, + "logps/chosen": -238.21566772460938, + "logps/rejected": -293.7730407714844, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05865603685379028, + "rewards/margins": 2.214850664138794, + "rewards/rejected": -2.1561944484710693, + "step": 6872 + }, + { + "epoch": 0.79, + "learning_rate": 6.324476179328105e-08, + "logits/chosen": -2.326263427734375, + "logits/rejected": -2.5099539756774902, + "logps/chosen": -237.26934814453125, + "logps/rejected": -272.9694519042969, + "loss": 0.2892, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09439561516046524, + "rewards/margins": 2.2884979248046875, + "rewards/rejected": -2.1941025257110596, + "step": 6873 + }, + { + "epoch": 0.79, + "learning_rate": 6.32096453236568e-08, + "logits/chosen": -3.479593515396118, + "logits/rejected": -3.4320316314697266, + "logps/chosen": -281.2029724121094, + "logps/rejected": -246.97161865234375, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5093991160392761, + "rewards/margins": 3.6741886138916016, + "rewards/rejected": -3.1647894382476807, + "step": 6874 + }, + { + "epoch": 0.79, + "learning_rate": 6.317452885403254e-08, + "logits/chosen": -3.695598840713501, + "logits/rejected": -3.857558250427246, + "logps/chosen": -83.91822814941406, + "logps/rejected": -229.68289184570312, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2006939798593521, + "rewards/margins": 1.437912940979004, + "rewards/rejected": -1.6386069059371948, + "step": 6875 + }, + { + "epoch": 0.79, + "learning_rate": 6.313941238440828e-08, + "logits/chosen": -3.2645087242126465, + "logits/rejected": -3.4484758377075195, + "logps/chosen": -386.1628112792969, + "logps/rejected": -269.38214111328125, + "loss": 0.324, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2500818073749542, + "rewards/margins": 2.142085313796997, + "rewards/rejected": -1.8920035362243652, + "step": 6876 + }, + { + "epoch": 0.79, + "learning_rate": 6.310429591478403e-08, + "logits/chosen": -2.999725580215454, + "logits/rejected": -3.0121803283691406, + "logps/chosen": -245.6441650390625, + "logps/rejected": -250.72647094726562, + "loss": 0.622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6914530396461487, + "rewards/margins": 0.8629512786865234, + "rewards/rejected": -1.5544042587280273, + "step": 6877 + }, + { + "epoch": 0.79, + "learning_rate": 6.306917944515977e-08, + "logits/chosen": -3.822054386138916, + "logits/rejected": -4.000582695007324, + "logps/chosen": -298.1544494628906, + "logps/rejected": -362.7718505859375, + "loss": 0.4517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3701764941215515, + "rewards/margins": 1.3521809577941895, + "rewards/rejected": -1.7223577499389648, + "step": 6878 + }, + { + "epoch": 0.79, + "learning_rate": 6.303406297553552e-08, + "logits/chosen": -2.864751100540161, + "logits/rejected": -3.339650869369507, + "logps/chosen": -184.45489501953125, + "logps/rejected": -283.1103820800781, + "loss": 0.379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3413684368133545, + "rewards/margins": 2.1201016902923584, + "rewards/rejected": -1.778733253479004, + "step": 6879 + }, + { + "epoch": 0.79, + "learning_rate": 6.299894650591127e-08, + "logits/chosen": -2.5102121829986572, + "logits/rejected": -2.468625545501709, + "logps/chosen": -313.90234375, + "logps/rejected": -302.5935974121094, + "loss": 0.4965, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2958912253379822, + "rewards/margins": 1.7813442945480347, + "rewards/rejected": -1.4854531288146973, + "step": 6880 + }, + { + "epoch": 0.79, + "learning_rate": 6.296383003628701e-08, + "logits/chosen": -2.8440263271331787, + "logits/rejected": -2.8773467540740967, + "logps/chosen": -390.4270935058594, + "logps/rejected": -218.5861358642578, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.028527140617370605, + "rewards/margins": 1.5096914768218994, + "rewards/rejected": -1.5382184982299805, + "step": 6881 + }, + { + "epoch": 0.79, + "learning_rate": 6.292871356666276e-08, + "logits/chosen": -2.917264938354492, + "logits/rejected": -3.1138463020324707, + "logps/chosen": -405.18316650390625, + "logps/rejected": -313.18994140625, + "loss": 0.352, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22794890403747559, + "rewards/margins": 2.0674829483032227, + "rewards/rejected": -1.8395342826843262, + "step": 6882 + }, + { + "epoch": 0.79, + "learning_rate": 6.28935970970385e-08, + "logits/chosen": -2.9692158699035645, + "logits/rejected": -3.1525657176971436, + "logps/chosen": -166.4990234375, + "logps/rejected": -141.53643798828125, + "loss": 0.3239, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14463672041893005, + "rewards/margins": 1.2622301578521729, + "rewards/rejected": -1.11759352684021, + "step": 6883 + }, + { + "epoch": 0.79, + "learning_rate": 6.285848062741426e-08, + "logits/chosen": -3.0730855464935303, + "logits/rejected": -3.3860981464385986, + "logps/chosen": -299.76727294921875, + "logps/rejected": -227.4004669189453, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3222920596599579, + "rewards/margins": 1.438347339630127, + "rewards/rejected": -1.7606395483016968, + "step": 6884 + }, + { + "epoch": 0.79, + "learning_rate": 6.282336415779e-08, + "logits/chosen": -1.9104608297348022, + "logits/rejected": -1.9323811531066895, + "logps/chosen": -243.21144104003906, + "logps/rejected": -290.5775451660156, + "loss": 0.4571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.024943001568317413, + "rewards/margins": 0.9101272225379944, + "rewards/rejected": -0.935070276260376, + "step": 6885 + }, + { + "epoch": 0.79, + "learning_rate": 6.278824768816575e-08, + "logits/chosen": -2.300570249557495, + "logits/rejected": -2.4551186561584473, + "logps/chosen": -201.5582275390625, + "logps/rejected": -360.60711669921875, + "loss": 0.339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3555089831352234, + "rewards/margins": 2.3074445724487305, + "rewards/rejected": -2.6629533767700195, + "step": 6886 + }, + { + "epoch": 0.79, + "learning_rate": 6.275313121854149e-08, + "logits/chosen": -2.9524686336517334, + "logits/rejected": -3.247201919555664, + "logps/chosen": -156.8269500732422, + "logps/rejected": -273.0232238769531, + "loss": 0.2901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08610832691192627, + "rewards/margins": 3.798002004623413, + "rewards/rejected": -3.88411021232605, + "step": 6887 + }, + { + "epoch": 0.79, + "learning_rate": 6.271801474891725e-08, + "logits/chosen": -2.7733888626098633, + "logits/rejected": -2.8369503021240234, + "logps/chosen": -333.38275146484375, + "logps/rejected": -369.7525634765625, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03532882034778595, + "rewards/margins": 1.796326756477356, + "rewards/rejected": -1.8316556215286255, + "step": 6888 + }, + { + "epoch": 0.79, + "learning_rate": 6.268289827929299e-08, + "logits/chosen": -3.195446491241455, + "logits/rejected": -3.092073917388916, + "logps/chosen": -237.06082153320312, + "logps/rejected": -227.07858276367188, + "loss": 0.331, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20463049411773682, + "rewards/margins": 1.7048801183700562, + "rewards/rejected": -1.5002496242523193, + "step": 6889 + }, + { + "epoch": 0.79, + "learning_rate": 6.264778180966873e-08, + "logits/chosen": -2.7557051181793213, + "logits/rejected": -3.221975564956665, + "logps/chosen": -137.88192749023438, + "logps/rejected": -329.6307678222656, + "loss": 0.4012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28547370433807373, + "rewards/margins": 1.6646965742111206, + "rewards/rejected": -1.3792228698730469, + "step": 6890 + }, + { + "epoch": 0.79, + "learning_rate": 6.261266534004448e-08, + "logits/chosen": -3.069709300994873, + "logits/rejected": -2.861220359802246, + "logps/chosen": -302.48358154296875, + "logps/rejected": -173.47406005859375, + "loss": 0.4968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5652791857719421, + "rewards/margins": 0.6199121475219727, + "rewards/rejected": -1.1851913928985596, + "step": 6891 + }, + { + "epoch": 0.79, + "learning_rate": 6.257754887042022e-08, + "logits/chosen": -3.047985076904297, + "logits/rejected": -3.5076189041137695, + "logps/chosen": -239.9783935546875, + "logps/rejected": -179.6811981201172, + "loss": 0.213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08551289141178131, + "rewards/margins": 2.653794050216675, + "rewards/rejected": -2.5682809352874756, + "step": 6892 + }, + { + "epoch": 0.79, + "learning_rate": 6.254243240079596e-08, + "logits/chosen": -3.0185861587524414, + "logits/rejected": -2.981876850128174, + "logps/chosen": -132.935546875, + "logps/rejected": -149.70928955078125, + "loss": 0.3026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33074119687080383, + "rewards/margins": 2.257427453994751, + "rewards/rejected": -2.5881686210632324, + "step": 6893 + }, + { + "epoch": 0.79, + "learning_rate": 6.250731593117172e-08, + "logits/chosen": -2.7466025352478027, + "logits/rejected": -3.099634885787964, + "logps/chosen": -374.97882080078125, + "logps/rejected": -318.3617858886719, + "loss": 0.1656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6082127094268799, + "rewards/margins": 2.6734707355499268, + "rewards/rejected": -2.0652577877044678, + "step": 6894 + }, + { + "epoch": 0.79, + "learning_rate": 6.247219946154746e-08, + "logits/chosen": -3.1095850467681885, + "logits/rejected": -2.889153480529785, + "logps/chosen": -264.2078857421875, + "logps/rejected": -310.78070068359375, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1999027281999588, + "rewards/margins": 2.5612244606018066, + "rewards/rejected": -2.761127233505249, + "step": 6895 + }, + { + "epoch": 0.79, + "learning_rate": 6.243708299192321e-08, + "logits/chosen": -2.182610034942627, + "logits/rejected": -2.1407625675201416, + "logps/chosen": -272.81500244140625, + "logps/rejected": -214.58163452148438, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05769185721874237, + "rewards/margins": 0.58591628074646, + "rewards/rejected": -0.5282244086265564, + "step": 6896 + }, + { + "epoch": 0.8, + "learning_rate": 6.240196652229895e-08, + "logits/chosen": -3.597557306289673, + "logits/rejected": -3.5217063426971436, + "logps/chosen": -265.26788330078125, + "logps/rejected": -181.65621948242188, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1283518373966217, + "rewards/margins": 2.2489266395568848, + "rewards/rejected": -2.120574951171875, + "step": 6897 + }, + { + "epoch": 0.8, + "learning_rate": 6.23668500526747e-08, + "logits/chosen": -2.474308729171753, + "logits/rejected": -2.304273843765259, + "logps/chosen": -155.16506958007812, + "logps/rejected": -262.0389099121094, + "loss": 0.7607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3279498219490051, + "rewards/margins": 0.6810014247894287, + "rewards/rejected": -1.008951187133789, + "step": 6898 + }, + { + "epoch": 0.8, + "learning_rate": 6.233173358305045e-08, + "logits/chosen": -2.9308013916015625, + "logits/rejected": -2.7349352836608887, + "logps/chosen": -293.2691955566406, + "logps/rejected": -327.2069091796875, + "loss": 0.2305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.009263008832931519, + "rewards/margins": 1.9479718208312988, + "rewards/rejected": -1.9572348594665527, + "step": 6899 + }, + { + "epoch": 0.8, + "learning_rate": 6.22966171134262e-08, + "logits/chosen": -3.306802272796631, + "logits/rejected": -3.810076951980591, + "logps/chosen": -296.97674560546875, + "logps/rejected": -372.856689453125, + "loss": 0.2529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48467814922332764, + "rewards/margins": 2.0994582176208496, + "rewards/rejected": -2.584136486053467, + "step": 6900 + }, + { + "epoch": 0.8, + "learning_rate": 6.226150064380194e-08, + "logits/chosen": -2.7982587814331055, + "logits/rejected": -3.0903711318969727, + "logps/chosen": -273.94537353515625, + "logps/rejected": -270.28802490234375, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5703680515289307, + "rewards/margins": 2.5379395484924316, + "rewards/rejected": -1.967571496963501, + "step": 6901 + }, + { + "epoch": 0.8, + "learning_rate": 6.22263841741777e-08, + "logits/chosen": -3.9200849533081055, + "logits/rejected": -4.1158246994018555, + "logps/chosen": -100.15008544921875, + "logps/rejected": -110.8628921508789, + "loss": 0.6877, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07611338794231415, + "rewards/margins": 1.6607253551483154, + "rewards/rejected": -1.5846121311187744, + "step": 6902 + }, + { + "epoch": 0.8, + "learning_rate": 6.219126770455344e-08, + "logits/chosen": -2.787980794906616, + "logits/rejected": -2.9087626934051514, + "logps/chosen": -195.42503356933594, + "logps/rejected": -332.37939453125, + "loss": 0.4914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18333151936531067, + "rewards/margins": 2.3424930572509766, + "rewards/rejected": -2.5258243083953857, + "step": 6903 + }, + { + "epoch": 0.8, + "learning_rate": 6.215615123492918e-08, + "logits/chosen": -2.5948338508605957, + "logits/rejected": -2.721376895904541, + "logps/chosen": -254.96873474121094, + "logps/rejected": -193.92007446289062, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17162173986434937, + "rewards/margins": 1.5634822845458984, + "rewards/rejected": -1.7351038455963135, + "step": 6904 + }, + { + "epoch": 0.8, + "learning_rate": 6.212103476530493e-08, + "logits/chosen": -2.6099069118499756, + "logits/rejected": -2.510551691055298, + "logps/chosen": -172.3911895751953, + "logps/rejected": -141.64857482910156, + "loss": 0.7413, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8225361108779907, + "rewards/margins": 0.17527207732200623, + "rewards/rejected": -0.9978082180023193, + "step": 6905 + }, + { + "epoch": 0.8, + "learning_rate": 6.208591829568067e-08, + "logits/chosen": -3.2003750801086426, + "logits/rejected": -3.176957607269287, + "logps/chosen": -231.46014404296875, + "logps/rejected": -241.0935516357422, + "loss": 0.8482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8179933428764343, + "rewards/margins": 0.5122226476669312, + "rewards/rejected": -1.3302160501480103, + "step": 6906 + }, + { + "epoch": 0.8, + "learning_rate": 6.205080182605641e-08, + "logits/chosen": -3.539546012878418, + "logits/rejected": -3.355071783065796, + "logps/chosen": -377.757080078125, + "logps/rejected": -294.52069091796875, + "loss": 0.2315, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13650892674922943, + "rewards/margins": 2.0078136920928955, + "rewards/rejected": -2.144322395324707, + "step": 6907 + }, + { + "epoch": 0.8, + "learning_rate": 6.201568535643217e-08, + "logits/chosen": -2.9643073081970215, + "logits/rejected": -2.795623779296875, + "logps/chosen": -159.75967407226562, + "logps/rejected": -219.87701416015625, + "loss": 0.4528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4945213794708252, + "rewards/margins": 1.444279670715332, + "rewards/rejected": -1.9388009309768677, + "step": 6908 + }, + { + "epoch": 0.8, + "learning_rate": 6.19805688868079e-08, + "logits/chosen": -2.997471809387207, + "logits/rejected": -3.167635440826416, + "logps/chosen": -222.56500244140625, + "logps/rejected": -184.3179168701172, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40444353222846985, + "rewards/margins": 1.0463972091674805, + "rewards/rejected": -1.4508408308029175, + "step": 6909 + }, + { + "epoch": 0.8, + "learning_rate": 6.194545241718365e-08, + "logits/chosen": -2.7558436393737793, + "logits/rejected": -2.6311488151550293, + "logps/chosen": -243.82489013671875, + "logps/rejected": -283.2449951171875, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040884919464588165, + "rewards/margins": 2.5031933784484863, + "rewards/rejected": -2.462308883666992, + "step": 6910 + }, + { + "epoch": 0.8, + "learning_rate": 6.19103359475594e-08, + "logits/chosen": -3.2598183155059814, + "logits/rejected": -3.002974033355713, + "logps/chosen": -762.5930786132812, + "logps/rejected": -241.87672424316406, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26440906524658203, + "rewards/margins": 1.8656747341156006, + "rewards/rejected": -1.6012656688690186, + "step": 6911 + }, + { + "epoch": 0.8, + "learning_rate": 6.187521947793514e-08, + "logits/chosen": -3.274794578552246, + "logits/rejected": -2.8384978771209717, + "logps/chosen": -280.97576904296875, + "logps/rejected": -170.1022491455078, + "loss": 0.3195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31578171253204346, + "rewards/margins": 1.8131417036056519, + "rewards/rejected": -2.1289234161376953, + "step": 6912 + }, + { + "epoch": 0.8, + "learning_rate": 6.18401030083109e-08, + "logits/chosen": -3.918534278869629, + "logits/rejected": -3.837541341781616, + "logps/chosen": -230.48626708984375, + "logps/rejected": -130.49130249023438, + "loss": 0.362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43590372800827026, + "rewards/margins": 1.5405082702636719, + "rewards/rejected": -1.976412057876587, + "step": 6913 + }, + { + "epoch": 0.8, + "learning_rate": 6.180498653868664e-08, + "logits/chosen": -2.946974277496338, + "logits/rejected": -2.5889389514923096, + "logps/chosen": -313.0987243652344, + "logps/rejected": -469.38592529296875, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38947802782058716, + "rewards/margins": 1.4635546207427979, + "rewards/rejected": -1.8530325889587402, + "step": 6914 + }, + { + "epoch": 0.8, + "learning_rate": 6.176987006906239e-08, + "logits/chosen": -3.1787405014038086, + "logits/rejected": -2.780752182006836, + "logps/chosen": -297.644775390625, + "logps/rejected": -498.3953857421875, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1990986466407776, + "rewards/margins": 1.4441978931427002, + "rewards/rejected": -1.643296480178833, + "step": 6915 + }, + { + "epoch": 0.8, + "learning_rate": 6.173475359943813e-08, + "logits/chosen": -2.603199005126953, + "logits/rejected": -2.515979528427124, + "logps/chosen": -389.23907470703125, + "logps/rejected": -172.3328857421875, + "loss": 0.2646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32855284214019775, + "rewards/margins": 1.4370816946029663, + "rewards/rejected": -1.1085288524627686, + "step": 6916 + }, + { + "epoch": 0.8, + "learning_rate": 6.169963712981388e-08, + "logits/chosen": -2.464099407196045, + "logits/rejected": -2.589663505554199, + "logps/chosen": -338.3741455078125, + "logps/rejected": -247.61146545410156, + "loss": 0.4167, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9013187885284424, + "rewards/margins": 1.0838890075683594, + "rewards/rejected": -1.9852077960968018, + "step": 6917 + }, + { + "epoch": 0.8, + "learning_rate": 6.166452066018962e-08, + "logits/chosen": -2.22609543800354, + "logits/rejected": -2.554272413253784, + "logps/chosen": -494.99359130859375, + "logps/rejected": -327.25946044921875, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1254196166992188, + "rewards/margins": 1.926288366317749, + "rewards/rejected": -0.8008686304092407, + "step": 6918 + }, + { + "epoch": 0.8, + "learning_rate": 6.162940419056538e-08, + "logits/chosen": -2.949626922607422, + "logits/rejected": -2.933156967163086, + "logps/chosen": -343.3488464355469, + "logps/rejected": -282.4388732910156, + "loss": 0.2131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34560346603393555, + "rewards/margins": 1.7155488729476929, + "rewards/rejected": -1.3699454069137573, + "step": 6919 + }, + { + "epoch": 0.8, + "learning_rate": 6.159428772094112e-08, + "logits/chosen": -3.5891637802124023, + "logits/rejected": -3.947050094604492, + "logps/chosen": -260.6293640136719, + "logps/rejected": -343.9156188964844, + "loss": 0.6376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.533272385597229, + "rewards/margins": 0.4418676197528839, + "rewards/rejected": -0.9751399755477905, + "step": 6920 + }, + { + "epoch": 0.8, + "learning_rate": 6.155917125131686e-08, + "logits/chosen": -3.2585983276367188, + "logits/rejected": -3.1968188285827637, + "logps/chosen": -88.00006866455078, + "logps/rejected": -175.18128967285156, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21215826272964478, + "rewards/margins": 2.4365768432617188, + "rewards/rejected": -2.648735284805298, + "step": 6921 + }, + { + "epoch": 0.8, + "learning_rate": 6.152405478169261e-08, + "logits/chosen": -2.611107587814331, + "logits/rejected": -2.6224775314331055, + "logps/chosen": -433.9884948730469, + "logps/rejected": -303.4171142578125, + "loss": 0.2259, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1996743083000183, + "rewards/margins": 2.1115474700927734, + "rewards/rejected": -2.3112220764160156, + "step": 6922 + }, + { + "epoch": 0.8, + "learning_rate": 6.148893831206835e-08, + "logits/chosen": -3.101010322570801, + "logits/rejected": -2.757798433303833, + "logps/chosen": -274.4108581542969, + "logps/rejected": -173.5542755126953, + "loss": 0.2533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44520407915115356, + "rewards/margins": 2.0208957195281982, + "rewards/rejected": -1.5756916999816895, + "step": 6923 + }, + { + "epoch": 0.8, + "learning_rate": 6.14538218424441e-08, + "logits/chosen": -3.267371654510498, + "logits/rejected": -3.592923641204834, + "logps/chosen": -254.95144653320312, + "logps/rejected": -304.2880554199219, + "loss": 0.4551, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27280062437057495, + "rewards/margins": 2.593517303466797, + "rewards/rejected": -2.8663182258605957, + "step": 6924 + }, + { + "epoch": 0.8, + "learning_rate": 6.141870537281985e-08, + "logits/chosen": -2.834172487258911, + "logits/rejected": -2.7362983226776123, + "logps/chosen": -182.0997314453125, + "logps/rejected": -315.0140075683594, + "loss": 0.3146, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1976601481437683, + "rewards/margins": 2.2280611991882324, + "rewards/rejected": -2.0304012298583984, + "step": 6925 + }, + { + "epoch": 0.8, + "learning_rate": 6.138358890319559e-08, + "logits/chosen": -2.9553768634796143, + "logits/rejected": -2.9969754219055176, + "logps/chosen": -225.42721557617188, + "logps/rejected": -342.064208984375, + "loss": 0.4131, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4038824439048767, + "rewards/margins": 2.1553030014038086, + "rewards/rejected": -1.7514206171035767, + "step": 6926 + }, + { + "epoch": 0.8, + "learning_rate": 6.134847243357134e-08, + "logits/chosen": -3.8258414268493652, + "logits/rejected": -3.7468862533569336, + "logps/chosen": -259.67449951171875, + "logps/rejected": -223.55364990234375, + "loss": 0.6097, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7806622982025146, + "rewards/margins": 0.4712674021720886, + "rewards/rejected": -1.2519296407699585, + "step": 6927 + }, + { + "epoch": 0.8, + "learning_rate": 6.131335596394708e-08, + "logits/chosen": -2.5032310485839844, + "logits/rejected": -2.8347997665405273, + "logps/chosen": -174.287841796875, + "logps/rejected": -136.6529998779297, + "loss": 0.6625, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.333992063999176, + "rewards/margins": 1.1687606573104858, + "rewards/rejected": -1.502752661705017, + "step": 6928 + }, + { + "epoch": 0.8, + "learning_rate": 6.127823949432284e-08, + "logits/chosen": -2.940782070159912, + "logits/rejected": -2.869699001312256, + "logps/chosen": -409.40771484375, + "logps/rejected": -196.79666137695312, + "loss": 0.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12029287219047546, + "rewards/margins": 2.224592685699463, + "rewards/rejected": -2.104300022125244, + "step": 6929 + }, + { + "epoch": 0.8, + "learning_rate": 6.124312302469858e-08, + "logits/chosen": -3.107272148132324, + "logits/rejected": -2.8803775310516357, + "logps/chosen": -170.6290740966797, + "logps/rejected": -226.51034545898438, + "loss": 0.4088, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20312818884849548, + "rewards/margins": 2.205857753753662, + "rewards/rejected": -2.002729654312134, + "step": 6930 + }, + { + "epoch": 0.8, + "learning_rate": 6.120800655507433e-08, + "logits/chosen": -3.258471727371216, + "logits/rejected": -3.3949851989746094, + "logps/chosen": -278.8154296875, + "logps/rejected": -296.8592529296875, + "loss": 0.3238, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4642886519432068, + "rewards/margins": 2.1703591346740723, + "rewards/rejected": -1.7060701847076416, + "step": 6931 + }, + { + "epoch": 0.8, + "learning_rate": 6.117289008545007e-08, + "logits/chosen": -3.1768593788146973, + "logits/rejected": -3.000978946685791, + "logps/chosen": -283.0621337890625, + "logps/rejected": -188.79864501953125, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14919519424438477, + "rewards/margins": 1.2274787425994873, + "rewards/rejected": -1.0782835483551025, + "step": 6932 + }, + { + "epoch": 0.8, + "learning_rate": 6.113777361582583e-08, + "logits/chosen": -2.4183101654052734, + "logits/rejected": -2.380706787109375, + "logps/chosen": -356.02069091796875, + "logps/rejected": -338.8557434082031, + "loss": 0.3731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05462533235549927, + "rewards/margins": 1.1229419708251953, + "rewards/rejected": -1.1775673627853394, + "step": 6933 + }, + { + "epoch": 0.8, + "learning_rate": 6.110265714620157e-08, + "logits/chosen": -3.0102524757385254, + "logits/rejected": -2.858203172683716, + "logps/chosen": -232.3582763671875, + "logps/rejected": -268.3809509277344, + "loss": 0.5292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.356096476316452, + "rewards/margins": 0.9975271224975586, + "rewards/rejected": -1.353623628616333, + "step": 6934 + }, + { + "epoch": 0.8, + "learning_rate": 6.106754067657731e-08, + "logits/chosen": -3.631277084350586, + "logits/rejected": -3.4281816482543945, + "logps/chosen": -377.9190368652344, + "logps/rejected": -270.5397644042969, + "loss": 0.523, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14649976789951324, + "rewards/margins": 1.7951287031173706, + "rewards/rejected": -1.9416284561157227, + "step": 6935 + }, + { + "epoch": 0.8, + "learning_rate": 6.103242420695306e-08, + "logits/chosen": -2.726210594177246, + "logits/rejected": -2.5786521434783936, + "logps/chosen": -360.8613586425781, + "logps/rejected": -198.5255126953125, + "loss": 0.5184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44467517733573914, + "rewards/margins": 1.5484180450439453, + "rewards/rejected": -1.993093490600586, + "step": 6936 + }, + { + "epoch": 0.8, + "learning_rate": 6.09973077373288e-08, + "logits/chosen": -3.5762124061584473, + "logits/rejected": -3.3063290119171143, + "logps/chosen": -138.4512939453125, + "logps/rejected": -213.74795532226562, + "loss": 0.3364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4511685371398926, + "rewards/margins": 1.6646990776062012, + "rewards/rejected": -2.1158676147460938, + "step": 6937 + }, + { + "epoch": 0.8, + "learning_rate": 6.096219126770454e-08, + "logits/chosen": -3.0415868759155273, + "logits/rejected": -3.5145955085754395, + "logps/chosen": -169.2500457763672, + "logps/rejected": -188.27256774902344, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2367323935031891, + "rewards/margins": 1.586731195449829, + "rewards/rejected": -1.8234636783599854, + "step": 6938 + }, + { + "epoch": 0.8, + "learning_rate": 6.09270747980803e-08, + "logits/chosen": -3.4779908657073975, + "logits/rejected": -3.5100722312927246, + "logps/chosen": -212.20701599121094, + "logps/rejected": -202.8245849609375, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007272630929946899, + "rewards/margins": 1.5324013233184814, + "rewards/rejected": -1.5251288414001465, + "step": 6939 + }, + { + "epoch": 0.8, + "learning_rate": 6.089195832845604e-08, + "logits/chosen": -3.240521192550659, + "logits/rejected": -3.134890079498291, + "logps/chosen": -263.38372802734375, + "logps/rejected": -176.42288208007812, + "loss": 0.4734, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20650431513786316, + "rewards/margins": 0.7102005481719971, + "rewards/rejected": -0.5036962032318115, + "step": 6940 + }, + { + "epoch": 0.8, + "learning_rate": 6.085684185883179e-08, + "logits/chosen": -2.7310709953308105, + "logits/rejected": -2.876204013824463, + "logps/chosen": -345.68121337890625, + "logps/rejected": -270.18353271484375, + "loss": 1.2103, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5730695128440857, + "rewards/margins": -0.058474019169807434, + "rewards/rejected": -0.5145954489707947, + "step": 6941 + }, + { + "epoch": 0.8, + "learning_rate": 6.082172538920753e-08, + "logits/chosen": -3.2182700634002686, + "logits/rejected": -3.009902000427246, + "logps/chosen": -184.67543029785156, + "logps/rejected": -218.2108917236328, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49612462520599365, + "rewards/margins": 1.9733840227127075, + "rewards/rejected": -2.469508647918701, + "step": 6942 + }, + { + "epoch": 0.8, + "learning_rate": 6.078660891958329e-08, + "logits/chosen": -2.7201318740844727, + "logits/rejected": -2.8363242149353027, + "logps/chosen": -177.3883514404297, + "logps/rejected": -335.1317138671875, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01637636125087738, + "rewards/margins": 1.9553626775741577, + "rewards/rejected": -1.9389863014221191, + "step": 6943 + }, + { + "epoch": 0.8, + "learning_rate": 6.075149244995903e-08, + "logits/chosen": -3.4872140884399414, + "logits/rejected": -3.4439072608947754, + "logps/chosen": -108.7239990234375, + "logps/rejected": -104.06349182128906, + "loss": 0.4399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31020018458366394, + "rewards/margins": 1.2907323837280273, + "rewards/rejected": -0.9805324077606201, + "step": 6944 + }, + { + "epoch": 0.8, + "learning_rate": 6.071637598033478e-08, + "logits/chosen": -3.571685791015625, + "logits/rejected": -3.5475401878356934, + "logps/chosen": -291.85186767578125, + "logps/rejected": -167.1832733154297, + "loss": 0.4401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11681388318538666, + "rewards/margins": 1.0881768465042114, + "rewards/rejected": -1.2049907445907593, + "step": 6945 + }, + { + "epoch": 0.8, + "learning_rate": 6.068125951071052e-08, + "logits/chosen": -3.552715301513672, + "logits/rejected": -3.6484150886535645, + "logps/chosen": -149.06678771972656, + "logps/rejected": -261.9591064453125, + "loss": 0.2519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11034746468067169, + "rewards/margins": 2.1226608753204346, + "rewards/rejected": -2.0123133659362793, + "step": 6946 + }, + { + "epoch": 0.8, + "learning_rate": 6.064614304108627e-08, + "logits/chosen": -3.2166638374328613, + "logits/rejected": -2.947073459625244, + "logps/chosen": -408.6716613769531, + "logps/rejected": -231.74752807617188, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1596297025680542, + "rewards/margins": 1.518135666847229, + "rewards/rejected": -1.6777653694152832, + "step": 6947 + }, + { + "epoch": 0.8, + "learning_rate": 6.061102657146202e-08, + "logits/chosen": -3.2877695560455322, + "logits/rejected": -3.192363739013672, + "logps/chosen": -306.6596374511719, + "logps/rejected": -312.8467102050781, + "loss": 0.313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31004273891448975, + "rewards/margins": 1.2941596508026123, + "rewards/rejected": -1.604202389717102, + "step": 6948 + }, + { + "epoch": 0.8, + "learning_rate": 6.057591010183777e-08, + "logits/chosen": -3.3210089206695557, + "logits/rejected": -3.657902240753174, + "logps/chosen": -322.0251159667969, + "logps/rejected": -265.26373291015625, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4957643747329712, + "rewards/margins": 1.5602669715881348, + "rewards/rejected": -2.0560312271118164, + "step": 6949 + }, + { + "epoch": 0.8, + "learning_rate": 6.054079363221351e-08, + "logits/chosen": -2.9940710067749023, + "logits/rejected": -2.818845510482788, + "logps/chosen": -281.94183349609375, + "logps/rejected": -253.54766845703125, + "loss": 0.4584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10900469124317169, + "rewards/margins": 1.3168742656707764, + "rewards/rejected": -1.207869529724121, + "step": 6950 + }, + { + "epoch": 0.8, + "learning_rate": 6.050567716258925e-08, + "logits/chosen": -2.786530017852783, + "logits/rejected": -2.8202271461486816, + "logps/chosen": -373.8701477050781, + "logps/rejected": -268.00238037109375, + "loss": 0.4243, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.431793212890625, + "rewards/margins": 0.9536972045898438, + "rewards/rejected": -0.5219039916992188, + "step": 6951 + }, + { + "epoch": 0.8, + "learning_rate": 6.047056069296499e-08, + "logits/chosen": -3.3679487705230713, + "logits/rejected": -3.334458351135254, + "logps/chosen": -307.3662109375, + "logps/rejected": -378.37200927734375, + "loss": 0.2732, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3789092004299164, + "rewards/margins": 3.028428316116333, + "rewards/rejected": -2.6495189666748047, + "step": 6952 + }, + { + "epoch": 0.8, + "learning_rate": 6.043544422334074e-08, + "logits/chosen": -3.672128200531006, + "logits/rejected": -3.238973379135132, + "logps/chosen": -233.14779663085938, + "logps/rejected": -182.70388793945312, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42179447412490845, + "rewards/margins": 1.2678064107894897, + "rewards/rejected": -1.689600944519043, + "step": 6953 + }, + { + "epoch": 0.8, + "learning_rate": 6.040032775371649e-08, + "logits/chosen": -3.3454489707946777, + "logits/rejected": -3.46112060546875, + "logps/chosen": -348.00030517578125, + "logps/rejected": -336.1090087890625, + "loss": 0.4976, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7184159159660339, + "rewards/margins": 1.3401222229003906, + "rewards/rejected": -2.0585379600524902, + "step": 6954 + }, + { + "epoch": 0.8, + "learning_rate": 6.036521128409224e-08, + "logits/chosen": -3.1476855278015137, + "logits/rejected": -3.1050431728363037, + "logps/chosen": -258.1007080078125, + "logps/rejected": -290.1959228515625, + "loss": 0.2843, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1695144772529602, + "rewards/margins": 2.185655355453491, + "rewards/rejected": -2.3551697731018066, + "step": 6955 + }, + { + "epoch": 0.8, + "learning_rate": 6.033009481446798e-08, + "logits/chosen": -2.3613100051879883, + "logits/rejected": -2.4268555641174316, + "logps/chosen": -299.7089538574219, + "logps/rejected": -276.5127258300781, + "loss": 0.2545, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6070857644081116, + "rewards/margins": 2.0665175914764404, + "rewards/rejected": -1.4594316482543945, + "step": 6956 + }, + { + "epoch": 0.8, + "learning_rate": 6.029497834484372e-08, + "logits/chosen": -3.551363945007324, + "logits/rejected": -3.451401710510254, + "logps/chosen": -248.30703735351562, + "logps/rejected": -206.882080078125, + "loss": 0.4627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09178006649017334, + "rewards/margins": 1.6204028129577637, + "rewards/rejected": -1.712182641029358, + "step": 6957 + }, + { + "epoch": 0.8, + "learning_rate": 6.025986187521947e-08, + "logits/chosen": -3.796799659729004, + "logits/rejected": -3.544156551361084, + "logps/chosen": -254.96571350097656, + "logps/rejected": -222.07470703125, + "loss": 0.4221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1552901417016983, + "rewards/margins": 1.7896525859832764, + "rewards/rejected": -1.944942831993103, + "step": 6958 + }, + { + "epoch": 0.8, + "learning_rate": 6.022474540559521e-08, + "logits/chosen": -3.125786066055298, + "logits/rejected": -3.0213370323181152, + "logps/chosen": -197.5426025390625, + "logps/rejected": -224.67538452148438, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021479979157447815, + "rewards/margins": 2.7360010147094727, + "rewards/rejected": -2.7145206928253174, + "step": 6959 + }, + { + "epoch": 0.8, + "learning_rate": 6.018962893597097e-08, + "logits/chosen": -4.134567737579346, + "logits/rejected": -4.047092437744141, + "logps/chosen": -172.82894897460938, + "logps/rejected": -163.48060607910156, + "loss": 0.735, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3580150008201599, + "rewards/margins": 0.7979310154914856, + "rewards/rejected": -1.1559460163116455, + "step": 6960 + }, + { + "epoch": 0.8, + "learning_rate": 6.015451246634671e-08, + "logits/chosen": -3.091373920440674, + "logits/rejected": -2.932126760482788, + "logps/chosen": -392.9687805175781, + "logps/rejected": -211.02505493164062, + "loss": 0.3932, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.034972578287124634, + "rewards/margins": 1.2965271472930908, + "rewards/rejected": -1.261554479598999, + "step": 6961 + }, + { + "epoch": 0.8, + "learning_rate": 6.011939599672246e-08, + "logits/chosen": -2.552109718322754, + "logits/rejected": -2.490813732147217, + "logps/chosen": -400.44610595703125, + "logps/rejected": -294.38690185546875, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6155077219009399, + "rewards/margins": 1.6363301277160645, + "rewards/rejected": -1.020822525024414, + "step": 6962 + }, + { + "epoch": 0.8, + "learning_rate": 6.00842795270982e-08, + "logits/chosen": -3.0347847938537598, + "logits/rejected": -2.9451870918273926, + "logps/chosen": -239.75819396972656, + "logps/rejected": -246.9307098388672, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3552280068397522, + "rewards/margins": 2.036203384399414, + "rewards/rejected": -1.680975317955017, + "step": 6963 + }, + { + "epoch": 0.8, + "learning_rate": 6.004916305747396e-08, + "logits/chosen": -3.2504942417144775, + "logits/rejected": -3.2064993381500244, + "logps/chosen": -162.08993530273438, + "logps/rejected": -220.43035888671875, + "loss": 0.5576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5907617211341858, + "rewards/margins": 0.9056981205940247, + "rewards/rejected": -1.4964598417282104, + "step": 6964 + }, + { + "epoch": 0.8, + "learning_rate": 6.00140465878497e-08, + "logits/chosen": -2.9582526683807373, + "logits/rejected": -2.9211533069610596, + "logps/chosen": -326.4638366699219, + "logps/rejected": -235.81405639648438, + "loss": 0.2787, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18441319465637207, + "rewards/margins": 2.012279510498047, + "rewards/rejected": -1.8278663158416748, + "step": 6965 + }, + { + "epoch": 0.8, + "learning_rate": 5.997893011822545e-08, + "logits/chosen": -2.8860208988189697, + "logits/rejected": -2.95339298248291, + "logps/chosen": -553.3525390625, + "logps/rejected": -364.57867431640625, + "loss": 0.2466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3985903263092041, + "rewards/margins": 1.9060683250427246, + "rewards/rejected": -1.5074779987335205, + "step": 6966 + }, + { + "epoch": 0.8, + "learning_rate": 5.994381364860119e-08, + "logits/chosen": -3.1683881282806396, + "logits/rejected": -3.268204689025879, + "logps/chosen": -209.6517333984375, + "logps/rejected": -103.41073608398438, + "loss": 0.5608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02602100372314453, + "rewards/margins": 0.48678308725357056, + "rewards/rejected": -0.460762083530426, + "step": 6967 + }, + { + "epoch": 0.8, + "learning_rate": 5.990869717897693e-08, + "logits/chosen": -2.0475707054138184, + "logits/rejected": -1.8462293148040771, + "logps/chosen": -174.15228271484375, + "logps/rejected": -299.77227783203125, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09349679201841354, + "rewards/margins": 1.8732316493988037, + "rewards/rejected": -1.7797349691390991, + "step": 6968 + }, + { + "epoch": 0.8, + "learning_rate": 5.987358070935267e-08, + "logits/chosen": -2.874101161956787, + "logits/rejected": -3.023653745651245, + "logps/chosen": -248.56207275390625, + "logps/rejected": -206.61537170410156, + "loss": 0.2285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7519725561141968, + "rewards/margins": 1.5925652980804443, + "rewards/rejected": -0.840592622756958, + "step": 6969 + }, + { + "epoch": 0.8, + "learning_rate": 5.983846423972843e-08, + "logits/chosen": -2.492197036743164, + "logits/rejected": -2.4313254356384277, + "logps/chosen": -324.7337646484375, + "logps/rejected": -346.2574157714844, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1870432198047638, + "rewards/margins": 1.3263425827026367, + "rewards/rejected": -1.5133857727050781, + "step": 6970 + }, + { + "epoch": 0.8, + "learning_rate": 5.980334777010417e-08, + "logits/chosen": -3.0768752098083496, + "logits/rejected": -2.810382843017578, + "logps/chosen": -249.8797607421875, + "logps/rejected": -206.31436157226562, + "loss": 0.7506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3719184100627899, + "rewards/margins": 0.4831152856349945, + "rewards/rejected": -0.8550336956977844, + "step": 6971 + }, + { + "epoch": 0.8, + "learning_rate": 5.976823130047992e-08, + "logits/chosen": -3.348515510559082, + "logits/rejected": -3.0292627811431885, + "logps/chosen": -175.6802520751953, + "logps/rejected": -144.98263549804688, + "loss": 0.3266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24199409782886505, + "rewards/margins": 1.965043544769287, + "rewards/rejected": -2.2070374488830566, + "step": 6972 + }, + { + "epoch": 0.8, + "learning_rate": 5.973311483085566e-08, + "logits/chosen": -3.467180013656616, + "logits/rejected": -3.434614658355713, + "logps/chosen": -294.4759216308594, + "logps/rejected": -309.9489440917969, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5537824034690857, + "rewards/margins": 3.0176942348480225, + "rewards/rejected": -2.463911771774292, + "step": 6973 + }, + { + "epoch": 0.8, + "learning_rate": 5.969799836123142e-08, + "logits/chosen": -3.4978654384613037, + "logits/rejected": -3.0073935985565186, + "logps/chosen": -456.33123779296875, + "logps/rejected": -285.3265686035156, + "loss": 0.4403, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08993859589099884, + "rewards/margins": 1.3635438680648804, + "rewards/rejected": -1.2736053466796875, + "step": 6974 + }, + { + "epoch": 0.8, + "learning_rate": 5.966288189160716e-08, + "logits/chosen": -3.3789520263671875, + "logits/rejected": -3.3220598697662354, + "logps/chosen": -181.70730590820312, + "logps/rejected": -154.29824829101562, + "loss": 0.7178, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47566860914230347, + "rewards/margins": 1.03249192237854, + "rewards/rejected": -1.5081605911254883, + "step": 6975 + }, + { + "epoch": 0.8, + "learning_rate": 5.962776542198291e-08, + "logits/chosen": -3.2670490741729736, + "logits/rejected": -3.387162685394287, + "logps/chosen": -158.5674285888672, + "logps/rejected": -260.0092468261719, + "loss": 0.2315, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09496927261352539, + "rewards/margins": 2.3262791633605957, + "rewards/rejected": -2.421248197555542, + "step": 6976 + }, + { + "epoch": 0.8, + "learning_rate": 5.959264895235865e-08, + "logits/chosen": -3.5319101810455322, + "logits/rejected": -3.590482711791992, + "logps/chosen": -366.9053039550781, + "logps/rejected": -351.73193359375, + "loss": 0.2228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9525577425956726, + "rewards/margins": 2.5397140979766846, + "rewards/rejected": -1.5871565341949463, + "step": 6977 + }, + { + "epoch": 0.8, + "learning_rate": 5.95575324827344e-08, + "logits/chosen": -3.3996987342834473, + "logits/rejected": -3.0378060340881348, + "logps/chosen": -319.0865173339844, + "logps/rejected": -176.8582763671875, + "loss": 0.393, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16276530921459198, + "rewards/margins": 1.1855764389038086, + "rewards/rejected": -1.0228111743927002, + "step": 6978 + }, + { + "epoch": 0.8, + "learning_rate": 5.9522416013110146e-08, + "logits/chosen": -3.3175482749938965, + "logits/rejected": -3.3471503257751465, + "logps/chosen": -164.73910522460938, + "logps/rejected": -241.93838500976562, + "loss": 0.1989, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09648928791284561, + "rewards/margins": 3.6138267517089844, + "rewards/rejected": -3.5173375606536865, + "step": 6979 + }, + { + "epoch": 0.8, + "learning_rate": 5.9487299543485894e-08, + "logits/chosen": -2.9146969318389893, + "logits/rejected": -2.7408101558685303, + "logps/chosen": -159.3497314453125, + "logps/rejected": -190.52230834960938, + "loss": 0.4084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1908813714981079, + "rewards/margins": 1.473282814025879, + "rewards/rejected": -1.6641643047332764, + "step": 6980 + }, + { + "epoch": 0.8, + "learning_rate": 5.9452183073861634e-08, + "logits/chosen": -3.195327043533325, + "logits/rejected": -2.9846999645233154, + "logps/chosen": -499.8396911621094, + "logps/rejected": -320.21441650390625, + "loss": 0.3131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12372083961963654, + "rewards/margins": 1.925856351852417, + "rewards/rejected": -2.049577236175537, + "step": 6981 + }, + { + "epoch": 0.8, + "learning_rate": 5.941706660423739e-08, + "logits/chosen": -3.4754958152770996, + "logits/rejected": -3.6751952171325684, + "logps/chosen": -63.35642623901367, + "logps/rejected": -156.18824768066406, + "loss": 0.3988, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38382238149642944, + "rewards/margins": 1.3138604164123535, + "rewards/rejected": -0.9300379753112793, + "step": 6982 + }, + { + "epoch": 0.81, + "learning_rate": 5.938195013461313e-08, + "logits/chosen": -3.7706804275512695, + "logits/rejected": -3.7852325439453125, + "logps/chosen": -158.6859130859375, + "logps/rejected": -240.93341064453125, + "loss": 0.5957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6287675499916077, + "rewards/margins": 2.1492199897766113, + "rewards/rejected": -2.777987480163574, + "step": 6983 + }, + { + "epoch": 0.81, + "learning_rate": 5.934683366498888e-08, + "logits/chosen": -3.154961585998535, + "logits/rejected": -3.1029090881347656, + "logps/chosen": -470.1600646972656, + "logps/rejected": -329.5315856933594, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5353565812110901, + "rewards/margins": 1.5144104957580566, + "rewards/rejected": -0.9790538549423218, + "step": 6984 + }, + { + "epoch": 0.81, + "learning_rate": 5.931171719536462e-08, + "logits/chosen": -2.795490026473999, + "logits/rejected": -2.897111415863037, + "logps/chosen": -425.0279541015625, + "logps/rejected": -292.76336669921875, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.037370458245277405, + "rewards/margins": 1.4702348709106445, + "rewards/rejected": -1.4328645467758179, + "step": 6985 + }, + { + "epoch": 0.81, + "learning_rate": 5.927660072574037e-08, + "logits/chosen": -2.7410759925842285, + "logits/rejected": -2.5345304012298584, + "logps/chosen": -363.58935546875, + "logps/rejected": -357.5035400390625, + "loss": 0.2231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16033661365509033, + "rewards/margins": 2.76309871673584, + "rewards/rejected": -2.9234352111816406, + "step": 6986 + }, + { + "epoch": 0.81, + "learning_rate": 5.924148425611611e-08, + "logits/chosen": -1.930738925933838, + "logits/rejected": -1.9003853797912598, + "logps/chosen": -497.39703369140625, + "logps/rejected": -380.62359619140625, + "loss": 0.4549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4081801772117615, + "rewards/margins": 1.057241678237915, + "rewards/rejected": -1.4654216766357422, + "step": 6987 + }, + { + "epoch": 0.81, + "learning_rate": 5.9206367786491865e-08, + "logits/chosen": -2.314234733581543, + "logits/rejected": -2.4437315464019775, + "logps/chosen": -380.39361572265625, + "logps/rejected": -383.2861328125, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33161747455596924, + "rewards/margins": 3.4700002670288086, + "rewards/rejected": -3.1383824348449707, + "step": 6988 + }, + { + "epoch": 0.81, + "learning_rate": 5.9171251316867606e-08, + "logits/chosen": -2.6744751930236816, + "logits/rejected": -2.9168758392333984, + "logps/chosen": -240.18701171875, + "logps/rejected": -384.57073974609375, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14186100661754608, + "rewards/margins": 1.9036368131637573, + "rewards/rejected": -2.0454978942871094, + "step": 6989 + }, + { + "epoch": 0.81, + "learning_rate": 5.913613484724336e-08, + "logits/chosen": -2.638814687728882, + "logits/rejected": -2.533006429672241, + "logps/chosen": -143.0211639404297, + "logps/rejected": -138.11163330078125, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4954966604709625, + "rewards/margins": 1.3878452777862549, + "rewards/rejected": -1.883341908454895, + "step": 6990 + }, + { + "epoch": 0.81, + "learning_rate": 5.91010183776191e-08, + "logits/chosen": -2.7294762134552, + "logits/rejected": -2.838654041290283, + "logps/chosen": -145.97850036621094, + "logps/rejected": -157.5864715576172, + "loss": 0.7478, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0701386108994484, + "rewards/margins": 0.5330374240875244, + "rewards/rejected": -0.6031760573387146, + "step": 6991 + }, + { + "epoch": 0.81, + "learning_rate": 5.906590190799485e-08, + "logits/chosen": -2.2638461589813232, + "logits/rejected": -2.4907429218292236, + "logps/chosen": -342.3465576171875, + "logps/rejected": -342.537353515625, + "loss": 0.4364, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5741224884986877, + "rewards/margins": 1.1225779056549072, + "rewards/rejected": -0.5484555959701538, + "step": 6992 + }, + { + "epoch": 0.81, + "learning_rate": 5.9030785438370595e-08, + "logits/chosen": -3.102147102355957, + "logits/rejected": -3.0724592208862305, + "logps/chosen": -235.44862365722656, + "logps/rejected": -198.119384765625, + "loss": 0.5575, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5492009520530701, + "rewards/margins": 1.7081482410430908, + "rewards/rejected": -1.1589473485946655, + "step": 6993 + }, + { + "epoch": 0.81, + "learning_rate": 5.899566896874634e-08, + "logits/chosen": -4.08859920501709, + "logits/rejected": -3.5882415771484375, + "logps/chosen": -229.02944946289062, + "logps/rejected": -188.18609619140625, + "loss": 0.392, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07418704032897949, + "rewards/margins": 1.1349377632141113, + "rewards/rejected": -1.0607508420944214, + "step": 6994 + }, + { + "epoch": 0.81, + "learning_rate": 5.896055249912208e-08, + "logits/chosen": -2.73305606842041, + "logits/rejected": -3.0745415687561035, + "logps/chosen": -208.04461669921875, + "logps/rejected": -162.91372680664062, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25808724761009216, + "rewards/margins": 1.4644243717193604, + "rewards/rejected": -1.7225115299224854, + "step": 6995 + }, + { + "epoch": 0.81, + "learning_rate": 5.8925436029497836e-08, + "logits/chosen": -2.9460697174072266, + "logits/rejected": -2.4610087871551514, + "logps/chosen": -257.615478515625, + "logps/rejected": -197.93988037109375, + "loss": 0.476, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07551130652427673, + "rewards/margins": 0.999491810798645, + "rewards/rejected": -0.9239804744720459, + "step": 6996 + }, + { + "epoch": 0.81, + "learning_rate": 5.889031955987358e-08, + "logits/chosen": -2.903322219848633, + "logits/rejected": -2.9148881435394287, + "logps/chosen": -268.1269226074219, + "logps/rejected": -171.7779998779297, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43259888887405396, + "rewards/margins": 2.082779884338379, + "rewards/rejected": -2.515378713607788, + "step": 6997 + }, + { + "epoch": 0.81, + "learning_rate": 5.885520309024933e-08, + "logits/chosen": -3.9176628589630127, + "logits/rejected": -3.814957857131958, + "logps/chosen": -335.3868713378906, + "logps/rejected": -344.2461242675781, + "loss": 0.3694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09627486765384674, + "rewards/margins": 2.1334798336029053, + "rewards/rejected": -2.229754686355591, + "step": 6998 + }, + { + "epoch": 0.81, + "learning_rate": 5.882008662062507e-08, + "logits/chosen": -3.500988006591797, + "logits/rejected": -3.604020118713379, + "logps/chosen": -174.91546630859375, + "logps/rejected": -249.53306579589844, + "loss": 0.1505, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058472298085689545, + "rewards/margins": 2.4385275840759277, + "rewards/rejected": -2.3800554275512695, + "step": 6999 + }, + { + "epoch": 0.81, + "learning_rate": 5.878497015100082e-08, + "logits/chosen": -2.493839979171753, + "logits/rejected": -2.7347664833068848, + "logps/chosen": -266.0604553222656, + "logps/rejected": -237.8282470703125, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1719624102115631, + "rewards/margins": 2.023911952972412, + "rewards/rejected": -1.851949691772461, + "step": 7000 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.8362886905670166, + "eval_logits/rejected": -2.7954912185668945, + "eval_logps/chosen": -293.8367614746094, + "eval_logps/rejected": -237.78546142578125, + "eval_loss": 0.42330804467201233, + "eval_rewards/accuracies": 0.800000011920929, + "eval_rewards/chosen": 0.02181965485215187, + "eval_rewards/margins": 1.3661056756973267, + "eval_rewards/rejected": -1.3442859649658203, + "eval_runtime": 32.6673, + "eval_samples_per_second": 2.143, + "eval_steps_per_second": 1.071, + "step": 7000 + }, + { + "epoch": 0.81, + "learning_rate": 5.8749853681376566e-08, + "logits/chosen": -2.9051244258880615, + "logits/rejected": -2.9484338760375977, + "logps/chosen": -168.61651611328125, + "logps/rejected": -201.84576416015625, + "loss": 0.4599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6740067005157471, + "rewards/margins": 1.0631098747253418, + "rewards/rejected": -1.7371165752410889, + "step": 7001 + }, + { + "epoch": 0.81, + "learning_rate": 5.871473721175231e-08, + "logits/chosen": -3.4423139095306396, + "logits/rejected": -2.925476312637329, + "logps/chosen": -505.7796325683594, + "logps/rejected": -342.3538818359375, + "loss": 0.1506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.799534261226654, + "rewards/margins": 2.4757723808288574, + "rewards/rejected": -1.6762381792068481, + "step": 7002 + }, + { + "epoch": 0.81, + "learning_rate": 5.8679620742128054e-08, + "logits/chosen": -3.111323118209839, + "logits/rejected": -3.1676225662231445, + "logps/chosen": -138.64088439941406, + "logps/rejected": -182.73558044433594, + "loss": 0.4436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05641554296016693, + "rewards/margins": 1.0240941047668457, + "rewards/rejected": -1.0805097818374634, + "step": 7003 + }, + { + "epoch": 0.81, + "learning_rate": 5.8644504272503794e-08, + "logits/chosen": -2.593442916870117, + "logits/rejected": -2.532731771469116, + "logps/chosen": -168.3059539794922, + "logps/rejected": -186.24221801757812, + "loss": 0.5824, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08572201430797577, + "rewards/margins": 0.7681580781936646, + "rewards/rejected": -0.6824361085891724, + "step": 7004 + }, + { + "epoch": 0.81, + "learning_rate": 5.860938780287955e-08, + "logits/chosen": -2.6649162769317627, + "logits/rejected": -2.6663284301757812, + "logps/chosen": -203.22933959960938, + "logps/rejected": -178.6185760498047, + "loss": 0.33, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24206435680389404, + "rewards/margins": 1.4337414503097534, + "rewards/rejected": -1.6758058071136475, + "step": 7005 + }, + { + "epoch": 0.81, + "learning_rate": 5.857427133325529e-08, + "logits/chosen": -3.45418119430542, + "logits/rejected": -3.2838146686553955, + "logps/chosen": -293.0616455078125, + "logps/rejected": -248.76087951660156, + "loss": 0.3189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13096627593040466, + "rewards/margins": 1.6464900970458984, + "rewards/rejected": -1.515523910522461, + "step": 7006 + }, + { + "epoch": 0.81, + "learning_rate": 5.853915486363104e-08, + "logits/chosen": -2.945352554321289, + "logits/rejected": -2.8514316082000732, + "logps/chosen": -276.66265869140625, + "logps/rejected": -320.00323486328125, + "loss": 1.034, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0379970073699951, + "rewards/margins": 1.109207034111023, + "rewards/rejected": -2.1472041606903076, + "step": 7007 + }, + { + "epoch": 0.81, + "learning_rate": 5.850403839400678e-08, + "logits/chosen": -2.81357479095459, + "logits/rejected": -2.8694841861724854, + "logps/chosen": -245.12779235839844, + "logps/rejected": -246.33140563964844, + "loss": 0.4074, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7012097239494324, + "rewards/margins": 2.5055363178253174, + "rewards/rejected": -1.8043264150619507, + "step": 7008 + }, + { + "epoch": 0.81, + "learning_rate": 5.846892192438253e-08, + "logits/chosen": -3.196967601776123, + "logits/rejected": -3.756488561630249, + "logps/chosen": -142.46762084960938, + "logps/rejected": -294.9142761230469, + "loss": 0.2482, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13832253217697144, + "rewards/margins": 3.6251261234283447, + "rewards/rejected": -3.4868035316467285, + "step": 7009 + }, + { + "epoch": 0.81, + "learning_rate": 5.843380545475828e-08, + "logits/chosen": -3.41225004196167, + "logits/rejected": -3.477538585662842, + "logps/chosen": -94.22686004638672, + "logps/rejected": -141.69674682617188, + "loss": 0.3236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3070080280303955, + "rewards/margins": 1.2230339050292969, + "rewards/rejected": -0.9160258769989014, + "step": 7010 + }, + { + "epoch": 0.81, + "learning_rate": 5.8398688985134025e-08, + "logits/chosen": -2.688107490539551, + "logits/rejected": -2.619704246520996, + "logps/chosen": -288.6092529296875, + "logps/rejected": -277.4413757324219, + "loss": 0.4864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23326177895069122, + "rewards/margins": 0.8740946054458618, + "rewards/rejected": -1.107356309890747, + "step": 7011 + }, + { + "epoch": 0.81, + "learning_rate": 5.8363572515509766e-08, + "logits/chosen": -2.9327104091644287, + "logits/rejected": -2.9905567169189453, + "logps/chosen": -394.778076171875, + "logps/rejected": -358.7056884765625, + "loss": 0.5225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00019050762057304382, + "rewards/margins": 0.944491446018219, + "rewards/rejected": -0.9446818828582764, + "step": 7012 + }, + { + "epoch": 0.81, + "learning_rate": 5.832845604588552e-08, + "logits/chosen": -2.5158278942108154, + "logits/rejected": -2.6115946769714355, + "logps/chosen": -248.7677001953125, + "logps/rejected": -173.294189453125, + "loss": 0.534, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03588390350341797, + "rewards/margins": 1.1677978038787842, + "rewards/rejected": -1.1319141387939453, + "step": 7013 + }, + { + "epoch": 0.81, + "learning_rate": 5.829333957626126e-08, + "logits/chosen": -3.4239258766174316, + "logits/rejected": -3.1715087890625, + "logps/chosen": -211.8265838623047, + "logps/rejected": -357.37579345703125, + "loss": 0.2278, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26790139079093933, + "rewards/margins": 3.259317398071289, + "rewards/rejected": -2.9914159774780273, + "step": 7014 + }, + { + "epoch": 0.81, + "learning_rate": 5.8258223106637014e-08, + "logits/chosen": -2.8229074478149414, + "logits/rejected": -2.7597951889038086, + "logps/chosen": -133.48080444335938, + "logps/rejected": -159.08413696289062, + "loss": 0.5817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4498964548110962, + "rewards/margins": 1.2256112098693848, + "rewards/rejected": -1.6755077838897705, + "step": 7015 + }, + { + "epoch": 0.81, + "learning_rate": 5.8223106637012755e-08, + "logits/chosen": -3.1342411041259766, + "logits/rejected": -3.2511019706726074, + "logps/chosen": -249.70947265625, + "logps/rejected": -240.67068481445312, + "loss": 0.3103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1765601634979248, + "rewards/margins": 2.4823625087738037, + "rewards/rejected": -2.6589224338531494, + "step": 7016 + }, + { + "epoch": 0.81, + "learning_rate": 5.81879901673885e-08, + "logits/chosen": -3.510951042175293, + "logits/rejected": -3.157930374145508, + "logps/chosen": -304.3177490234375, + "logps/rejected": -216.09658813476562, + "loss": 0.3778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11523973941802979, + "rewards/margins": 2.106722116470337, + "rewards/rejected": -2.221961736679077, + "step": 7017 + }, + { + "epoch": 0.81, + "learning_rate": 5.815287369776425e-08, + "logits/chosen": -2.5554585456848145, + "logits/rejected": -2.37892746925354, + "logps/chosen": -240.58502197265625, + "logps/rejected": -266.75213623046875, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4494015574455261, + "rewards/margins": 2.2332239151000977, + "rewards/rejected": -1.7838222980499268, + "step": 7018 + }, + { + "epoch": 0.81, + "learning_rate": 5.8117757228139996e-08, + "logits/chosen": -2.719301700592041, + "logits/rejected": -2.9576923847198486, + "logps/chosen": -276.9740295410156, + "logps/rejected": -332.9812316894531, + "loss": 0.2072, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4142550230026245, + "rewards/margins": 3.1963658332824707, + "rewards/rejected": -2.7821106910705566, + "step": 7019 + }, + { + "epoch": 0.81, + "learning_rate": 5.808264075851574e-08, + "logits/chosen": -2.835714817047119, + "logits/rejected": -2.7402169704437256, + "logps/chosen": -236.01895141601562, + "logps/rejected": -228.68280029296875, + "loss": 0.5754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0045690275728702545, + "rewards/margins": 1.1852267980575562, + "rewards/rejected": -1.1897958517074585, + "step": 7020 + }, + { + "epoch": 0.81, + "learning_rate": 5.804752428889149e-08, + "logits/chosen": -3.075382709503174, + "logits/rejected": -2.87271785736084, + "logps/chosen": -274.2955017089844, + "logps/rejected": -283.7748718261719, + "loss": 0.4992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22629694640636444, + "rewards/margins": 0.9724595546722412, + "rewards/rejected": -1.198756456375122, + "step": 7021 + }, + { + "epoch": 0.81, + "learning_rate": 5.801240781926723e-08, + "logits/chosen": -2.9359560012817383, + "logits/rejected": -2.6708314418792725, + "logps/chosen": -215.87551879882812, + "logps/rejected": -177.6206512451172, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33864274621009827, + "rewards/margins": 1.317594051361084, + "rewards/rejected": -1.6562367677688599, + "step": 7022 + }, + { + "epoch": 0.81, + "learning_rate": 5.797729134964298e-08, + "logits/chosen": -3.1327714920043945, + "logits/rejected": -2.90097713470459, + "logps/chosen": -416.82562255859375, + "logps/rejected": -414.0065002441406, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04780092090368271, + "rewards/margins": 1.8778820037841797, + "rewards/rejected": -1.8300807476043701, + "step": 7023 + }, + { + "epoch": 0.81, + "learning_rate": 5.7942174880018726e-08, + "logits/chosen": -2.749321222305298, + "logits/rejected": -2.6748499870300293, + "logps/chosen": -230.70370483398438, + "logps/rejected": -291.6788024902344, + "loss": 0.3914, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13591018319129944, + "rewards/margins": 1.7908746004104614, + "rewards/rejected": -1.9267847537994385, + "step": 7024 + }, + { + "epoch": 0.81, + "learning_rate": 5.790705841039447e-08, + "logits/chosen": -3.303863048553467, + "logits/rejected": -3.4979522228240967, + "logps/chosen": -270.1163330078125, + "logps/rejected": -263.6848449707031, + "loss": 0.2674, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2693692743778229, + "rewards/margins": 2.038541555404663, + "rewards/rejected": -1.7691724300384521, + "step": 7025 + }, + { + "epoch": 0.81, + "learning_rate": 5.7871941940770214e-08, + "logits/chosen": -2.8216116428375244, + "logits/rejected": -2.847590446472168, + "logps/chosen": -215.08717346191406, + "logps/rejected": -276.6814880371094, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09741765260696411, + "rewards/margins": 3.1318373680114746, + "rewards/rejected": -3.0344197750091553, + "step": 7026 + }, + { + "epoch": 0.81, + "learning_rate": 5.783682547114597e-08, + "logits/chosen": -3.912646770477295, + "logits/rejected": -3.8543429374694824, + "logps/chosen": -236.34674072265625, + "logps/rejected": -296.74420166015625, + "loss": 0.3249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10732920467853546, + "rewards/margins": 2.2522826194763184, + "rewards/rejected": -2.359611749649048, + "step": 7027 + }, + { + "epoch": 0.81, + "learning_rate": 5.780170900152171e-08, + "logits/chosen": -3.098193883895874, + "logits/rejected": -2.8091702461242676, + "logps/chosen": -306.3274230957031, + "logps/rejected": -257.2455749511719, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15268273651599884, + "rewards/margins": 2.8946781158447266, + "rewards/rejected": -3.047360420227051, + "step": 7028 + }, + { + "epoch": 0.81, + "learning_rate": 5.776659253189746e-08, + "logits/chosen": -3.50736665725708, + "logits/rejected": -3.6484546661376953, + "logps/chosen": -182.63880920410156, + "logps/rejected": -234.1070098876953, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43419313430786133, + "rewards/margins": 4.1922993659973145, + "rewards/rejected": -3.758105993270874, + "step": 7029 + }, + { + "epoch": 0.81, + "learning_rate": 5.77314760622732e-08, + "logits/chosen": -3.3539843559265137, + "logits/rejected": -3.6536202430725098, + "logps/chosen": -81.39289093017578, + "logps/rejected": -188.53878784179688, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0565045103430748, + "rewards/margins": 3.0868730545043945, + "rewards/rejected": -3.1433777809143066, + "step": 7030 + }, + { + "epoch": 0.81, + "learning_rate": 5.769635959264895e-08, + "logits/chosen": -3.204451560974121, + "logits/rejected": -3.3115899562835693, + "logps/chosen": -140.57106018066406, + "logps/rejected": -300.8236999511719, + "loss": 0.3448, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.012392483651638031, + "rewards/margins": 1.391034722328186, + "rewards/rejected": -1.378642201423645, + "step": 7031 + }, + { + "epoch": 0.81, + "learning_rate": 5.76612431230247e-08, + "logits/chosen": -3.5460705757141113, + "logits/rejected": -4.080048084259033, + "logps/chosen": -131.97496032714844, + "logps/rejected": -260.0867004394531, + "loss": 0.4248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3123064339160919, + "rewards/margins": 1.5114145278930664, + "rewards/rejected": -1.8237210512161255, + "step": 7032 + }, + { + "epoch": 0.81, + "learning_rate": 5.7626126653400444e-08, + "logits/chosen": -3.5782439708709717, + "logits/rejected": -3.3531854152679443, + "logps/chosen": -405.2745666503906, + "logps/rejected": -454.8512878417969, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0387142114341259, + "rewards/margins": 2.743417739868164, + "rewards/rejected": -2.782132148742676, + "step": 7033 + }, + { + "epoch": 0.81, + "learning_rate": 5.7591010183776185e-08, + "logits/chosen": -2.707460403442383, + "logits/rejected": -2.593702793121338, + "logps/chosen": -432.5107727050781, + "logps/rejected": -462.3116149902344, + "loss": 0.2387, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5372724533081055, + "rewards/margins": 2.7662675380706787, + "rewards/rejected": -2.2289953231811523, + "step": 7034 + }, + { + "epoch": 0.81, + "learning_rate": 5.755589371415194e-08, + "logits/chosen": -3.147020101547241, + "logits/rejected": -3.1884093284606934, + "logps/chosen": -71.85221099853516, + "logps/rejected": -282.9247741699219, + "loss": 0.5986, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04178383946418762, + "rewards/margins": 1.0045280456542969, + "rewards/rejected": -1.046311855316162, + "step": 7035 + }, + { + "epoch": 0.81, + "learning_rate": 5.752077724452768e-08, + "logits/chosen": -3.1600430011749268, + "logits/rejected": -3.267702341079712, + "logps/chosen": -354.5407409667969, + "logps/rejected": -251.55972290039062, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8089035749435425, + "rewards/margins": 2.563283920288086, + "rewards/rejected": -1.7543803453445435, + "step": 7036 + }, + { + "epoch": 0.81, + "learning_rate": 5.7485660774903433e-08, + "logits/chosen": -3.1817052364349365, + "logits/rejected": -3.1565184593200684, + "logps/chosen": -80.84439086914062, + "logps/rejected": -166.3172149658203, + "loss": 0.3198, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38280996680259705, + "rewards/margins": 2.0058820247650146, + "rewards/rejected": -1.6230719089508057, + "step": 7037 + }, + { + "epoch": 0.81, + "learning_rate": 5.7450544305279174e-08, + "logits/chosen": -2.54144287109375, + "logits/rejected": -2.425483226776123, + "logps/chosen": -242.839111328125, + "logps/rejected": -261.7880859375, + "loss": 0.3703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004092380404472351, + "rewards/margins": 2.301875352859497, + "rewards/rejected": -2.3059678077697754, + "step": 7038 + }, + { + "epoch": 0.81, + "learning_rate": 5.741542783565492e-08, + "logits/chosen": -3.841797113418579, + "logits/rejected": -3.5687458515167236, + "logps/chosen": -386.20355224609375, + "logps/rejected": -313.2921447753906, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9863862991333008, + "rewards/margins": 1.861578106880188, + "rewards/rejected": -2.847964286804199, + "step": 7039 + }, + { + "epoch": 0.81, + "learning_rate": 5.738031136603066e-08, + "logits/chosen": -2.8970139026641846, + "logits/rejected": -3.2469797134399414, + "logps/chosen": -196.3568115234375, + "logps/rejected": -218.02694702148438, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14216555655002594, + "rewards/margins": 2.558840751647949, + "rewards/rejected": -2.416675329208374, + "step": 7040 + }, + { + "epoch": 0.81, + "learning_rate": 5.7345194896406416e-08, + "logits/chosen": -3.126605987548828, + "logits/rejected": -3.1066880226135254, + "logps/chosen": -282.1741027832031, + "logps/rejected": -166.8070068359375, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4176447093486786, + "rewards/margins": 1.9743317365646362, + "rewards/rejected": -1.5566872358322144, + "step": 7041 + }, + { + "epoch": 0.81, + "learning_rate": 5.7310078426782156e-08, + "logits/chosen": -3.0865578651428223, + "logits/rejected": -3.6900711059570312, + "logps/chosen": -153.4751739501953, + "logps/rejected": -252.004150390625, + "loss": 0.476, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4377203583717346, + "rewards/margins": 1.9476804733276367, + "rewards/rejected": -2.3854007720947266, + "step": 7042 + }, + { + "epoch": 0.81, + "learning_rate": 5.727496195715791e-08, + "logits/chosen": -2.827249765396118, + "logits/rejected": -2.659858226776123, + "logps/chosen": -267.361083984375, + "logps/rejected": -293.8900451660156, + "loss": 0.3435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5057290196418762, + "rewards/margins": 2.0650031566619873, + "rewards/rejected": -1.5592741966247559, + "step": 7043 + }, + { + "epoch": 0.81, + "learning_rate": 5.723984548753365e-08, + "logits/chosen": -3.050323486328125, + "logits/rejected": -3.104583740234375, + "logps/chosen": -306.00384521484375, + "logps/rejected": -311.7962646484375, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0699100494384766, + "rewards/margins": 1.2356595993041992, + "rewards/rejected": -2.305569648742676, + "step": 7044 + }, + { + "epoch": 0.81, + "learning_rate": 5.72047290179094e-08, + "logits/chosen": -3.1819331645965576, + "logits/rejected": -2.8479716777801514, + "logps/chosen": -343.167724609375, + "logps/rejected": -222.93482971191406, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14843156933784485, + "rewards/margins": 1.267166256904602, + "rewards/rejected": -1.11873459815979, + "step": 7045 + }, + { + "epoch": 0.81, + "learning_rate": 5.7169612548285145e-08, + "logits/chosen": -3.110764265060425, + "logits/rejected": -2.930696964263916, + "logps/chosen": -280.049560546875, + "logps/rejected": -333.2024230957031, + "loss": 0.2563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03746870905160904, + "rewards/margins": 2.4979894161224365, + "rewards/rejected": -2.5354583263397217, + "step": 7046 + }, + { + "epoch": 0.81, + "learning_rate": 5.713449607866089e-08, + "logits/chosen": -2.9193787574768066, + "logits/rejected": -3.132394313812256, + "logps/chosen": -295.9837341308594, + "logps/rejected": -485.7255554199219, + "loss": 0.5455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5538333654403687, + "rewards/margins": 1.6877435445785522, + "rewards/rejected": -2.241576910018921, + "step": 7047 + }, + { + "epoch": 0.81, + "learning_rate": 5.709937960903663e-08, + "logits/chosen": -3.5206079483032227, + "logits/rejected": -3.667102575302124, + "logps/chosen": -158.73484802246094, + "logps/rejected": -110.18072509765625, + "loss": 0.5939, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15645790100097656, + "rewards/margins": 0.9980670213699341, + "rewards/rejected": -1.1545250415802002, + "step": 7048 + }, + { + "epoch": 0.81, + "learning_rate": 5.706426313941239e-08, + "logits/chosen": -2.5946037769317627, + "logits/rejected": -2.6919455528259277, + "logps/chosen": -427.58929443359375, + "logps/rejected": -314.52093505859375, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36567479372024536, + "rewards/margins": 2.1707184314727783, + "rewards/rejected": -1.8050434589385986, + "step": 7049 + }, + { + "epoch": 0.81, + "learning_rate": 5.702914666978813e-08, + "logits/chosen": -2.777996778488159, + "logits/rejected": -2.699032783508301, + "logps/chosen": -264.9035949707031, + "logps/rejected": -259.9676513671875, + "loss": 0.4568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18916812539100647, + "rewards/margins": 1.6577956676483154, + "rewards/rejected": -1.8469637632369995, + "step": 7050 + }, + { + "epoch": 0.81, + "learning_rate": 5.699403020016388e-08, + "logits/chosen": -3.391815662384033, + "logits/rejected": -3.2201616764068604, + "logps/chosen": -405.2630615234375, + "logps/rejected": -167.54440307617188, + "loss": 0.199, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3981601595878601, + "rewards/margins": 2.1905150413513184, + "rewards/rejected": -1.792354941368103, + "step": 7051 + }, + { + "epoch": 0.81, + "learning_rate": 5.695891373053962e-08, + "logits/chosen": -3.227337598800659, + "logits/rejected": -3.163895606994629, + "logps/chosen": -180.7281951904297, + "logps/rejected": -389.0982971191406, + "loss": 0.2694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07126118987798691, + "rewards/margins": 1.6756329536437988, + "rewards/rejected": -1.7468942403793335, + "step": 7052 + }, + { + "epoch": 0.81, + "learning_rate": 5.692379726091536e-08, + "logits/chosen": -3.6207454204559326, + "logits/rejected": -3.599393367767334, + "logps/chosen": -406.77947998046875, + "logps/rejected": -295.5972595214844, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5041491985321045, + "rewards/margins": 1.401402235031128, + "rewards/rejected": -1.905551552772522, + "step": 7053 + }, + { + "epoch": 0.81, + "learning_rate": 5.6888680791291116e-08, + "logits/chosen": -3.334864377975464, + "logits/rejected": -3.2278780937194824, + "logps/chosen": -300.92242431640625, + "logps/rejected": -257.08489990234375, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49210023880004883, + "rewards/margins": 1.449122428894043, + "rewards/rejected": -1.9412225484848022, + "step": 7054 + }, + { + "epoch": 0.81, + "learning_rate": 5.685356432166686e-08, + "logits/chosen": -3.036513566970825, + "logits/rejected": -2.95225191116333, + "logps/chosen": -261.46881103515625, + "logps/rejected": -257.7674255371094, + "loss": 0.4225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41657161712646484, + "rewards/margins": 1.2301901578903198, + "rewards/rejected": -1.6467617750167847, + "step": 7055 + }, + { + "epoch": 0.81, + "learning_rate": 5.6818447852042604e-08, + "logits/chosen": -2.8878931999206543, + "logits/rejected": -3.1583895683288574, + "logps/chosen": -276.63275146484375, + "logps/rejected": -254.91697692871094, + "loss": 0.3093, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7066150903701782, + "rewards/margins": 2.489630937576294, + "rewards/rejected": -1.7830158472061157, + "step": 7056 + }, + { + "epoch": 0.81, + "learning_rate": 5.6783331382418345e-08, + "logits/chosen": -3.782564878463745, + "logits/rejected": -3.586973190307617, + "logps/chosen": -97.64024353027344, + "logps/rejected": -116.10865783691406, + "loss": 0.5069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28266292810440063, + "rewards/margins": 1.0339677333831787, + "rewards/rejected": -1.3166306018829346, + "step": 7057 + }, + { + "epoch": 0.81, + "learning_rate": 5.67482149127941e-08, + "logits/chosen": -2.8402411937713623, + "logits/rejected": -2.9018378257751465, + "logps/chosen": -227.19827270507812, + "logps/rejected": -317.8902587890625, + "loss": 0.2066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3257594108581543, + "rewards/margins": 2.291743040084839, + "rewards/rejected": -2.617502450942993, + "step": 7058 + }, + { + "epoch": 0.81, + "learning_rate": 5.671309844316984e-08, + "logits/chosen": -3.6081953048706055, + "logits/rejected": -3.2927210330963135, + "logps/chosen": -306.507080078125, + "logps/rejected": -231.87355041503906, + "loss": 0.3894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3053259253501892, + "rewards/margins": 1.331560492515564, + "rewards/rejected": -1.02623450756073, + "step": 7059 + }, + { + "epoch": 0.81, + "learning_rate": 5.667798197354559e-08, + "logits/chosen": -3.3207836151123047, + "logits/rejected": -3.1705055236816406, + "logps/chosen": -468.75201416015625, + "logps/rejected": -237.28713989257812, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4280315637588501, + "rewards/margins": 1.8350305557250977, + "rewards/rejected": -1.406998872756958, + "step": 7060 + }, + { + "epoch": 0.81, + "learning_rate": 5.6642865503921334e-08, + "logits/chosen": -3.359374761581421, + "logits/rejected": -3.183685064315796, + "logps/chosen": -282.3336181640625, + "logps/rejected": -287.80010986328125, + "loss": 0.5238, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2787518799304962, + "rewards/margins": 1.3831183910369873, + "rewards/rejected": -1.1043665409088135, + "step": 7061 + }, + { + "epoch": 0.81, + "learning_rate": 5.660774903429708e-08, + "logits/chosen": -3.4322497844696045, + "logits/rejected": -3.386507034301758, + "logps/chosen": -255.1927490234375, + "logps/rejected": -295.40985107421875, + "loss": 0.3453, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19994470477104187, + "rewards/margins": 3.092789888381958, + "rewards/rejected": -2.8928451538085938, + "step": 7062 + }, + { + "epoch": 0.81, + "learning_rate": 5.657263256467283e-08, + "logits/chosen": -3.438406467437744, + "logits/rejected": -3.175628900527954, + "logps/chosen": -251.47628784179688, + "logps/rejected": -294.6208190917969, + "loss": 0.3252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4550783038139343, + "rewards/margins": 2.113854169845581, + "rewards/rejected": -2.56893253326416, + "step": 7063 + }, + { + "epoch": 0.81, + "learning_rate": 5.6537516095048576e-08, + "logits/chosen": -3.1073198318481445, + "logits/rejected": -3.1878671646118164, + "logps/chosen": -341.9613037109375, + "logps/rejected": -292.1611328125, + "loss": 0.4168, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14789438247680664, + "rewards/margins": 1.339041829109192, + "rewards/rejected": -1.1911474466323853, + "step": 7064 + }, + { + "epoch": 0.81, + "learning_rate": 5.6502399625424316e-08, + "logits/chosen": -3.2322499752044678, + "logits/rejected": -3.1525685787200928, + "logps/chosen": -230.96800231933594, + "logps/rejected": -400.3896484375, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08606119453907013, + "rewards/margins": 2.494959831237793, + "rewards/rejected": -2.4088988304138184, + "step": 7065 + }, + { + "epoch": 0.81, + "learning_rate": 5.646728315580007e-08, + "logits/chosen": -2.995748996734619, + "logits/rejected": -2.8926000595092773, + "logps/chosen": -176.25741577148438, + "logps/rejected": -110.837646484375, + "loss": 0.639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6685460805892944, + "rewards/margins": 0.40451210737228394, + "rewards/rejected": -1.0730582475662231, + "step": 7066 + }, + { + "epoch": 0.81, + "learning_rate": 5.643216668617581e-08, + "logits/chosen": -2.9526665210723877, + "logits/rejected": -2.7665019035339355, + "logps/chosen": -206.21615600585938, + "logps/rejected": -306.4855651855469, + "loss": 0.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3203727602958679, + "rewards/margins": 0.38039350509643555, + "rewards/rejected": -0.7007662653923035, + "step": 7067 + }, + { + "epoch": 0.81, + "learning_rate": 5.6397050216551565e-08, + "logits/chosen": -2.9689197540283203, + "logits/rejected": -2.6862335205078125, + "logps/chosen": -215.55511474609375, + "logps/rejected": -297.5643005371094, + "loss": 1.1776, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9492173790931702, + "rewards/margins": -0.48783284425735474, + "rewards/rejected": -0.46138453483581543, + "step": 7068 + }, + { + "epoch": 0.81, + "learning_rate": 5.6361933746927305e-08, + "logits/chosen": -3.0212974548339844, + "logits/rejected": -3.2587404251098633, + "logps/chosen": -154.463623046875, + "logps/rejected": -272.8995666503906, + "loss": 0.2192, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5217602849006653, + "rewards/margins": 2.1935935020446777, + "rewards/rejected": -1.6718332767486572, + "step": 7069 + }, + { + "epoch": 0.82, + "learning_rate": 5.632681727730305e-08, + "logits/chosen": -3.0041098594665527, + "logits/rejected": -2.9872727394104004, + "logps/chosen": -169.76791381835938, + "logps/rejected": -142.58639526367188, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2464982569217682, + "rewards/margins": 1.506847858428955, + "rewards/rejected": -1.2603496313095093, + "step": 7070 + }, + { + "epoch": 0.82, + "learning_rate": 5.629170080767879e-08, + "logits/chosen": -3.275524139404297, + "logits/rejected": -3.119741201400757, + "logps/chosen": -257.7836608886719, + "logps/rejected": -196.55044555664062, + "loss": 0.5916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17790032923221588, + "rewards/margins": 2.461857318878174, + "rewards/rejected": -2.6397576332092285, + "step": 7071 + }, + { + "epoch": 0.82, + "learning_rate": 5.625658433805455e-08, + "logits/chosen": -3.1378724575042725, + "logits/rejected": -3.2077736854553223, + "logps/chosen": -247.58306884765625, + "logps/rejected": -176.12905883789062, + "loss": 0.4589, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1516421139240265, + "rewards/margins": 1.2360289096832275, + "rewards/rejected": -1.0843868255615234, + "step": 7072 + }, + { + "epoch": 0.82, + "learning_rate": 5.622146786843029e-08, + "logits/chosen": -2.735647201538086, + "logits/rejected": -2.6106834411621094, + "logps/chosen": -338.3728332519531, + "logps/rejected": -319.166015625, + "loss": 0.2904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06503213196992874, + "rewards/margins": 1.5332378149032593, + "rewards/rejected": -1.5982699394226074, + "step": 7073 + }, + { + "epoch": 0.82, + "learning_rate": 5.618635139880604e-08, + "logits/chosen": -2.7515780925750732, + "logits/rejected": -2.9231112003326416, + "logps/chosen": -324.6691589355469, + "logps/rejected": -332.10443115234375, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21707721054553986, + "rewards/margins": 2.3996899127960205, + "rewards/rejected": -2.182612657546997, + "step": 7074 + }, + { + "epoch": 0.82, + "learning_rate": 5.615123492918178e-08, + "logits/chosen": -3.1166272163391113, + "logits/rejected": -3.191347360610962, + "logps/chosen": -201.09896850585938, + "logps/rejected": -209.6114044189453, + "loss": 0.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21603630483150482, + "rewards/margins": 2.0323991775512695, + "rewards/rejected": -1.816362977027893, + "step": 7075 + }, + { + "epoch": 0.82, + "learning_rate": 5.611611845955753e-08, + "logits/chosen": -3.6852574348449707, + "logits/rejected": -3.6955854892730713, + "logps/chosen": -197.50306701660156, + "logps/rejected": -266.58428955078125, + "loss": 0.4192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5733964443206787, + "rewards/margins": 1.1692750453948975, + "rewards/rejected": -1.7426716089248657, + "step": 7076 + }, + { + "epoch": 0.82, + "learning_rate": 5.6081001989933276e-08, + "logits/chosen": -2.519528388977051, + "logits/rejected": -2.7924022674560547, + "logps/chosen": -439.73358154296875, + "logps/rejected": -240.33419799804688, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010356217622756958, + "rewards/margins": 0.5724209547042847, + "rewards/rejected": -0.5620647668838501, + "step": 7077 + }, + { + "epoch": 0.82, + "learning_rate": 5.6045885520309024e-08, + "logits/chosen": -2.952589273452759, + "logits/rejected": -2.894784450531006, + "logps/chosen": -248.90403747558594, + "logps/rejected": -184.84259033203125, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2236766666173935, + "rewards/margins": 0.8994431495666504, + "rewards/rejected": -1.1231197118759155, + "step": 7078 + }, + { + "epoch": 0.82, + "learning_rate": 5.6010769050684764e-08, + "logits/chosen": -3.8146562576293945, + "logits/rejected": -3.951063394546509, + "logps/chosen": -301.46038818359375, + "logps/rejected": -192.5223846435547, + "loss": 0.5273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.269353985786438, + "rewards/margins": 1.4656658172607422, + "rewards/rejected": -1.7350196838378906, + "step": 7079 + }, + { + "epoch": 0.82, + "learning_rate": 5.597565258106052e-08, + "logits/chosen": -2.900376319885254, + "logits/rejected": -2.872828722000122, + "logps/chosen": -250.26870727539062, + "logps/rejected": -216.10549926757812, + "loss": 0.4067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5317414999008179, + "rewards/margins": 3.1164350509643555, + "rewards/rejected": -3.648176670074463, + "step": 7080 + }, + { + "epoch": 0.82, + "learning_rate": 5.594053611143626e-08, + "logits/chosen": -3.8526978492736816, + "logits/rejected": -3.902538537979126, + "logps/chosen": -370.6311950683594, + "logps/rejected": -410.0972900390625, + "loss": 0.8254, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6767392754554749, + "rewards/margins": 1.161439061164856, + "rewards/rejected": -1.8381781578063965, + "step": 7081 + }, + { + "epoch": 0.82, + "learning_rate": 5.590541964181201e-08, + "logits/chosen": -2.8272626399993896, + "logits/rejected": -2.8615105152130127, + "logps/chosen": -685.072021484375, + "logps/rejected": -641.4850463867188, + "loss": 0.5283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45681530237197876, + "rewards/margins": 1.2376024723052979, + "rewards/rejected": -1.6944177150726318, + "step": 7082 + }, + { + "epoch": 0.82, + "learning_rate": 5.587030317218775e-08, + "logits/chosen": -3.2007060050964355, + "logits/rejected": -3.1233367919921875, + "logps/chosen": -462.0911865234375, + "logps/rejected": -291.7182312011719, + "loss": 0.3886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2548060417175293, + "rewards/margins": 1.5906261205673218, + "rewards/rejected": -1.845432162284851, + "step": 7083 + }, + { + "epoch": 0.82, + "learning_rate": 5.58351867025635e-08, + "logits/chosen": -3.1351892948150635, + "logits/rejected": -3.1308822631835938, + "logps/chosen": -281.2229309082031, + "logps/rejected": -290.6504211425781, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.695325493812561, + "rewards/margins": 1.2385369539260864, + "rewards/rejected": -1.9338624477386475, + "step": 7084 + }, + { + "epoch": 0.82, + "learning_rate": 5.580007023293925e-08, + "logits/chosen": -2.4682297706604004, + "logits/rejected": -2.5058419704437256, + "logps/chosen": -317.8101501464844, + "logps/rejected": -314.1389465332031, + "loss": 0.4079, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00435948371887207, + "rewards/margins": 1.0696160793304443, + "rewards/rejected": -1.0652565956115723, + "step": 7085 + }, + { + "epoch": 0.82, + "learning_rate": 5.5764953763314995e-08, + "logits/chosen": -3.4052867889404297, + "logits/rejected": -3.580050230026245, + "logps/chosen": -256.4182434082031, + "logps/rejected": -269.349365234375, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12558375298976898, + "rewards/margins": 2.2278125286102295, + "rewards/rejected": -2.1022286415100098, + "step": 7086 + }, + { + "epoch": 0.82, + "learning_rate": 5.5729837293690736e-08, + "logits/chosen": -2.6451597213745117, + "logits/rejected": -2.76731538772583, + "logps/chosen": -283.95550537109375, + "logps/rejected": -195.28419494628906, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0431140661239624, + "rewards/margins": 1.1461060047149658, + "rewards/rejected": -1.1892200708389282, + "step": 7087 + }, + { + "epoch": 0.82, + "learning_rate": 5.569472082406649e-08, + "logits/chosen": -2.9680416584014893, + "logits/rejected": -2.699769973754883, + "logps/chosen": -234.42343139648438, + "logps/rejected": -245.4241943359375, + "loss": 0.3785, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15280485153198242, + "rewards/margins": 1.6252148151397705, + "rewards/rejected": -1.472409963607788, + "step": 7088 + }, + { + "epoch": 0.82, + "learning_rate": 5.565960435444223e-08, + "logits/chosen": -3.1271395683288574, + "logits/rejected": -2.9621267318725586, + "logps/chosen": -193.8024139404297, + "logps/rejected": -215.2017364501953, + "loss": 0.8313, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16300687193870544, + "rewards/margins": 1.9391807317733765, + "rewards/rejected": -1.7761738300323486, + "step": 7089 + }, + { + "epoch": 0.82, + "learning_rate": 5.562448788481798e-08, + "logits/chosen": -2.6596081256866455, + "logits/rejected": -2.748577117919922, + "logps/chosen": -313.57952880859375, + "logps/rejected": -202.201171875, + "loss": 0.3118, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08566740900278091, + "rewards/margins": 1.4105364084243774, + "rewards/rejected": -1.3248690366744995, + "step": 7090 + }, + { + "epoch": 0.82, + "learning_rate": 5.5589371415193725e-08, + "logits/chosen": -3.806964159011841, + "logits/rejected": -3.5072247982025146, + "logps/chosen": -234.96473693847656, + "logps/rejected": -151.3390655517578, + "loss": 0.3696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1668606698513031, + "rewards/margins": 1.820281744003296, + "rewards/rejected": -1.9871424436569214, + "step": 7091 + }, + { + "epoch": 0.82, + "learning_rate": 5.555425494556947e-08, + "logits/chosen": -2.7330214977264404, + "logits/rejected": -3.023786783218384, + "logps/chosen": -213.3662567138672, + "logps/rejected": -260.3422546386719, + "loss": 0.3295, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3202010989189148, + "rewards/margins": 1.8877284526824951, + "rewards/rejected": -2.2079296112060547, + "step": 7092 + }, + { + "epoch": 0.82, + "learning_rate": 5.551913847594521e-08, + "logits/chosen": -2.5855352878570557, + "logits/rejected": -2.600278854370117, + "logps/chosen": -253.9904327392578, + "logps/rejected": -192.25364685058594, + "loss": 0.733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6619327664375305, + "rewards/margins": 0.7816194295883179, + "rewards/rejected": -1.4435522556304932, + "step": 7093 + }, + { + "epoch": 0.82, + "learning_rate": 5.5484022006320966e-08, + "logits/chosen": -2.436964988708496, + "logits/rejected": -2.318171739578247, + "logps/chosen": -299.86834716796875, + "logps/rejected": -215.6698760986328, + "loss": 0.2903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6762658953666687, + "rewards/margins": 1.7401716709136963, + "rewards/rejected": -2.4164376258850098, + "step": 7094 + }, + { + "epoch": 0.82, + "learning_rate": 5.544890553669671e-08, + "logits/chosen": -3.92901611328125, + "logits/rejected": -3.7500557899475098, + "logps/chosen": -140.6545867919922, + "logps/rejected": -180.45822143554688, + "loss": 0.208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10577109456062317, + "rewards/margins": 2.3322598934173584, + "rewards/rejected": -2.2264890670776367, + "step": 7095 + }, + { + "epoch": 0.82, + "learning_rate": 5.541378906707246e-08, + "logits/chosen": -3.7872354984283447, + "logits/rejected": -3.5539309978485107, + "logps/chosen": -358.5177917480469, + "logps/rejected": -232.00234985351562, + "loss": 0.2649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2504768669605255, + "rewards/margins": 2.7441132068634033, + "rewards/rejected": -2.9945898056030273, + "step": 7096 + }, + { + "epoch": 0.82, + "learning_rate": 5.53786725974482e-08, + "logits/chosen": -3.2905468940734863, + "logits/rejected": -3.0145328044891357, + "logps/chosen": -206.1131134033203, + "logps/rejected": -196.57994079589844, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11245220899581909, + "rewards/margins": 1.570654273033142, + "rewards/rejected": -1.6831063032150269, + "step": 7097 + }, + { + "epoch": 0.82, + "learning_rate": 5.534355612782395e-08, + "logits/chosen": -3.4214577674865723, + "logits/rejected": -3.5702950954437256, + "logps/chosen": -280.1068420410156, + "logps/rejected": -238.83953857421875, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36264070868492126, + "rewards/margins": 1.8021775484085083, + "rewards/rejected": -2.164818286895752, + "step": 7098 + }, + { + "epoch": 0.82, + "learning_rate": 5.5308439658199696e-08, + "logits/chosen": -3.536156177520752, + "logits/rejected": -3.0974879264831543, + "logps/chosen": -346.6443176269531, + "logps/rejected": -288.46331787109375, + "loss": 0.5007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4743553400039673, + "rewards/margins": 1.0005881786346436, + "rewards/rejected": -1.4749436378479004, + "step": 7099 + }, + { + "epoch": 0.82, + "learning_rate": 5.5273323188575436e-08, + "logits/chosen": -3.426936149597168, + "logits/rejected": -3.062103033065796, + "logps/chosen": -289.6976318359375, + "logps/rejected": -211.37982177734375, + "loss": 0.3313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.035894013941287994, + "rewards/margins": 1.4520708322525024, + "rewards/rejected": -1.4879648685455322, + "step": 7100 + }, + { + "epoch": 0.82, + "learning_rate": 5.5238206718951184e-08, + "logits/chosen": -3.3491029739379883, + "logits/rejected": -3.363377094268799, + "logps/chosen": -100.10203552246094, + "logps/rejected": -208.3076629638672, + "loss": 0.4727, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1137009933590889, + "rewards/margins": 1.5027084350585938, + "rewards/rejected": -1.616409420967102, + "step": 7101 + }, + { + "epoch": 0.82, + "learning_rate": 5.520309024932693e-08, + "logits/chosen": -3.3375144004821777, + "logits/rejected": -3.3835184574127197, + "logps/chosen": -223.378173828125, + "logps/rejected": -188.72727966308594, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48113158345222473, + "rewards/margins": 0.9668976664543152, + "rewards/rejected": -1.4480292797088623, + "step": 7102 + }, + { + "epoch": 0.82, + "learning_rate": 5.516797377970268e-08, + "logits/chosen": -3.001511573791504, + "logits/rejected": -3.3444631099700928, + "logps/chosen": -189.11050415039062, + "logps/rejected": -207.21458435058594, + "loss": 0.3631, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11339936405420303, + "rewards/margins": 2.3371145725250244, + "rewards/rejected": -2.22371506690979, + "step": 7103 + }, + { + "epoch": 0.82, + "learning_rate": 5.513285731007842e-08, + "logits/chosen": -4.2181525230407715, + "logits/rejected": -4.147733211517334, + "logps/chosen": -204.74851989746094, + "logps/rejected": -289.3227844238281, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14262992143630981, + "rewards/margins": 0.936086893081665, + "rewards/rejected": -1.07871675491333, + "step": 7104 + }, + { + "epoch": 0.82, + "learning_rate": 5.509774084045417e-08, + "logits/chosen": -3.318091630935669, + "logits/rejected": -3.257157802581787, + "logps/chosen": -185.50595092773438, + "logps/rejected": -183.26112365722656, + "loss": 0.2922, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23624268174171448, + "rewards/margins": 2.1310958862304688, + "rewards/rejected": -1.894853115081787, + "step": 7105 + }, + { + "epoch": 0.82, + "learning_rate": 5.506262437082991e-08, + "logits/chosen": -2.317072629928589, + "logits/rejected": -2.347303867340088, + "logps/chosen": -384.7866516113281, + "logps/rejected": -403.66741943359375, + "loss": 0.2828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08356037735939026, + "rewards/margins": 1.3286412954330444, + "rewards/rejected": -1.2450810670852661, + "step": 7106 + }, + { + "epoch": 0.82, + "learning_rate": 5.502750790120566e-08, + "logits/chosen": -3.4087719917297363, + "logits/rejected": -3.592648506164551, + "logps/chosen": -241.17202758789062, + "logps/rejected": -239.26170349121094, + "loss": 0.5878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24762877821922302, + "rewards/margins": 1.3556208610534668, + "rewards/rejected": -1.6032495498657227, + "step": 7107 + }, + { + "epoch": 0.82, + "learning_rate": 5.499239143158141e-08, + "logits/chosen": -3.2962396144866943, + "logits/rejected": -3.17842960357666, + "logps/chosen": -261.74749755859375, + "logps/rejected": -230.29367065429688, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.393573522567749, + "rewards/margins": 1.8459275960922241, + "rewards/rejected": -2.2395009994506836, + "step": 7108 + }, + { + "epoch": 0.82, + "learning_rate": 5.4957274961957155e-08, + "logits/chosen": -3.3644227981567383, + "logits/rejected": -3.234234571456909, + "logps/chosen": -333.8115539550781, + "logps/rejected": -214.1597137451172, + "loss": 0.376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5869728326797485, + "rewards/margins": 1.588766098022461, + "rewards/rejected": -2.175739049911499, + "step": 7109 + }, + { + "epoch": 0.82, + "learning_rate": 5.4922158492332896e-08, + "logits/chosen": -3.1037120819091797, + "logits/rejected": -2.7979791164398193, + "logps/chosen": -191.82717895507812, + "logps/rejected": -261.0047302246094, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6177102327346802, + "rewards/margins": 1.1065542697906494, + "rewards/rejected": -1.7242646217346191, + "step": 7110 + }, + { + "epoch": 0.82, + "learning_rate": 5.488704202270865e-08, + "logits/chosen": -2.9968347549438477, + "logits/rejected": -3.0185165405273438, + "logps/chosen": -320.313232421875, + "logps/rejected": -307.19061279296875, + "loss": 0.3367, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16595155000686646, + "rewards/margins": 2.8731255531311035, + "rewards/rejected": -2.707174062728882, + "step": 7111 + }, + { + "epoch": 0.82, + "learning_rate": 5.485192555308439e-08, + "logits/chosen": -3.476222276687622, + "logits/rejected": -3.2149643898010254, + "logps/chosen": -275.4999084472656, + "logps/rejected": -322.5150146484375, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2037954330444336, + "rewards/margins": 0.9272400140762329, + "rewards/rejected": -1.1310354471206665, + "step": 7112 + }, + { + "epoch": 0.82, + "learning_rate": 5.4816809083460144e-08, + "logits/chosen": -3.3259406089782715, + "logits/rejected": -3.5277209281921387, + "logps/chosen": -128.9437713623047, + "logps/rejected": -287.9943542480469, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02124711498618126, + "rewards/margins": 1.0774606466293335, + "rewards/rejected": -1.0987077951431274, + "step": 7113 + }, + { + "epoch": 0.82, + "learning_rate": 5.4781692613835885e-08, + "logits/chosen": -2.591050148010254, + "logits/rejected": -2.9913177490234375, + "logps/chosen": -509.6435546875, + "logps/rejected": -333.4090270996094, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15163275599479675, + "rewards/margins": 1.787489891052246, + "rewards/rejected": -1.635857105255127, + "step": 7114 + }, + { + "epoch": 0.82, + "learning_rate": 5.474657614421163e-08, + "logits/chosen": -3.7225074768066406, + "logits/rejected": -3.3928699493408203, + "logps/chosen": -327.8751220703125, + "logps/rejected": -282.6995849609375, + "loss": 0.3936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4192380905151367, + "rewards/margins": 1.2128431797027588, + "rewards/rejected": -1.6320812702178955, + "step": 7115 + }, + { + "epoch": 0.82, + "learning_rate": 5.471145967458738e-08, + "logits/chosen": -3.586422920227051, + "logits/rejected": -3.3939459323883057, + "logps/chosen": -266.06298828125, + "logps/rejected": -253.4732666015625, + "loss": 0.2199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08766976743936539, + "rewards/margins": 2.016558885574341, + "rewards/rejected": -2.1042287349700928, + "step": 7116 + }, + { + "epoch": 0.82, + "learning_rate": 5.4676343204963126e-08, + "logits/chosen": -2.8334364891052246, + "logits/rejected": -3.1148083209991455, + "logps/chosen": -202.23394775390625, + "logps/rejected": -258.0196838378906, + "loss": 0.5485, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14447063207626343, + "rewards/margins": 1.9710063934326172, + "rewards/rejected": -2.1154768466949463, + "step": 7117 + }, + { + "epoch": 0.82, + "learning_rate": 5.464122673533887e-08, + "logits/chosen": -2.603865623474121, + "logits/rejected": -2.776315927505493, + "logps/chosen": -342.0797119140625, + "logps/rejected": -309.4161071777344, + "loss": 0.3552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5659746527671814, + "rewards/margins": 2.0174472332000732, + "rewards/rejected": -2.5834217071533203, + "step": 7118 + }, + { + "epoch": 0.82, + "learning_rate": 5.460611026571462e-08, + "logits/chosen": -3.1425440311431885, + "logits/rejected": -3.1078341007232666, + "logps/chosen": -220.35784912109375, + "logps/rejected": -158.66111755371094, + "loss": 1.0974, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9560684561729431, + "rewards/margins": 0.19644419848918915, + "rewards/rejected": -1.152512550354004, + "step": 7119 + }, + { + "epoch": 0.82, + "learning_rate": 5.457099379609036e-08, + "logits/chosen": -3.786813735961914, + "logits/rejected": -3.804370403289795, + "logps/chosen": -376.12359619140625, + "logps/rejected": -353.0893859863281, + "loss": 0.0876, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18266546726226807, + "rewards/margins": 2.940398693084717, + "rewards/rejected": -2.7577333450317383, + "step": 7120 + }, + { + "epoch": 0.82, + "learning_rate": 5.4535877326466115e-08, + "logits/chosen": -2.7635393142700195, + "logits/rejected": -2.6908092498779297, + "logps/chosen": -477.5245361328125, + "logps/rejected": -335.4281921386719, + "loss": 0.7203, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20364710688591003, + "rewards/margins": 1.1700901985168457, + "rewards/rejected": -0.9664431214332581, + "step": 7121 + }, + { + "epoch": 0.82, + "learning_rate": 5.4500760856841856e-08, + "logits/chosen": -2.2166295051574707, + "logits/rejected": -2.214165210723877, + "logps/chosen": -179.15115356445312, + "logps/rejected": -159.06268310546875, + "loss": 0.7361, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21381348371505737, + "rewards/margins": 0.5099722146987915, + "rewards/rejected": -0.7237857580184937, + "step": 7122 + }, + { + "epoch": 0.82, + "learning_rate": 5.44656443872176e-08, + "logits/chosen": -3.256969451904297, + "logits/rejected": -3.2838261127471924, + "logps/chosen": -142.9159393310547, + "logps/rejected": -159.37301635742188, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7400006055831909, + "rewards/margins": 1.1559216976165771, + "rewards/rejected": -1.895922303199768, + "step": 7123 + }, + { + "epoch": 0.82, + "learning_rate": 5.4430527917593344e-08, + "logits/chosen": -3.1001839637756348, + "logits/rejected": -2.84311842918396, + "logps/chosen": -221.5218963623047, + "logps/rejected": -209.4567413330078, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4193398356437683, + "rewards/margins": 0.5012933611869812, + "rewards/rejected": -0.9206331968307495, + "step": 7124 + }, + { + "epoch": 0.82, + "learning_rate": 5.43954114479691e-08, + "logits/chosen": -3.7675180435180664, + "logits/rejected": -3.6534531116485596, + "logps/chosen": -140.07489013671875, + "logps/rejected": -137.56500244140625, + "loss": 0.3777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3066025972366333, + "rewards/margins": 2.194904327392578, + "rewards/rejected": -1.8883017301559448, + "step": 7125 + }, + { + "epoch": 0.82, + "learning_rate": 5.436029497834484e-08, + "logits/chosen": -2.5257420539855957, + "logits/rejected": -2.8547449111938477, + "logps/chosen": -196.67913818359375, + "logps/rejected": -252.99290466308594, + "loss": 0.3526, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05926968902349472, + "rewards/margins": 2.667520046234131, + "rewards/rejected": -2.608250617980957, + "step": 7126 + }, + { + "epoch": 0.82, + "learning_rate": 5.432517850872059e-08, + "logits/chosen": -3.7101993560791016, + "logits/rejected": -3.991537094116211, + "logps/chosen": -174.7431182861328, + "logps/rejected": -246.19772338867188, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03859903663396835, + "rewards/margins": 1.957377552986145, + "rewards/rejected": -1.9187785387039185, + "step": 7127 + }, + { + "epoch": 0.82, + "learning_rate": 5.429006203909633e-08, + "logits/chosen": -2.4049227237701416, + "logits/rejected": -2.3480963706970215, + "logps/chosen": -274.2750244140625, + "logps/rejected": -314.7406311035156, + "loss": 0.4021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0894150510430336, + "rewards/margins": 1.4254539012908936, + "rewards/rejected": -1.514868974685669, + "step": 7128 + }, + { + "epoch": 0.82, + "learning_rate": 5.425494556947208e-08, + "logits/chosen": -3.3813998699188232, + "logits/rejected": -3.408198833465576, + "logps/chosen": -116.60267639160156, + "logps/rejected": -209.93798828125, + "loss": 0.5026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1124332994222641, + "rewards/margins": 1.3458800315856934, + "rewards/rejected": -1.4583134651184082, + "step": 7129 + }, + { + "epoch": 0.82, + "learning_rate": 5.421982909984783e-08, + "logits/chosen": -3.2054264545440674, + "logits/rejected": -2.9508614540100098, + "logps/chosen": -229.07440185546875, + "logps/rejected": -150.50164794921875, + "loss": 0.4218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5230340361595154, + "rewards/margins": 1.2570263147354126, + "rewards/rejected": -1.7800602912902832, + "step": 7130 + }, + { + "epoch": 0.82, + "learning_rate": 5.4184712630223574e-08, + "logits/chosen": -3.1950016021728516, + "logits/rejected": -3.2600908279418945, + "logps/chosen": -181.82037353515625, + "logps/rejected": -197.30224609375, + "loss": 0.382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20639705657958984, + "rewards/margins": 1.3572319746017456, + "rewards/rejected": -1.563629150390625, + "step": 7131 + }, + { + "epoch": 0.82, + "learning_rate": 5.4149596160599315e-08, + "logits/chosen": -2.3831636905670166, + "logits/rejected": -2.250735282897949, + "logps/chosen": -372.8475341796875, + "logps/rejected": -401.07257080078125, + "loss": 0.49, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20846614241600037, + "rewards/margins": 0.998725175857544, + "rewards/rejected": -1.2071913480758667, + "step": 7132 + }, + { + "epoch": 0.82, + "learning_rate": 5.411447969097507e-08, + "logits/chosen": -3.5911002159118652, + "logits/rejected": -3.5294382572174072, + "logps/chosen": -217.8195037841797, + "logps/rejected": -245.01553344726562, + "loss": 0.2001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38681507110595703, + "rewards/margins": 3.208322048187256, + "rewards/rejected": -2.821506977081299, + "step": 7133 + }, + { + "epoch": 0.82, + "learning_rate": 5.407936322135081e-08, + "logits/chosen": -2.926800489425659, + "logits/rejected": -3.1174659729003906, + "logps/chosen": -552.4227294921875, + "logps/rejected": -190.94989013671875, + "loss": 0.3774, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29180508852005005, + "rewards/margins": 2.3337221145629883, + "rewards/rejected": -2.041917085647583, + "step": 7134 + }, + { + "epoch": 0.82, + "learning_rate": 5.4044246751726563e-08, + "logits/chosen": -2.7444968223571777, + "logits/rejected": -2.9273557662963867, + "logps/chosen": -355.1973571777344, + "logps/rejected": -276.47833251953125, + "loss": 0.1702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12823784351348877, + "rewards/margins": 4.171582221984863, + "rewards/rejected": -4.299820423126221, + "step": 7135 + }, + { + "epoch": 0.82, + "learning_rate": 5.4009130282102304e-08, + "logits/chosen": -3.17832612991333, + "logits/rejected": -3.4203057289123535, + "logps/chosen": -206.61595153808594, + "logps/rejected": -215.2950439453125, + "loss": 0.7406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9272035956382751, + "rewards/margins": 0.761696994304657, + "rewards/rejected": -1.6889004707336426, + "step": 7136 + }, + { + "epoch": 0.82, + "learning_rate": 5.397401381247805e-08, + "logits/chosen": -3.2560009956359863, + "logits/rejected": -2.881807804107666, + "logps/chosen": -235.8383331298828, + "logps/rejected": -182.64089965820312, + "loss": 1.0773, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7596648931503296, + "rewards/margins": 0.6561343669891357, + "rewards/rejected": -1.4157992601394653, + "step": 7137 + }, + { + "epoch": 0.82, + "learning_rate": 5.39388973428538e-08, + "logits/chosen": -3.241103172302246, + "logits/rejected": -3.3165550231933594, + "logps/chosen": -222.8415069580078, + "logps/rejected": -182.9735107421875, + "loss": 0.2551, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22179633378982544, + "rewards/margins": 2.108006477355957, + "rewards/rejected": -2.3298027515411377, + "step": 7138 + }, + { + "epoch": 0.82, + "learning_rate": 5.3903780873229546e-08, + "logits/chosen": -3.2768564224243164, + "logits/rejected": -3.292141914367676, + "logps/chosen": -245.9980926513672, + "logps/rejected": -278.5570983886719, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22541603446006775, + "rewards/margins": 1.4013360738754272, + "rewards/rejected": -1.6267521381378174, + "step": 7139 + }, + { + "epoch": 0.82, + "learning_rate": 5.3868664403605286e-08, + "logits/chosen": -3.258636474609375, + "logits/rejected": -3.3108372688293457, + "logps/chosen": -206.52711486816406, + "logps/rejected": -167.8974609375, + "loss": 0.645, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7569208145141602, + "rewards/margins": 0.8343013525009155, + "rewards/rejected": -1.5912221670150757, + "step": 7140 + }, + { + "epoch": 0.82, + "learning_rate": 5.383354793398104e-08, + "logits/chosen": -2.9101128578186035, + "logits/rejected": -2.874324083328247, + "logps/chosen": -193.00894165039062, + "logps/rejected": -199.1323699951172, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3595592975616455, + "rewards/margins": 1.5790296792984009, + "rewards/rejected": -1.9385889768600464, + "step": 7141 + }, + { + "epoch": 0.82, + "learning_rate": 5.379843146435678e-08, + "logits/chosen": -3.735628843307495, + "logits/rejected": -3.389333724975586, + "logps/chosen": -258.93865966796875, + "logps/rejected": -198.241943359375, + "loss": 0.1263, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31665122509002686, + "rewards/margins": 2.761840581893921, + "rewards/rejected": -2.4451892375946045, + "step": 7142 + }, + { + "epoch": 0.82, + "learning_rate": 5.376331499473253e-08, + "logits/chosen": -3.662111759185791, + "logits/rejected": -3.6177330017089844, + "logps/chosen": -190.16529846191406, + "logps/rejected": -195.35638427734375, + "loss": 0.3526, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14735686779022217, + "rewards/margins": 2.345254421234131, + "rewards/rejected": -2.197897434234619, + "step": 7143 + }, + { + "epoch": 0.82, + "learning_rate": 5.3728198525108275e-08, + "logits/chosen": -3.3476250171661377, + "logits/rejected": -3.492668628692627, + "logps/chosen": -238.50482177734375, + "logps/rejected": -223.25210571289062, + "loss": 0.4713, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6927677989006042, + "rewards/margins": 2.179865837097168, + "rewards/rejected": -2.872633457183838, + "step": 7144 + }, + { + "epoch": 0.82, + "learning_rate": 5.369308205548402e-08, + "logits/chosen": -2.8849036693573, + "logits/rejected": -2.997025728225708, + "logps/chosen": -325.3931579589844, + "logps/rejected": -387.0048522949219, + "loss": 0.5015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0015087127685546875, + "rewards/margins": 1.183915376663208, + "rewards/rejected": -1.1854240894317627, + "step": 7145 + }, + { + "epoch": 0.82, + "learning_rate": 5.365796558585976e-08, + "logits/chosen": -3.024273157119751, + "logits/rejected": -3.23806095123291, + "logps/chosen": -273.0790100097656, + "logps/rejected": -349.974365234375, + "loss": 0.433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0017879605293273926, + "rewards/margins": 1.4102909564971924, + "rewards/rejected": -1.4120787382125854, + "step": 7146 + }, + { + "epoch": 0.82, + "learning_rate": 5.362284911623551e-08, + "logits/chosen": -3.270214796066284, + "logits/rejected": -3.303323984146118, + "logps/chosen": -261.51580810546875, + "logps/rejected": -432.56707763671875, + "loss": 0.657, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32239580154418945, + "rewards/margins": 1.277784824371338, + "rewards/rejected": -1.6001808643341064, + "step": 7147 + }, + { + "epoch": 0.82, + "learning_rate": 5.358773264661126e-08, + "logits/chosen": -3.13706111907959, + "logits/rejected": -2.816362142562866, + "logps/chosen": -238.332275390625, + "logps/rejected": -276.6945495605469, + "loss": 0.3635, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.294575035572052, + "rewards/margins": 2.0045483112335205, + "rewards/rejected": -2.2991232872009277, + "step": 7148 + }, + { + "epoch": 0.82, + "learning_rate": 5.3552616176987e-08, + "logits/chosen": -3.4177966117858887, + "logits/rejected": -3.2029786109924316, + "logps/chosen": -432.92755126953125, + "logps/rejected": -209.31185913085938, + "loss": 0.4904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1412757933139801, + "rewards/margins": 1.3529679775238037, + "rewards/rejected": -1.211692214012146, + "step": 7149 + }, + { + "epoch": 0.82, + "learning_rate": 5.351749970736275e-08, + "logits/chosen": -2.968412160873413, + "logits/rejected": -2.9250307083129883, + "logps/chosen": -280.2703857421875, + "logps/rejected": -262.619140625, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2529021203517914, + "rewards/margins": 1.0485327243804932, + "rewards/rejected": -1.3014349937438965, + "step": 7150 + }, + { + "epoch": 0.82, + "learning_rate": 5.348238323773849e-08, + "logits/chosen": -3.407029151916504, + "logits/rejected": -3.115996837615967, + "logps/chosen": -270.0254821777344, + "logps/rejected": -246.6224365234375, + "loss": 0.5909, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4621444344520569, + "rewards/margins": 1.432503342628479, + "rewards/rejected": -1.8946478366851807, + "step": 7151 + }, + { + "epoch": 0.82, + "learning_rate": 5.3447266768114247e-08, + "logits/chosen": -3.179720401763916, + "logits/rejected": -3.467926263809204, + "logps/chosen": -296.02886962890625, + "logps/rejected": -235.03262329101562, + "loss": 0.5672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5206654071807861, + "rewards/margins": 0.8019474148750305, + "rewards/rejected": -1.3226128816604614, + "step": 7152 + }, + { + "epoch": 0.82, + "learning_rate": 5.341215029848999e-08, + "logits/chosen": -3.497636079788208, + "logits/rejected": -3.410590887069702, + "logps/chosen": -336.58538818359375, + "logps/rejected": -273.8204345703125, + "loss": 0.1595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8151699900627136, + "rewards/margins": 2.796783208847046, + "rewards/rejected": -1.9816131591796875, + "step": 7153 + }, + { + "epoch": 0.82, + "learning_rate": 5.3377033828865734e-08, + "logits/chosen": -3.283355712890625, + "logits/rejected": -3.2371985912323, + "logps/chosen": -172.03509521484375, + "logps/rejected": -331.6355285644531, + "loss": 0.5594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44285818934440613, + "rewards/margins": 2.647660970687866, + "rewards/rejected": -3.0905191898345947, + "step": 7154 + }, + { + "epoch": 0.82, + "learning_rate": 5.334191735924148e-08, + "logits/chosen": -2.7365598678588867, + "logits/rejected": -2.6112942695617676, + "logps/chosen": -263.00067138671875, + "logps/rejected": -293.7089538574219, + "loss": 0.1302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7234277725219727, + "rewards/margins": 3.002239942550659, + "rewards/rejected": -2.2788124084472656, + "step": 7155 + }, + { + "epoch": 0.82, + "learning_rate": 5.330680088961723e-08, + "logits/chosen": -2.6125688552856445, + "logits/rejected": -2.529881000518799, + "logps/chosen": -318.92181396484375, + "logps/rejected": -364.12603759765625, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7279793620109558, + "rewards/margins": 2.256826400756836, + "rewards/rejected": -2.9848058223724365, + "step": 7156 + }, + { + "epoch": 0.83, + "learning_rate": 5.327168441999297e-08, + "logits/chosen": -2.704072952270508, + "logits/rejected": -2.6610424518585205, + "logps/chosen": -332.5299072265625, + "logps/rejected": -322.45233154296875, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6599267721176147, + "rewards/margins": 3.0830421447753906, + "rewards/rejected": -2.4231152534484863, + "step": 7157 + }, + { + "epoch": 0.83, + "learning_rate": 5.323656795036872e-08, + "logits/chosen": -3.3705644607543945, + "logits/rejected": -3.3625173568725586, + "logps/chosen": -317.4811096191406, + "logps/rejected": -281.2325439453125, + "loss": 0.1607, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12237464636564255, + "rewards/margins": 3.722754716873169, + "rewards/rejected": -3.600379705429077, + "step": 7158 + }, + { + "epoch": 0.83, + "learning_rate": 5.3201451480744464e-08, + "logits/chosen": -3.4120538234710693, + "logits/rejected": -3.447821855545044, + "logps/chosen": -124.3772964477539, + "logps/rejected": -180.6776885986328, + "loss": 1.1916, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5838055610656738, + "rewards/margins": -0.13256797194480896, + "rewards/rejected": -1.4512377977371216, + "step": 7159 + }, + { + "epoch": 0.83, + "learning_rate": 5.316633501112021e-08, + "logits/chosen": -2.476067543029785, + "logits/rejected": -2.604236602783203, + "logps/chosen": -394.2427978515625, + "logps/rejected": -293.75384521484375, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14745891094207764, + "rewards/margins": 1.5967051982879639, + "rewards/rejected": -1.744164228439331, + "step": 7160 + }, + { + "epoch": 0.83, + "learning_rate": 5.313121854149596e-08, + "logits/chosen": -3.1099724769592285, + "logits/rejected": -2.860887050628662, + "logps/chosen": -227.5123291015625, + "logps/rejected": -230.17047119140625, + "loss": 0.4825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.334161639213562, + "rewards/margins": 0.9114069938659668, + "rewards/rejected": -1.2455687522888184, + "step": 7161 + }, + { + "epoch": 0.83, + "learning_rate": 5.3096102071871706e-08, + "logits/chosen": -3.5224366188049316, + "logits/rejected": -3.5810699462890625, + "logps/chosen": -286.3636169433594, + "logps/rejected": -216.73960876464844, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48567676544189453, + "rewards/margins": 3.5095911026000977, + "rewards/rejected": -3.0239145755767822, + "step": 7162 + }, + { + "epoch": 0.83, + "learning_rate": 5.3060985602247446e-08, + "logits/chosen": -2.6303887367248535, + "logits/rejected": -3.276106834411621, + "logps/chosen": -136.97982788085938, + "logps/rejected": -303.70794677734375, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1255500614643097, + "rewards/margins": 2.690877676010132, + "rewards/rejected": -2.5653276443481445, + "step": 7163 + }, + { + "epoch": 0.83, + "learning_rate": 5.30258691326232e-08, + "logits/chosen": -3.3790528774261475, + "logits/rejected": -2.9400475025177, + "logps/chosen": -362.17694091796875, + "logps/rejected": -177.6874542236328, + "loss": 0.3965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3523489236831665, + "rewards/margins": 1.061598539352417, + "rewards/rejected": -1.413947343826294, + "step": 7164 + }, + { + "epoch": 0.83, + "learning_rate": 5.299075266299894e-08, + "logits/chosen": -3.048398017883301, + "logits/rejected": -2.827425479888916, + "logps/chosen": -411.25592041015625, + "logps/rejected": -397.6034851074219, + "loss": 0.1547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15595278143882751, + "rewards/margins": 2.9642677307128906, + "rewards/rejected": -3.12022066116333, + "step": 7165 + }, + { + "epoch": 0.83, + "learning_rate": 5.2955636193374695e-08, + "logits/chosen": -2.9141664505004883, + "logits/rejected": -3.1487598419189453, + "logps/chosen": -269.5955505371094, + "logps/rejected": -297.6686096191406, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19615979492664337, + "rewards/margins": 2.1887223720550537, + "rewards/rejected": -2.3848822116851807, + "step": 7166 + }, + { + "epoch": 0.83, + "learning_rate": 5.2920519723750435e-08, + "logits/chosen": -2.4134950637817383, + "logits/rejected": -2.3975229263305664, + "logps/chosen": -108.747314453125, + "logps/rejected": -303.2008056640625, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4618927240371704, + "rewards/margins": 2.3741979598999023, + "rewards/rejected": -2.8360908031463623, + "step": 7167 + }, + { + "epoch": 0.83, + "learning_rate": 5.288540325412618e-08, + "logits/chosen": -3.163064956665039, + "logits/rejected": -3.0900492668151855, + "logps/chosen": -155.19085693359375, + "logps/rejected": -236.49017333984375, + "loss": 0.7081, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1751229166984558, + "rewards/margins": 0.9185384511947632, + "rewards/rejected": -1.0936614274978638, + "step": 7168 + }, + { + "epoch": 0.83, + "learning_rate": 5.285028678450193e-08, + "logits/chosen": -2.591860771179199, + "logits/rejected": -2.758780002593994, + "logps/chosen": -124.29107666015625, + "logps/rejected": -136.834228515625, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.039963752031326294, + "rewards/margins": 1.0059757232666016, + "rewards/rejected": -1.0459394454956055, + "step": 7169 + }, + { + "epoch": 0.83, + "learning_rate": 5.281517031487768e-08, + "logits/chosen": -3.179471969604492, + "logits/rejected": -3.239856243133545, + "logps/chosen": -395.94378662109375, + "logps/rejected": -269.49871826171875, + "loss": 0.6818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6943668127059937, + "rewards/margins": 0.7898698449134827, + "rewards/rejected": -1.484236717224121, + "step": 7170 + }, + { + "epoch": 0.83, + "learning_rate": 5.278005384525342e-08, + "logits/chosen": -3.3077614307403564, + "logits/rejected": -3.3139548301696777, + "logps/chosen": -271.8511657714844, + "logps/rejected": -289.1751708984375, + "loss": 0.4971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11771325021982193, + "rewards/margins": 2.3311009407043457, + "rewards/rejected": -2.4488143920898438, + "step": 7171 + }, + { + "epoch": 0.83, + "learning_rate": 5.274493737562917e-08, + "logits/chosen": -3.085782051086426, + "logits/rejected": -3.1233084201812744, + "logps/chosen": -335.94659423828125, + "logps/rejected": -324.3938903808594, + "loss": 0.5618, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18316471576690674, + "rewards/margins": 1.665304183959961, + "rewards/rejected": -1.4821393489837646, + "step": 7172 + }, + { + "epoch": 0.83, + "learning_rate": 5.270982090600491e-08, + "logits/chosen": -3.1732585430145264, + "logits/rejected": -2.8213343620300293, + "logps/chosen": -429.6387634277344, + "logps/rejected": -335.6704406738281, + "loss": 0.158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23916402459144592, + "rewards/margins": 2.4587173461914062, + "rewards/rejected": -2.2195534706115723, + "step": 7173 + }, + { + "epoch": 0.83, + "learning_rate": 5.267470443638066e-08, + "logits/chosen": -3.4048473834991455, + "logits/rejected": -3.248838186264038, + "logps/chosen": -205.46798706054688, + "logps/rejected": -153.02816772460938, + "loss": 0.4758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.017721980810165405, + "rewards/margins": 1.7634456157684326, + "rewards/rejected": -1.78116774559021, + "step": 7174 + }, + { + "epoch": 0.83, + "learning_rate": 5.2639587966756406e-08, + "logits/chosen": -2.9116263389587402, + "logits/rejected": -2.775219440460205, + "logps/chosen": -221.50941467285156, + "logps/rejected": -176.64328002929688, + "loss": 0.4434, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5129404664039612, + "rewards/margins": 1.1801316738128662, + "rewards/rejected": -0.6671911478042603, + "step": 7175 + }, + { + "epoch": 0.83, + "learning_rate": 5.2604471497132154e-08, + "logits/chosen": -2.0995755195617676, + "logits/rejected": -2.053757905960083, + "logps/chosen": -175.2369842529297, + "logps/rejected": -272.14349365234375, + "loss": 0.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14458192884922028, + "rewards/margins": 2.9634451866149902, + "rewards/rejected": -2.8188633918762207, + "step": 7176 + }, + { + "epoch": 0.83, + "learning_rate": 5.2569355027507894e-08, + "logits/chosen": -2.9591825008392334, + "logits/rejected": -2.8129115104675293, + "logps/chosen": -239.23223876953125, + "logps/rejected": -276.94976806640625, + "loss": 0.6112, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25859880447387695, + "rewards/margins": 2.1577720642089844, + "rewards/rejected": -2.4163706302642822, + "step": 7177 + }, + { + "epoch": 0.83, + "learning_rate": 5.253423855788365e-08, + "logits/chosen": -3.009730339050293, + "logits/rejected": -2.8602051734924316, + "logps/chosen": -230.31021118164062, + "logps/rejected": -318.58575439453125, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4576041102409363, + "rewards/margins": 2.6772921085357666, + "rewards/rejected": -3.1348962783813477, + "step": 7178 + }, + { + "epoch": 0.83, + "learning_rate": 5.249912208825939e-08, + "logits/chosen": -3.850715160369873, + "logits/rejected": -3.1356279850006104, + "logps/chosen": -274.42120361328125, + "logps/rejected": -186.80441284179688, + "loss": 0.4682, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011641651391983032, + "rewards/margins": 1.5573867559432983, + "rewards/rejected": -1.5457451343536377, + "step": 7179 + }, + { + "epoch": 0.83, + "learning_rate": 5.246400561863514e-08, + "logits/chosen": -3.125565767288208, + "logits/rejected": -3.3291115760803223, + "logps/chosen": -345.64190673828125, + "logps/rejected": -352.2969665527344, + "loss": 0.6559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8319402933120728, + "rewards/margins": 0.9330559372901917, + "rewards/rejected": -1.7649961709976196, + "step": 7180 + }, + { + "epoch": 0.83, + "learning_rate": 5.242888914901088e-08, + "logits/chosen": -3.443859338760376, + "logits/rejected": -3.7568769454956055, + "logps/chosen": -191.58399963378906, + "logps/rejected": -230.5093994140625, + "loss": 0.1438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1696305274963379, + "rewards/margins": 2.773963451385498, + "rewards/rejected": -2.943593740463257, + "step": 7181 + }, + { + "epoch": 0.83, + "learning_rate": 5.239377267938663e-08, + "logits/chosen": -3.2939257621765137, + "logits/rejected": -3.2996158599853516, + "logps/chosen": -104.30058288574219, + "logps/rejected": -200.30947875976562, + "loss": 0.3947, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4234180450439453, + "rewards/margins": 1.401839256286621, + "rewards/rejected": -0.9784212708473206, + "step": 7182 + }, + { + "epoch": 0.83, + "learning_rate": 5.235865620976238e-08, + "logits/chosen": -3.3709359169006348, + "logits/rejected": -3.409945487976074, + "logps/chosen": -182.3096923828125, + "logps/rejected": -256.1904296875, + "loss": 0.3457, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14487504959106445, + "rewards/margins": 2.0211057662963867, + "rewards/rejected": -1.8762307167053223, + "step": 7183 + }, + { + "epoch": 0.83, + "learning_rate": 5.2323539740138125e-08, + "logits/chosen": -3.5443286895751953, + "logits/rejected": -3.599041700363159, + "logps/chosen": -198.54855346679688, + "logps/rejected": -210.27264404296875, + "loss": 0.2486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1781499981880188, + "rewards/margins": 1.9933366775512695, + "rewards/rejected": -1.8151865005493164, + "step": 7184 + }, + { + "epoch": 0.83, + "learning_rate": 5.2288423270513866e-08, + "logits/chosen": -3.443084239959717, + "logits/rejected": -3.182363986968994, + "logps/chosen": -316.8511657714844, + "logps/rejected": -204.3426513671875, + "loss": 0.3839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15805649757385254, + "rewards/margins": 1.601384162902832, + "rewards/rejected": -1.44332754611969, + "step": 7185 + }, + { + "epoch": 0.83, + "learning_rate": 5.225330680088962e-08, + "logits/chosen": -3.361208200454712, + "logits/rejected": -3.5827627182006836, + "logps/chosen": -175.9075927734375, + "logps/rejected": -272.21466064453125, + "loss": 0.4503, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2877669930458069, + "rewards/margins": 2.4001147747039795, + "rewards/rejected": -2.1123478412628174, + "step": 7186 + }, + { + "epoch": 0.83, + "learning_rate": 5.221819033126536e-08, + "logits/chosen": -2.6030163764953613, + "logits/rejected": -2.63392972946167, + "logps/chosen": -176.7958526611328, + "logps/rejected": -189.87899780273438, + "loss": 0.4649, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07603834569454193, + "rewards/margins": 1.140049695968628, + "rewards/rejected": -1.0640113353729248, + "step": 7187 + }, + { + "epoch": 0.83, + "learning_rate": 5.2183073861641114e-08, + "logits/chosen": -3.3674750328063965, + "logits/rejected": -2.9322073459625244, + "logps/chosen": -392.8912353515625, + "logps/rejected": -294.6309814453125, + "loss": 0.5877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41470810770988464, + "rewards/margins": 0.790001630783081, + "rewards/rejected": -1.2047096490859985, + "step": 7188 + }, + { + "epoch": 0.83, + "learning_rate": 5.2147957392016855e-08, + "logits/chosen": -3.626079559326172, + "logits/rejected": -3.2735772132873535, + "logps/chosen": -221.61485290527344, + "logps/rejected": -254.3193817138672, + "loss": 0.4348, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03566461801528931, + "rewards/margins": 0.9663242697715759, + "rewards/rejected": -0.9306596517562866, + "step": 7189 + }, + { + "epoch": 0.83, + "learning_rate": 5.21128409223926e-08, + "logits/chosen": -3.012845516204834, + "logits/rejected": -2.9319541454315186, + "logps/chosen": -218.39321899414062, + "logps/rejected": -237.50379943847656, + "loss": 0.2191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2104346752166748, + "rewards/margins": 2.376249074935913, + "rewards/rejected": -2.1658146381378174, + "step": 7190 + }, + { + "epoch": 0.83, + "learning_rate": 5.207772445276834e-08, + "logits/chosen": -2.4714152812957764, + "logits/rejected": -2.6587796211242676, + "logps/chosen": -303.0846252441406, + "logps/rejected": -265.298095703125, + "loss": 0.619, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0017559044063091278, + "rewards/margins": 1.2833112478256226, + "rewards/rejected": -1.285067081451416, + "step": 7191 + }, + { + "epoch": 0.83, + "learning_rate": 5.2042607983144096e-08, + "logits/chosen": -3.123079776763916, + "logits/rejected": -2.7818398475646973, + "logps/chosen": -325.539306640625, + "logps/rejected": -290.9383544921875, + "loss": 0.4078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14213469624519348, + "rewards/margins": 2.0117945671081543, + "rewards/rejected": -2.1539292335510254, + "step": 7192 + }, + { + "epoch": 0.83, + "learning_rate": 5.200749151351984e-08, + "logits/chosen": -3.0964229106903076, + "logits/rejected": -3.21736478805542, + "logps/chosen": -156.13198852539062, + "logps/rejected": -273.4672546386719, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23286586999893188, + "rewards/margins": 3.0485332012176514, + "rewards/rejected": -2.815667152404785, + "step": 7193 + }, + { + "epoch": 0.83, + "learning_rate": 5.197237504389559e-08, + "logits/chosen": -3.611042022705078, + "logits/rejected": -3.4054136276245117, + "logps/chosen": -316.1274108886719, + "logps/rejected": -222.89141845703125, + "loss": 0.3795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2574393153190613, + "rewards/margins": 2.368043899536133, + "rewards/rejected": -2.1106045246124268, + "step": 7194 + }, + { + "epoch": 0.83, + "learning_rate": 5.193725857427133e-08, + "logits/chosen": -3.6205430030822754, + "logits/rejected": -3.3656296730041504, + "logps/chosen": -277.5799865722656, + "logps/rejected": -245.41415405273438, + "loss": 0.4914, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0701688528060913, + "rewards/margins": 0.8755013942718506, + "rewards/rejected": -1.945670247077942, + "step": 7195 + }, + { + "epoch": 0.83, + "learning_rate": 5.190214210464707e-08, + "logits/chosen": -3.636477470397949, + "logits/rejected": -3.79390549659729, + "logps/chosen": -334.841552734375, + "logps/rejected": -255.5904998779297, + "loss": 0.2382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10335685312747955, + "rewards/margins": 2.2148020267486572, + "rewards/rejected": -2.111445426940918, + "step": 7196 + }, + { + "epoch": 0.83, + "learning_rate": 5.1867025635022826e-08, + "logits/chosen": -3.139993190765381, + "logits/rejected": -3.1623871326446533, + "logps/chosen": -91.03340148925781, + "logps/rejected": -146.93853759765625, + "loss": 0.4306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0757780373096466, + "rewards/margins": 1.5053956508636475, + "rewards/rejected": -1.5811737775802612, + "step": 7197 + }, + { + "epoch": 0.83, + "learning_rate": 5.1831909165398566e-08, + "logits/chosen": -3.236751079559326, + "logits/rejected": -2.9461381435394287, + "logps/chosen": -220.8119659423828, + "logps/rejected": -329.17755126953125, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01634964346885681, + "rewards/margins": 2.453500509262085, + "rewards/rejected": -2.4371509552001953, + "step": 7198 + }, + { + "epoch": 0.83, + "learning_rate": 5.1796792695774314e-08, + "logits/chosen": -2.899993658065796, + "logits/rejected": -3.0135269165039062, + "logps/chosen": -279.3945007324219, + "logps/rejected": -219.54110717773438, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25677812099456787, + "rewards/margins": 2.150317430496216, + "rewards/rejected": -1.8935391902923584, + "step": 7199 + }, + { + "epoch": 0.83, + "learning_rate": 5.176167622615006e-08, + "logits/chosen": -3.0503158569335938, + "logits/rejected": -2.8781580924987793, + "logps/chosen": -190.38742065429688, + "logps/rejected": -215.14903259277344, + "loss": 0.2156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24159660935401917, + "rewards/margins": 2.0767951011657715, + "rewards/rejected": -2.318391799926758, + "step": 7200 + }, + { + "epoch": 0.83, + "learning_rate": 5.172655975652581e-08, + "logits/chosen": -2.822378158569336, + "logits/rejected": -2.9218201637268066, + "logps/chosen": -293.5276794433594, + "logps/rejected": -226.42941284179688, + "loss": 0.2741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.048987261950969696, + "rewards/margins": 1.9472087621688843, + "rewards/rejected": -1.898221492767334, + "step": 7201 + }, + { + "epoch": 0.83, + "learning_rate": 5.169144328690155e-08, + "logits/chosen": -3.353611469268799, + "logits/rejected": -3.1339752674102783, + "logps/chosen": -135.82785034179688, + "logps/rejected": -135.29037475585938, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1117946207523346, + "rewards/margins": 1.3920656442642212, + "rewards/rejected": -1.280271053314209, + "step": 7202 + }, + { + "epoch": 0.83, + "learning_rate": 5.16563268172773e-08, + "logits/chosen": -2.768587112426758, + "logits/rejected": -3.0902342796325684, + "logps/chosen": -141.28305053710938, + "logps/rejected": -317.7546081542969, + "loss": 0.3637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.047416456043720245, + "rewards/margins": 1.8709005117416382, + "rewards/rejected": -1.9183168411254883, + "step": 7203 + }, + { + "epoch": 0.83, + "learning_rate": 5.162121034765304e-08, + "logits/chosen": -3.4882473945617676, + "logits/rejected": -3.610447883605957, + "logps/chosen": -129.71070861816406, + "logps/rejected": -211.37588500976562, + "loss": 0.4796, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7566113471984863, + "rewards/margins": 1.654716968536377, + "rewards/rejected": -2.4113283157348633, + "step": 7204 + }, + { + "epoch": 0.83, + "learning_rate": 5.15860938780288e-08, + "logits/chosen": -3.099691867828369, + "logits/rejected": -3.1153671741485596, + "logps/chosen": -327.21533203125, + "logps/rejected": -210.59390258789062, + "loss": 0.35, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14276161789894104, + "rewards/margins": 1.8347703218460083, + "rewards/rejected": -1.6920088529586792, + "step": 7205 + }, + { + "epoch": 0.83, + "learning_rate": 5.155097740840454e-08, + "logits/chosen": -3.4056687355041504, + "logits/rejected": -3.2464683055877686, + "logps/chosen": -217.2108917236328, + "logps/rejected": -291.6184387207031, + "loss": 0.3716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5433745384216309, + "rewards/margins": 1.016566514968872, + "rewards/rejected": -1.5599409341812134, + "step": 7206 + }, + { + "epoch": 0.83, + "learning_rate": 5.1515860938780285e-08, + "logits/chosen": -2.0250158309936523, + "logits/rejected": -2.2212419509887695, + "logps/chosen": -479.78106689453125, + "logps/rejected": -411.8062744140625, + "loss": 0.6131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1372247189283371, + "rewards/margins": 1.1852023601531982, + "rewards/rejected": -1.3224271535873413, + "step": 7207 + }, + { + "epoch": 0.83, + "learning_rate": 5.1480744469156026e-08, + "logits/chosen": -2.562403440475464, + "logits/rejected": -2.5418102741241455, + "logps/chosen": -349.981201171875, + "logps/rejected": -278.78753662109375, + "loss": 0.2866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3121623992919922, + "rewards/margins": 2.5945208072662354, + "rewards/rejected": -2.9066832065582275, + "step": 7208 + }, + { + "epoch": 0.83, + "learning_rate": 5.144562799953178e-08, + "logits/chosen": -2.324127197265625, + "logits/rejected": -2.371750593185425, + "logps/chosen": -321.9378662109375, + "logps/rejected": -198.25218200683594, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36530056595802307, + "rewards/margins": 2.1439261436462402, + "rewards/rejected": -1.778625726699829, + "step": 7209 + }, + { + "epoch": 0.83, + "learning_rate": 5.141051152990752e-08, + "logits/chosen": -3.348931074142456, + "logits/rejected": -3.3629398345947266, + "logps/chosen": -217.61553955078125, + "logps/rejected": -209.7340087890625, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35177645087242126, + "rewards/margins": 0.9011094570159912, + "rewards/rejected": -1.2528859376907349, + "step": 7210 + }, + { + "epoch": 0.83, + "learning_rate": 5.1375395060283274e-08, + "logits/chosen": -3.1566390991210938, + "logits/rejected": -3.124048948287964, + "logps/chosen": -192.31640625, + "logps/rejected": -183.98548889160156, + "loss": 0.2128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.726064145565033, + "rewards/margins": 2.169632911682129, + "rewards/rejected": -1.4435688257217407, + "step": 7211 + }, + { + "epoch": 0.83, + "learning_rate": 5.1340278590659015e-08, + "logits/chosen": -2.5206034183502197, + "logits/rejected": -2.8013477325439453, + "logps/chosen": -390.63671875, + "logps/rejected": -302.86993408203125, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0093747079372406, + "rewards/margins": 2.381688356399536, + "rewards/rejected": -2.3723137378692627, + "step": 7212 + }, + { + "epoch": 0.83, + "learning_rate": 5.130516212103476e-08, + "logits/chosen": -2.8184759616851807, + "logits/rejected": -2.9844157695770264, + "logps/chosen": -229.88758850097656, + "logps/rejected": -334.8213806152344, + "loss": 0.6267, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20644442737102509, + "rewards/margins": 1.112413763999939, + "rewards/rejected": -1.3188581466674805, + "step": 7213 + }, + { + "epoch": 0.83, + "learning_rate": 5.127004565141051e-08, + "logits/chosen": -3.7543506622314453, + "logits/rejected": -3.337465286254883, + "logps/chosen": -441.5437316894531, + "logps/rejected": -210.23257446289062, + "loss": 0.2485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004851162433624268, + "rewards/margins": 2.1254005432128906, + "rewards/rejected": -2.120549440383911, + "step": 7214 + }, + { + "epoch": 0.83, + "learning_rate": 5.1234929181786256e-08, + "logits/chosen": -2.5159685611724854, + "logits/rejected": -2.5104219913482666, + "logps/chosen": -213.26303100585938, + "logps/rejected": -306.47900390625, + "loss": 0.4134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3495781421661377, + "rewards/margins": 2.225933074951172, + "rewards/rejected": -2.5755109786987305, + "step": 7215 + }, + { + "epoch": 0.83, + "learning_rate": 5.1199812712162e-08, + "logits/chosen": -2.96331787109375, + "logits/rejected": -2.8253655433654785, + "logps/chosen": -318.27252197265625, + "logps/rejected": -357.6860046386719, + "loss": 0.1702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12684068083763123, + "rewards/margins": 3.799116611480713, + "rewards/rejected": -3.672276020050049, + "step": 7216 + }, + { + "epoch": 0.83, + "learning_rate": 5.116469624253775e-08, + "logits/chosen": -3.1732683181762695, + "logits/rejected": -3.14555025100708, + "logps/chosen": -294.1593017578125, + "logps/rejected": -205.81202697753906, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18109780550003052, + "rewards/margins": 1.5081470012664795, + "rewards/rejected": -1.3270492553710938, + "step": 7217 + }, + { + "epoch": 0.83, + "learning_rate": 5.112957977291349e-08, + "logits/chosen": -2.6620919704437256, + "logits/rejected": -2.485532283782959, + "logps/chosen": -229.7654266357422, + "logps/rejected": -450.20745849609375, + "loss": 0.6689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5382860898971558, + "rewards/margins": 2.340419292449951, + "rewards/rejected": -2.8787052631378174, + "step": 7218 + }, + { + "epoch": 0.83, + "learning_rate": 5.1094463303289245e-08, + "logits/chosen": -2.8578553199768066, + "logits/rejected": -2.8441576957702637, + "logps/chosen": -167.7171173095703, + "logps/rejected": -595.407470703125, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3701966404914856, + "rewards/margins": 1.8665001392364502, + "rewards/rejected": -2.236696720123291, + "step": 7219 + }, + { + "epoch": 0.83, + "learning_rate": 5.1059346833664986e-08, + "logits/chosen": -2.906580924987793, + "logits/rejected": -2.7348132133483887, + "logps/chosen": -194.05923461914062, + "logps/rejected": -294.0055847167969, + "loss": 0.3101, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36083829402923584, + "rewards/margins": 1.8688404560089111, + "rewards/rejected": -2.2296786308288574, + "step": 7220 + }, + { + "epoch": 0.83, + "learning_rate": 5.102423036404073e-08, + "logits/chosen": -3.084887981414795, + "logits/rejected": -3.2139639854431152, + "logps/chosen": -369.2535400390625, + "logps/rejected": -277.60015869140625, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2462189793586731, + "rewards/margins": 1.4211320877075195, + "rewards/rejected": -1.6673511266708374, + "step": 7221 + }, + { + "epoch": 0.83, + "learning_rate": 5.098911389441648e-08, + "logits/chosen": -3.1356067657470703, + "logits/rejected": -3.196457624435425, + "logps/chosen": -239.6969451904297, + "logps/rejected": -234.3162841796875, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5528683662414551, + "rewards/margins": 1.5487558841705322, + "rewards/rejected": -0.9958874583244324, + "step": 7222 + }, + { + "epoch": 0.83, + "learning_rate": 5.095399742479223e-08, + "logits/chosen": -2.8404672145843506, + "logits/rejected": -2.8910112380981445, + "logps/chosen": -278.107666015625, + "logps/rejected": -287.7952880859375, + "loss": 0.2924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3784114122390747, + "rewards/margins": 2.1610982418060303, + "rewards/rejected": -2.5395095348358154, + "step": 7223 + }, + { + "epoch": 0.83, + "learning_rate": 5.091888095516797e-08, + "logits/chosen": -2.432145833969116, + "logits/rejected": -2.7687366008758545, + "logps/chosen": -316.23309326171875, + "logps/rejected": -257.31146240234375, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4711979627609253, + "rewards/margins": 0.6853963136672974, + "rewards/rejected": -1.1565942764282227, + "step": 7224 + }, + { + "epoch": 0.83, + "learning_rate": 5.088376448554372e-08, + "logits/chosen": -3.3834495544433594, + "logits/rejected": -3.5673608779907227, + "logps/chosen": -198.2320556640625, + "logps/rejected": -154.30911254882812, + "loss": 0.3054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3147566616535187, + "rewards/margins": 1.768181324005127, + "rewards/rejected": -2.0829379558563232, + "step": 7225 + }, + { + "epoch": 0.83, + "learning_rate": 5.084864801591946e-08, + "logits/chosen": -2.768056631088257, + "logits/rejected": -2.564816474914551, + "logps/chosen": -252.56637573242188, + "logps/rejected": -197.78660583496094, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06370498239994049, + "rewards/margins": 1.0651905536651611, + "rewards/rejected": -1.0014855861663818, + "step": 7226 + }, + { + "epoch": 0.83, + "learning_rate": 5.081353154629521e-08, + "logits/chosen": -2.748734474182129, + "logits/rejected": -2.8341104984283447, + "logps/chosen": -226.97833251953125, + "logps/rejected": -319.068359375, + "loss": 0.3362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33532992005348206, + "rewards/margins": 2.2578370571136475, + "rewards/rejected": -2.5931670665740967, + "step": 7227 + }, + { + "epoch": 0.83, + "learning_rate": 5.077841507667096e-08, + "logits/chosen": -3.666344165802002, + "logits/rejected": -3.667032241821289, + "logps/chosen": -193.7274169921875, + "logps/rejected": -278.1099548339844, + "loss": 0.6655, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2482751607894897, + "rewards/margins": 0.45893168449401855, + "rewards/rejected": -1.7072067260742188, + "step": 7228 + }, + { + "epoch": 0.83, + "learning_rate": 5.0743298607046704e-08, + "logits/chosen": -3.3546183109283447, + "logits/rejected": -3.21582293510437, + "logps/chosen": -212.4844207763672, + "logps/rejected": -282.94671630859375, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5599007606506348, + "rewards/margins": 1.969846487045288, + "rewards/rejected": -2.529747247695923, + "step": 7229 + }, + { + "epoch": 0.83, + "learning_rate": 5.0708182137422445e-08, + "logits/chosen": -2.17800235748291, + "logits/rejected": -2.0206480026245117, + "logps/chosen": -295.41522216796875, + "logps/rejected": -363.5266418457031, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.036778099834918976, + "rewards/margins": 2.0522620677948, + "rewards/rejected": -2.015484094619751, + "step": 7230 + }, + { + "epoch": 0.83, + "learning_rate": 5.06730656677982e-08, + "logits/chosen": -2.4390439987182617, + "logits/rejected": -2.4294915199279785, + "logps/chosen": -263.84326171875, + "logps/rejected": -312.64697265625, + "loss": 0.1233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5868321061134338, + "rewards/margins": 3.057685136795044, + "rewards/rejected": -2.470852851867676, + "step": 7231 + }, + { + "epoch": 0.83, + "learning_rate": 5.063794919817394e-08, + "logits/chosen": -3.682229995727539, + "logits/rejected": -3.980869770050049, + "logps/chosen": -135.2556915283203, + "logps/rejected": -179.06053161621094, + "loss": 0.5635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08606965839862823, + "rewards/margins": 0.59857177734375, + "rewards/rejected": -0.6846414804458618, + "step": 7232 + }, + { + "epoch": 0.83, + "learning_rate": 5.0602832728549693e-08, + "logits/chosen": -3.0537166595458984, + "logits/rejected": -2.945998191833496, + "logps/chosen": -248.93743896484375, + "logps/rejected": -200.779296875, + "loss": 0.3459, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.062008775770664215, + "rewards/margins": 2.0428574085235596, + "rewards/rejected": -1.9808486700057983, + "step": 7233 + }, + { + "epoch": 0.83, + "learning_rate": 5.0567716258925434e-08, + "logits/chosen": -3.6369733810424805, + "logits/rejected": -3.5469794273376465, + "logps/chosen": -301.2701721191406, + "logps/rejected": -249.73939514160156, + "loss": 0.2107, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32317203283309937, + "rewards/margins": 3.754246711730957, + "rewards/rejected": -3.431074619293213, + "step": 7234 + }, + { + "epoch": 0.83, + "learning_rate": 5.053259978930118e-08, + "logits/chosen": -2.945077657699585, + "logits/rejected": -2.824096441268921, + "logps/chosen": -293.1856689453125, + "logps/rejected": -293.32952880859375, + "loss": 0.1586, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22017619013786316, + "rewards/margins": 3.155290365219116, + "rewards/rejected": -2.9351143836975098, + "step": 7235 + }, + { + "epoch": 0.83, + "learning_rate": 5.049748331967693e-08, + "logits/chosen": -3.467777967453003, + "logits/rejected": -2.99243426322937, + "logps/chosen": -391.2261962890625, + "logps/rejected": -267.9790344238281, + "loss": 0.3607, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.334145188331604, + "rewards/margins": 2.3822762966156006, + "rewards/rejected": -2.048130989074707, + "step": 7236 + }, + { + "epoch": 0.83, + "learning_rate": 5.0462366850052676e-08, + "logits/chosen": -2.9586586952209473, + "logits/rejected": -2.5912508964538574, + "logps/chosen": -357.03900146484375, + "logps/rejected": -286.2179260253906, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5359609723091125, + "rewards/margins": 2.467419385910034, + "rewards/rejected": -1.9314583539962769, + "step": 7237 + }, + { + "epoch": 0.83, + "learning_rate": 5.0427250380428416e-08, + "logits/chosen": -3.7321577072143555, + "logits/rejected": -3.539353132247925, + "logps/chosen": -205.33250427246094, + "logps/rejected": -131.23487854003906, + "loss": 0.4874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20737551152706146, + "rewards/margins": 1.2731231451034546, + "rewards/rejected": -1.4804986715316772, + "step": 7238 + }, + { + "epoch": 0.83, + "learning_rate": 5.039213391080417e-08, + "logits/chosen": -3.347930431365967, + "logits/rejected": -3.473954916000366, + "logps/chosen": -258.64031982421875, + "logps/rejected": -271.65936279296875, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21302083134651184, + "rewards/margins": 2.312222719192505, + "rewards/rejected": -2.0992019176483154, + "step": 7239 + }, + { + "epoch": 0.83, + "learning_rate": 5.035701744117991e-08, + "logits/chosen": -3.5192980766296387, + "logits/rejected": -3.468867778778076, + "logps/chosen": -249.80577087402344, + "logps/rejected": -132.6614227294922, + "loss": 0.3805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42423000931739807, + "rewards/margins": 1.6800646781921387, + "rewards/rejected": -1.255834698677063, + "step": 7240 + }, + { + "epoch": 0.83, + "learning_rate": 5.0321900971555665e-08, + "logits/chosen": -3.075322389602661, + "logits/rejected": -3.3324501514434814, + "logps/chosen": -236.03492736816406, + "logps/rejected": -216.5542449951172, + "loss": 0.3611, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21005485951900482, + "rewards/margins": 2.3397347927093506, + "rewards/rejected": -2.1296796798706055, + "step": 7241 + }, + { + "epoch": 0.83, + "learning_rate": 5.0286784501931405e-08, + "logits/chosen": -2.9063029289245605, + "logits/rejected": -3.1994709968566895, + "logps/chosen": -96.35302734375, + "logps/rejected": -187.04896545410156, + "loss": 0.3163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15326303243637085, + "rewards/margins": 1.758339285850525, + "rewards/rejected": -1.6050763130187988, + "step": 7242 + }, + { + "epoch": 0.83, + "learning_rate": 5.0251668032307146e-08, + "logits/chosen": -2.896296977996826, + "logits/rejected": -3.020688056945801, + "logps/chosen": -148.65745544433594, + "logps/rejected": -245.28164672851562, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014453485608100891, + "rewards/margins": 3.4737002849578857, + "rewards/rejected": -3.4592466354370117, + "step": 7243 + }, + { + "epoch": 0.84, + "learning_rate": 5.021655156268289e-08, + "logits/chosen": -3.448168992996216, + "logits/rejected": -3.0071046352386475, + "logps/chosen": -219.86044311523438, + "logps/rejected": -175.07923889160156, + "loss": 0.3534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2569941580295563, + "rewards/margins": 2.499329090118408, + "rewards/rejected": -2.7563230991363525, + "step": 7244 + }, + { + "epoch": 0.84, + "learning_rate": 5.018143509305864e-08, + "logits/chosen": -3.0672767162323, + "logits/rejected": -2.4708375930786133, + "logps/chosen": -203.8226776123047, + "logps/rejected": -227.91311645507812, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09652988612651825, + "rewards/margins": 1.445613145828247, + "rewards/rejected": -1.5421431064605713, + "step": 7245 + }, + { + "epoch": 0.84, + "learning_rate": 5.014631862343439e-08, + "logits/chosen": -3.2285068035125732, + "logits/rejected": -3.23307466506958, + "logps/chosen": -132.4745330810547, + "logps/rejected": -157.16644287109375, + "loss": 0.4672, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09372683614492416, + "rewards/margins": 1.4251713752746582, + "rewards/rejected": -1.331444501876831, + "step": 7246 + }, + { + "epoch": 0.84, + "learning_rate": 5.011120215381013e-08, + "logits/chosen": -2.9865612983703613, + "logits/rejected": -2.824361801147461, + "logps/chosen": -438.48919677734375, + "logps/rejected": -387.7099609375, + "loss": 0.1639, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14505872130393982, + "rewards/margins": 2.6284122467041016, + "rewards/rejected": -2.48335337638855, + "step": 7247 + }, + { + "epoch": 0.84, + "learning_rate": 5.007608568418588e-08, + "logits/chosen": -3.425402879714966, + "logits/rejected": -3.5531206130981445, + "logps/chosen": -213.7608642578125, + "logps/rejected": -115.87738037109375, + "loss": 0.2962, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21418142318725586, + "rewards/margins": 1.8235903978347778, + "rewards/rejected": -1.609408974647522, + "step": 7248 + }, + { + "epoch": 0.84, + "learning_rate": 5.004096921456162e-08, + "logits/chosen": -2.8044662475585938, + "logits/rejected": -2.8064823150634766, + "logps/chosen": -364.1567687988281, + "logps/rejected": -322.76470947265625, + "loss": 0.7284, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3767805099487305, + "rewards/margins": 0.5106115341186523, + "rewards/rejected": -1.8873920440673828, + "step": 7249 + }, + { + "epoch": 0.84, + "learning_rate": 5.0005852744937377e-08, + "logits/chosen": -2.4771475791931152, + "logits/rejected": -2.402222156524658, + "logps/chosen": -218.1021728515625, + "logps/rejected": -255.9054412841797, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2969256639480591, + "rewards/margins": 0.9961888790130615, + "rewards/rejected": -1.2931146621704102, + "step": 7250 + }, + { + "epoch": 0.84, + "learning_rate": 4.997073627531312e-08, + "logits/chosen": -3.127955675125122, + "logits/rejected": -3.6149754524230957, + "logps/chosen": -238.84793090820312, + "logps/rejected": -196.8673858642578, + "loss": 0.3447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49891865253448486, + "rewards/margins": 1.5283573865890503, + "rewards/rejected": -2.027275800704956, + "step": 7251 + }, + { + "epoch": 0.84, + "learning_rate": 4.9935619805688864e-08, + "logits/chosen": -3.916311264038086, + "logits/rejected": -4.003241062164307, + "logps/chosen": -270.83734130859375, + "logps/rejected": -248.2839813232422, + "loss": 0.3046, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5412649512290955, + "rewards/margins": 1.9029667377471924, + "rewards/rejected": -2.4442317485809326, + "step": 7252 + }, + { + "epoch": 0.84, + "learning_rate": 4.990050333606461e-08, + "logits/chosen": -2.9026429653167725, + "logits/rejected": -3.0897152423858643, + "logps/chosen": -339.25823974609375, + "logps/rejected": -422.83331298828125, + "loss": 0.3412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0689205527305603, + "rewards/margins": 2.0260913372039795, + "rewards/rejected": -1.9571707248687744, + "step": 7253 + }, + { + "epoch": 0.84, + "learning_rate": 4.986538686644036e-08, + "logits/chosen": -3.273560047149658, + "logits/rejected": -3.186000347137451, + "logps/chosen": -232.13180541992188, + "logps/rejected": -264.48175048828125, + "loss": 0.5452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.791731059551239, + "rewards/margins": 2.1436400413513184, + "rewards/rejected": -2.935370922088623, + "step": 7254 + }, + { + "epoch": 0.84, + "learning_rate": 4.98302703968161e-08, + "logits/chosen": -2.9774131774902344, + "logits/rejected": -3.418776273727417, + "logps/chosen": -205.2285614013672, + "logps/rejected": -240.7371826171875, + "loss": 0.1197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7197244167327881, + "rewards/margins": 3.134186267852783, + "rewards/rejected": -2.414462089538574, + "step": 7255 + }, + { + "epoch": 0.84, + "learning_rate": 4.9795153927191853e-08, + "logits/chosen": -2.7514281272888184, + "logits/rejected": -2.8240041732788086, + "logps/chosen": -359.6626281738281, + "logps/rejected": -293.65380859375, + "loss": 0.1972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07779745012521744, + "rewards/margins": 3.210418701171875, + "rewards/rejected": -3.2882163524627686, + "step": 7256 + }, + { + "epoch": 0.84, + "learning_rate": 4.9760037457567594e-08, + "logits/chosen": -2.7410988807678223, + "logits/rejected": -2.9195165634155273, + "logps/chosen": -254.07411193847656, + "logps/rejected": -241.51846313476562, + "loss": 0.396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21580596268177032, + "rewards/margins": 1.2213687896728516, + "rewards/rejected": -1.437174677848816, + "step": 7257 + }, + { + "epoch": 0.84, + "learning_rate": 4.972492098794335e-08, + "logits/chosen": -2.988142728805542, + "logits/rejected": -2.8688008785247803, + "logps/chosen": -307.84442138671875, + "logps/rejected": -313.1390380859375, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32012486457824707, + "rewards/margins": 2.4511961936950684, + "rewards/rejected": -2.1310713291168213, + "step": 7258 + }, + { + "epoch": 0.84, + "learning_rate": 4.968980451831909e-08, + "logits/chosen": -3.0733094215393066, + "logits/rejected": -3.0168542861938477, + "logps/chosen": -218.33883666992188, + "logps/rejected": -149.92886352539062, + "loss": 0.5113, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2758692800998688, + "rewards/margins": 0.5334983468055725, + "rewards/rejected": -0.8093676567077637, + "step": 7259 + }, + { + "epoch": 0.84, + "learning_rate": 4.9654688048694836e-08, + "logits/chosen": -2.2442948818206787, + "logits/rejected": -2.142073631286621, + "logps/chosen": -206.916259765625, + "logps/rejected": -249.69842529296875, + "loss": 0.3911, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015127800405025482, + "rewards/margins": 1.2381527423858643, + "rewards/rejected": -1.2532804012298584, + "step": 7260 + }, + { + "epoch": 0.84, + "learning_rate": 4.9619571579070576e-08, + "logits/chosen": -2.050347328186035, + "logits/rejected": -1.8694854974746704, + "logps/chosen": -292.9566955566406, + "logps/rejected": -397.5107421875, + "loss": 0.4842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06792230159044266, + "rewards/margins": 1.1243079900741577, + "rewards/rejected": -1.0563856363296509, + "step": 7261 + }, + { + "epoch": 0.84, + "learning_rate": 4.958445510944633e-08, + "logits/chosen": -3.5581984519958496, + "logits/rejected": -3.472510814666748, + "logps/chosen": -292.0977783203125, + "logps/rejected": -161.60052490234375, + "loss": 0.4383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8429885506629944, + "rewards/margins": 1.1656779050827026, + "rewards/rejected": -2.008666515350342, + "step": 7262 + }, + { + "epoch": 0.84, + "learning_rate": 4.954933863982207e-08, + "logits/chosen": -2.56957745552063, + "logits/rejected": -2.385180711746216, + "logps/chosen": -490.3033447265625, + "logps/rejected": -488.3107604980469, + "loss": 0.3647, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5026763081550598, + "rewards/margins": 1.6816668510437012, + "rewards/rejected": -1.1789904832839966, + "step": 7263 + }, + { + "epoch": 0.84, + "learning_rate": 4.9514222170197825e-08, + "logits/chosen": -2.736356019973755, + "logits/rejected": -2.840841293334961, + "logps/chosen": -332.1766662597656, + "logps/rejected": -302.17535400390625, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38550567626953125, + "rewards/margins": 1.1064305305480957, + "rewards/rejected": -1.491936206817627, + "step": 7264 + }, + { + "epoch": 0.84, + "learning_rate": 4.9479105700573565e-08, + "logits/chosen": -3.0711867809295654, + "logits/rejected": -2.6153929233551025, + "logps/chosen": -309.967529296875, + "logps/rejected": -262.7599792480469, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18580739200115204, + "rewards/margins": 2.1398074626922607, + "rewards/rejected": -1.9540001153945923, + "step": 7265 + }, + { + "epoch": 0.84, + "learning_rate": 4.944398923094931e-08, + "logits/chosen": -2.8674166202545166, + "logits/rejected": -2.814300298690796, + "logps/chosen": -228.36959838867188, + "logps/rejected": -248.02850341796875, + "loss": 0.3661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19352349638938904, + "rewards/margins": 1.734015703201294, + "rewards/rejected": -1.927539348602295, + "step": 7266 + }, + { + "epoch": 0.84, + "learning_rate": 4.940887276132506e-08, + "logits/chosen": -2.610485792160034, + "logits/rejected": -2.459897756576538, + "logps/chosen": -257.58111572265625, + "logps/rejected": -268.7911071777344, + "loss": 0.5117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0519716739654541, + "rewards/margins": 1.9205138683319092, + "rewards/rejected": -1.9724856615066528, + "step": 7267 + }, + { + "epoch": 0.84, + "learning_rate": 4.937375629170081e-08, + "logits/chosen": -3.2745320796966553, + "logits/rejected": -3.3858420848846436, + "logps/chosen": -138.09228515625, + "logps/rejected": -228.33609008789062, + "loss": 0.7236, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8138493299484253, + "rewards/margins": 0.8385409116744995, + "rewards/rejected": -1.6523903608322144, + "step": 7268 + }, + { + "epoch": 0.84, + "learning_rate": 4.933863982207655e-08, + "logits/chosen": -2.5273661613464355, + "logits/rejected": -2.674689531326294, + "logps/chosen": -345.9493103027344, + "logps/rejected": -266.034423828125, + "loss": 0.7288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.020972460508346558, + "rewards/margins": 1.1111276149749756, + "rewards/rejected": -1.0901551246643066, + "step": 7269 + }, + { + "epoch": 0.84, + "learning_rate": 4.93035233524523e-08, + "logits/chosen": -3.4082188606262207, + "logits/rejected": -3.466184377670288, + "logps/chosen": -235.87144470214844, + "logps/rejected": -366.4646301269531, + "loss": 0.6429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4190317392349243, + "rewards/margins": 0.8734185695648193, + "rewards/rejected": -1.2924504280090332, + "step": 7270 + }, + { + "epoch": 0.84, + "learning_rate": 4.926840688282804e-08, + "logits/chosen": -2.5614054203033447, + "logits/rejected": -2.43047833442688, + "logps/chosen": -152.64076232910156, + "logps/rejected": -233.89634704589844, + "loss": 0.339, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011704586446285248, + "rewards/margins": 1.699904441833496, + "rewards/rejected": -1.688199758529663, + "step": 7271 + }, + { + "epoch": 0.84, + "learning_rate": 4.9233290413203796e-08, + "logits/chosen": -3.010301113128662, + "logits/rejected": -3.005540609359741, + "logps/chosen": -157.85079956054688, + "logps/rejected": -173.25262451171875, + "loss": 0.4327, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1290259063243866, + "rewards/margins": 1.1687648296356201, + "rewards/rejected": -1.0397388935089111, + "step": 7272 + }, + { + "epoch": 0.84, + "learning_rate": 4.9198173943579536e-08, + "logits/chosen": -3.093505859375, + "logits/rejected": -3.0845139026641846, + "logps/chosen": -255.22900390625, + "logps/rejected": -300.4136657714844, + "loss": 0.4937, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18153756856918335, + "rewards/margins": 1.9888190031051636, + "rewards/rejected": -1.807281494140625, + "step": 7273 + }, + { + "epoch": 0.84, + "learning_rate": 4.9163057473955284e-08, + "logits/chosen": -2.9446828365325928, + "logits/rejected": -2.702209949493408, + "logps/chosen": -218.6045379638672, + "logps/rejected": -396.9185791015625, + "loss": 1.0241, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9380279779434204, + "rewards/margins": -0.0010988116264343262, + "rewards/rejected": -0.9369291067123413, + "step": 7274 + }, + { + "epoch": 0.84, + "learning_rate": 4.912794100433103e-08, + "logits/chosen": -3.0684814453125, + "logits/rejected": -3.1150918006896973, + "logps/chosen": -281.46478271484375, + "logps/rejected": -408.67950439453125, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04981401562690735, + "rewards/margins": 2.914843797683716, + "rewards/rejected": -2.964657783508301, + "step": 7275 + }, + { + "epoch": 0.84, + "learning_rate": 4.909282453470678e-08, + "logits/chosen": -3.40449857711792, + "logits/rejected": -3.3501219749450684, + "logps/chosen": -210.17259216308594, + "logps/rejected": -248.86895751953125, + "loss": 0.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07876773178577423, + "rewards/margins": 2.6153295040130615, + "rewards/rejected": -2.6940970420837402, + "step": 7276 + }, + { + "epoch": 0.84, + "learning_rate": 4.905770806508252e-08, + "logits/chosen": -2.9393742084503174, + "logits/rejected": -3.0858113765716553, + "logps/chosen": -245.57765197753906, + "logps/rejected": -315.68597412109375, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17368200421333313, + "rewards/margins": 2.679633617401123, + "rewards/rejected": -2.853315830230713, + "step": 7277 + }, + { + "epoch": 0.84, + "learning_rate": 4.902259159545827e-08, + "logits/chosen": -3.411923885345459, + "logits/rejected": -3.452000617980957, + "logps/chosen": -166.6035614013672, + "logps/rejected": -238.7212677001953, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08537646383047104, + "rewards/margins": 3.3329763412475586, + "rewards/rejected": -3.2475998401641846, + "step": 7278 + }, + { + "epoch": 0.84, + "learning_rate": 4.898747512583401e-08, + "logits/chosen": -2.8583695888519287, + "logits/rejected": -3.092001438140869, + "logps/chosen": -168.74026489257812, + "logps/rejected": -158.2676544189453, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0483776330947876, + "rewards/margins": 2.0308406352996826, + "rewards/rejected": -1.9824631214141846, + "step": 7279 + }, + { + "epoch": 0.84, + "learning_rate": 4.895235865620976e-08, + "logits/chosen": -3.1074063777923584, + "logits/rejected": -3.254213333129883, + "logps/chosen": -276.8778076171875, + "logps/rejected": -323.763671875, + "loss": 0.2042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27058011293411255, + "rewards/margins": 2.574982166290283, + "rewards/rejected": -2.3044021129608154, + "step": 7280 + }, + { + "epoch": 0.84, + "learning_rate": 4.891724218658551e-08, + "logits/chosen": -3.087440252304077, + "logits/rejected": -3.165618419647217, + "logps/chosen": -226.5810546875, + "logps/rejected": -188.57327270507812, + "loss": 0.8235, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1311836242675781, + "rewards/margins": 0.38190758228302, + "rewards/rejected": -1.5130912065505981, + "step": 7281 + }, + { + "epoch": 0.84, + "learning_rate": 4.8882125716961255e-08, + "logits/chosen": -2.828007936477661, + "logits/rejected": -2.893136501312256, + "logps/chosen": -198.54664611816406, + "logps/rejected": -234.611572265625, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3042939305305481, + "rewards/margins": 1.5410435199737549, + "rewards/rejected": -1.2367496490478516, + "step": 7282 + }, + { + "epoch": 0.84, + "learning_rate": 4.8847009247336996e-08, + "logits/chosen": -3.4766130447387695, + "logits/rejected": -3.467379093170166, + "logps/chosen": -204.6961669921875, + "logps/rejected": -179.06390380859375, + "loss": 0.4449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18237724900245667, + "rewards/margins": 3.2531626224517822, + "rewards/rejected": -3.435539960861206, + "step": 7283 + }, + { + "epoch": 0.84, + "learning_rate": 4.881189277771275e-08, + "logits/chosen": -2.37133526802063, + "logits/rejected": -2.424757480621338, + "logps/chosen": -399.0733642578125, + "logps/rejected": -490.33734130859375, + "loss": 0.4331, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2209317982196808, + "rewards/margins": 1.3591325283050537, + "rewards/rejected": -1.1382006406784058, + "step": 7284 + }, + { + "epoch": 0.84, + "learning_rate": 4.877677630808849e-08, + "logits/chosen": -3.3877601623535156, + "logits/rejected": -3.4637575149536133, + "logps/chosen": -232.30972290039062, + "logps/rejected": -279.6500244140625, + "loss": 0.7722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35711830854415894, + "rewards/margins": 0.843306303024292, + "rewards/rejected": -1.2004246711730957, + "step": 7285 + }, + { + "epoch": 0.84, + "learning_rate": 4.8741659838464244e-08, + "logits/chosen": -3.00357723236084, + "logits/rejected": -3.234877586364746, + "logps/chosen": -200.8505859375, + "logps/rejected": -272.5226745605469, + "loss": 0.3513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1436670422554016, + "rewards/margins": 1.5940016508102417, + "rewards/rejected": -1.7376686334609985, + "step": 7286 + }, + { + "epoch": 0.84, + "learning_rate": 4.8706543368839985e-08, + "logits/chosen": -3.370555877685547, + "logits/rejected": -3.378140926361084, + "logps/chosen": -176.77455139160156, + "logps/rejected": -184.7077178955078, + "loss": 0.425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16141854226589203, + "rewards/margins": 1.2648391723632812, + "rewards/rejected": -1.426257848739624, + "step": 7287 + }, + { + "epoch": 0.84, + "learning_rate": 4.867142689921573e-08, + "logits/chosen": -3.794935941696167, + "logits/rejected": -3.626936912536621, + "logps/chosen": -440.64837646484375, + "logps/rejected": -316.0798034667969, + "loss": 0.491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13842487335205078, + "rewards/margins": 1.6401281356811523, + "rewards/rejected": -1.7785531282424927, + "step": 7288 + }, + { + "epoch": 0.84, + "learning_rate": 4.863631042959148e-08, + "logits/chosen": -3.341630697250366, + "logits/rejected": -3.200620651245117, + "logps/chosen": -169.20860290527344, + "logps/rejected": -151.58544921875, + "loss": 0.3727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3285248279571533, + "rewards/margins": 2.000237464904785, + "rewards/rejected": -2.3287620544433594, + "step": 7289 + }, + { + "epoch": 0.84, + "learning_rate": 4.860119395996722e-08, + "logits/chosen": -2.6627156734466553, + "logits/rejected": -2.53596568107605, + "logps/chosen": -468.583251953125, + "logps/rejected": -315.4981689453125, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08667278289794922, + "rewards/margins": 2.6280128955841064, + "rewards/rejected": -2.5413401126861572, + "step": 7290 + }, + { + "epoch": 0.84, + "learning_rate": 4.856607749034297e-08, + "logits/chosen": -2.7861886024475098, + "logits/rejected": -2.581777572631836, + "logps/chosen": -432.36114501953125, + "logps/rejected": -369.5299987792969, + "loss": 0.3895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.167967289686203, + "rewards/margins": 1.6322658061981201, + "rewards/rejected": -1.800233006477356, + "step": 7291 + }, + { + "epoch": 0.84, + "learning_rate": 4.8530961020718714e-08, + "logits/chosen": -2.9562318325042725, + "logits/rejected": -2.994007110595703, + "logps/chosen": -248.79071044921875, + "logps/rejected": -306.63031005859375, + "loss": 0.1022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9452049732208252, + "rewards/margins": 3.174410581588745, + "rewards/rejected": -2.22920560836792, + "step": 7292 + }, + { + "epoch": 0.84, + "learning_rate": 4.849584455109446e-08, + "logits/chosen": -2.2344038486480713, + "logits/rejected": -2.411007881164551, + "logps/chosen": -436.69000244140625, + "logps/rejected": -193.89605712890625, + "loss": 0.2749, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.032880473881959915, + "rewards/margins": 2.6431002616882324, + "rewards/rejected": -2.675980806350708, + "step": 7293 + }, + { + "epoch": 0.84, + "learning_rate": 4.84607280814702e-08, + "logits/chosen": -2.9807276725769043, + "logits/rejected": -3.4131040573120117, + "logps/chosen": -138.2904510498047, + "logps/rejected": -371.0430908203125, + "loss": 0.4197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21740078926086426, + "rewards/margins": 2.3061039447784424, + "rewards/rejected": -2.5235044956207275, + "step": 7294 + }, + { + "epoch": 0.84, + "learning_rate": 4.8425611611845956e-08, + "logits/chosen": -3.3203837871551514, + "logits/rejected": -3.5049076080322266, + "logps/chosen": -315.8813781738281, + "logps/rejected": -333.078369140625, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7096199989318848, + "rewards/margins": 3.622614860534668, + "rewards/rejected": -2.912994861602783, + "step": 7295 + }, + { + "epoch": 0.84, + "learning_rate": 4.8390495142221696e-08, + "logits/chosen": -3.440485715866089, + "logits/rejected": -3.521178722381592, + "logps/chosen": -335.42938232421875, + "logps/rejected": -489.822021484375, + "loss": 0.1528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08731476217508316, + "rewards/margins": 4.07585334777832, + "rewards/rejected": -3.9885382652282715, + "step": 7296 + }, + { + "epoch": 0.84, + "learning_rate": 4.8355378672597444e-08, + "logits/chosen": -3.420790195465088, + "logits/rejected": -3.415738105773926, + "logps/chosen": -440.493408203125, + "logps/rejected": -245.51412963867188, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08056928217411041, + "rewards/margins": 2.0284018516540527, + "rewards/rejected": -1.947832465171814, + "step": 7297 + }, + { + "epoch": 0.84, + "learning_rate": 4.832026220297319e-08, + "logits/chosen": -3.185253381729126, + "logits/rejected": -2.7340140342712402, + "logps/chosen": -226.18995666503906, + "logps/rejected": -158.39584350585938, + "loss": 0.3169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.198471799492836, + "rewards/margins": 1.2369322776794434, + "rewards/rejected": -1.4354041814804077, + "step": 7298 + }, + { + "epoch": 0.84, + "learning_rate": 4.828514573334894e-08, + "logits/chosen": -3.8645191192626953, + "logits/rejected": -3.8011977672576904, + "logps/chosen": -237.42343139648438, + "logps/rejected": -187.81903076171875, + "loss": 0.7474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7465457916259766, + "rewards/margins": 0.8199043273925781, + "rewards/rejected": -1.5664501190185547, + "step": 7299 + }, + { + "epoch": 0.84, + "learning_rate": 4.825002926372468e-08, + "logits/chosen": -3.102614402770996, + "logits/rejected": -2.942016363143921, + "logps/chosen": -221.24737548828125, + "logps/rejected": -381.0291442871094, + "loss": 0.3752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28374022245407104, + "rewards/margins": 3.445960521697998, + "rewards/rejected": -3.7297005653381348, + "step": 7300 + }, + { + "epoch": 0.84, + "learning_rate": 4.821491279410043e-08, + "logits/chosen": -3.061697006225586, + "logits/rejected": -3.1062002182006836, + "logps/chosen": -297.1098937988281, + "logps/rejected": -318.64300537109375, + "loss": 0.3203, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6267428994178772, + "rewards/margins": 2.014457941055298, + "rewards/rejected": -2.641200542449951, + "step": 7301 + }, + { + "epoch": 0.84, + "learning_rate": 4.817979632447617e-08, + "logits/chosen": -2.8429322242736816, + "logits/rejected": -2.9778459072113037, + "logps/chosen": -142.3765106201172, + "logps/rejected": -174.58303833007812, + "loss": 0.6632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6912792325019836, + "rewards/margins": 0.4836684465408325, + "rewards/rejected": -1.174947738647461, + "step": 7302 + }, + { + "epoch": 0.84, + "learning_rate": 4.814467985485193e-08, + "logits/chosen": -2.9730589389801025, + "logits/rejected": -3.104862689971924, + "logps/chosen": -320.78143310546875, + "logps/rejected": -291.5182800292969, + "loss": 0.2316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03268962353467941, + "rewards/margins": 2.9554500579833984, + "rewards/rejected": -2.922760486602783, + "step": 7303 + }, + { + "epoch": 0.84, + "learning_rate": 4.810956338522767e-08, + "logits/chosen": -3.9333295822143555, + "logits/rejected": -3.93192195892334, + "logps/chosen": -461.83172607421875, + "logps/rejected": -414.620361328125, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48872315883636475, + "rewards/margins": 1.8870216608047485, + "rewards/rejected": -2.3757448196411133, + "step": 7304 + }, + { + "epoch": 0.84, + "learning_rate": 4.8074446915603415e-08, + "logits/chosen": -3.002988815307617, + "logits/rejected": -3.426597833633423, + "logps/chosen": -287.8871765136719, + "logps/rejected": -379.8193054199219, + "loss": 0.1929, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.281061053276062, + "rewards/margins": 4.356834411621094, + "rewards/rejected": -4.0757737159729, + "step": 7305 + }, + { + "epoch": 0.84, + "learning_rate": 4.803933044597916e-08, + "logits/chosen": -3.575083017349243, + "logits/rejected": -3.5665464401245117, + "logps/chosen": -159.91993713378906, + "logps/rejected": -201.2017059326172, + "loss": 0.2467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5654672384262085, + "rewards/margins": 2.040109634399414, + "rewards/rejected": -1.4746425151824951, + "step": 7306 + }, + { + "epoch": 0.84, + "learning_rate": 4.800421397635491e-08, + "logits/chosen": -2.8793303966522217, + "logits/rejected": -3.108724594116211, + "logps/chosen": -243.35684204101562, + "logps/rejected": -205.84352111816406, + "loss": 0.2111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1350824385881424, + "rewards/margins": 2.2993600368499756, + "rewards/rejected": -2.4344425201416016, + "step": 7307 + }, + { + "epoch": 0.84, + "learning_rate": 4.796909750673065e-08, + "logits/chosen": -3.401247262954712, + "logits/rejected": -3.3550868034362793, + "logps/chosen": -128.42762756347656, + "logps/rejected": -343.7008056640625, + "loss": 0.3172, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06420469284057617, + "rewards/margins": 3.055263042449951, + "rewards/rejected": -2.991058349609375, + "step": 7308 + }, + { + "epoch": 0.84, + "learning_rate": 4.7933981037106404e-08, + "logits/chosen": -3.4297101497650146, + "logits/rejected": -3.5788333415985107, + "logps/chosen": -197.015869140625, + "logps/rejected": -186.0310821533203, + "loss": 0.4897, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25712335109710693, + "rewards/margins": 1.947446346282959, + "rewards/rejected": -2.2045698165893555, + "step": 7309 + }, + { + "epoch": 0.84, + "learning_rate": 4.7898864567482145e-08, + "logits/chosen": -3.027949571609497, + "logits/rejected": -3.2580952644348145, + "logps/chosen": -182.4051055908203, + "logps/rejected": -169.75430297851562, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4217607080936432, + "rewards/margins": 1.1751235723495483, + "rewards/rejected": -0.753362774848938, + "step": 7310 + }, + { + "epoch": 0.84, + "learning_rate": 4.786374809785789e-08, + "logits/chosen": -4.005672931671143, + "logits/rejected": -3.927506923675537, + "logps/chosen": -179.89671325683594, + "logps/rejected": -259.1134338378906, + "loss": 0.7326, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4258078634738922, + "rewards/margins": 0.3502572178840637, + "rewards/rejected": -0.7760651111602783, + "step": 7311 + }, + { + "epoch": 0.84, + "learning_rate": 4.782863162823364e-08, + "logits/chosen": -3.840287208557129, + "logits/rejected": -3.5483999252319336, + "logps/chosen": -169.66378784179688, + "logps/rejected": -139.97280883789062, + "loss": 0.3329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12246038019657135, + "rewards/margins": 1.8799288272857666, + "rewards/rejected": -2.0023891925811768, + "step": 7312 + }, + { + "epoch": 0.84, + "learning_rate": 4.7793515158609386e-08, + "logits/chosen": -3.4007012844085693, + "logits/rejected": -3.5846152305603027, + "logps/chosen": -122.65921020507812, + "logps/rejected": -274.78289794921875, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20147854089736938, + "rewards/margins": 2.8531172275543213, + "rewards/rejected": -3.054595947265625, + "step": 7313 + }, + { + "epoch": 0.84, + "learning_rate": 4.775839868898513e-08, + "logits/chosen": -2.6804275512695312, + "logits/rejected": -2.589238405227661, + "logps/chosen": -202.95384216308594, + "logps/rejected": -221.0663299560547, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16295230388641357, + "rewards/margins": 0.8561410903930664, + "rewards/rejected": -1.01909339427948, + "step": 7314 + }, + { + "epoch": 0.84, + "learning_rate": 4.772328221936088e-08, + "logits/chosen": -2.763871431350708, + "logits/rejected": -2.8507299423217773, + "logps/chosen": -180.61026000976562, + "logps/rejected": -250.99374389648438, + "loss": 0.2902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09512703120708466, + "rewards/margins": 3.4204747676849365, + "rewards/rejected": -3.515601634979248, + "step": 7315 + }, + { + "epoch": 0.84, + "learning_rate": 4.768816574973662e-08, + "logits/chosen": -3.0319623947143555, + "logits/rejected": -2.837456226348877, + "logps/chosen": -338.229248046875, + "logps/rejected": -396.53668212890625, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10290517657995224, + "rewards/margins": 1.1387763023376465, + "rewards/rejected": -1.0358712673187256, + "step": 7316 + }, + { + "epoch": 0.84, + "learning_rate": 4.7653049280112375e-08, + "logits/chosen": -3.8190183639526367, + "logits/rejected": -3.68247652053833, + "logps/chosen": -212.0687255859375, + "logps/rejected": -266.50396728515625, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11737871170043945, + "rewards/margins": 2.275092601776123, + "rewards/rejected": -2.3924717903137207, + "step": 7317 + }, + { + "epoch": 0.84, + "learning_rate": 4.7617932810488116e-08, + "logits/chosen": -2.8385801315307617, + "logits/rejected": -2.957085371017456, + "logps/chosen": -278.64447021484375, + "logps/rejected": -184.35968017578125, + "loss": 0.6205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6117500066757202, + "rewards/margins": 0.6758273243904114, + "rewards/rejected": -1.2875773906707764, + "step": 7318 + }, + { + "epoch": 0.84, + "learning_rate": 4.758281634086386e-08, + "logits/chosen": -3.0211052894592285, + "logits/rejected": -3.087735176086426, + "logps/chosen": -105.45380401611328, + "logps/rejected": -150.23008728027344, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1231069341301918, + "rewards/margins": 0.9202936291694641, + "rewards/rejected": -1.043400526046753, + "step": 7319 + }, + { + "epoch": 0.84, + "learning_rate": 4.754769987123961e-08, + "logits/chosen": -3.249913215637207, + "logits/rejected": -2.991976261138916, + "logps/chosen": -324.0323486328125, + "logps/rejected": -385.9041748046875, + "loss": 0.5114, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45750170946121216, + "rewards/margins": 1.5237070322036743, + "rewards/rejected": -1.0662052631378174, + "step": 7320 + }, + { + "epoch": 0.84, + "learning_rate": 4.751258340161536e-08, + "logits/chosen": -3.225053071975708, + "logits/rejected": -3.1735119819641113, + "logps/chosen": -186.2451171875, + "logps/rejected": -172.99380493164062, + "loss": 0.3687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17560851573944092, + "rewards/margins": 1.128713846206665, + "rewards/rejected": -1.304322361946106, + "step": 7321 + }, + { + "epoch": 0.84, + "learning_rate": 4.74774669319911e-08, + "logits/chosen": -2.422070264816284, + "logits/rejected": -2.704610824584961, + "logps/chosen": -254.8570556640625, + "logps/rejected": -208.01219177246094, + "loss": 0.5933, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2295866161584854, + "rewards/margins": 0.40061384439468384, + "rewards/rejected": -0.6302005052566528, + "step": 7322 + }, + { + "epoch": 0.84, + "learning_rate": 4.744235046236685e-08, + "logits/chosen": -2.7399778366088867, + "logits/rejected": -2.8843202590942383, + "logps/chosen": -195.7682647705078, + "logps/rejected": -245.11338806152344, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24796563386917114, + "rewards/margins": 1.8631551265716553, + "rewards/rejected": -2.1111207008361816, + "step": 7323 + }, + { + "epoch": 0.84, + "learning_rate": 4.740723399274259e-08, + "logits/chosen": -3.283493757247925, + "logits/rejected": -3.4087436199188232, + "logps/chosen": -104.8115234375, + "logps/rejected": -119.67205810546875, + "loss": 0.2859, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3326238989830017, + "rewards/margins": 1.4431123733520508, + "rewards/rejected": -1.1104885339736938, + "step": 7324 + }, + { + "epoch": 0.84, + "learning_rate": 4.7372117523118347e-08, + "logits/chosen": -3.4501538276672363, + "logits/rejected": -2.9166085720062256, + "logps/chosen": -423.3934020996094, + "logps/rejected": -302.6856689453125, + "loss": 0.3584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28087443113327026, + "rewards/margins": 2.6533474922180176, + "rewards/rejected": -2.3724732398986816, + "step": 7325 + }, + { + "epoch": 0.84, + "learning_rate": 4.733700105349409e-08, + "logits/chosen": -3.491476535797119, + "logits/rejected": -3.1719436645507812, + "logps/chosen": -219.1077880859375, + "logps/rejected": -207.60372924804688, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4429379105567932, + "rewards/margins": 1.3127529621124268, + "rewards/rejected": -1.7556910514831543, + "step": 7326 + }, + { + "epoch": 0.84, + "learning_rate": 4.7301884583869834e-08, + "logits/chosen": -2.8877086639404297, + "logits/rejected": -2.5626325607299805, + "logps/chosen": -231.3386993408203, + "logps/rejected": -170.88784790039062, + "loss": 0.2943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08883452415466309, + "rewards/margins": 1.6548713445663452, + "rewards/rejected": -1.7437058687210083, + "step": 7327 + }, + { + "epoch": 0.84, + "learning_rate": 4.7266768114245575e-08, + "logits/chosen": -3.6558258533477783, + "logits/rejected": -3.944817304611206, + "logps/chosen": -114.43352508544922, + "logps/rejected": -219.16766357421875, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1752062737941742, + "rewards/margins": 2.490837812423706, + "rewards/rejected": -2.6660444736480713, + "step": 7328 + }, + { + "epoch": 0.84, + "learning_rate": 4.723165164462133e-08, + "logits/chosen": -3.315973997116089, + "logits/rejected": -3.5065174102783203, + "logps/chosen": -278.18060302734375, + "logps/rejected": -295.3574523925781, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29532575607299805, + "rewards/margins": 1.7133129835128784, + "rewards/rejected": -2.008638858795166, + "step": 7329 + }, + { + "epoch": 0.85, + "learning_rate": 4.719653517499707e-08, + "logits/chosen": -3.7902326583862305, + "logits/rejected": -3.832307815551758, + "logps/chosen": -271.6027526855469, + "logps/rejected": -314.26898193359375, + "loss": 0.2584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16282454133033752, + "rewards/margins": 3.438626527786255, + "rewards/rejected": -3.2758021354675293, + "step": 7330 + }, + { + "epoch": 0.85, + "learning_rate": 4.7161418705372823e-08, + "logits/chosen": -3.0126850605010986, + "logits/rejected": -3.073497772216797, + "logps/chosen": -359.3127746582031, + "logps/rejected": -323.47991943359375, + "loss": 0.2408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5288572311401367, + "rewards/margins": 1.8935892581939697, + "rewards/rejected": -1.364732027053833, + "step": 7331 + }, + { + "epoch": 0.85, + "learning_rate": 4.7126302235748564e-08, + "logits/chosen": -3.708963394165039, + "logits/rejected": -3.5330350399017334, + "logps/chosen": -256.1639099121094, + "logps/rejected": -335.654296875, + "loss": 0.3016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23040857911109924, + "rewards/margins": 1.8226168155670166, + "rewards/rejected": -1.5922081470489502, + "step": 7332 + }, + { + "epoch": 0.85, + "learning_rate": 4.709118576612431e-08, + "logits/chosen": -2.91880464553833, + "logits/rejected": -3.4834585189819336, + "logps/chosen": -191.68170166015625, + "logps/rejected": -208.81973266601562, + "loss": 0.7786, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8515530824661255, + "rewards/margins": 0.48204243183135986, + "rewards/rejected": -1.3335955142974854, + "step": 7333 + }, + { + "epoch": 0.85, + "learning_rate": 4.705606929650006e-08, + "logits/chosen": -2.8397560119628906, + "logits/rejected": -3.1333260536193848, + "logps/chosen": -126.1759033203125, + "logps/rejected": -219.0943603515625, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3072376847267151, + "rewards/margins": 1.2899103164672852, + "rewards/rejected": -1.5971479415893555, + "step": 7334 + }, + { + "epoch": 0.85, + "learning_rate": 4.7020952826875806e-08, + "logits/chosen": -2.8610587120056152, + "logits/rejected": -3.2174344062805176, + "logps/chosen": -317.0050048828125, + "logps/rejected": -224.5142822265625, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6875356435775757, + "rewards/margins": 1.741040825843811, + "rewards/rejected": -1.0535050630569458, + "step": 7335 + }, + { + "epoch": 0.85, + "learning_rate": 4.6985836357251546e-08, + "logits/chosen": -2.957298994064331, + "logits/rejected": -3.1131653785705566, + "logps/chosen": -213.8180694580078, + "logps/rejected": -217.9388427734375, + "loss": 0.4836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0843716412782669, + "rewards/margins": 1.8703190088272095, + "rewards/rejected": -1.95469069480896, + "step": 7336 + }, + { + "epoch": 0.85, + "learning_rate": 4.69507198876273e-08, + "logits/chosen": -3.2588839530944824, + "logits/rejected": -3.5889248847961426, + "logps/chosen": -254.72283935546875, + "logps/rejected": -355.9941101074219, + "loss": 0.2524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30633866786956787, + "rewards/margins": 2.3448753356933594, + "rewards/rejected": -2.6512138843536377, + "step": 7337 + }, + { + "epoch": 0.85, + "learning_rate": 4.691560341800304e-08, + "logits/chosen": -2.3245861530303955, + "logits/rejected": -2.4936394691467285, + "logps/chosen": -371.86859130859375, + "logps/rejected": -280.9288330078125, + "loss": 0.5391, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3915713429450989, + "rewards/margins": 1.0539168119430542, + "rewards/rejected": -0.6623454689979553, + "step": 7338 + }, + { + "epoch": 0.85, + "learning_rate": 4.688048694837878e-08, + "logits/chosen": -2.890493392944336, + "logits/rejected": -2.829853057861328, + "logps/chosen": -244.71060180664062, + "logps/rejected": -300.8856201171875, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11171610653400421, + "rewards/margins": 3.6254050731658936, + "rewards/rejected": -3.5136890411376953, + "step": 7339 + }, + { + "epoch": 0.85, + "learning_rate": 4.6845370478754535e-08, + "logits/chosen": -3.243692398071289, + "logits/rejected": -3.4760990142822266, + "logps/chosen": -164.147216796875, + "logps/rejected": -215.63995361328125, + "loss": 0.2067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10448744148015976, + "rewards/margins": 4.02479362487793, + "rewards/rejected": -3.9203059673309326, + "step": 7340 + }, + { + "epoch": 0.85, + "learning_rate": 4.6810254009130276e-08, + "logits/chosen": -3.750148296356201, + "logits/rejected": -3.697678327560425, + "logps/chosen": -204.79827880859375, + "logps/rejected": -261.7188720703125, + "loss": 0.6503, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.047427773475647, + "rewards/margins": 1.7234405279159546, + "rewards/rejected": -2.7708683013916016, + "step": 7341 + }, + { + "epoch": 0.85, + "learning_rate": 4.677513753950603e-08, + "logits/chosen": -2.7344017028808594, + "logits/rejected": -2.74267578125, + "logps/chosen": -344.7205505371094, + "logps/rejected": -223.904296875, + "loss": 0.2918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018908776342868805, + "rewards/margins": 2.380125045776367, + "rewards/rejected": -2.399034023284912, + "step": 7342 + }, + { + "epoch": 0.85, + "learning_rate": 4.674002106988177e-08, + "logits/chosen": -2.784703254699707, + "logits/rejected": -2.731060743331909, + "logps/chosen": -372.99859619140625, + "logps/rejected": -297.0814208984375, + "loss": 0.5189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05833778530359268, + "rewards/margins": 0.9888304471969604, + "rewards/rejected": -0.930492639541626, + "step": 7343 + }, + { + "epoch": 0.85, + "learning_rate": 4.670490460025752e-08, + "logits/chosen": -2.8519275188446045, + "logits/rejected": -2.8314380645751953, + "logps/chosen": -416.91717529296875, + "logps/rejected": -377.5380554199219, + "loss": 0.4659, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5273405909538269, + "rewards/margins": 1.565872073173523, + "rewards/rejected": -2.093212366104126, + "step": 7344 + }, + { + "epoch": 0.85, + "learning_rate": 4.666978813063326e-08, + "logits/chosen": -4.028273582458496, + "logits/rejected": -3.8425886631011963, + "logps/chosen": -302.6734313964844, + "logps/rejected": -279.5216979980469, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.578534722328186, + "rewards/margins": 2.5576696395874023, + "rewards/rejected": -1.9791351556777954, + "step": 7345 + }, + { + "epoch": 0.85, + "learning_rate": 4.663467166100901e-08, + "logits/chosen": -3.1002674102783203, + "logits/rejected": -3.0070433616638184, + "logps/chosen": -386.769775390625, + "logps/rejected": -298.45989990234375, + "loss": 0.2962, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29456162452697754, + "rewards/margins": 2.590243101119995, + "rewards/rejected": -2.2956814765930176, + "step": 7346 + }, + { + "epoch": 0.85, + "learning_rate": 4.659955519138475e-08, + "logits/chosen": -2.9068028926849365, + "logits/rejected": -2.9619979858398438, + "logps/chosen": -329.574951171875, + "logps/rejected": -329.071044921875, + "loss": 0.3597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04635202884674072, + "rewards/margins": 2.993783712387085, + "rewards/rejected": -3.040135622024536, + "step": 7347 + }, + { + "epoch": 0.85, + "learning_rate": 4.6564438721760507e-08, + "logits/chosen": -3.2784523963928223, + "logits/rejected": -3.2948689460754395, + "logps/chosen": -205.94784545898438, + "logps/rejected": -214.25070190429688, + "loss": 0.605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16859325766563416, + "rewards/margins": 1.6817902326583862, + "rewards/rejected": -1.8503834009170532, + "step": 7348 + }, + { + "epoch": 0.85, + "learning_rate": 4.652932225213625e-08, + "logits/chosen": -3.048021078109741, + "logits/rejected": -3.0928821563720703, + "logps/chosen": -296.2345886230469, + "logps/rejected": -255.06875610351562, + "loss": 0.7382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29918408393859863, + "rewards/margins": 0.9553760290145874, + "rewards/rejected": -1.254560112953186, + "step": 7349 + }, + { + "epoch": 0.85, + "learning_rate": 4.6494205782511994e-08, + "logits/chosen": -3.6084914207458496, + "logits/rejected": -3.455488443374634, + "logps/chosen": -266.482666015625, + "logps/rejected": -268.52020263671875, + "loss": 0.3066, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7531477212905884, + "rewards/margins": 1.7420415878295898, + "rewards/rejected": -2.4951891899108887, + "step": 7350 + }, + { + "epoch": 0.85, + "learning_rate": 4.645908931288774e-08, + "logits/chosen": -2.858302116394043, + "logits/rejected": -2.5553460121154785, + "logps/chosen": -237.10000610351562, + "logps/rejected": -196.5335235595703, + "loss": 0.3574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12628306448459625, + "rewards/margins": 0.9533749222755432, + "rewards/rejected": -1.0796579122543335, + "step": 7351 + }, + { + "epoch": 0.85, + "learning_rate": 4.642397284326349e-08, + "logits/chosen": -2.5157790184020996, + "logits/rejected": -2.3960089683532715, + "logps/chosen": -240.721435546875, + "logps/rejected": -247.38058471679688, + "loss": 0.4024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21461085975170135, + "rewards/margins": 1.7407556772232056, + "rewards/rejected": -1.955366611480713, + "step": 7352 + }, + { + "epoch": 0.85, + "learning_rate": 4.638885637363923e-08, + "logits/chosen": -3.074939250946045, + "logits/rejected": -3.361172676086426, + "logps/chosen": -200.81396484375, + "logps/rejected": -309.76910400390625, + "loss": 0.19, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023403527215123177, + "rewards/margins": 2.3370542526245117, + "rewards/rejected": -2.3136508464813232, + "step": 7353 + }, + { + "epoch": 0.85, + "learning_rate": 4.6353739904014983e-08, + "logits/chosen": -3.2743964195251465, + "logits/rejected": -3.385770797729492, + "logps/chosen": -251.82229614257812, + "logps/rejected": -214.83837890625, + "loss": 0.2196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3394489288330078, + "rewards/margins": 2.7553133964538574, + "rewards/rejected": -3.0947623252868652, + "step": 7354 + }, + { + "epoch": 0.85, + "learning_rate": 4.6318623434390724e-08, + "logits/chosen": -3.495187997817993, + "logits/rejected": -3.1620311737060547, + "logps/chosen": -171.62982177734375, + "logps/rejected": -202.3713836669922, + "loss": 0.9129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13207721710205078, + "rewards/margins": 0.032562822103500366, + "rewards/rejected": -0.16464009881019592, + "step": 7355 + }, + { + "epoch": 0.85, + "learning_rate": 4.628350696476648e-08, + "logits/chosen": -2.7904837131500244, + "logits/rejected": -2.564824342727661, + "logps/chosen": -348.6330261230469, + "logps/rejected": -349.6911315917969, + "loss": 0.3207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16057689487934113, + "rewards/margins": 2.106289863586426, + "rewards/rejected": -1.945712924003601, + "step": 7356 + }, + { + "epoch": 0.85, + "learning_rate": 4.624839049514222e-08, + "logits/chosen": -3.273210287094116, + "logits/rejected": -3.158374786376953, + "logps/chosen": -366.6065368652344, + "logps/rejected": -287.60882568359375, + "loss": 0.5256, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5399941205978394, + "rewards/margins": 1.22239351272583, + "rewards/rejected": -1.762387752532959, + "step": 7357 + }, + { + "epoch": 0.85, + "learning_rate": 4.6213274025517966e-08, + "logits/chosen": -3.065751075744629, + "logits/rejected": -3.1093544960021973, + "logps/chosen": -180.89016723632812, + "logps/rejected": -137.0840606689453, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4979577958583832, + "rewards/margins": 0.5182552337646484, + "rewards/rejected": -1.016213059425354, + "step": 7358 + }, + { + "epoch": 0.85, + "learning_rate": 4.617815755589371e-08, + "logits/chosen": -2.9162228107452393, + "logits/rejected": -2.9554502964019775, + "logps/chosen": -417.62969970703125, + "logps/rejected": -360.80120849609375, + "loss": 0.9655, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8676888942718506, + "rewards/margins": 1.8249750137329102, + "rewards/rejected": -2.6926639080047607, + "step": 7359 + }, + { + "epoch": 0.85, + "learning_rate": 4.614304108626946e-08, + "logits/chosen": -3.9769082069396973, + "logits/rejected": -3.8023223876953125, + "logps/chosen": -180.43881225585938, + "logps/rejected": -184.5065460205078, + "loss": 0.2377, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.223817840218544, + "rewards/margins": 2.0683250427246094, + "rewards/rejected": -1.8445074558258057, + "step": 7360 + }, + { + "epoch": 0.85, + "learning_rate": 4.61079246166452e-08, + "logits/chosen": -2.9157416820526123, + "logits/rejected": -2.7604269981384277, + "logps/chosen": -425.509521484375, + "logps/rejected": -443.06561279296875, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4441467225551605, + "rewards/margins": 2.7436764240264893, + "rewards/rejected": -3.1878232955932617, + "step": 7361 + }, + { + "epoch": 0.85, + "learning_rate": 4.6072808147020955e-08, + "logits/chosen": -3.153310537338257, + "logits/rejected": -3.0169992446899414, + "logps/chosen": -336.7825622558594, + "logps/rejected": -254.8670196533203, + "loss": 0.2862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43875330686569214, + "rewards/margins": 1.9437536001205444, + "rewards/rejected": -1.505000352859497, + "step": 7362 + }, + { + "epoch": 0.85, + "learning_rate": 4.6037691677396695e-08, + "logits/chosen": -3.0551371574401855, + "logits/rejected": -3.0391879081726074, + "logps/chosen": -361.33734130859375, + "logps/rejected": -315.07598876953125, + "loss": 0.1707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07322511076927185, + "rewards/margins": 2.002021074295044, + "rewards/rejected": -2.0752463340759277, + "step": 7363 + }, + { + "epoch": 0.85, + "learning_rate": 4.600257520777244e-08, + "logits/chosen": -3.6549954414367676, + "logits/rejected": -3.5781373977661133, + "logps/chosen": -207.0986785888672, + "logps/rejected": -173.01397705078125, + "loss": 0.3797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3333570063114166, + "rewards/margins": 2.0400099754333496, + "rewards/rejected": -2.3733668327331543, + "step": 7364 + }, + { + "epoch": 0.85, + "learning_rate": 4.596745873814819e-08, + "logits/chosen": -2.390343189239502, + "logits/rejected": -2.6015326976776123, + "logps/chosen": -340.1717529296875, + "logps/rejected": -299.9361572265625, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4012141227722168, + "rewards/margins": 1.6989376544952393, + "rewards/rejected": -2.100151777267456, + "step": 7365 + }, + { + "epoch": 0.85, + "learning_rate": 4.593234226852394e-08, + "logits/chosen": -3.4621875286102295, + "logits/rejected": -3.280602216720581, + "logps/chosen": -296.4013671875, + "logps/rejected": -230.3800506591797, + "loss": 0.2827, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6775476932525635, + "rewards/margins": 1.8550424575805664, + "rewards/rejected": -1.177494764328003, + "step": 7366 + }, + { + "epoch": 0.85, + "learning_rate": 4.589722579889968e-08, + "logits/chosen": -3.122526168823242, + "logits/rejected": -3.040761709213257, + "logps/chosen": -190.0733184814453, + "logps/rejected": -252.55596923828125, + "loss": 0.4569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09075909852981567, + "rewards/margins": 1.0809311866760254, + "rewards/rejected": -1.1716903448104858, + "step": 7367 + }, + { + "epoch": 0.85, + "learning_rate": 4.586210932927543e-08, + "logits/chosen": -3.335434675216675, + "logits/rejected": -3.025513172149658, + "logps/chosen": -303.6552734375, + "logps/rejected": -215.19613647460938, + "loss": 0.5531, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07645189762115479, + "rewards/margins": 1.9050986766815186, + "rewards/rejected": -1.8286468982696533, + "step": 7368 + }, + { + "epoch": 0.85, + "learning_rate": 4.582699285965117e-08, + "logits/chosen": -3.5892951488494873, + "logits/rejected": -3.3400070667266846, + "logps/chosen": -153.0694580078125, + "logps/rejected": -196.88465881347656, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17493481934070587, + "rewards/margins": 1.3493777513504028, + "rewards/rejected": -1.5243126153945923, + "step": 7369 + }, + { + "epoch": 0.85, + "learning_rate": 4.5791876390026926e-08, + "logits/chosen": -2.626343250274658, + "logits/rejected": -3.0521528720855713, + "logps/chosen": -166.386962890625, + "logps/rejected": -288.65350341796875, + "loss": 0.4311, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.691758930683136, + "rewards/margins": 2.1181459426879883, + "rewards/rejected": -1.426387071609497, + "step": 7370 + }, + { + "epoch": 0.85, + "learning_rate": 4.5756759920402667e-08, + "logits/chosen": -3.447619676589966, + "logits/rejected": -3.3275046348571777, + "logps/chosen": -333.9166259765625, + "logps/rejected": -212.23712158203125, + "loss": 0.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.67537522315979, + "rewards/margins": 1.3704421520233154, + "rewards/rejected": -2.0458173751831055, + "step": 7371 + }, + { + "epoch": 0.85, + "learning_rate": 4.5721643450778414e-08, + "logits/chosen": -3.1989216804504395, + "logits/rejected": -2.735759973526001, + "logps/chosen": -207.4794921875, + "logps/rejected": -224.9835205078125, + "loss": 0.0753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5257019400596619, + "rewards/margins": 2.663910388946533, + "rewards/rejected": -2.1382083892822266, + "step": 7372 + }, + { + "epoch": 0.85, + "learning_rate": 4.568652698115416e-08, + "logits/chosen": -3.432699680328369, + "logits/rejected": -3.2786669731140137, + "logps/chosen": -127.67857360839844, + "logps/rejected": -164.77244567871094, + "loss": 0.282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25780928134918213, + "rewards/margins": 2.0598089694976807, + "rewards/rejected": -1.8019996881484985, + "step": 7373 + }, + { + "epoch": 0.85, + "learning_rate": 4.565141051152991e-08, + "logits/chosen": -3.2788288593292236, + "logits/rejected": -3.306389808654785, + "logps/chosen": -225.0753173828125, + "logps/rejected": -337.6608581542969, + "loss": 0.247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08364330977201462, + "rewards/margins": 2.512240171432495, + "rewards/rejected": -2.59588360786438, + "step": 7374 + }, + { + "epoch": 0.85, + "learning_rate": 4.561629404190565e-08, + "logits/chosen": -2.615952730178833, + "logits/rejected": -3.077463150024414, + "logps/chosen": -159.70187377929688, + "logps/rejected": -324.16229248046875, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12438494712114334, + "rewards/margins": 4.684408187866211, + "rewards/rejected": -4.560022830963135, + "step": 7375 + }, + { + "epoch": 0.85, + "learning_rate": 4.55811775722814e-08, + "logits/chosen": -3.578324556350708, + "logits/rejected": -3.5736541748046875, + "logps/chosen": -246.40606689453125, + "logps/rejected": -295.82696533203125, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25619590282440186, + "rewards/margins": 3.3249778747558594, + "rewards/rejected": -3.5811736583709717, + "step": 7376 + }, + { + "epoch": 0.85, + "learning_rate": 4.5546061102657143e-08, + "logits/chosen": -3.2314066886901855, + "logits/rejected": -3.0887951850891113, + "logps/chosen": -214.36293029785156, + "logps/rejected": -292.63922119140625, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06424164772033691, + "rewards/margins": 2.1364798545837402, + "rewards/rejected": -2.2007217407226562, + "step": 7377 + }, + { + "epoch": 0.85, + "learning_rate": 4.55109446330329e-08, + "logits/chosen": -3.5547783374786377, + "logits/rejected": -3.8678107261657715, + "logps/chosen": -264.2967224121094, + "logps/rejected": -400.1763610839844, + "loss": 0.6633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24900075793266296, + "rewards/margins": 0.6543028950691223, + "rewards/rejected": -0.9033036231994629, + "step": 7378 + }, + { + "epoch": 0.85, + "learning_rate": 4.547582816340864e-08, + "logits/chosen": -3.590613603591919, + "logits/rejected": -3.8312416076660156, + "logps/chosen": -112.12344360351562, + "logps/rejected": -234.7897186279297, + "loss": 0.2583, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6900643706321716, + "rewards/margins": 3.0393528938293457, + "rewards/rejected": -2.3492884635925293, + "step": 7379 + }, + { + "epoch": 0.85, + "learning_rate": 4.5440711693784385e-08, + "logits/chosen": -3.241446018218994, + "logits/rejected": -3.127260208129883, + "logps/chosen": -318.18878173828125, + "logps/rejected": -264.37451171875, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24911613762378693, + "rewards/margins": 2.3555798530578613, + "rewards/rejected": -2.10646390914917, + "step": 7380 + }, + { + "epoch": 0.85, + "learning_rate": 4.5405595224160126e-08, + "logits/chosen": -2.711111545562744, + "logits/rejected": -2.7783877849578857, + "logps/chosen": -282.016845703125, + "logps/rejected": -269.02197265625, + "loss": 0.2801, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32066312432289124, + "rewards/margins": 2.2598624229431152, + "rewards/rejected": -1.9391992092132568, + "step": 7381 + }, + { + "epoch": 0.85, + "learning_rate": 4.537047875453588e-08, + "logits/chosen": -2.589491844177246, + "logits/rejected": -2.4451112747192383, + "logps/chosen": -215.72955322265625, + "logps/rejected": -361.7466735839844, + "loss": 0.3281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0880458801984787, + "rewards/margins": 1.8985483646392822, + "rewards/rejected": -1.9865942001342773, + "step": 7382 + }, + { + "epoch": 0.85, + "learning_rate": 4.533536228491162e-08, + "logits/chosen": -2.672924518585205, + "logits/rejected": -2.4827873706817627, + "logps/chosen": -209.29473876953125, + "logps/rejected": -381.988037109375, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5538262128829956, + "rewards/margins": 2.0753672122955322, + "rewards/rejected": -2.6291933059692383, + "step": 7383 + }, + { + "epoch": 0.85, + "learning_rate": 4.5300245815287374e-08, + "logits/chosen": -3.3280417919158936, + "logits/rejected": -2.8145813941955566, + "logps/chosen": -485.22564697265625, + "logps/rejected": -286.356201171875, + "loss": 0.2291, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36938369274139404, + "rewards/margins": 2.40694522857666, + "rewards/rejected": -2.0375614166259766, + "step": 7384 + }, + { + "epoch": 0.85, + "learning_rate": 4.5265129345663115e-08, + "logits/chosen": -3.1267619132995605, + "logits/rejected": -3.0390665531158447, + "logps/chosen": -236.42581176757812, + "logps/rejected": -399.0465087890625, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20961330831050873, + "rewards/margins": 2.5146102905273438, + "rewards/rejected": -2.7242238521575928, + "step": 7385 + }, + { + "epoch": 0.85, + "learning_rate": 4.5230012876038855e-08, + "logits/chosen": -2.743500232696533, + "logits/rejected": -2.599937915802002, + "logps/chosen": -684.5462646484375, + "logps/rejected": -273.0282897949219, + "loss": 0.2849, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37285763025283813, + "rewards/margins": 1.521550178527832, + "rewards/rejected": -1.8944077491760254, + "step": 7386 + }, + { + "epoch": 0.85, + "learning_rate": 4.519489640641461e-08, + "logits/chosen": -2.5405452251434326, + "logits/rejected": -2.197251081466675, + "logps/chosen": -327.2315979003906, + "logps/rejected": -294.61297607421875, + "loss": 0.317, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13133585453033447, + "rewards/margins": 2.1776015758514404, + "rewards/rejected": -2.0462658405303955, + "step": 7387 + }, + { + "epoch": 0.85, + "learning_rate": 4.515977993679035e-08, + "logits/chosen": -3.529749870300293, + "logits/rejected": -3.489616870880127, + "logps/chosen": -293.01055908203125, + "logps/rejected": -245.04010009765625, + "loss": 0.1952, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21759814023971558, + "rewards/margins": 2.4155449867248535, + "rewards/rejected": -2.197946548461914, + "step": 7388 + }, + { + "epoch": 0.85, + "learning_rate": 4.51246634671661e-08, + "logits/chosen": -3.1082262992858887, + "logits/rejected": -2.7512238025665283, + "logps/chosen": -144.06973266601562, + "logps/rejected": -167.82850646972656, + "loss": 0.4601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1762596070766449, + "rewards/margins": 1.1656489372253418, + "rewards/rejected": -1.3419084548950195, + "step": 7389 + }, + { + "epoch": 0.85, + "learning_rate": 4.5089546997541844e-08, + "logits/chosen": -2.5238072872161865, + "logits/rejected": -2.6004087924957275, + "logps/chosen": -284.48199462890625, + "logps/rejected": -167.90647888183594, + "loss": 0.5149, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38004207611083984, + "rewards/margins": 1.6261969804763794, + "rewards/rejected": -1.246155023574829, + "step": 7390 + }, + { + "epoch": 0.85, + "learning_rate": 4.505443052791759e-08, + "logits/chosen": -2.6539084911346436, + "logits/rejected": -2.9682154655456543, + "logps/chosen": -358.566162109375, + "logps/rejected": -235.00042724609375, + "loss": 0.9853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.748590350151062, + "rewards/margins": 0.1703180968761444, + "rewards/rejected": -0.9189084768295288, + "step": 7391 + }, + { + "epoch": 0.85, + "learning_rate": 4.501931405829333e-08, + "logits/chosen": -2.4079558849334717, + "logits/rejected": -2.7242348194122314, + "logps/chosen": -337.7750549316406, + "logps/rejected": -307.7074890136719, + "loss": 0.548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22043690085411072, + "rewards/margins": 1.6949173212051392, + "rewards/rejected": -1.9153542518615723, + "step": 7392 + }, + { + "epoch": 0.85, + "learning_rate": 4.4984197588669086e-08, + "logits/chosen": -3.0514755249023438, + "logits/rejected": -3.217343330383301, + "logps/chosen": -255.41229248046875, + "logps/rejected": -232.87998962402344, + "loss": 0.4195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.725545346736908, + "rewards/margins": 1.4105339050292969, + "rewards/rejected": -2.1360793113708496, + "step": 7393 + }, + { + "epoch": 0.85, + "learning_rate": 4.4949081119044826e-08, + "logits/chosen": -2.8487937450408936, + "logits/rejected": -3.308960437774658, + "logps/chosen": -211.62986755371094, + "logps/rejected": -400.3833923339844, + "loss": 0.3996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5617678761482239, + "rewards/margins": 2.8462960720062256, + "rewards/rejected": -3.4080638885498047, + "step": 7394 + }, + { + "epoch": 0.85, + "learning_rate": 4.491396464942058e-08, + "logits/chosen": -3.2647128105163574, + "logits/rejected": -3.1874845027923584, + "logps/chosen": -269.5652160644531, + "logps/rejected": -176.9139404296875, + "loss": 0.5296, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4171907305717468, + "rewards/margins": 1.238340973854065, + "rewards/rejected": -1.655531644821167, + "step": 7395 + }, + { + "epoch": 0.85, + "learning_rate": 4.487884817979632e-08, + "logits/chosen": -3.0367066860198975, + "logits/rejected": -3.0103440284729004, + "logps/chosen": -292.23028564453125, + "logps/rejected": -309.94158935546875, + "loss": 0.3348, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04001212120056152, + "rewards/margins": 1.2863304615020752, + "rewards/rejected": -1.2463182210922241, + "step": 7396 + }, + { + "epoch": 0.85, + "learning_rate": 4.484373171017207e-08, + "logits/chosen": -3.1643362045288086, + "logits/rejected": -3.305773973464966, + "logps/chosen": -204.40487670898438, + "logps/rejected": -253.90499877929688, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1974877566099167, + "rewards/margins": 2.6436800956726074, + "rewards/rejected": -2.4461922645568848, + "step": 7397 + }, + { + "epoch": 0.85, + "learning_rate": 4.480861524054781e-08, + "logits/chosen": -2.712146759033203, + "logits/rejected": -2.7704479694366455, + "logps/chosen": -518.696533203125, + "logps/rejected": -312.71044921875, + "loss": 0.6575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3837507963180542, + "rewards/margins": 0.5743777751922607, + "rewards/rejected": -0.9581286311149597, + "step": 7398 + }, + { + "epoch": 0.85, + "learning_rate": 4.477349877092356e-08, + "logits/chosen": -2.5693016052246094, + "logits/rejected": -2.5571699142456055, + "logps/chosen": -310.2842102050781, + "logps/rejected": -345.7493896484375, + "loss": 0.3753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47799158096313477, + "rewards/margins": 1.6927895545959473, + "rewards/rejected": -2.170781135559082, + "step": 7399 + }, + { + "epoch": 0.85, + "learning_rate": 4.47383823012993e-08, + "logits/chosen": -2.4264109134674072, + "logits/rejected": -2.09572172164917, + "logps/chosen": -432.15863037109375, + "logps/rejected": -356.7404479980469, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2651725709438324, + "rewards/margins": 2.40944766998291, + "rewards/rejected": -2.6746201515197754, + "step": 7400 + }, + { + "epoch": 0.85, + "learning_rate": 4.470326583167506e-08, + "logits/chosen": -2.9492177963256836, + "logits/rejected": -3.3205671310424805, + "logps/chosen": -197.17459106445312, + "logps/rejected": -294.949951171875, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03179468959569931, + "rewards/margins": 4.291082859039307, + "rewards/rejected": -4.322877407073975, + "step": 7401 + }, + { + "epoch": 0.85, + "learning_rate": 4.46681493620508e-08, + "logits/chosen": -3.1109910011291504, + "logits/rejected": -2.9824752807617188, + "logps/chosen": -189.6608428955078, + "logps/rejected": -201.69338989257812, + "loss": 0.4808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20206981897354126, + "rewards/margins": 1.4606117010116577, + "rewards/rejected": -1.6626814603805542, + "step": 7402 + }, + { + "epoch": 0.85, + "learning_rate": 4.4633032892426545e-08, + "logits/chosen": -2.262728214263916, + "logits/rejected": -2.52956223487854, + "logps/chosen": -267.68048095703125, + "logps/rejected": -249.97149658203125, + "loss": 0.4842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39689165353775024, + "rewards/margins": 1.348945140838623, + "rewards/rejected": -0.952053427696228, + "step": 7403 + }, + { + "epoch": 0.85, + "learning_rate": 4.459791642280229e-08, + "logits/chosen": -3.336676597595215, + "logits/rejected": -3.2782764434814453, + "logps/chosen": -132.73280334472656, + "logps/rejected": -177.72657775878906, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0846312940120697, + "rewards/margins": 1.5497756004333496, + "rewards/rejected": -1.465144395828247, + "step": 7404 + }, + { + "epoch": 0.85, + "learning_rate": 4.456279995317804e-08, + "logits/chosen": -2.343111038208008, + "logits/rejected": -2.3094818592071533, + "logps/chosen": -331.24420166015625, + "logps/rejected": -218.10244750976562, + "loss": 0.7811, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4097417891025543, + "rewards/margins": 0.3874269425868988, + "rewards/rejected": -0.7971687316894531, + "step": 7405 + }, + { + "epoch": 0.85, + "learning_rate": 4.452768348355378e-08, + "logits/chosen": -3.1395747661590576, + "logits/rejected": -3.019679307937622, + "logps/chosen": -318.7430419921875, + "logps/rejected": -286.90472412109375, + "loss": 0.2352, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20777207612991333, + "rewards/margins": 2.0578627586364746, + "rewards/rejected": -1.850090742111206, + "step": 7406 + }, + { + "epoch": 0.85, + "learning_rate": 4.4492567013929534e-08, + "logits/chosen": -3.555464267730713, + "logits/rejected": -3.4460277557373047, + "logps/chosen": -232.82144165039062, + "logps/rejected": -372.8388671875, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0011692345142364502, + "rewards/margins": 2.610886335372925, + "rewards/rejected": -2.60971736907959, + "step": 7407 + }, + { + "epoch": 0.85, + "learning_rate": 4.4457450544305275e-08, + "logits/chosen": -2.878420352935791, + "logits/rejected": -2.613579750061035, + "logps/chosen": -206.58566284179688, + "logps/rejected": -177.5654296875, + "loss": 0.5588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05352276563644409, + "rewards/margins": 0.4265248775482178, + "rewards/rejected": -0.48004770278930664, + "step": 7408 + }, + { + "epoch": 0.85, + "learning_rate": 4.442233407468103e-08, + "logits/chosen": -2.9768404960632324, + "logits/rejected": -2.965655565261841, + "logps/chosen": -176.04010009765625, + "logps/rejected": -515.2606201171875, + "loss": 0.0935, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06351907551288605, + "rewards/margins": 3.183511257171631, + "rewards/rejected": -3.119992256164551, + "step": 7409 + }, + { + "epoch": 0.85, + "learning_rate": 4.438721760505677e-08, + "logits/chosen": -2.86220121383667, + "logits/rejected": -2.861907958984375, + "logps/chosen": -242.9217987060547, + "logps/rejected": -194.4593505859375, + "loss": 0.3899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2799178957939148, + "rewards/margins": 1.1459925174713135, + "rewards/rejected": -1.425910234451294, + "step": 7410 + }, + { + "epoch": 0.85, + "learning_rate": 4.4352101135432516e-08, + "logits/chosen": -3.199709415435791, + "logits/rejected": -3.2326958179473877, + "logps/chosen": -418.71923828125, + "logps/rejected": -317.1479797363281, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6093025207519531, + "rewards/margins": 2.0537467002868652, + "rewards/rejected": -1.444444179534912, + "step": 7411 + }, + { + "epoch": 0.85, + "learning_rate": 4.4316984665808264e-08, + "logits/chosen": -3.2416892051696777, + "logits/rejected": -2.8967254161834717, + "logps/chosen": -304.15472412109375, + "logps/rejected": -192.35365295410156, + "loss": 0.4707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37001144886016846, + "rewards/margins": 1.7227500677108765, + "rewards/rejected": -2.092761516571045, + "step": 7412 + }, + { + "epoch": 0.85, + "learning_rate": 4.428186819618401e-08, + "logits/chosen": -2.635300636291504, + "logits/rejected": -2.6542248725891113, + "logps/chosen": -245.8482666015625, + "logps/rejected": -227.01280212402344, + "loss": 0.439, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4488365650177002, + "rewards/margins": 1.2716432809829712, + "rewards/rejected": -0.8228066563606262, + "step": 7413 + }, + { + "epoch": 0.85, + "learning_rate": 4.424675172655975e-08, + "logits/chosen": -3.0005171298980713, + "logits/rejected": -2.995628595352173, + "logps/chosen": -381.68048095703125, + "logps/rejected": -245.37933349609375, + "loss": 0.208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4776992201805115, + "rewards/margins": 2.2707571983337402, + "rewards/rejected": -1.793057918548584, + "step": 7414 + }, + { + "epoch": 0.85, + "learning_rate": 4.4211635256935505e-08, + "logits/chosen": -2.3998003005981445, + "logits/rejected": -2.5123462677001953, + "logps/chosen": -184.46530151367188, + "logps/rejected": -145.26663208007812, + "loss": 0.8506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.435015469789505, + "rewards/margins": -0.1642000377178192, + "rewards/rejected": -0.2708154320716858, + "step": 7415 + }, + { + "epoch": 0.85, + "learning_rate": 4.4176518787311246e-08, + "logits/chosen": -3.207338809967041, + "logits/rejected": -3.021148681640625, + "logps/chosen": -328.3614501953125, + "logps/rejected": -269.80859375, + "loss": 0.4364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4673064947128296, + "rewards/margins": 1.484609842300415, + "rewards/rejected": -1.951916217803955, + "step": 7416 + }, + { + "epoch": 0.86, + "learning_rate": 4.414140231768699e-08, + "logits/chosen": -3.6868019104003906, + "logits/rejected": -3.7610180377960205, + "logps/chosen": -142.55615234375, + "logps/rejected": -175.34609985351562, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27270764112472534, + "rewards/margins": 2.208280086517334, + "rewards/rejected": -1.9355722665786743, + "step": 7417 + }, + { + "epoch": 0.86, + "learning_rate": 4.410628584806274e-08, + "logits/chosen": -3.2152209281921387, + "logits/rejected": -3.026519775390625, + "logps/chosen": -315.69024658203125, + "logps/rejected": -258.5008544921875, + "loss": 0.392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05569286644458771, + "rewards/margins": 1.4244757890701294, + "rewards/rejected": -1.4801685810089111, + "step": 7418 + }, + { + "epoch": 0.86, + "learning_rate": 4.407116937843849e-08, + "logits/chosen": -3.266249179840088, + "logits/rejected": -3.2555222511291504, + "logps/chosen": -241.1493682861328, + "logps/rejected": -216.04225158691406, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8808154463768005, + "rewards/margins": 2.6715993881225586, + "rewards/rejected": -1.7907840013504028, + "step": 7419 + }, + { + "epoch": 0.86, + "learning_rate": 4.403605290881423e-08, + "logits/chosen": -3.2370264530181885, + "logits/rejected": -3.2843141555786133, + "logps/chosen": -108.50077819824219, + "logps/rejected": -204.26417541503906, + "loss": 0.5156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2773430049419403, + "rewards/margins": 1.2045574188232422, + "rewards/rejected": -1.4819004535675049, + "step": 7420 + }, + { + "epoch": 0.86, + "learning_rate": 4.400093643918998e-08, + "logits/chosen": -3.469696283340454, + "logits/rejected": -3.1000804901123047, + "logps/chosen": -314.03216552734375, + "logps/rejected": -293.0509948730469, + "loss": 0.2632, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01044502854347229, + "rewards/margins": 1.960181713104248, + "rewards/rejected": -1.9497368335723877, + "step": 7421 + }, + { + "epoch": 0.86, + "learning_rate": 4.396581996956572e-08, + "logits/chosen": -3.441765308380127, + "logits/rejected": -3.424752712249756, + "logps/chosen": -208.7914581298828, + "logps/rejected": -191.16683959960938, + "loss": 0.401, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8385730981826782, + "rewards/margins": 1.5056648254394531, + "rewards/rejected": -2.344237804412842, + "step": 7422 + }, + { + "epoch": 0.86, + "learning_rate": 4.3930703499941477e-08, + "logits/chosen": -2.813591718673706, + "logits/rejected": -2.729159116744995, + "logps/chosen": -249.24620056152344, + "logps/rejected": -151.74623107910156, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04918160289525986, + "rewards/margins": 1.787903904914856, + "rewards/rejected": -1.738722324371338, + "step": 7423 + }, + { + "epoch": 0.86, + "learning_rate": 4.389558703031722e-08, + "logits/chosen": -3.12668514251709, + "logits/rejected": -3.4513092041015625, + "logps/chosen": -693.62646484375, + "logps/rejected": -389.7574462890625, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14287053048610687, + "rewards/margins": 4.53176212310791, + "rewards/rejected": -4.674632549285889, + "step": 7424 + }, + { + "epoch": 0.86, + "learning_rate": 4.3860470560692964e-08, + "logits/chosen": -2.908275842666626, + "logits/rejected": -2.8644351959228516, + "logps/chosen": -441.10308837890625, + "logps/rejected": -255.0076446533203, + "loss": 0.9223, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7453814148902893, + "rewards/margins": 0.5440059900283813, + "rewards/rejected": -1.2893874645233154, + "step": 7425 + }, + { + "epoch": 0.86, + "learning_rate": 4.382535409106871e-08, + "logits/chosen": -2.8634865283966064, + "logits/rejected": -2.9170637130737305, + "logps/chosen": -315.28106689453125, + "logps/rejected": -325.6986083984375, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06391158699989319, + "rewards/margins": 1.8343620300292969, + "rewards/rejected": -1.8982734680175781, + "step": 7426 + }, + { + "epoch": 0.86, + "learning_rate": 4.379023762144446e-08, + "logits/chosen": -3.280503273010254, + "logits/rejected": -3.394077777862549, + "logps/chosen": -370.7622375488281, + "logps/rejected": -373.8385925292969, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3408520221710205, + "rewards/margins": 2.0996592044830322, + "rewards/rejected": -1.7588070631027222, + "step": 7427 + }, + { + "epoch": 0.86, + "learning_rate": 4.37551211518202e-08, + "logits/chosen": -3.0971603393554688, + "logits/rejected": -2.995231866836548, + "logps/chosen": -270.17974853515625, + "logps/rejected": -250.91070556640625, + "loss": 0.4489, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18312904238700867, + "rewards/margins": 2.2842912673950195, + "rewards/rejected": -2.4674201011657715, + "step": 7428 + }, + { + "epoch": 0.86, + "learning_rate": 4.3720004682195953e-08, + "logits/chosen": -2.7729389667510986, + "logits/rejected": -2.5030312538146973, + "logps/chosen": -257.87652587890625, + "logps/rejected": -470.9156494140625, + "loss": 0.8878, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16276590526103973, + "rewards/margins": 1.0994665622711182, + "rewards/rejected": -1.2622325420379639, + "step": 7429 + }, + { + "epoch": 0.86, + "learning_rate": 4.3684888212571694e-08, + "logits/chosen": -3.775862216949463, + "logits/rejected": -3.2605857849121094, + "logps/chosen": -539.2446899414062, + "logps/rejected": -240.41888427734375, + "loss": 0.2115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41220584511756897, + "rewards/margins": 2.4417471885681152, + "rewards/rejected": -2.8539528846740723, + "step": 7430 + }, + { + "epoch": 0.86, + "learning_rate": 4.364977174294744e-08, + "logits/chosen": -2.8562934398651123, + "logits/rejected": -3.0854148864746094, + "logps/chosen": -360.49859619140625, + "logps/rejected": -357.86651611328125, + "loss": 0.6366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6102811694145203, + "rewards/margins": 0.2912939190864563, + "rewards/rejected": -0.9015750885009766, + "step": 7431 + }, + { + "epoch": 0.86, + "learning_rate": 4.361465527332319e-08, + "logits/chosen": -3.2375054359436035, + "logits/rejected": -3.2511098384857178, + "logps/chosen": -119.41746520996094, + "logps/rejected": -250.4125518798828, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06495543569326401, + "rewards/margins": 2.3460142612457275, + "rewards/rejected": -2.4109697341918945, + "step": 7432 + }, + { + "epoch": 0.86, + "learning_rate": 4.3579538803698936e-08, + "logits/chosen": -2.597503662109375, + "logits/rejected": -2.5739548206329346, + "logps/chosen": -166.81658935546875, + "logps/rejected": -232.29730224609375, + "loss": 0.6304, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2544320821762085, + "rewards/margins": 0.6507646441459656, + "rewards/rejected": -0.9051968455314636, + "step": 7433 + }, + { + "epoch": 0.86, + "learning_rate": 4.3544422334074676e-08, + "logits/chosen": -3.5229570865631104, + "logits/rejected": -3.1696155071258545, + "logps/chosen": -410.3159484863281, + "logps/rejected": -251.21609497070312, + "loss": 0.3764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15014207363128662, + "rewards/margins": 1.4456764459609985, + "rewards/rejected": -1.5958185195922852, + "step": 7434 + }, + { + "epoch": 0.86, + "learning_rate": 4.3509305864450424e-08, + "logits/chosen": -2.6682639122009277, + "logits/rejected": -2.5484607219696045, + "logps/chosen": -256.0222473144531, + "logps/rejected": -241.33807373046875, + "loss": 0.2605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09968234598636627, + "rewards/margins": 1.6386700868606567, + "rewards/rejected": -1.7383522987365723, + "step": 7435 + }, + { + "epoch": 0.86, + "learning_rate": 4.347418939482617e-08, + "logits/chosen": -2.960486888885498, + "logits/rejected": -3.165278196334839, + "logps/chosen": -122.10794830322266, + "logps/rejected": -161.94488525390625, + "loss": 0.3525, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6823269128799438, + "rewards/margins": 1.7886614799499512, + "rewards/rejected": -2.4709885120391846, + "step": 7436 + }, + { + "epoch": 0.86, + "learning_rate": 4.343907292520191e-08, + "logits/chosen": -2.917811393737793, + "logits/rejected": -2.833866596221924, + "logps/chosen": -280.2016906738281, + "logps/rejected": -369.5550537109375, + "loss": 0.6243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44674086570739746, + "rewards/margins": 0.7635760307312012, + "rewards/rejected": -1.2103168964385986, + "step": 7437 + }, + { + "epoch": 0.86, + "learning_rate": 4.3403956455577665e-08, + "logits/chosen": -2.701106071472168, + "logits/rejected": -2.6974923610687256, + "logps/chosen": -159.37461853027344, + "logps/rejected": -319.3586120605469, + "loss": 0.5298, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40425658226013184, + "rewards/margins": 1.308295726776123, + "rewards/rejected": -0.904039204120636, + "step": 7438 + }, + { + "epoch": 0.86, + "learning_rate": 4.3368839985953406e-08, + "logits/chosen": -3.196037530899048, + "logits/rejected": -3.5752949714660645, + "logps/chosen": -233.58392333984375, + "logps/rejected": -207.5394744873047, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36246296763420105, + "rewards/margins": 1.1036020517349243, + "rewards/rejected": -1.4660649299621582, + "step": 7439 + }, + { + "epoch": 0.86, + "learning_rate": 4.333372351632916e-08, + "logits/chosen": -3.2297093868255615, + "logits/rejected": -3.354759693145752, + "logps/chosen": -163.96890258789062, + "logps/rejected": -265.93804931640625, + "loss": 0.5298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2608449459075928, + "rewards/margins": 1.3344228267669678, + "rewards/rejected": -1.595267653465271, + "step": 7440 + }, + { + "epoch": 0.86, + "learning_rate": 4.32986070467049e-08, + "logits/chosen": -3.4136037826538086, + "logits/rejected": -3.527000904083252, + "logps/chosen": -275.0611877441406, + "logps/rejected": -514.3787231445312, + "loss": 0.3599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10396989434957504, + "rewards/margins": 2.384913444519043, + "rewards/rejected": -2.4888834953308105, + "step": 7441 + }, + { + "epoch": 0.86, + "learning_rate": 4.326349057708065e-08, + "logits/chosen": -3.108804702758789, + "logits/rejected": -3.1495018005371094, + "logps/chosen": -327.5858459472656, + "logps/rejected": -328.162353515625, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05809244513511658, + "rewards/margins": 1.346487045288086, + "rewards/rejected": -1.4045796394348145, + "step": 7442 + }, + { + "epoch": 0.86, + "learning_rate": 4.3228374107456395e-08, + "logits/chosen": -3.224019765853882, + "logits/rejected": -3.375354766845703, + "logps/chosen": -136.7716064453125, + "logps/rejected": -168.18142700195312, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38671931624412537, + "rewards/margins": 2.322164535522461, + "rewards/rejected": -1.9354451894760132, + "step": 7443 + }, + { + "epoch": 0.86, + "learning_rate": 4.319325763783214e-08, + "logits/chosen": -2.3665990829467773, + "logits/rejected": -2.3921420574188232, + "logps/chosen": -442.6899719238281, + "logps/rejected": -412.4638977050781, + "loss": 0.2839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2534828186035156, + "rewards/margins": 2.2681589126586914, + "rewards/rejected": -2.014676094055176, + "step": 7444 + }, + { + "epoch": 0.86, + "learning_rate": 4.315814116820788e-08, + "logits/chosen": -3.2274246215820312, + "logits/rejected": -3.620516777038574, + "logps/chosen": -283.6339111328125, + "logps/rejected": -267.26873779296875, + "loss": 0.2167, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06238704174757004, + "rewards/margins": 3.1818408966064453, + "rewards/rejected": -3.1194536685943604, + "step": 7445 + }, + { + "epoch": 0.86, + "learning_rate": 4.3123024698583637e-08, + "logits/chosen": -3.584656238555908, + "logits/rejected": -3.330528736114502, + "logps/chosen": -118.53046417236328, + "logps/rejected": -90.88018798828125, + "loss": 0.7956, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23851390182971954, + "rewards/margins": 0.48337507247924805, + "rewards/rejected": -0.7218888998031616, + "step": 7446 + }, + { + "epoch": 0.86, + "learning_rate": 4.308790822895938e-08, + "logits/chosen": -2.9882686138153076, + "logits/rejected": -3.0370326042175293, + "logps/chosen": -213.92327880859375, + "logps/rejected": -206.5742645263672, + "loss": 0.7907, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.17497240006923676, + "rewards/margins": 0.7127736806869507, + "rewards/rejected": -0.537801206111908, + "step": 7447 + }, + { + "epoch": 0.86, + "learning_rate": 4.3052791759335124e-08, + "logits/chosen": -2.874072551727295, + "logits/rejected": -2.9871110916137695, + "logps/chosen": -287.23486328125, + "logps/rejected": -264.3133544921875, + "loss": 0.3816, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05959977209568024, + "rewards/margins": 1.2330803871154785, + "rewards/rejected": -1.1734806299209595, + "step": 7448 + }, + { + "epoch": 0.86, + "learning_rate": 4.301767528971087e-08, + "logits/chosen": -3.453993797302246, + "logits/rejected": -3.5075936317443848, + "logps/chosen": -384.8571472167969, + "logps/rejected": -300.44427490234375, + "loss": 0.3232, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30325180292129517, + "rewards/margins": 2.3086206912994385, + "rewards/rejected": -2.611872434616089, + "step": 7449 + }, + { + "epoch": 0.86, + "learning_rate": 4.298255882008662e-08, + "logits/chosen": -3.5823540687561035, + "logits/rejected": -3.8320541381835938, + "logps/chosen": -183.92926025390625, + "logps/rejected": -242.208251953125, + "loss": 0.2209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007891088724136353, + "rewards/margins": 2.080946683883667, + "rewards/rejected": -2.0888378620147705, + "step": 7450 + }, + { + "epoch": 0.86, + "learning_rate": 4.294744235046236e-08, + "logits/chosen": -2.886143684387207, + "logits/rejected": -2.966383695602417, + "logps/chosen": -327.53594970703125, + "logps/rejected": -299.966552734375, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11361183971166611, + "rewards/margins": 1.9461798667907715, + "rewards/rejected": -2.0597920417785645, + "step": 7451 + }, + { + "epoch": 0.86, + "learning_rate": 4.2912325880838113e-08, + "logits/chosen": -3.3536858558654785, + "logits/rejected": -3.3786818981170654, + "logps/chosen": -287.83367919921875, + "logps/rejected": -240.60699462890625, + "loss": 0.2803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033266451209783554, + "rewards/margins": 1.7147939205169678, + "rewards/rejected": -1.7480604648590088, + "step": 7452 + }, + { + "epoch": 0.86, + "learning_rate": 4.2877209411213854e-08, + "logits/chosen": -2.844278573989868, + "logits/rejected": -2.793128490447998, + "logps/chosen": -119.99345397949219, + "logps/rejected": -410.7127685546875, + "loss": 0.5345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4307541847229004, + "rewards/margins": 1.6598570346832275, + "rewards/rejected": -3.090610980987549, + "step": 7453 + }, + { + "epoch": 0.86, + "learning_rate": 4.284209294158961e-08, + "logits/chosen": -3.179813861846924, + "logits/rejected": -3.3362083435058594, + "logps/chosen": -204.34750366210938, + "logps/rejected": -345.2425231933594, + "loss": 0.4562, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10708782076835632, + "rewards/margins": 3.1033618450164795, + "rewards/rejected": -2.99627423286438, + "step": 7454 + }, + { + "epoch": 0.86, + "learning_rate": 4.280697647196535e-08, + "logits/chosen": -3.684847354888916, + "logits/rejected": -3.5177319049835205, + "logps/chosen": -177.5576629638672, + "logps/rejected": -160.0171661376953, + "loss": 0.2672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15750280022621155, + "rewards/margins": 1.802829623222351, + "rewards/rejected": -1.9603323936462402, + "step": 7455 + }, + { + "epoch": 0.86, + "learning_rate": 4.2771860002341096e-08, + "logits/chosen": -3.132546901702881, + "logits/rejected": -2.879006862640381, + "logps/chosen": -275.4530944824219, + "logps/rejected": -275.2625427246094, + "loss": 0.3477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46610745787620544, + "rewards/margins": 1.7877979278564453, + "rewards/rejected": -1.321690320968628, + "step": 7456 + }, + { + "epoch": 0.86, + "learning_rate": 4.273674353271684e-08, + "logits/chosen": -2.4907331466674805, + "logits/rejected": -2.356266498565674, + "logps/chosen": -304.0224914550781, + "logps/rejected": -284.18243408203125, + "loss": 0.7237, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10557231307029724, + "rewards/margins": 1.4116867780685425, + "rewards/rejected": -1.3061144351959229, + "step": 7457 + }, + { + "epoch": 0.86, + "learning_rate": 4.270162706309259e-08, + "logits/chosen": -2.6891605854034424, + "logits/rejected": -3.0345568656921387, + "logps/chosen": -271.20416259765625, + "logps/rejected": -241.65138244628906, + "loss": 0.3497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5012956261634827, + "rewards/margins": 1.2683813571929932, + "rewards/rejected": -1.7696770429611206, + "step": 7458 + }, + { + "epoch": 0.86, + "learning_rate": 4.266651059346833e-08, + "logits/chosen": -3.034236431121826, + "logits/rejected": -2.9846014976501465, + "logps/chosen": -338.949951171875, + "logps/rejected": -300.1862487792969, + "loss": 0.2965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2846730947494507, + "rewards/margins": 2.5205001831054688, + "rewards/rejected": -2.80517315864563, + "step": 7459 + }, + { + "epoch": 0.86, + "learning_rate": 4.2631394123844085e-08, + "logits/chosen": -3.2985048294067383, + "logits/rejected": -3.3573484420776367, + "logps/chosen": -347.2588806152344, + "logps/rejected": -239.73162841796875, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32636868953704834, + "rewards/margins": 3.2175326347351074, + "rewards/rejected": -3.5439016819000244, + "step": 7460 + }, + { + "epoch": 0.86, + "learning_rate": 4.2596277654219825e-08, + "logits/chosen": -3.7688989639282227, + "logits/rejected": -3.655292510986328, + "logps/chosen": -335.7353820800781, + "logps/rejected": -334.39312744140625, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6973587870597839, + "rewards/margins": 1.3293330669403076, + "rewards/rejected": -2.0266916751861572, + "step": 7461 + }, + { + "epoch": 0.86, + "learning_rate": 4.256116118459558e-08, + "logits/chosen": -2.362692356109619, + "logits/rejected": -2.668851375579834, + "logps/chosen": -341.8357849121094, + "logps/rejected": -315.2247619628906, + "loss": 1.1995, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8725191950798035, + "rewards/margins": 0.3874571919441223, + "rewards/rejected": -1.2599763870239258, + "step": 7462 + }, + { + "epoch": 0.86, + "learning_rate": 4.252604471497132e-08, + "logits/chosen": -2.983443021774292, + "logits/rejected": -2.673900842666626, + "logps/chosen": -276.78863525390625, + "logps/rejected": -220.83103942871094, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2632012963294983, + "rewards/margins": 1.620740532875061, + "rewards/rejected": -1.357539176940918, + "step": 7463 + }, + { + "epoch": 0.86, + "learning_rate": 4.249092824534707e-08, + "logits/chosen": -3.424574375152588, + "logits/rejected": -3.274334192276001, + "logps/chosen": -324.7206115722656, + "logps/rejected": -227.92291259765625, + "loss": 0.3883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41919419169425964, + "rewards/margins": 2.2573773860931396, + "rewards/rejected": -1.8381831645965576, + "step": 7464 + }, + { + "epoch": 0.86, + "learning_rate": 4.245581177572281e-08, + "logits/chosen": -3.573768138885498, + "logits/rejected": -3.5019264221191406, + "logps/chosen": -244.967529296875, + "logps/rejected": -179.06808471679688, + "loss": 0.3743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.759601354598999, + "rewards/margins": 1.6167445182800293, + "rewards/rejected": -2.3763458728790283, + "step": 7465 + }, + { + "epoch": 0.86, + "learning_rate": 4.242069530609856e-08, + "logits/chosen": -3.2868311405181885, + "logits/rejected": -2.9286670684814453, + "logps/chosen": -280.3487854003906, + "logps/rejected": -325.92730712890625, + "loss": 0.5011, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9654921293258667, + "rewards/margins": 1.4948618412017822, + "rewards/rejected": -2.4603538513183594, + "step": 7466 + }, + { + "epoch": 0.86, + "learning_rate": 4.23855788364743e-08, + "logits/chosen": -3.278658866882324, + "logits/rejected": -2.9004158973693848, + "logps/chosen": -260.00482177734375, + "logps/rejected": -203.32496643066406, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24211648106575012, + "rewards/margins": 2.291813373565674, + "rewards/rejected": -2.049696922302246, + "step": 7467 + }, + { + "epoch": 0.86, + "learning_rate": 4.2350462366850056e-08, + "logits/chosen": -2.5139119625091553, + "logits/rejected": -2.918936014175415, + "logps/chosen": -278.2324523925781, + "logps/rejected": -265.64056396484375, + "loss": 0.2222, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008336499333381653, + "rewards/margins": 3.565913677215576, + "rewards/rejected": -3.557577133178711, + "step": 7468 + }, + { + "epoch": 0.86, + "learning_rate": 4.2315345897225797e-08, + "logits/chosen": -3.3000893592834473, + "logits/rejected": -3.266570806503296, + "logps/chosen": -252.02493286132812, + "logps/rejected": -229.1265411376953, + "loss": 0.5732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1088513657450676, + "rewards/margins": 1.0406867265701294, + "rewards/rejected": -0.931835412979126, + "step": 7469 + }, + { + "epoch": 0.86, + "learning_rate": 4.2280229427601544e-08, + "logits/chosen": -2.84344482421875, + "logits/rejected": -2.956928014755249, + "logps/chosen": -547.812255859375, + "logps/rejected": -253.36886596679688, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21478500962257385, + "rewards/margins": 2.367927074432373, + "rewards/rejected": -2.582712173461914, + "step": 7470 + }, + { + "epoch": 0.86, + "learning_rate": 4.224511295797729e-08, + "logits/chosen": -3.685360908508301, + "logits/rejected": -3.652024984359741, + "logps/chosen": -196.8019256591797, + "logps/rejected": -166.2613525390625, + "loss": 0.3634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04689069092273712, + "rewards/margins": 1.3754754066467285, + "rewards/rejected": -1.4223662614822388, + "step": 7471 + }, + { + "epoch": 0.86, + "learning_rate": 4.220999648835304e-08, + "logits/chosen": -3.6516458988189697, + "logits/rejected": -3.679670810699463, + "logps/chosen": -365.682861328125, + "logps/rejected": -321.343994140625, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6450765132904053, + "rewards/margins": 3.3201582431793213, + "rewards/rejected": -3.9652347564697266, + "step": 7472 + }, + { + "epoch": 0.86, + "learning_rate": 4.217488001872878e-08, + "logits/chosen": -2.4299967288970947, + "logits/rejected": -2.47464656829834, + "logps/chosen": -228.9652862548828, + "logps/rejected": -282.2402648925781, + "loss": 0.3287, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35253578424453735, + "rewards/margins": 2.2304391860961914, + "rewards/rejected": -1.877903699874878, + "step": 7473 + }, + { + "epoch": 0.86, + "learning_rate": 4.213976354910453e-08, + "logits/chosen": -3.2873289585113525, + "logits/rejected": -3.3333115577697754, + "logps/chosen": -163.8485107421875, + "logps/rejected": -237.61489868164062, + "loss": 0.2501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5246586799621582, + "rewards/margins": 2.243171453475952, + "rewards/rejected": -1.718512773513794, + "step": 7474 + }, + { + "epoch": 0.86, + "learning_rate": 4.2104647079480273e-08, + "logits/chosen": -2.3115234375, + "logits/rejected": -2.742823362350464, + "logps/chosen": -277.1209411621094, + "logps/rejected": -258.4232177734375, + "loss": 0.4136, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029480498284101486, + "rewards/margins": 1.3066778182983398, + "rewards/rejected": -1.3361583948135376, + "step": 7475 + }, + { + "epoch": 0.86, + "learning_rate": 4.206953060985603e-08, + "logits/chosen": -3.0496387481689453, + "logits/rejected": -2.8698458671569824, + "logps/chosen": -350.2351989746094, + "logps/rejected": -324.0829772949219, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7080439329147339, + "rewards/margins": 3.947234869003296, + "rewards/rejected": -3.2391910552978516, + "step": 7476 + }, + { + "epoch": 0.86, + "learning_rate": 4.203441414023177e-08, + "logits/chosen": -3.299542188644409, + "logits/rejected": -3.1708617210388184, + "logps/chosen": -339.5414733886719, + "logps/rejected": -347.48199462890625, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2525096833705902, + "rewards/margins": 2.4536845684051514, + "rewards/rejected": -2.7061944007873535, + "step": 7477 + }, + { + "epoch": 0.86, + "learning_rate": 4.1999297670607515e-08, + "logits/chosen": -3.6724610328674316, + "logits/rejected": -3.360605239868164, + "logps/chosen": -248.93826293945312, + "logps/rejected": -267.2522888183594, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6572730541229248, + "rewards/margins": 1.0041768550872803, + "rewards/rejected": -1.6614497900009155, + "step": 7478 + }, + { + "epoch": 0.86, + "learning_rate": 4.196418120098326e-08, + "logits/chosen": -3.0134902000427246, + "logits/rejected": -3.123918056488037, + "logps/chosen": -389.127685546875, + "logps/rejected": -264.9132080078125, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14779280126094818, + "rewards/margins": 2.4338390827178955, + "rewards/rejected": -2.286046266555786, + "step": 7479 + }, + { + "epoch": 0.86, + "learning_rate": 4.192906473135901e-08, + "logits/chosen": -3.0954551696777344, + "logits/rejected": -3.039759397506714, + "logps/chosen": -435.6393127441406, + "logps/rejected": -275.943603515625, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7951943874359131, + "rewards/margins": 2.22257399559021, + "rewards/rejected": -1.4273797273635864, + "step": 7480 + }, + { + "epoch": 0.86, + "learning_rate": 4.189394826173475e-08, + "logits/chosen": -3.071462869644165, + "logits/rejected": -2.8943090438842773, + "logps/chosen": -262.9268798828125, + "logps/rejected": -265.0702209472656, + "loss": 0.2635, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11157265305519104, + "rewards/margins": 2.0797624588012695, + "rewards/rejected": -1.9681898355484009, + "step": 7481 + }, + { + "epoch": 0.86, + "learning_rate": 4.185883179211049e-08, + "logits/chosen": -3.4534237384796143, + "logits/rejected": -3.446528911590576, + "logps/chosen": -318.72967529296875, + "logps/rejected": -313.9820556640625, + "loss": 0.3996, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24750715494155884, + "rewards/margins": 1.840911865234375, + "rewards/rejected": -1.5934045314788818, + "step": 7482 + }, + { + "epoch": 0.86, + "learning_rate": 4.1823715322486245e-08, + "logits/chosen": -3.7844297885894775, + "logits/rejected": -3.8877103328704834, + "logps/chosen": -244.54388427734375, + "logps/rejected": -232.22509765625, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08492715656757355, + "rewards/margins": 3.0766897201538086, + "rewards/rejected": -3.1616170406341553, + "step": 7483 + }, + { + "epoch": 0.86, + "learning_rate": 4.1788598852861985e-08, + "logits/chosen": -2.9961469173431396, + "logits/rejected": -2.7090096473693848, + "logps/chosen": -260.53289794921875, + "logps/rejected": -286.3724365234375, + "loss": 0.1086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26659414172172546, + "rewards/margins": 2.78131103515625, + "rewards/rejected": -2.5147171020507812, + "step": 7484 + }, + { + "epoch": 0.86, + "learning_rate": 4.175348238323774e-08, + "logits/chosen": -2.8050222396850586, + "logits/rejected": -2.820755958557129, + "logps/chosen": -152.08262634277344, + "logps/rejected": -117.54035949707031, + "loss": 0.408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0013656988739967346, + "rewards/margins": 1.194110631942749, + "rewards/rejected": -1.1954764127731323, + "step": 7485 + }, + { + "epoch": 0.86, + "learning_rate": 4.171836591361348e-08, + "logits/chosen": -3.3114819526672363, + "logits/rejected": -2.991631031036377, + "logps/chosen": -315.25103759765625, + "logps/rejected": -169.7267303466797, + "loss": 0.5911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09069804847240448, + "rewards/margins": 1.2933461666107178, + "rewards/rejected": -1.3840441703796387, + "step": 7486 + }, + { + "epoch": 0.86, + "learning_rate": 4.168324944398923e-08, + "logits/chosen": -3.181676149368286, + "logits/rejected": -2.9140467643737793, + "logps/chosen": -418.30523681640625, + "logps/rejected": -312.2193603515625, + "loss": 0.2735, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40348368883132935, + "rewards/margins": 2.4174458980560303, + "rewards/rejected": -2.820929527282715, + "step": 7487 + }, + { + "epoch": 0.86, + "learning_rate": 4.1648132974364974e-08, + "logits/chosen": -3.116424560546875, + "logits/rejected": -2.909649610519409, + "logps/chosen": -275.90972900390625, + "logps/rejected": -311.2938232421875, + "loss": 0.3671, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32641926407814026, + "rewards/margins": 1.2905763387680054, + "rewards/rejected": -0.9641571044921875, + "step": 7488 + }, + { + "epoch": 0.86, + "learning_rate": 4.161301650474072e-08, + "logits/chosen": -3.090491771697998, + "logits/rejected": -3.1310951709747314, + "logps/chosen": -254.7110595703125, + "logps/rejected": -272.84161376953125, + "loss": 0.6334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47798240184783936, + "rewards/margins": 1.0938502550125122, + "rewards/rejected": -1.5718326568603516, + "step": 7489 + }, + { + "epoch": 0.86, + "learning_rate": 4.157790003511646e-08, + "logits/chosen": -3.623589038848877, + "logits/rejected": -2.9990170001983643, + "logps/chosen": -568.4430541992188, + "logps/rejected": -363.9608459472656, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9769412279129028, + "rewards/margins": 2.153909683227539, + "rewards/rejected": -3.1308507919311523, + "step": 7490 + }, + { + "epoch": 0.86, + "learning_rate": 4.1542783565492216e-08, + "logits/chosen": -2.6676650047302246, + "logits/rejected": -2.611044406890869, + "logps/chosen": -230.71136474609375, + "logps/rejected": -247.94361877441406, + "loss": 0.5683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5342506170272827, + "rewards/margins": 0.5014493465423584, + "rewards/rejected": -1.0356999635696411, + "step": 7491 + }, + { + "epoch": 0.86, + "learning_rate": 4.1507667095867957e-08, + "logits/chosen": -2.454514265060425, + "logits/rejected": -2.389425039291382, + "logps/chosen": -384.9090576171875, + "logps/rejected": -474.991455078125, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16366291046142578, + "rewards/margins": 1.14713716506958, + "rewards/rejected": -1.3108000755310059, + "step": 7492 + }, + { + "epoch": 0.86, + "learning_rate": 4.147255062624371e-08, + "logits/chosen": -2.606959581375122, + "logits/rejected": -2.4475574493408203, + "logps/chosen": -324.0587158203125, + "logps/rejected": -186.76583862304688, + "loss": 0.8865, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2293229103088379, + "rewards/margins": 0.16705404222011566, + "rewards/rejected": -0.39637693762779236, + "step": 7493 + }, + { + "epoch": 0.86, + "learning_rate": 4.143743415661945e-08, + "logits/chosen": -3.3281681537628174, + "logits/rejected": -3.3591434955596924, + "logps/chosen": -164.9707794189453, + "logps/rejected": -180.0811309814453, + "loss": 0.3258, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05662701278924942, + "rewards/margins": 1.291383981704712, + "rewards/rejected": -1.2347568273544312, + "step": 7494 + }, + { + "epoch": 0.86, + "learning_rate": 4.14023176869952e-08, + "logits/chosen": -3.0743939876556396, + "logits/rejected": -3.1544923782348633, + "logps/chosen": -384.84295654296875, + "logps/rejected": -375.42315673828125, + "loss": 0.9013, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21830201148986816, + "rewards/margins": -0.1918463110923767, + "rewards/rejected": -0.02645571529865265, + "step": 7495 + }, + { + "epoch": 0.86, + "learning_rate": 4.1367201217370945e-08, + "logits/chosen": -3.3725266456604004, + "logits/rejected": -3.035233736038208, + "logps/chosen": -420.79559326171875, + "logps/rejected": -261.7130126953125, + "loss": 0.2886, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10130177438259125, + "rewards/margins": 1.6438865661621094, + "rewards/rejected": -1.542584776878357, + "step": 7496 + }, + { + "epoch": 0.86, + "learning_rate": 4.133208474774669e-08, + "logits/chosen": -2.9534573554992676, + "logits/rejected": -2.749403715133667, + "logps/chosen": -191.3076171875, + "logps/rejected": -219.6448974609375, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46717843413352966, + "rewards/margins": 2.3963229656219482, + "rewards/rejected": -1.9291445016860962, + "step": 7497 + }, + { + "epoch": 0.86, + "learning_rate": 4.129696827812243e-08, + "logits/chosen": -2.8842101097106934, + "logits/rejected": -3.069355010986328, + "logps/chosen": -185.71743774414062, + "logps/rejected": -290.4529113769531, + "loss": 0.235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1757672131061554, + "rewards/margins": 2.003011703491211, + "rewards/rejected": -1.827244520187378, + "step": 7498 + }, + { + "epoch": 0.86, + "learning_rate": 4.126185180849819e-08, + "logits/chosen": -3.3114445209503174, + "logits/rejected": -3.054583787918091, + "logps/chosen": -405.69580078125, + "logps/rejected": -415.8905029296875, + "loss": 0.4532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14278483390808105, + "rewards/margins": 1.3095752000808716, + "rewards/rejected": -1.452359914779663, + "step": 7499 + }, + { + "epoch": 0.86, + "learning_rate": 4.122673533887393e-08, + "logits/chosen": -1.9591503143310547, + "logits/rejected": -1.8553916215896606, + "logps/chosen": -246.1662139892578, + "logps/rejected": -344.12335205078125, + "loss": 0.3975, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0316905602812767, + "rewards/margins": 1.7500603199005127, + "rewards/rejected": -1.7817507982254028, + "step": 7500 + }, + { + "epoch": 0.86, + "learning_rate": 4.1191618869249675e-08, + "logits/chosen": -2.514021396636963, + "logits/rejected": -2.9005823135375977, + "logps/chosen": -197.38539123535156, + "logps/rejected": -261.93023681640625, + "loss": 0.373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07319873571395874, + "rewards/margins": 2.0075764656066895, + "rewards/rejected": -2.080775260925293, + "step": 7501 + }, + { + "epoch": 0.86, + "learning_rate": 4.115650239962542e-08, + "logits/chosen": -2.884659767150879, + "logits/rejected": -2.8863797187805176, + "logps/chosen": -175.53038024902344, + "logps/rejected": -196.93392944335938, + "loss": 0.3917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11727648973464966, + "rewards/margins": 1.8456131219863892, + "rewards/rejected": -1.7283365726470947, + "step": 7502 + }, + { + "epoch": 0.86, + "learning_rate": 4.112138593000117e-08, + "logits/chosen": -3.177049160003662, + "logits/rejected": -3.5217041969299316, + "logps/chosen": -287.2966613769531, + "logps/rejected": -221.2364501953125, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07297545671463013, + "rewards/margins": 1.7879159450531006, + "rewards/rejected": -1.860891342163086, + "step": 7503 + }, + { + "epoch": 0.87, + "learning_rate": 4.108626946037691e-08, + "logits/chosen": -3.1697311401367188, + "logits/rejected": -3.2719602584838867, + "logps/chosen": -231.26246643066406, + "logps/rejected": -236.20660400390625, + "loss": 0.5863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26350268721580505, + "rewards/margins": 1.2640931606292725, + "rewards/rejected": -1.5275959968566895, + "step": 7504 + }, + { + "epoch": 0.87, + "learning_rate": 4.1051152990752664e-08, + "logits/chosen": -2.5208098888397217, + "logits/rejected": -2.491269588470459, + "logps/chosen": -260.0534362792969, + "logps/rejected": -430.389404296875, + "loss": 0.5539, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3080190420150757, + "rewards/margins": 1.37216317653656, + "rewards/rejected": -1.6801822185516357, + "step": 7505 + }, + { + "epoch": 0.87, + "learning_rate": 4.1016036521128405e-08, + "logits/chosen": -3.1013498306274414, + "logits/rejected": -3.210181474685669, + "logps/chosen": -123.26429748535156, + "logps/rejected": -237.262451171875, + "loss": 0.4469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6994352340698242, + "rewards/margins": 1.352970004081726, + "rewards/rejected": -2.0524051189422607, + "step": 7506 + }, + { + "epoch": 0.87, + "learning_rate": 4.098092005150416e-08, + "logits/chosen": -3.3366494178771973, + "logits/rejected": -3.5089378356933594, + "logps/chosen": -196.43618774414062, + "logps/rejected": -243.03231811523438, + "loss": 0.4846, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.515826940536499, + "rewards/margins": 1.2663086652755737, + "rewards/rejected": -0.7504817247390747, + "step": 7507 + }, + { + "epoch": 0.87, + "learning_rate": 4.09458035818799e-08, + "logits/chosen": -2.879525899887085, + "logits/rejected": -2.9187309741973877, + "logps/chosen": -379.3719482421875, + "logps/rejected": -393.375244140625, + "loss": 0.4655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27074727416038513, + "rewards/margins": 1.5144315958023071, + "rewards/rejected": -1.785178780555725, + "step": 7508 + }, + { + "epoch": 0.87, + "learning_rate": 4.0910687112255646e-08, + "logits/chosen": -3.0926856994628906, + "logits/rejected": -3.245452880859375, + "logps/chosen": -357.85064697265625, + "logps/rejected": -420.8505554199219, + "loss": 0.7255, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06324917823076248, + "rewards/margins": 0.33078742027282715, + "rewards/rejected": -0.39403659105300903, + "step": 7509 + }, + { + "epoch": 0.87, + "learning_rate": 4.0875570642631394e-08, + "logits/chosen": -3.3347227573394775, + "logits/rejected": -3.516310453414917, + "logps/chosen": -356.01812744140625, + "logps/rejected": -315.1039123535156, + "loss": 0.8123, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6355506181716919, + "rewards/margins": 1.016275405883789, + "rewards/rejected": -1.6518261432647705, + "step": 7510 + }, + { + "epoch": 0.87, + "learning_rate": 4.084045417300714e-08, + "logits/chosen": -3.159053325653076, + "logits/rejected": -3.204591751098633, + "logps/chosen": -108.9964370727539, + "logps/rejected": -346.22528076171875, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23483017086982727, + "rewards/margins": 4.039377212524414, + "rewards/rejected": -3.804547071456909, + "step": 7511 + }, + { + "epoch": 0.87, + "learning_rate": 4.080533770338288e-08, + "logits/chosen": -3.118699073791504, + "logits/rejected": -3.27553391456604, + "logps/chosen": -440.9292907714844, + "logps/rejected": -382.7550048828125, + "loss": 0.2174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19414466619491577, + "rewards/margins": 3.3705883026123047, + "rewards/rejected": -3.564732789993286, + "step": 7512 + }, + { + "epoch": 0.87, + "learning_rate": 4.0770221233758635e-08, + "logits/chosen": -3.103029251098633, + "logits/rejected": -3.4165267944335938, + "logps/chosen": -301.21356201171875, + "logps/rejected": -225.13328552246094, + "loss": 0.6869, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9004427194595337, + "rewards/margins": 0.9406253099441528, + "rewards/rejected": -1.8410680294036865, + "step": 7513 + }, + { + "epoch": 0.87, + "learning_rate": 4.0735104764134376e-08, + "logits/chosen": -3.385509967803955, + "logits/rejected": -3.5735716819763184, + "logps/chosen": -327.01092529296875, + "logps/rejected": -266.5755920410156, + "loss": 0.3032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39524391293525696, + "rewards/margins": 2.1370673179626465, + "rewards/rejected": -1.741823434829712, + "step": 7514 + }, + { + "epoch": 0.87, + "learning_rate": 4.069998829451012e-08, + "logits/chosen": -2.763573169708252, + "logits/rejected": -2.8655359745025635, + "logps/chosen": -379.47528076171875, + "logps/rejected": -254.66253662109375, + "loss": 0.6002, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4422298073768616, + "rewards/margins": 1.366270661354065, + "rewards/rejected": -1.8085005283355713, + "step": 7515 + }, + { + "epoch": 0.87, + "learning_rate": 4.066487182488587e-08, + "logits/chosen": -2.4003007411956787, + "logits/rejected": -2.61470627784729, + "logps/chosen": -375.8123779296875, + "logps/rejected": -239.82846069335938, + "loss": 0.2216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38269561529159546, + "rewards/margins": 2.160046339035034, + "rewards/rejected": -1.7773507833480835, + "step": 7516 + }, + { + "epoch": 0.87, + "learning_rate": 4.062975535526162e-08, + "logits/chosen": -2.7294671535491943, + "logits/rejected": -2.695668935775757, + "logps/chosen": -227.7852020263672, + "logps/rejected": -243.02723693847656, + "loss": 0.2116, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16507235169410706, + "rewards/margins": 2.392014265060425, + "rewards/rejected": -2.2269418239593506, + "step": 7517 + }, + { + "epoch": 0.87, + "learning_rate": 4.059463888563736e-08, + "logits/chosen": -3.5599355697631836, + "logits/rejected": -3.4366002082824707, + "logps/chosen": -222.60365295410156, + "logps/rejected": -233.5117645263672, + "loss": 0.1625, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6110637187957764, + "rewards/margins": 3.060159921646118, + "rewards/rejected": -2.449096441268921, + "step": 7518 + }, + { + "epoch": 0.87, + "learning_rate": 4.055952241601311e-08, + "logits/chosen": -3.574110984802246, + "logits/rejected": -3.231353998184204, + "logps/chosen": -220.97280883789062, + "logps/rejected": -162.0476531982422, + "loss": 0.326, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3030501902103424, + "rewards/margins": 1.923255205154419, + "rewards/rejected": -1.6202049255371094, + "step": 7519 + }, + { + "epoch": 0.87, + "learning_rate": 4.052440594638885e-08, + "logits/chosen": -2.9844348430633545, + "logits/rejected": -3.0909149646759033, + "logps/chosen": -362.608642578125, + "logps/rejected": -183.5186004638672, + "loss": 0.295, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19992396235466003, + "rewards/margins": 1.9291026592254639, + "rewards/rejected": -2.1290266513824463, + "step": 7520 + }, + { + "epoch": 0.87, + "learning_rate": 4.0489289476764607e-08, + "logits/chosen": -3.32062029838562, + "logits/rejected": -2.881941318511963, + "logps/chosen": -190.5640106201172, + "logps/rejected": -229.21377563476562, + "loss": 0.5446, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2819797992706299, + "rewards/margins": 1.2030670642852783, + "rewards/rejected": -1.4850468635559082, + "step": 7521 + }, + { + "epoch": 0.87, + "learning_rate": 4.045417300714035e-08, + "logits/chosen": -2.8962180614471436, + "logits/rejected": -2.7982969284057617, + "logps/chosen": -273.21417236328125, + "logps/rejected": -374.62750244140625, + "loss": 0.211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14779920876026154, + "rewards/margins": 2.698577642440796, + "rewards/rejected": -2.550778388977051, + "step": 7522 + }, + { + "epoch": 0.87, + "learning_rate": 4.0419056537516094e-08, + "logits/chosen": -3.433238983154297, + "logits/rejected": -3.199942111968994, + "logps/chosen": -289.3759765625, + "logps/rejected": -242.98733520507812, + "loss": 0.2824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03205490857362747, + "rewards/margins": 2.0512232780456543, + "rewards/rejected": -2.083278179168701, + "step": 7523 + }, + { + "epoch": 0.87, + "learning_rate": 4.038394006789184e-08, + "logits/chosen": -4.248073577880859, + "logits/rejected": -3.8847827911376953, + "logps/chosen": -257.0155334472656, + "logps/rejected": -194.49642944335938, + "loss": 0.5694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5786929726600647, + "rewards/margins": 0.7365216016769409, + "rewards/rejected": -1.3152146339416504, + "step": 7524 + }, + { + "epoch": 0.87, + "learning_rate": 4.034882359826759e-08, + "logits/chosen": -3.0310347080230713, + "logits/rejected": -3.1769967079162598, + "logps/chosen": -421.77264404296875, + "logps/rejected": -248.3815155029297, + "loss": 0.6999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5656816959381104, + "rewards/margins": 1.5136704444885254, + "rewards/rejected": -2.0793521404266357, + "step": 7525 + }, + { + "epoch": 0.87, + "learning_rate": 4.031370712864333e-08, + "logits/chosen": -2.5610783100128174, + "logits/rejected": -2.6042449474334717, + "logps/chosen": -148.31878662109375, + "logps/rejected": -257.6938171386719, + "loss": 0.2598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029988639056682587, + "rewards/margins": 3.3818955421447754, + "rewards/rejected": -3.3519067764282227, + "step": 7526 + }, + { + "epoch": 0.87, + "learning_rate": 4.0278590659019083e-08, + "logits/chosen": -3.9426746368408203, + "logits/rejected": -3.89273738861084, + "logps/chosen": -197.83651733398438, + "logps/rejected": -188.46759033203125, + "loss": 0.2264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.015062347054481506, + "rewards/margins": 2.5096335411071777, + "rewards/rejected": -2.524695873260498, + "step": 7527 + }, + { + "epoch": 0.87, + "learning_rate": 4.0243474189394824e-08, + "logits/chosen": -3.5719757080078125, + "logits/rejected": -3.159079074859619, + "logps/chosen": -302.1076965332031, + "logps/rejected": -284.2590637207031, + "loss": 0.5131, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4441414773464203, + "rewards/margins": 1.3435890674591064, + "rewards/rejected": -1.7877305746078491, + "step": 7528 + }, + { + "epoch": 0.87, + "learning_rate": 4.0208357719770565e-08, + "logits/chosen": -2.519930839538574, + "logits/rejected": -2.7203004360198975, + "logps/chosen": -362.00946044921875, + "logps/rejected": -297.5560302734375, + "loss": 0.2791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29538097977638245, + "rewards/margins": 1.7654409408569336, + "rewards/rejected": -2.060822010040283, + "step": 7529 + }, + { + "epoch": 0.87, + "learning_rate": 4.017324125014632e-08, + "logits/chosen": -3.2347335815429688, + "logits/rejected": -3.497551202774048, + "logps/chosen": -320.5330810546875, + "logps/rejected": -349.33648681640625, + "loss": 0.6814, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04571977257728577, + "rewards/margins": 1.0934126377105713, + "rewards/rejected": -1.1391323804855347, + "step": 7530 + }, + { + "epoch": 0.87, + "learning_rate": 4.013812478052206e-08, + "logits/chosen": -3.31465482711792, + "logits/rejected": -3.561979293823242, + "logps/chosen": -316.0699462890625, + "logps/rejected": -204.8778076171875, + "loss": 0.3241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24360236525535583, + "rewards/margins": 1.8318272829055786, + "rewards/rejected": -2.075429677963257, + "step": 7531 + }, + { + "epoch": 0.87, + "learning_rate": 4.0103008310897806e-08, + "logits/chosen": -3.580631732940674, + "logits/rejected": -3.8744168281555176, + "logps/chosen": -189.57240295410156, + "logps/rejected": -237.7109375, + "loss": 0.3708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36793458461761475, + "rewards/margins": 1.5106291770935059, + "rewards/rejected": -1.8785638809204102, + "step": 7532 + }, + { + "epoch": 0.87, + "learning_rate": 4.0067891841273554e-08, + "logits/chosen": -2.4814884662628174, + "logits/rejected": -2.549510955810547, + "logps/chosen": -223.60247802734375, + "logps/rejected": -226.96255493164062, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40669548511505127, + "rewards/margins": 1.7812680006027222, + "rewards/rejected": -2.1879634857177734, + "step": 7533 + }, + { + "epoch": 0.87, + "learning_rate": 4.00327753716493e-08, + "logits/chosen": -2.2782785892486572, + "logits/rejected": -2.2890071868896484, + "logps/chosen": -482.6736145019531, + "logps/rejected": -409.7596740722656, + "loss": 0.4736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.263846755027771, + "rewards/margins": 2.0601847171783447, + "rewards/rejected": -2.324031352996826, + "step": 7534 + }, + { + "epoch": 0.87, + "learning_rate": 3.999765890202504e-08, + "logits/chosen": -3.291430711746216, + "logits/rejected": -3.0477380752563477, + "logps/chosen": -160.39877319335938, + "logps/rejected": -149.31155395507812, + "loss": 0.5069, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.298551470041275, + "rewards/margins": 0.9882910847663879, + "rewards/rejected": -0.6897397041320801, + "step": 7535 + }, + { + "epoch": 0.87, + "learning_rate": 3.9962542432400795e-08, + "logits/chosen": -3.5985238552093506, + "logits/rejected": -3.789578914642334, + "logps/chosen": -300.0981140136719, + "logps/rejected": -290.34930419921875, + "loss": 0.1317, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28566309809684753, + "rewards/margins": 2.4673538208007812, + "rewards/rejected": -2.1816906929016113, + "step": 7536 + }, + { + "epoch": 0.87, + "learning_rate": 3.9927425962776536e-08, + "logits/chosen": -3.0996973514556885, + "logits/rejected": -3.2523012161254883, + "logps/chosen": -93.00965118408203, + "logps/rejected": -167.6415252685547, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1543428897857666, + "rewards/margins": 2.309688091278076, + "rewards/rejected": -2.1553452014923096, + "step": 7537 + }, + { + "epoch": 0.87, + "learning_rate": 3.989230949315229e-08, + "logits/chosen": -3.614081382751465, + "logits/rejected": -3.0137524604797363, + "logps/chosen": -420.4374694824219, + "logps/rejected": -291.822509765625, + "loss": 0.2361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0666956752538681, + "rewards/margins": 2.596250295639038, + "rewards/rejected": -2.529554843902588, + "step": 7538 + }, + { + "epoch": 0.87, + "learning_rate": 3.985719302352803e-08, + "logits/chosen": -2.2222652435302734, + "logits/rejected": -2.005113124847412, + "logps/chosen": -150.3624267578125, + "logps/rejected": -277.3478088378906, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22399689257144928, + "rewards/margins": 1.8131697177886963, + "rewards/rejected": -1.5891728401184082, + "step": 7539 + }, + { + "epoch": 0.87, + "learning_rate": 3.982207655390378e-08, + "logits/chosen": -2.625220775604248, + "logits/rejected": -2.5450353622436523, + "logps/chosen": -577.5968017578125, + "logps/rejected": -337.7035217285156, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17025962471961975, + "rewards/margins": 1.8822531700134277, + "rewards/rejected": -2.0525126457214355, + "step": 7540 + }, + { + "epoch": 0.87, + "learning_rate": 3.9786960084279525e-08, + "logits/chosen": -2.866905689239502, + "logits/rejected": -2.8613975048065186, + "logps/chosen": -262.69256591796875, + "logps/rejected": -273.4716796875, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6296974420547485, + "rewards/margins": 1.670243501663208, + "rewards/rejected": -1.0405460596084595, + "step": 7541 + }, + { + "epoch": 0.87, + "learning_rate": 3.975184361465527e-08, + "logits/chosen": -2.6864123344421387, + "logits/rejected": -2.748539447784424, + "logps/chosen": -211.3208770751953, + "logps/rejected": -346.57586669921875, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5239019989967346, + "rewards/margins": 2.248486042022705, + "rewards/rejected": -1.7245843410491943, + "step": 7542 + }, + { + "epoch": 0.87, + "learning_rate": 3.971672714503101e-08, + "logits/chosen": -3.701113224029541, + "logits/rejected": -3.444645881652832, + "logps/chosen": -307.62066650390625, + "logps/rejected": -141.19769287109375, + "loss": 0.3325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0024237819015979767, + "rewards/margins": 1.8553813695907593, + "rewards/rejected": -1.8578052520751953, + "step": 7543 + }, + { + "epoch": 0.87, + "learning_rate": 3.9681610675406767e-08, + "logits/chosen": -3.069775104522705, + "logits/rejected": -3.1007938385009766, + "logps/chosen": -233.069091796875, + "logps/rejected": -181.84300231933594, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2961309254169464, + "rewards/margins": 1.0707422494888306, + "rewards/rejected": -1.3668732643127441, + "step": 7544 + }, + { + "epoch": 0.87, + "learning_rate": 3.964649420578251e-08, + "logits/chosen": -3.6050350666046143, + "logits/rejected": -3.361511468887329, + "logps/chosen": -189.4403533935547, + "logps/rejected": -212.89736938476562, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17795979976654053, + "rewards/margins": 1.754941463470459, + "rewards/rejected": -1.932901382446289, + "step": 7545 + }, + { + "epoch": 0.87, + "learning_rate": 3.961137773615826e-08, + "logits/chosen": -3.1898300647735596, + "logits/rejected": -2.81356143951416, + "logps/chosen": -215.05674743652344, + "logps/rejected": -209.92745971679688, + "loss": 0.2356, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2028280794620514, + "rewards/margins": 1.6892858743667603, + "rewards/rejected": -1.4864578247070312, + "step": 7546 + }, + { + "epoch": 0.87, + "learning_rate": 3.9576261266534e-08, + "logits/chosen": -3.5576388835906982, + "logits/rejected": -3.5651230812072754, + "logps/chosen": -428.13818359375, + "logps/rejected": -263.2501220703125, + "loss": 0.3199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11587333679199219, + "rewards/margins": 1.9149330854415894, + "rewards/rejected": -2.030806303024292, + "step": 7547 + }, + { + "epoch": 0.87, + "learning_rate": 3.954114479690975e-08, + "logits/chosen": -2.6316440105438232, + "logits/rejected": -3.162478446960449, + "logps/chosen": -251.7567901611328, + "logps/rejected": -264.64935302734375, + "loss": 0.65, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11990400403738022, + "rewards/margins": 1.2034716606140137, + "rewards/rejected": -1.3233757019042969, + "step": 7548 + }, + { + "epoch": 0.87, + "learning_rate": 3.950602832728549e-08, + "logits/chosen": -3.2327375411987305, + "logits/rejected": -3.2179059982299805, + "logps/chosen": -221.9896240234375, + "logps/rejected": -221.55279541015625, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41803479194641113, + "rewards/margins": 1.3247987031936646, + "rewards/rejected": -1.7428333759307861, + "step": 7549 + }, + { + "epoch": 0.87, + "learning_rate": 3.9470911857661243e-08, + "logits/chosen": -2.3771135807037354, + "logits/rejected": -2.471571445465088, + "logps/chosen": -329.99822998046875, + "logps/rejected": -350.55645751953125, + "loss": 0.3153, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.046194493770599365, + "rewards/margins": 1.6235947608947754, + "rewards/rejected": -1.5774002075195312, + "step": 7550 + }, + { + "epoch": 0.87, + "learning_rate": 3.9435795388036984e-08, + "logits/chosen": -3.066906213760376, + "logits/rejected": -2.8141098022460938, + "logps/chosen": -396.1695556640625, + "logps/rejected": -274.3712158203125, + "loss": 0.3162, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4541410207748413, + "rewards/margins": 1.4618275165557861, + "rewards/rejected": -1.915968656539917, + "step": 7551 + }, + { + "epoch": 0.87, + "learning_rate": 3.940067891841274e-08, + "logits/chosen": -2.9881432056427, + "logits/rejected": -2.752713680267334, + "logps/chosen": -182.09982299804688, + "logps/rejected": -306.08251953125, + "loss": 0.4522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32706135511398315, + "rewards/margins": 1.5259308815002441, + "rewards/rejected": -1.8529921770095825, + "step": 7552 + }, + { + "epoch": 0.87, + "learning_rate": 3.936556244878848e-08, + "logits/chosen": -3.5800466537475586, + "logits/rejected": -3.587484121322632, + "logps/chosen": -291.1708679199219, + "logps/rejected": -208.8103485107422, + "loss": 0.2404, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4191213548183441, + "rewards/margins": 1.6652395725250244, + "rewards/rejected": -1.2461183071136475, + "step": 7553 + }, + { + "epoch": 0.87, + "learning_rate": 3.9330445979164226e-08, + "logits/chosen": -2.3640952110290527, + "logits/rejected": -2.346482276916504, + "logps/chosen": -120.52156829833984, + "logps/rejected": -206.947998046875, + "loss": 0.4608, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6944410800933838, + "rewards/margins": 1.7416150569915771, + "rewards/rejected": -2.43605637550354, + "step": 7554 + }, + { + "epoch": 0.87, + "learning_rate": 3.929532950953997e-08, + "logits/chosen": -3.842179775238037, + "logits/rejected": -3.9236738681793213, + "logps/chosen": -162.48321533203125, + "logps/rejected": -280.4861755371094, + "loss": 0.2041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04581344500184059, + "rewards/margins": 2.558535575866699, + "rewards/rejected": -2.5127224922180176, + "step": 7555 + }, + { + "epoch": 0.87, + "learning_rate": 3.926021303991572e-08, + "logits/chosen": -2.7926151752471924, + "logits/rejected": -2.5377161502838135, + "logps/chosen": -257.3371276855469, + "logps/rejected": -339.5315246582031, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3628157675266266, + "rewards/margins": 2.4191765785217285, + "rewards/rejected": -2.7819924354553223, + "step": 7556 + }, + { + "epoch": 0.87, + "learning_rate": 3.922509657029146e-08, + "logits/chosen": -3.9925270080566406, + "logits/rejected": -4.044466972351074, + "logps/chosen": -209.60350036621094, + "logps/rejected": -239.74606323242188, + "loss": 0.3202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5925271511077881, + "rewards/margins": 2.387394428253174, + "rewards/rejected": -2.979921579360962, + "step": 7557 + }, + { + "epoch": 0.87, + "learning_rate": 3.9189980100667215e-08, + "logits/chosen": -2.9197874069213867, + "logits/rejected": -2.8317458629608154, + "logps/chosen": -258.7553405761719, + "logps/rejected": -380.9085693359375, + "loss": 0.2855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08691045641899109, + "rewards/margins": 2.3651578426361084, + "rewards/rejected": -2.4520680904388428, + "step": 7558 + }, + { + "epoch": 0.87, + "learning_rate": 3.9154863631042955e-08, + "logits/chosen": -3.186643600463867, + "logits/rejected": -3.0660080909729004, + "logps/chosen": -307.81243896484375, + "logps/rejected": -316.4818115234375, + "loss": 0.1864, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2185508757829666, + "rewards/margins": 2.727076768875122, + "rewards/rejected": -2.9456276893615723, + "step": 7559 + }, + { + "epoch": 0.87, + "learning_rate": 3.911974716141871e-08, + "logits/chosen": -3.603273630142212, + "logits/rejected": -3.1034016609191895, + "logps/chosen": -219.39370727539062, + "logps/rejected": -177.79420471191406, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4160585403442383, + "rewards/margins": 1.7824475765228271, + "rewards/rejected": -1.3663891553878784, + "step": 7560 + }, + { + "epoch": 0.87, + "learning_rate": 3.908463069179445e-08, + "logits/chosen": -3.111633777618408, + "logits/rejected": -2.7252566814422607, + "logps/chosen": -306.180908203125, + "logps/rejected": -138.95387268066406, + "loss": 0.3558, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09783172607421875, + "rewards/margins": 1.4430997371673584, + "rewards/rejected": -1.5409314632415771, + "step": 7561 + }, + { + "epoch": 0.87, + "learning_rate": 3.90495142221702e-08, + "logits/chosen": -4.150136947631836, + "logits/rejected": -3.5667905807495117, + "logps/chosen": -231.4912567138672, + "logps/rejected": -138.6046142578125, + "loss": 0.2631, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20945841073989868, + "rewards/margins": 1.8365755081176758, + "rewards/rejected": -2.0460338592529297, + "step": 7562 + }, + { + "epoch": 0.87, + "learning_rate": 3.9014397752545944e-08, + "logits/chosen": -2.8923513889312744, + "logits/rejected": -2.9932138919830322, + "logps/chosen": -242.42477416992188, + "logps/rejected": -232.68936157226562, + "loss": 0.2204, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4024260640144348, + "rewards/margins": 2.592386245727539, + "rewards/rejected": -2.189960241317749, + "step": 7563 + }, + { + "epoch": 0.87, + "learning_rate": 3.897928128292169e-08, + "logits/chosen": -3.0339345932006836, + "logits/rejected": -2.7042338848114014, + "logps/chosen": -338.98541259765625, + "logps/rejected": -379.7603759765625, + "loss": 0.1312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06723837554454803, + "rewards/margins": 2.9604616165161133, + "rewards/rejected": -2.893223285675049, + "step": 7564 + }, + { + "epoch": 0.87, + "learning_rate": 3.894416481329743e-08, + "logits/chosen": -3.0354435443878174, + "logits/rejected": -2.9515721797943115, + "logps/chosen": -371.57928466796875, + "logps/rejected": -256.2343444824219, + "loss": 0.8974, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45775118470191956, + "rewards/margins": 1.490973949432373, + "rewards/rejected": -1.9487251043319702, + "step": 7565 + }, + { + "epoch": 0.87, + "learning_rate": 3.8909048343673186e-08, + "logits/chosen": -4.086225509643555, + "logits/rejected": -3.9285669326782227, + "logps/chosen": -410.01763916015625, + "logps/rejected": -297.49639892578125, + "loss": 0.3326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5513190031051636, + "rewards/margins": 2.1904683113098145, + "rewards/rejected": -2.7417869567871094, + "step": 7566 + }, + { + "epoch": 0.87, + "learning_rate": 3.8873931874048927e-08, + "logits/chosen": -2.459522247314453, + "logits/rejected": -2.658433675765991, + "logps/chosen": -353.91546630859375, + "logps/rejected": -315.06787109375, + "loss": 0.635, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1849309802055359, + "rewards/margins": 0.7666946053504944, + "rewards/rejected": -0.9516257047653198, + "step": 7567 + }, + { + "epoch": 0.87, + "learning_rate": 3.8838815404424674e-08, + "logits/chosen": -3.2379226684570312, + "logits/rejected": -3.097601890563965, + "logps/chosen": -373.33892822265625, + "logps/rejected": -292.4755554199219, + "loss": 0.341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07962757349014282, + "rewards/margins": 1.3106374740600586, + "rewards/rejected": -1.2310099601745605, + "step": 7568 + }, + { + "epoch": 0.87, + "learning_rate": 3.880369893480042e-08, + "logits/chosen": -3.161777973175049, + "logits/rejected": -3.441592216491699, + "logps/chosen": -155.29885864257812, + "logps/rejected": -253.94686889648438, + "loss": 0.2253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5492926836013794, + "rewards/margins": 2.668752908706665, + "rewards/rejected": -2.119460344314575, + "step": 7569 + }, + { + "epoch": 0.87, + "learning_rate": 3.876858246517617e-08, + "logits/chosen": -3.053536891937256, + "logits/rejected": -3.3107030391693115, + "logps/chosen": -193.4838409423828, + "logps/rejected": -202.07131958007812, + "loss": 0.2168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23331770300865173, + "rewards/margins": 2.748387336730957, + "rewards/rejected": -2.9817051887512207, + "step": 7570 + }, + { + "epoch": 0.87, + "learning_rate": 3.873346599555191e-08, + "logits/chosen": -3.2025482654571533, + "logits/rejected": -3.029066801071167, + "logps/chosen": -371.6539001464844, + "logps/rejected": -311.97564697265625, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08835163712501526, + "rewards/margins": 3.016402006149292, + "rewards/rejected": -3.1047537326812744, + "step": 7571 + }, + { + "epoch": 0.87, + "learning_rate": 3.869834952592766e-08, + "logits/chosen": -3.24029278755188, + "logits/rejected": -3.1274194717407227, + "logps/chosen": -269.87481689453125, + "logps/rejected": -286.5357666015625, + "loss": 0.2428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5753228068351746, + "rewards/margins": 2.4233646392822266, + "rewards/rejected": -2.998687267303467, + "step": 7572 + }, + { + "epoch": 0.87, + "learning_rate": 3.8663233056303403e-08, + "logits/chosen": -2.3796653747558594, + "logits/rejected": -2.275763511657715, + "logps/chosen": -323.4744873046875, + "logps/rejected": -242.3963623046875, + "loss": 0.7146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36813944578170776, + "rewards/margins": 1.3301877975463867, + "rewards/rejected": -1.6983273029327393, + "step": 7573 + }, + { + "epoch": 0.87, + "learning_rate": 3.862811658667916e-08, + "logits/chosen": -2.8156120777130127, + "logits/rejected": -2.7153561115264893, + "logps/chosen": -255.17005920410156, + "logps/rejected": -279.2387390136719, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2533290684223175, + "rewards/margins": 1.449665904045105, + "rewards/rejected": -1.1963369846343994, + "step": 7574 + }, + { + "epoch": 0.87, + "learning_rate": 3.85930001170549e-08, + "logits/chosen": -2.6669154167175293, + "logits/rejected": -2.656661033630371, + "logps/chosen": -162.4687042236328, + "logps/rejected": -215.538330078125, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03997776657342911, + "rewards/margins": 1.9633609056472778, + "rewards/rejected": -1.923383116722107, + "step": 7575 + }, + { + "epoch": 0.87, + "learning_rate": 3.8557883647430645e-08, + "logits/chosen": -3.3245697021484375, + "logits/rejected": -3.808011531829834, + "logps/chosen": -81.61366271972656, + "logps/rejected": -234.8203887939453, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4588702619075775, + "rewards/margins": 1.1712242364883423, + "rewards/rejected": -1.6300945281982422, + "step": 7576 + }, + { + "epoch": 0.87, + "learning_rate": 3.852276717780639e-08, + "logits/chosen": -3.17464542388916, + "logits/rejected": -3.3284831047058105, + "logps/chosen": -291.9651794433594, + "logps/rejected": -345.1766052246094, + "loss": 0.2416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0014204084873199463, + "rewards/margins": 2.152723789215088, + "rewards/rejected": -2.151303291320801, + "step": 7577 + }, + { + "epoch": 0.87, + "learning_rate": 3.848765070818213e-08, + "logits/chosen": -2.635668992996216, + "logits/rejected": -2.9738638401031494, + "logps/chosen": -480.68389892578125, + "logps/rejected": -364.9833984375, + "loss": 0.299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2666846513748169, + "rewards/margins": 1.2803875207901, + "rewards/rejected": -1.5470722913742065, + "step": 7578 + }, + { + "epoch": 0.87, + "learning_rate": 3.845253423855788e-08, + "logits/chosen": -2.5546014308929443, + "logits/rejected": -2.7725882530212402, + "logps/chosen": -252.08096313476562, + "logps/rejected": -322.1826171875, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00039318203926086426, + "rewards/margins": 1.8391757011413574, + "rewards/rejected": -1.838782548904419, + "step": 7579 + }, + { + "epoch": 0.87, + "learning_rate": 3.841741776893363e-08, + "logits/chosen": -2.9973411560058594, + "logits/rejected": -2.77644944190979, + "logps/chosen": -222.830322265625, + "logps/rejected": -238.18362426757812, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2909637689590454, + "rewards/margins": 1.4511042833328247, + "rewards/rejected": -1.7420680522918701, + "step": 7580 + }, + { + "epoch": 0.87, + "learning_rate": 3.8382301299309375e-08, + "logits/chosen": -2.3000221252441406, + "logits/rejected": -2.6768991947174072, + "logps/chosen": -123.8421630859375, + "logps/rejected": -216.54408264160156, + "loss": 0.285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2440255880355835, + "rewards/margins": 1.600690245628357, + "rewards/rejected": -1.8447158336639404, + "step": 7581 + }, + { + "epoch": 0.87, + "learning_rate": 3.8347184829685115e-08, + "logits/chosen": -3.424807548522949, + "logits/rejected": -3.3455495834350586, + "logps/chosen": -343.9700622558594, + "logps/rejected": -202.4750213623047, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5537621974945068, + "rewards/margins": 1.5196900367736816, + "rewards/rejected": -2.0734522342681885, + "step": 7582 + }, + { + "epoch": 0.87, + "learning_rate": 3.831206836006087e-08, + "logits/chosen": -3.428264617919922, + "logits/rejected": -3.482574462890625, + "logps/chosen": -319.1234436035156, + "logps/rejected": -322.68170166015625, + "loss": 0.4837, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.108221635222435, + "rewards/margins": 1.870319128036499, + "rewards/rejected": -1.7620974779129028, + "step": 7583 + }, + { + "epoch": 0.87, + "learning_rate": 3.827695189043661e-08, + "logits/chosen": -3.701219320297241, + "logits/rejected": -3.929344415664673, + "logps/chosen": -155.2252197265625, + "logps/rejected": -202.30233764648438, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08364749699831009, + "rewards/margins": 2.0329108238220215, + "rewards/rejected": -1.9492634534835815, + "step": 7584 + }, + { + "epoch": 0.87, + "learning_rate": 3.824183542081236e-08, + "logits/chosen": -3.935368061065674, + "logits/rejected": -3.663639783859253, + "logps/chosen": -267.15283203125, + "logps/rejected": -199.788330078125, + "loss": 0.4969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16736631095409393, + "rewards/margins": 1.8326650857925415, + "rewards/rejected": -2.0000314712524414, + "step": 7585 + }, + { + "epoch": 0.87, + "learning_rate": 3.8206718951188104e-08, + "logits/chosen": -3.275580406188965, + "logits/rejected": -3.2906947135925293, + "logps/chosen": -240.33999633789062, + "logps/rejected": -149.70059204101562, + "loss": 0.4238, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06640158593654633, + "rewards/margins": 1.2977986335754395, + "rewards/rejected": -1.3642001152038574, + "step": 7586 + }, + { + "epoch": 0.87, + "learning_rate": 3.817160248156385e-08, + "logits/chosen": -2.5057356357574463, + "logits/rejected": -2.960049867630005, + "logps/chosen": -398.403564453125, + "logps/rejected": -229.0504150390625, + "loss": 0.6163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12105005979537964, + "rewards/margins": 1.1103798151016235, + "rewards/rejected": -1.231429934501648, + "step": 7587 + }, + { + "epoch": 0.87, + "learning_rate": 3.813648601193959e-08, + "logits/chosen": -2.437084674835205, + "logits/rejected": -2.447450876235962, + "logps/chosen": -282.7270202636719, + "logps/rejected": -288.4638977050781, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12493991106748581, + "rewards/margins": 2.42681622505188, + "rewards/rejected": -2.3018765449523926, + "step": 7588 + }, + { + "epoch": 0.87, + "learning_rate": 3.8101369542315346e-08, + "logits/chosen": -3.584993362426758, + "logits/rejected": -3.843242883682251, + "logps/chosen": -182.67523193359375, + "logps/rejected": -224.857177734375, + "loss": 0.3774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2901533842086792, + "rewards/margins": 2.3085780143737793, + "rewards/rejected": -2.598731517791748, + "step": 7589 + }, + { + "epoch": 0.87, + "learning_rate": 3.8066253072691087e-08, + "logits/chosen": -2.957814931869507, + "logits/rejected": -2.907405376434326, + "logps/chosen": -221.808837890625, + "logps/rejected": -221.50595092773438, + "loss": 0.6174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2720318138599396, + "rewards/margins": 1.164910078048706, + "rewards/rejected": -1.4369418621063232, + "step": 7590 + }, + { + "epoch": 0.88, + "learning_rate": 3.803113660306684e-08, + "logits/chosen": -2.4351108074188232, + "logits/rejected": -2.674781084060669, + "logps/chosen": -235.7695770263672, + "logps/rejected": -147.9312286376953, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21141257882118225, + "rewards/margins": 1.2517539262771606, + "rewards/rejected": -1.4631664752960205, + "step": 7591 + }, + { + "epoch": 0.88, + "learning_rate": 3.799602013344258e-08, + "logits/chosen": -2.7036170959472656, + "logits/rejected": -2.686844825744629, + "logps/chosen": -319.20123291015625, + "logps/rejected": -152.3541259765625, + "loss": 1.3517, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1489957571029663, + "rewards/margins": -0.30185508728027344, + "rewards/rejected": -0.8471405506134033, + "step": 7592 + }, + { + "epoch": 0.88, + "learning_rate": 3.796090366381833e-08, + "logits/chosen": -3.054304361343384, + "logits/rejected": -3.220294952392578, + "logps/chosen": -396.071044921875, + "logps/rejected": -399.5506896972656, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2405589371919632, + "rewards/margins": 1.6677930355072021, + "rewards/rejected": -1.427234172821045, + "step": 7593 + }, + { + "epoch": 0.88, + "learning_rate": 3.7925787194194075e-08, + "logits/chosen": -2.644350528717041, + "logits/rejected": -2.925177574157715, + "logps/chosen": -378.0043640136719, + "logps/rejected": -374.9029541015625, + "loss": 0.9014, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3612551987171173, + "rewards/margins": 0.03571963310241699, + "rewards/rejected": -0.3969747722148895, + "step": 7594 + }, + { + "epoch": 0.88, + "learning_rate": 3.789067072456982e-08, + "logits/chosen": -3.098341941833496, + "logits/rejected": -3.181070566177368, + "logps/chosen": -311.84527587890625, + "logps/rejected": -307.4010314941406, + "loss": 0.3698, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37557315826416016, + "rewards/margins": 1.9505267143249512, + "rewards/rejected": -2.3261001110076904, + "step": 7595 + }, + { + "epoch": 0.88, + "learning_rate": 3.7855554254945563e-08, + "logits/chosen": -2.5498299598693848, + "logits/rejected": -2.679361343383789, + "logps/chosen": -349.10028076171875, + "logps/rejected": -453.55096435546875, + "loss": 0.1053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7441035509109497, + "rewards/margins": 3.977997303009033, + "rewards/rejected": -3.233893632888794, + "step": 7596 + }, + { + "epoch": 0.88, + "learning_rate": 3.782043778532132e-08, + "logits/chosen": -3.1346843242645264, + "logits/rejected": -3.606557846069336, + "logps/chosen": -237.14112854003906, + "logps/rejected": -385.85699462890625, + "loss": 0.312, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8163938522338867, + "rewards/margins": 1.717664361000061, + "rewards/rejected": -0.90127032995224, + "step": 7597 + }, + { + "epoch": 0.88, + "learning_rate": 3.778532131569706e-08, + "logits/chosen": -3.0892677307128906, + "logits/rejected": -2.6711299419403076, + "logps/chosen": -288.348876953125, + "logps/rejected": -311.7647705078125, + "loss": 0.2924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23230576515197754, + "rewards/margins": 1.903717279434204, + "rewards/rejected": -2.1360228061676025, + "step": 7598 + }, + { + "epoch": 0.88, + "learning_rate": 3.775020484607281e-08, + "logits/chosen": -2.796632766723633, + "logits/rejected": -2.7059755325317383, + "logps/chosen": -360.93011474609375, + "logps/rejected": -440.08282470703125, + "loss": 0.2664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10496583580970764, + "rewards/margins": 3.869910478591919, + "rewards/rejected": -3.9748764038085938, + "step": 7599 + }, + { + "epoch": 0.88, + "learning_rate": 3.771508837644855e-08, + "logits/chosen": -3.407113552093506, + "logits/rejected": -2.955613136291504, + "logps/chosen": -444.0251770019531, + "logps/rejected": -285.7989501953125, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4099489748477936, + "rewards/margins": 1.5719212293624878, + "rewards/rejected": -1.1619722843170166, + "step": 7600 + }, + { + "epoch": 0.88, + "learning_rate": 3.76799719068243e-08, + "logits/chosen": -2.7645602226257324, + "logits/rejected": -2.7802939414978027, + "logps/chosen": -413.627685546875, + "logps/rejected": -250.46575927734375, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33420008420944214, + "rewards/margins": 1.5374984741210938, + "rewards/rejected": -1.8716986179351807, + "step": 7601 + }, + { + "epoch": 0.88, + "learning_rate": 3.764485543720004e-08, + "logits/chosen": -3.3080568313598633, + "logits/rejected": -3.3107478618621826, + "logps/chosen": -118.25067138671875, + "logps/rejected": -268.0151062011719, + "loss": 0.3948, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16873550415039062, + "rewards/margins": 2.0823585987091064, + "rewards/rejected": -1.9136230945587158, + "step": 7602 + }, + { + "epoch": 0.88, + "learning_rate": 3.7609738967575794e-08, + "logits/chosen": -3.1791627407073975, + "logits/rejected": -2.57861065864563, + "logps/chosen": -464.30419921875, + "logps/rejected": -372.3885192871094, + "loss": 0.5702, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9690279364585876, + "rewards/margins": 1.6413335800170898, + "rewards/rejected": -2.6103615760803223, + "step": 7603 + }, + { + "epoch": 0.88, + "learning_rate": 3.7574622497951535e-08, + "logits/chosen": -2.9340481758117676, + "logits/rejected": -2.7933003902435303, + "logps/chosen": -306.62030029296875, + "logps/rejected": -263.33294677734375, + "loss": 0.8562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7862238883972168, + "rewards/margins": 0.17646723985671997, + "rewards/rejected": -0.9626911878585815, + "step": 7604 + }, + { + "epoch": 0.88, + "learning_rate": 3.753950602832729e-08, + "logits/chosen": -3.69677734375, + "logits/rejected": -3.8206772804260254, + "logps/chosen": -137.2847900390625, + "logps/rejected": -176.9290771484375, + "loss": 0.6402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7195086479187012, + "rewards/margins": 1.0663621425628662, + "rewards/rejected": -1.7858707904815674, + "step": 7605 + }, + { + "epoch": 0.88, + "learning_rate": 3.750438955870303e-08, + "logits/chosen": -2.784344434738159, + "logits/rejected": -2.7706947326660156, + "logps/chosen": -165.04498291015625, + "logps/rejected": -183.4173583984375, + "loss": 0.8607, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2199465036392212, + "rewards/margins": 0.13225476443767548, + "rewards/rejected": -0.3522012531757355, + "step": 7606 + }, + { + "epoch": 0.88, + "learning_rate": 3.7469273089078776e-08, + "logits/chosen": -2.6172454357147217, + "logits/rejected": -2.9715640544891357, + "logps/chosen": -134.66146850585938, + "logps/rejected": -111.86105346679688, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0568179190158844, + "rewards/margins": 1.4647725820541382, + "rewards/rejected": -1.5215904712677002, + "step": 7607 + }, + { + "epoch": 0.88, + "learning_rate": 3.7434156619454524e-08, + "logits/chosen": -2.736809492111206, + "logits/rejected": -3.0143697261810303, + "logps/chosen": -200.12159729003906, + "logps/rejected": -273.3205871582031, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.057573266327381134, + "rewards/margins": 2.292672634124756, + "rewards/rejected": -2.2350993156433105, + "step": 7608 + }, + { + "epoch": 0.88, + "learning_rate": 3.7399040149830264e-08, + "logits/chosen": -3.541539430618286, + "logits/rejected": -3.480531930923462, + "logps/chosen": -271.50872802734375, + "logps/rejected": -173.95388793945312, + "loss": 0.4854, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2878613770008087, + "rewards/margins": 0.8165280222892761, + "rewards/rejected": -0.5286666750907898, + "step": 7609 + }, + { + "epoch": 0.88, + "learning_rate": 3.736392368020601e-08, + "logits/chosen": -2.619772434234619, + "logits/rejected": -2.6740944385528564, + "logps/chosen": -223.5157470703125, + "logps/rejected": -168.71484375, + "loss": 0.5366, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18076805770397186, + "rewards/margins": 0.9080381393432617, + "rewards/rejected": -1.08880615234375, + "step": 7610 + }, + { + "epoch": 0.88, + "learning_rate": 3.732880721058176e-08, + "logits/chosen": -3.4996025562286377, + "logits/rejected": -3.363617420196533, + "logps/chosen": -289.63201904296875, + "logps/rejected": -260.6901550292969, + "loss": 0.4496, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014582201838493347, + "rewards/margins": 1.2935256958007812, + "rewards/rejected": -1.278943419456482, + "step": 7611 + }, + { + "epoch": 0.88, + "learning_rate": 3.7293690740957506e-08, + "logits/chosen": -3.1556713581085205, + "logits/rejected": -3.169811725616455, + "logps/chosen": -162.7407684326172, + "logps/rejected": -106.66816711425781, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22439280152320862, + "rewards/margins": 0.17966556549072266, + "rewards/rejected": -0.4040583372116089, + "step": 7612 + }, + { + "epoch": 0.88, + "learning_rate": 3.725857427133325e-08, + "logits/chosen": -3.2529654502868652, + "logits/rejected": -3.5669078826904297, + "logps/chosen": -219.0803985595703, + "logps/rejected": -178.70904541015625, + "loss": 0.2699, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23684369027614594, + "rewards/margins": 2.1413044929504395, + "rewards/rejected": -1.9044607877731323, + "step": 7613 + }, + { + "epoch": 0.88, + "learning_rate": 3.7223457801709e-08, + "logits/chosen": -3.594912528991699, + "logits/rejected": -3.6971917152404785, + "logps/chosen": -153.599853515625, + "logps/rejected": -310.6912536621094, + "loss": 0.6387, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17991191148757935, + "rewards/margins": 2.041037082672119, + "rewards/rejected": -2.220949172973633, + "step": 7614 + }, + { + "epoch": 0.88, + "learning_rate": 3.718834133208475e-08, + "logits/chosen": -3.022644519805908, + "logits/rejected": -2.781877279281616, + "logps/chosen": -257.0694885253906, + "logps/rejected": -268.97760009765625, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08323192596435547, + "rewards/margins": 1.7235392332077026, + "rewards/rejected": -1.8067710399627686, + "step": 7615 + }, + { + "epoch": 0.88, + "learning_rate": 3.7153224862460495e-08, + "logits/chosen": -2.7521159648895264, + "logits/rejected": -2.734151840209961, + "logps/chosen": -174.47068786621094, + "logps/rejected": -238.9608154296875, + "loss": 0.3379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09667423367500305, + "rewards/margins": 1.8496005535125732, + "rewards/rejected": -1.7529263496398926, + "step": 7616 + }, + { + "epoch": 0.88, + "learning_rate": 3.7118108392836235e-08, + "logits/chosen": -2.5079736709594727, + "logits/rejected": -2.9383625984191895, + "logps/chosen": -197.1796875, + "logps/rejected": -196.0464324951172, + "loss": 0.3859, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09908384084701538, + "rewards/margins": 2.128351926803589, + "rewards/rejected": -2.0292680263519287, + "step": 7617 + }, + { + "epoch": 0.88, + "learning_rate": 3.708299192321198e-08, + "logits/chosen": -2.2869584560394287, + "logits/rejected": -2.3304355144500732, + "logps/chosen": -132.51458740234375, + "logps/rejected": -211.7092742919922, + "loss": 0.3483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.392471045255661, + "rewards/margins": 1.4465982913970947, + "rewards/rejected": -1.839069128036499, + "step": 7618 + }, + { + "epoch": 0.88, + "learning_rate": 3.704787545358773e-08, + "logits/chosen": -2.9152591228485107, + "logits/rejected": -2.8448293209075928, + "logps/chosen": -204.47763061523438, + "logps/rejected": -201.3577880859375, + "loss": 0.3134, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02714618667960167, + "rewards/margins": 1.7272515296936035, + "rewards/rejected": -1.7001054286956787, + "step": 7619 + }, + { + "epoch": 0.88, + "learning_rate": 3.701275898396348e-08, + "logits/chosen": -2.2620391845703125, + "logits/rejected": -2.6744260787963867, + "logps/chosen": -458.10479736328125, + "logps/rejected": -211.65757751464844, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7492223381996155, + "rewards/margins": 2.045100212097168, + "rewards/rejected": -1.2958779335021973, + "step": 7620 + }, + { + "epoch": 0.88, + "learning_rate": 3.6977642514339224e-08, + "logits/chosen": -2.875605583190918, + "logits/rejected": -2.869070529937744, + "logps/chosen": -206.992431640625, + "logps/rejected": -226.53341674804688, + "loss": 0.5937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.605523943901062, + "rewards/margins": 0.6339443325996399, + "rewards/rejected": -1.2394683361053467, + "step": 7621 + }, + { + "epoch": 0.88, + "learning_rate": 3.694252604471497e-08, + "logits/chosen": -3.081188917160034, + "logits/rejected": -3.0427112579345703, + "logps/chosen": -285.08203125, + "logps/rejected": -346.0024719238281, + "loss": 0.21, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25914713740348816, + "rewards/margins": 2.268519878387451, + "rewards/rejected": -2.0093727111816406, + "step": 7622 + }, + { + "epoch": 0.88, + "learning_rate": 3.690740957509072e-08, + "logits/chosen": -3.2641797065734863, + "logits/rejected": -3.328871488571167, + "logps/chosen": -313.53192138671875, + "logps/rejected": -280.05340576171875, + "loss": 0.2644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06294530630111694, + "rewards/margins": 1.6746585369110107, + "rewards/rejected": -1.7376036643981934, + "step": 7623 + }, + { + "epoch": 0.88, + "learning_rate": 3.687229310546646e-08, + "logits/chosen": -3.3563408851623535, + "logits/rejected": -3.3440425395965576, + "logps/chosen": -289.53363037109375, + "logps/rejected": -203.98892211914062, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4824787378311157, + "rewards/margins": 2.3691229820251465, + "rewards/rejected": -2.8516016006469727, + "step": 7624 + }, + { + "epoch": 0.88, + "learning_rate": 3.683717663584221e-08, + "logits/chosen": -3.2121639251708984, + "logits/rejected": -3.3572726249694824, + "logps/chosen": -218.2339630126953, + "logps/rejected": -153.7510223388672, + "loss": 0.2932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14481651782989502, + "rewards/margins": 2.0227060317993164, + "rewards/rejected": -2.167522430419922, + "step": 7625 + }, + { + "epoch": 0.88, + "learning_rate": 3.6802060166217954e-08, + "logits/chosen": -2.7336974143981934, + "logits/rejected": -2.971919298171997, + "logps/chosen": -274.9068603515625, + "logps/rejected": -358.610107421875, + "loss": 0.6866, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15624390542507172, + "rewards/margins": 0.8141628503799438, + "rewards/rejected": -0.9704067707061768, + "step": 7626 + }, + { + "epoch": 0.88, + "learning_rate": 3.67669436965937e-08, + "logits/chosen": -2.7857155799865723, + "logits/rejected": -2.771305799484253, + "logps/chosen": -174.1531982421875, + "logps/rejected": -138.58377075195312, + "loss": 0.4414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5070219039916992, + "rewards/margins": 0.7728464603424072, + "rewards/rejected": -1.2798683643341064, + "step": 7627 + }, + { + "epoch": 0.88, + "learning_rate": 3.673182722696945e-08, + "logits/chosen": -3.3698015213012695, + "logits/rejected": -2.99528431892395, + "logps/chosen": -293.1990051269531, + "logps/rejected": -226.3348388671875, + "loss": 0.4878, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21373769640922546, + "rewards/margins": 1.865426778793335, + "rewards/rejected": -2.079164505004883, + "step": 7628 + }, + { + "epoch": 0.88, + "learning_rate": 3.6696710757345196e-08, + "logits/chosen": -2.5424275398254395, + "logits/rejected": -2.4286184310913086, + "logps/chosen": -358.4206848144531, + "logps/rejected": -345.48876953125, + "loss": 0.3504, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5109260082244873, + "rewards/margins": 1.3053237199783325, + "rewards/rejected": -1.8162497282028198, + "step": 7629 + }, + { + "epoch": 0.88, + "learning_rate": 3.666159428772094e-08, + "logits/chosen": -2.4932146072387695, + "logits/rejected": -2.403775691986084, + "logps/chosen": -233.6435089111328, + "logps/rejected": -218.89508056640625, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3113442063331604, + "rewards/margins": 1.247554063796997, + "rewards/rejected": -1.5588980913162231, + "step": 7630 + }, + { + "epoch": 0.88, + "learning_rate": 3.6626477818096684e-08, + "logits/chosen": -3.1626033782958984, + "logits/rejected": -3.1247336864471436, + "logps/chosen": -352.94781494140625, + "logps/rejected": -310.75421142578125, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1739061176776886, + "rewards/margins": 1.859586477279663, + "rewards/rejected": -1.6856803894042969, + "step": 7631 + }, + { + "epoch": 0.88, + "learning_rate": 3.659136134847243e-08, + "logits/chosen": -3.2817039489746094, + "logits/rejected": -3.4136295318603516, + "logps/chosen": -206.05455017089844, + "logps/rejected": -168.4808349609375, + "loss": 0.342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03652048483490944, + "rewards/margins": 1.6448055505752563, + "rewards/rejected": -1.6082849502563477, + "step": 7632 + }, + { + "epoch": 0.88, + "learning_rate": 3.655624487884818e-08, + "logits/chosen": -3.799051284790039, + "logits/rejected": -3.178656816482544, + "logps/chosen": -357.99871826171875, + "logps/rejected": -162.61441040039062, + "loss": 0.4786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23966602981090546, + "rewards/margins": 1.1368972063064575, + "rewards/rejected": -1.376563310623169, + "step": 7633 + }, + { + "epoch": 0.88, + "learning_rate": 3.6521128409223925e-08, + "logits/chosen": -2.6555280685424805, + "logits/rejected": -2.6765482425689697, + "logps/chosen": -301.66339111328125, + "logps/rejected": -309.6047058105469, + "loss": 0.4045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5260014533996582, + "rewards/margins": 1.3505544662475586, + "rewards/rejected": -1.8765559196472168, + "step": 7634 + }, + { + "epoch": 0.88, + "learning_rate": 3.648601193959967e-08, + "logits/chosen": -3.1763036251068115, + "logits/rejected": -3.192868232727051, + "logps/chosen": -355.01123046875, + "logps/rejected": -241.92239379882812, + "loss": 0.3658, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17121285200119019, + "rewards/margins": 1.2839034795761108, + "rewards/rejected": -1.1126905679702759, + "step": 7635 + }, + { + "epoch": 0.88, + "learning_rate": 3.645089546997542e-08, + "logits/chosen": -3.876673460006714, + "logits/rejected": -3.7684497833251953, + "logps/chosen": -229.6480712890625, + "logps/rejected": -230.7838134765625, + "loss": 0.2233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27507105469703674, + "rewards/margins": 2.1472644805908203, + "rewards/rejected": -1.872193455696106, + "step": 7636 + }, + { + "epoch": 0.88, + "learning_rate": 3.641577900035117e-08, + "logits/chosen": -3.1206889152526855, + "logits/rejected": -3.1701934337615967, + "logps/chosen": -503.32061767578125, + "logps/rejected": -327.8870849609375, + "loss": 0.6893, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2947362959384918, + "rewards/margins": 0.3740427494049072, + "rewards/rejected": -0.6687790751457214, + "step": 7637 + }, + { + "epoch": 0.88, + "learning_rate": 3.638066253072691e-08, + "logits/chosen": -3.3647868633270264, + "logits/rejected": -3.4685096740722656, + "logps/chosen": -168.15115356445312, + "logps/rejected": -170.8046875, + "loss": 0.4434, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21816954016685486, + "rewards/margins": 1.3796720504760742, + "rewards/rejected": -1.5978416204452515, + "step": 7638 + }, + { + "epoch": 0.88, + "learning_rate": 3.6345546061102655e-08, + "logits/chosen": -3.338784694671631, + "logits/rejected": -3.405172824859619, + "logps/chosen": -149.04666137695312, + "logps/rejected": -120.28192138671875, + "loss": 0.4129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12702493369579315, + "rewards/margins": 1.815659999847412, + "rewards/rejected": -1.688634991645813, + "step": 7639 + }, + { + "epoch": 0.88, + "learning_rate": 3.63104295914784e-08, + "logits/chosen": -2.7295384407043457, + "logits/rejected": -2.7153818607330322, + "logps/chosen": -237.40707397460938, + "logps/rejected": -150.3667755126953, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6186214685440063, + "rewards/margins": 2.6845149993896484, + "rewards/rejected": -2.0658936500549316, + "step": 7640 + }, + { + "epoch": 0.88, + "learning_rate": 3.627531312185415e-08, + "logits/chosen": -2.750901699066162, + "logits/rejected": -2.518700361251831, + "logps/chosen": -482.34442138671875, + "logps/rejected": -415.9822998046875, + "loss": 0.4956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2535858750343323, + "rewards/margins": 1.9383552074432373, + "rewards/rejected": -2.191941261291504, + "step": 7641 + }, + { + "epoch": 0.88, + "learning_rate": 3.6240196652229897e-08, + "logits/chosen": -2.933825969696045, + "logits/rejected": -2.9700405597686768, + "logps/chosen": -185.8231201171875, + "logps/rejected": -218.3321990966797, + "loss": 0.2981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05574329197406769, + "rewards/margins": 1.755366563796997, + "rewards/rejected": -1.6996231079101562, + "step": 7642 + }, + { + "epoch": 0.88, + "learning_rate": 3.6205080182605644e-08, + "logits/chosen": -2.606166124343872, + "logits/rejected": -2.5084586143493652, + "logps/chosen": -388.26019287109375, + "logps/rejected": -336.9754638671875, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17042848467826843, + "rewards/margins": 1.6970205307006836, + "rewards/rejected": -1.8674489259719849, + "step": 7643 + }, + { + "epoch": 0.88, + "learning_rate": 3.616996371298139e-08, + "logits/chosen": -3.3378026485443115, + "logits/rejected": -3.2633228302001953, + "logps/chosen": -506.8536376953125, + "logps/rejected": -268.85943603515625, + "loss": 0.9576, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.619388222694397, + "rewards/margins": 0.6217590570449829, + "rewards/rejected": -1.2411472797393799, + "step": 7644 + }, + { + "epoch": 0.88, + "learning_rate": 3.613484724335713e-08, + "logits/chosen": -4.131060600280762, + "logits/rejected": -4.020941734313965, + "logps/chosen": -165.30917358398438, + "logps/rejected": -176.06236267089844, + "loss": 0.5372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1151377260684967, + "rewards/margins": 0.8534390926361084, + "rewards/rejected": -0.9685768485069275, + "step": 7645 + }, + { + "epoch": 0.88, + "learning_rate": 3.609973077373288e-08, + "logits/chosen": -2.8859474658966064, + "logits/rejected": -2.969754457473755, + "logps/chosen": -331.26263427734375, + "logps/rejected": -243.60330200195312, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06554195284843445, + "rewards/margins": 1.0498462915420532, + "rewards/rejected": -0.9843042492866516, + "step": 7646 + }, + { + "epoch": 0.88, + "learning_rate": 3.6064614304108626e-08, + "logits/chosen": -2.2014708518981934, + "logits/rejected": -2.310743570327759, + "logps/chosen": -154.4513702392578, + "logps/rejected": -232.7261962890625, + "loss": 0.385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4110721945762634, + "rewards/margins": 1.7894057035446167, + "rewards/rejected": -1.3783334493637085, + "step": 7647 + }, + { + "epoch": 0.88, + "learning_rate": 3.602949783448437e-08, + "logits/chosen": -2.59954833984375, + "logits/rejected": -2.5684280395507812, + "logps/chosen": -610.4935913085938, + "logps/rejected": -398.55609130859375, + "loss": 0.2282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5761045217514038, + "rewards/margins": 2.4265780448913574, + "rewards/rejected": -1.8504736423492432, + "step": 7648 + }, + { + "epoch": 0.88, + "learning_rate": 3.5994381364860114e-08, + "logits/chosen": -4.148447513580322, + "logits/rejected": -4.053508281707764, + "logps/chosen": -178.102294921875, + "logps/rejected": -165.68148803710938, + "loss": 0.3541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22258982062339783, + "rewards/margins": 2.0176453590393066, + "rewards/rejected": -2.2402350902557373, + "step": 7649 + }, + { + "epoch": 0.88, + "learning_rate": 3.595926489523586e-08, + "logits/chosen": -3.127763509750366, + "logits/rejected": -3.190213918685913, + "logps/chosen": -203.63058471679688, + "logps/rejected": -200.00582885742188, + "loss": 0.3159, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2161267250776291, + "rewards/margins": 3.2045531272888184, + "rewards/rejected": -2.988426446914673, + "step": 7650 + }, + { + "epoch": 0.88, + "learning_rate": 3.592414842561161e-08, + "logits/chosen": -3.004331588745117, + "logits/rejected": -3.208406686782837, + "logps/chosen": -230.68695068359375, + "logps/rejected": -287.5965881347656, + "loss": 0.3823, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.048517100512981415, + "rewards/margins": 2.1586720943450928, + "rewards/rejected": -2.110154867172241, + "step": 7651 + }, + { + "epoch": 0.88, + "learning_rate": 3.5889031955987356e-08, + "logits/chosen": -3.659144878387451, + "logits/rejected": -3.471808433532715, + "logps/chosen": -197.71914672851562, + "logps/rejected": -227.12258911132812, + "loss": 0.3044, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5203490853309631, + "rewards/margins": 2.3945536613464355, + "rewards/rejected": -1.8742048740386963, + "step": 7652 + }, + { + "epoch": 0.88, + "learning_rate": 3.58539154863631e-08, + "logits/chosen": -3.1716010570526123, + "logits/rejected": -3.0795986652374268, + "logps/chosen": -270.70849609375, + "logps/rejected": -404.6064147949219, + "loss": 0.1384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008125629276037216, + "rewards/margins": 2.973641872406006, + "rewards/rejected": -2.9655165672302246, + "step": 7653 + }, + { + "epoch": 0.88, + "learning_rate": 3.581879901673885e-08, + "logits/chosen": -3.29257869720459, + "logits/rejected": -2.8480136394500732, + "logps/chosen": -195.81829833984375, + "logps/rejected": -261.26654052734375, + "loss": 0.3753, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2969158887863159, + "rewards/margins": 1.8440520763397217, + "rewards/rejected": -1.5471360683441162, + "step": 7654 + }, + { + "epoch": 0.88, + "learning_rate": 3.578368254711459e-08, + "logits/chosen": -2.788461208343506, + "logits/rejected": -2.635265350341797, + "logps/chosen": -268.42724609375, + "logps/rejected": -266.407470703125, + "loss": 0.3061, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04917724430561066, + "rewards/margins": 2.7990050315856934, + "rewards/rejected": -2.848182201385498, + "step": 7655 + }, + { + "epoch": 0.88, + "learning_rate": 3.574856607749034e-08, + "logits/chosen": -2.817654848098755, + "logits/rejected": -3.0996322631835938, + "logps/chosen": -245.56982421875, + "logps/rejected": -233.31753540039062, + "loss": 0.5579, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2723575234413147, + "rewards/margins": 1.7695564031600952, + "rewards/rejected": -1.4971989393234253, + "step": 7656 + }, + { + "epoch": 0.88, + "learning_rate": 3.5713449607866085e-08, + "logits/chosen": -3.146768093109131, + "logits/rejected": -3.093116283416748, + "logps/chosen": -163.6436767578125, + "logps/rejected": -163.0171661376953, + "loss": 0.6741, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.163284033536911, + "rewards/margins": 1.4121973514556885, + "rewards/rejected": -1.5754815340042114, + "step": 7657 + }, + { + "epoch": 0.88, + "learning_rate": 3.567833313824183e-08, + "logits/chosen": -2.3784000873565674, + "logits/rejected": -2.441859006881714, + "logps/chosen": -194.76861572265625, + "logps/rejected": -327.80328369140625, + "loss": 0.2998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15238747000694275, + "rewards/margins": 3.249593734741211, + "rewards/rejected": -3.0972063541412354, + "step": 7658 + }, + { + "epoch": 0.88, + "learning_rate": 3.564321666861758e-08, + "logits/chosen": -3.2394089698791504, + "logits/rejected": -3.1864938735961914, + "logps/chosen": -152.1769256591797, + "logps/rejected": -134.38693237304688, + "loss": 0.2922, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46608826518058777, + "rewards/margins": 1.2863248586654663, + "rewards/rejected": -0.8202365636825562, + "step": 7659 + }, + { + "epoch": 0.88, + "learning_rate": 3.560810019899333e-08, + "logits/chosen": -3.240140676498413, + "logits/rejected": -3.018044948577881, + "logps/chosen": -217.94944763183594, + "logps/rejected": -315.2206115722656, + "loss": 0.2427, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.004435107111930847, + "rewards/margins": 2.5326125621795654, + "rewards/rejected": -2.5281777381896973, + "step": 7660 + }, + { + "epoch": 0.88, + "learning_rate": 3.5572983729369074e-08, + "logits/chosen": -3.0184788703918457, + "logits/rejected": -3.1693787574768066, + "logps/chosen": -370.0528869628906, + "logps/rejected": -230.92337036132812, + "loss": 0.2771, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6506674289703369, + "rewards/margins": 2.628783941268921, + "rewards/rejected": -1.978116512298584, + "step": 7661 + }, + { + "epoch": 0.88, + "learning_rate": 3.5537867259744815e-08, + "logits/chosen": -2.548046588897705, + "logits/rejected": -2.775184154510498, + "logps/chosen": -242.05828857421875, + "logps/rejected": -223.63812255859375, + "loss": 0.5132, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009656044654548168, + "rewards/margins": 1.3965058326721191, + "rewards/rejected": -1.3868498802185059, + "step": 7662 + }, + { + "epoch": 0.88, + "learning_rate": 3.550275079012056e-08, + "logits/chosen": -2.9915943145751953, + "logits/rejected": -3.12161922454834, + "logps/chosen": -228.07391357421875, + "logps/rejected": -168.74832153320312, + "loss": 0.523, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04166325926780701, + "rewards/margins": 0.5615637302398682, + "rewards/rejected": -0.6032271385192871, + "step": 7663 + }, + { + "epoch": 0.88, + "learning_rate": 3.546763432049631e-08, + "logits/chosen": -2.7289435863494873, + "logits/rejected": -2.622244119644165, + "logps/chosen": -227.98858642578125, + "logps/rejected": -322.22003173828125, + "loss": 0.6126, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15746495127677917, + "rewards/margins": 1.462770700454712, + "rewards/rejected": -1.3053056001663208, + "step": 7664 + }, + { + "epoch": 0.88, + "learning_rate": 3.5432517850872057e-08, + "logits/chosen": -3.438560724258423, + "logits/rejected": -3.215437173843384, + "logps/chosen": -166.12098693847656, + "logps/rejected": -294.1054992675781, + "loss": 0.5871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9853675961494446, + "rewards/margins": 0.8093535304069519, + "rewards/rejected": -1.794721007347107, + "step": 7665 + }, + { + "epoch": 0.88, + "learning_rate": 3.5397401381247804e-08, + "logits/chosen": -2.976301670074463, + "logits/rejected": -3.130638599395752, + "logps/chosen": -363.5771789550781, + "logps/rejected": -366.8040466308594, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3144974112510681, + "rewards/margins": 2.9902987480163574, + "rewards/rejected": -2.6758012771606445, + "step": 7666 + }, + { + "epoch": 0.88, + "learning_rate": 3.536228491162355e-08, + "logits/chosen": -3.571976900100708, + "logits/rejected": -3.6370620727539062, + "logps/chosen": -258.1890869140625, + "logps/rejected": -304.0965881347656, + "loss": 0.423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10948070883750916, + "rewards/margins": 1.7805047035217285, + "rewards/rejected": -1.8899853229522705, + "step": 7667 + }, + { + "epoch": 0.88, + "learning_rate": 3.53271684419993e-08, + "logits/chosen": -3.19193696975708, + "logits/rejected": -3.133286952972412, + "logps/chosen": -204.53427124023438, + "logps/rejected": -147.36268615722656, + "loss": 0.427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029847070574760437, + "rewards/margins": 1.1351768970489502, + "rewards/rejected": -1.165023922920227, + "step": 7668 + }, + { + "epoch": 0.88, + "learning_rate": 3.529205197237504e-08, + "logits/chosen": -3.725642442703247, + "logits/rejected": -3.502023935317993, + "logps/chosen": -275.0013122558594, + "logps/rejected": -263.19573974609375, + "loss": 0.5994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.862059473991394, + "rewards/margins": 0.8998991847038269, + "rewards/rejected": -1.7619587182998657, + "step": 7669 + }, + { + "epoch": 0.88, + "learning_rate": 3.5256935502750786e-08, + "logits/chosen": -3.3947958946228027, + "logits/rejected": -3.4377174377441406, + "logps/chosen": -215.1112060546875, + "logps/rejected": -291.4317626953125, + "loss": 0.6478, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8465834856033325, + "rewards/margins": 1.993584394454956, + "rewards/rejected": -2.840167760848999, + "step": 7670 + }, + { + "epoch": 0.88, + "learning_rate": 3.5221819033126533e-08, + "logits/chosen": -3.039644956588745, + "logits/rejected": -3.1727547645568848, + "logps/chosen": -181.00599670410156, + "logps/rejected": -321.14288330078125, + "loss": 0.5312, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36281317472457886, + "rewards/margins": 1.0977845191955566, + "rewards/rejected": -0.7349714040756226, + "step": 7671 + }, + { + "epoch": 0.88, + "learning_rate": 3.518670256350228e-08, + "logits/chosen": -3.6386072635650635, + "logits/rejected": -3.569944143295288, + "logps/chosen": -257.1064453125, + "logps/rejected": -128.11773681640625, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11363162100315094, + "rewards/margins": 1.1544733047485352, + "rewards/rejected": -1.268104910850525, + "step": 7672 + }, + { + "epoch": 0.88, + "learning_rate": 3.515158609387803e-08, + "logits/chosen": -3.0936381816864014, + "logits/rejected": -3.1023905277252197, + "logps/chosen": -428.01275634765625, + "logps/rejected": -414.23046875, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059980347752571106, + "rewards/margins": 3.504026412963867, + "rewards/rejected": -3.5640065670013428, + "step": 7673 + }, + { + "epoch": 0.88, + "learning_rate": 3.5116469624253775e-08, + "logits/chosen": -3.3131322860717773, + "logits/rejected": -3.35331130027771, + "logps/chosen": -161.24205017089844, + "logps/rejected": -186.045166015625, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3432813286781311, + "rewards/margins": 2.2728075981140137, + "rewards/rejected": -2.6160888671875, + "step": 7674 + }, + { + "epoch": 0.88, + "learning_rate": 3.508135315462952e-08, + "logits/chosen": -3.2668557167053223, + "logits/rejected": -3.5880250930786133, + "logps/chosen": -187.70578002929688, + "logps/rejected": -245.8788604736328, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.255094051361084, + "rewards/margins": 2.678262948989868, + "rewards/rejected": -2.423168659210205, + "step": 7675 + }, + { + "epoch": 0.88, + "learning_rate": 3.504623668500527e-08, + "logits/chosen": -2.6756019592285156, + "logits/rejected": -2.725548267364502, + "logps/chosen": -154.49266052246094, + "logps/rejected": -260.20599365234375, + "loss": 0.2794, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13877670466899872, + "rewards/margins": 1.4353692531585693, + "rewards/rejected": -1.2965924739837646, + "step": 7676 + }, + { + "epoch": 0.89, + "learning_rate": 3.501112021538101e-08, + "logits/chosen": -4.036628723144531, + "logits/rejected": -3.5835819244384766, + "logps/chosen": -203.0177764892578, + "logps/rejected": -171.12765502929688, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29513877630233765, + "rewards/margins": 2.7936410903930664, + "rewards/rejected": -2.498502254486084, + "step": 7677 + }, + { + "epoch": 0.89, + "learning_rate": 3.497600374575676e-08, + "logits/chosen": -3.1263158321380615, + "logits/rejected": -3.221679925918579, + "logps/chosen": -202.86936950683594, + "logps/rejected": -291.65582275390625, + "loss": 0.2576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6727031469345093, + "rewards/margins": 2.5051164627075195, + "rewards/rejected": -3.1778194904327393, + "step": 7678 + }, + { + "epoch": 0.89, + "learning_rate": 3.4940887276132505e-08, + "logits/chosen": -3.058842182159424, + "logits/rejected": -3.042689800262451, + "logps/chosen": -203.01675415039062, + "logps/rejected": -164.68716430664062, + "loss": 0.4322, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16362354159355164, + "rewards/margins": 1.509131669998169, + "rewards/rejected": -1.6727551221847534, + "step": 7679 + }, + { + "epoch": 0.89, + "learning_rate": 3.490577080650825e-08, + "logits/chosen": -2.768169403076172, + "logits/rejected": -2.804692506790161, + "logps/chosen": -278.6617431640625, + "logps/rejected": -233.3551025390625, + "loss": 0.4326, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2474874258041382, + "rewards/margins": 1.1916913986206055, + "rewards/rejected": -2.439178943634033, + "step": 7680 + }, + { + "epoch": 0.89, + "learning_rate": 3.4870654336884e-08, + "logits/chosen": -2.9182651042938232, + "logits/rejected": -3.090418577194214, + "logps/chosen": -200.34136962890625, + "logps/rejected": -155.178955078125, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0705111175775528, + "rewards/margins": 1.7049411535263062, + "rewards/rejected": -1.634429931640625, + "step": 7681 + }, + { + "epoch": 0.89, + "learning_rate": 3.4835537867259746e-08, + "logits/chosen": -3.857264995574951, + "logits/rejected": -3.583944320678711, + "logps/chosen": -435.1050109863281, + "logps/rejected": -370.1891174316406, + "loss": 0.3204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35307058691978455, + "rewards/margins": 2.765441656112671, + "rewards/rejected": -3.1185121536254883, + "step": 7682 + }, + { + "epoch": 0.89, + "learning_rate": 3.4800421397635494e-08, + "logits/chosen": -2.8667235374450684, + "logits/rejected": -3.0640289783477783, + "logps/chosen": -190.1137237548828, + "logps/rejected": -322.49176025390625, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3227255642414093, + "rewards/margins": 2.0898518562316895, + "rewards/rejected": -1.767126202583313, + "step": 7683 + }, + { + "epoch": 0.89, + "learning_rate": 3.4765304928011234e-08, + "logits/chosen": -3.5781478881835938, + "logits/rejected": -3.4235925674438477, + "logps/chosen": -277.4344177246094, + "logps/rejected": -310.1117858886719, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.421318918466568, + "rewards/margins": 2.414865732192993, + "rewards/rejected": -2.8361847400665283, + "step": 7684 + }, + { + "epoch": 0.89, + "learning_rate": 3.473018845838698e-08, + "logits/chosen": -2.9035568237304688, + "logits/rejected": -2.4842984676361084, + "logps/chosen": -273.3330383300781, + "logps/rejected": -308.31298828125, + "loss": 0.9169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9008941054344177, + "rewards/margins": -0.07152464985847473, + "rewards/rejected": -0.8293695449829102, + "step": 7685 + }, + { + "epoch": 0.89, + "learning_rate": 3.469507198876273e-08, + "logits/chosen": -2.750377893447876, + "logits/rejected": -2.6009764671325684, + "logps/chosen": -220.80908203125, + "logps/rejected": -245.847900390625, + "loss": 0.7389, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4926799535751343, + "rewards/margins": 0.658049464225769, + "rewards/rejected": -1.1507294178009033, + "step": 7686 + }, + { + "epoch": 0.89, + "learning_rate": 3.4659955519138476e-08, + "logits/chosen": -3.151965379714966, + "logits/rejected": -2.9210424423217773, + "logps/chosen": -172.95755004882812, + "logps/rejected": -165.51785278320312, + "loss": 0.6492, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.857921302318573, + "rewards/margins": 0.6993834972381592, + "rewards/rejected": -1.5573047399520874, + "step": 7687 + }, + { + "epoch": 0.89, + "learning_rate": 3.462483904951422e-08, + "logits/chosen": -3.8976378440856934, + "logits/rejected": -4.207592964172363, + "logps/chosen": -212.37823486328125, + "logps/rejected": -242.76702880859375, + "loss": 0.6889, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1867302656173706, + "rewards/margins": 0.661041796207428, + "rewards/rejected": -0.8477720022201538, + "step": 7688 + }, + { + "epoch": 0.89, + "learning_rate": 3.458972257988997e-08, + "logits/chosen": -2.5930685997009277, + "logits/rejected": -2.50842022895813, + "logps/chosen": -323.29888916015625, + "logps/rejected": -330.74169921875, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1770612597465515, + "rewards/margins": 1.6394681930541992, + "rewards/rejected": -1.4624069929122925, + "step": 7689 + }, + { + "epoch": 0.89, + "learning_rate": 3.455460611026572e-08, + "logits/chosen": -2.9935970306396484, + "logits/rejected": -3.141474962234497, + "logps/chosen": -283.3449401855469, + "logps/rejected": -254.9655303955078, + "loss": 0.5054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15016943216323853, + "rewards/margins": 1.7433760166168213, + "rewards/rejected": -1.593206763267517, + "step": 7690 + }, + { + "epoch": 0.89, + "learning_rate": 3.451948964064146e-08, + "logits/chosen": -3.860302209854126, + "logits/rejected": -3.7988839149475098, + "logps/chosen": -315.24615478515625, + "logps/rejected": -312.4653015136719, + "loss": 0.2754, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06173545867204666, + "rewards/margins": 1.9881621599197388, + "rewards/rejected": -1.926426649093628, + "step": 7691 + }, + { + "epoch": 0.89, + "learning_rate": 3.4484373171017205e-08, + "logits/chosen": -2.917102336883545, + "logits/rejected": -2.9034225940704346, + "logps/chosen": -252.02310180664062, + "logps/rejected": -461.7735900878906, + "loss": 0.4117, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4548414349555969, + "rewards/margins": 2.1772797107696533, + "rewards/rejected": -1.7224382162094116, + "step": 7692 + }, + { + "epoch": 0.89, + "learning_rate": 3.444925670139295e-08, + "logits/chosen": -3.5589683055877686, + "logits/rejected": -3.5374221801757812, + "logps/chosen": -262.6549072265625, + "logps/rejected": -251.53465270996094, + "loss": 0.2011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5966962575912476, + "rewards/margins": 2.180335521697998, + "rewards/rejected": -1.5836392641067505, + "step": 7693 + }, + { + "epoch": 0.89, + "learning_rate": 3.44141402317687e-08, + "logits/chosen": -3.2782812118530273, + "logits/rejected": -3.133056163787842, + "logps/chosen": -148.0978546142578, + "logps/rejected": -141.48760986328125, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16870620846748352, + "rewards/margins": 1.94362211227417, + "rewards/rejected": -1.7749156951904297, + "step": 7694 + }, + { + "epoch": 0.89, + "learning_rate": 3.437902376214445e-08, + "logits/chosen": -2.9960389137268066, + "logits/rejected": -2.5571725368499756, + "logps/chosen": -365.5172119140625, + "logps/rejected": -356.5035400390625, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22979021072387695, + "rewards/margins": 1.2422047853469849, + "rewards/rejected": -1.4719949960708618, + "step": 7695 + }, + { + "epoch": 0.89, + "learning_rate": 3.434390729252019e-08, + "logits/chosen": -3.0020368099212646, + "logits/rejected": -3.1969475746154785, + "logps/chosen": -377.54302978515625, + "logps/rejected": -286.8892822265625, + "loss": 0.9296, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5243821144104004, + "rewards/margins": 0.2713039517402649, + "rewards/rejected": -0.7956860661506653, + "step": 7696 + }, + { + "epoch": 0.89, + "learning_rate": 3.4308790822895935e-08, + "logits/chosen": -2.771672010421753, + "logits/rejected": -3.262355089187622, + "logps/chosen": -234.47991943359375, + "logps/rejected": -348.1251220703125, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3008919656276703, + "rewards/margins": 3.379920482635498, + "rewards/rejected": -3.079028844833374, + "step": 7697 + }, + { + "epoch": 0.89, + "learning_rate": 3.427367435327168e-08, + "logits/chosen": -2.8321290016174316, + "logits/rejected": -2.996061086654663, + "logps/chosen": -180.8538818359375, + "logps/rejected": -217.2718505859375, + "loss": 0.4374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04240456223487854, + "rewards/margins": 2.6338579654693604, + "rewards/rejected": -2.676262617111206, + "step": 7698 + }, + { + "epoch": 0.89, + "learning_rate": 3.423855788364743e-08, + "logits/chosen": -3.767836570739746, + "logits/rejected": -3.559028387069702, + "logps/chosen": -412.3526611328125, + "logps/rejected": -294.35540771484375, + "loss": 0.2104, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.114081472158432, + "rewards/margins": 2.3415002822875977, + "rewards/rejected": -2.227418899536133, + "step": 7699 + }, + { + "epoch": 0.89, + "learning_rate": 3.420344141402318e-08, + "logits/chosen": -3.1921725273132324, + "logits/rejected": -3.4911887645721436, + "logps/chosen": -294.6708984375, + "logps/rejected": -408.4051208496094, + "loss": 0.6008, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0305671691894531, + "rewards/margins": 1.9120299816131592, + "rewards/rejected": -2.9425971508026123, + "step": 7700 + }, + { + "epoch": 0.89, + "learning_rate": 3.416832494439892e-08, + "logits/chosen": -3.119098663330078, + "logits/rejected": -2.9794468879699707, + "logps/chosen": -217.44573974609375, + "logps/rejected": -259.6300048828125, + "loss": 0.156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1777142435312271, + "rewards/margins": 2.241927146911621, + "rewards/rejected": -2.0642130374908447, + "step": 7701 + }, + { + "epoch": 0.89, + "learning_rate": 3.4133208474774665e-08, + "logits/chosen": -2.860567092895508, + "logits/rejected": -2.8718223571777344, + "logps/chosen": -472.2198791503906, + "logps/rejected": -569.9971313476562, + "loss": 0.5037, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3044637441635132, + "rewards/margins": 1.4361369609832764, + "rewards/rejected": -1.7406007051467896, + "step": 7702 + }, + { + "epoch": 0.89, + "learning_rate": 3.409809200515041e-08, + "logits/chosen": -3.4763262271881104, + "logits/rejected": -3.383606433868408, + "logps/chosen": -151.1147003173828, + "logps/rejected": -144.41392517089844, + "loss": 0.3205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23783788084983826, + "rewards/margins": 1.6689621210098267, + "rewards/rejected": -1.9068000316619873, + "step": 7703 + }, + { + "epoch": 0.89, + "learning_rate": 3.406297553552616e-08, + "logits/chosen": -3.863996982574463, + "logits/rejected": -3.3563127517700195, + "logps/chosen": -243.33673095703125, + "logps/rejected": -205.6649169921875, + "loss": 0.4968, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9303748607635498, + "rewards/margins": 1.1627581119537354, + "rewards/rejected": -2.093132734298706, + "step": 7704 + }, + { + "epoch": 0.89, + "learning_rate": 3.4027859065901906e-08, + "logits/chosen": -1.9601452350616455, + "logits/rejected": -2.0156188011169434, + "logps/chosen": -223.96942138671875, + "logps/rejected": -189.8973388671875, + "loss": 0.4333, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19601355493068695, + "rewards/margins": 1.0412909984588623, + "rewards/rejected": -0.8452774286270142, + "step": 7705 + }, + { + "epoch": 0.89, + "learning_rate": 3.3992742596277654e-08, + "logits/chosen": -3.083670139312744, + "logits/rejected": -2.813535451889038, + "logps/chosen": -235.2729949951172, + "logps/rejected": -249.4950714111328, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42884957790374756, + "rewards/margins": 2.9381141662597656, + "rewards/rejected": -2.5092647075653076, + "step": 7706 + }, + { + "epoch": 0.89, + "learning_rate": 3.39576261266534e-08, + "logits/chosen": -2.3362138271331787, + "logits/rejected": -2.835983991622925, + "logps/chosen": -335.38690185546875, + "logps/rejected": -266.859130859375, + "loss": 0.6267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9145666360855103, + "rewards/margins": 0.516122579574585, + "rewards/rejected": -1.4306893348693848, + "step": 7707 + }, + { + "epoch": 0.89, + "learning_rate": 3.392250965702914e-08, + "logits/chosen": -2.9312806129455566, + "logits/rejected": -2.785050392150879, + "logps/chosen": -255.94207763671875, + "logps/rejected": -373.8504638671875, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28805339336395264, + "rewards/margins": 3.512237787246704, + "rewards/rejected": -3.224184513092041, + "step": 7708 + }, + { + "epoch": 0.89, + "learning_rate": 3.388739318740489e-08, + "logits/chosen": -3.373201847076416, + "logits/rejected": -3.2293548583984375, + "logps/chosen": -286.82916259765625, + "logps/rejected": -246.7964324951172, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6285721063613892, + "rewards/margins": 1.687580943107605, + "rewards/rejected": -2.316153049468994, + "step": 7709 + }, + { + "epoch": 0.89, + "learning_rate": 3.3852276717780636e-08, + "logits/chosen": -3.210542678833008, + "logits/rejected": -3.318963050842285, + "logps/chosen": -136.4557647705078, + "logps/rejected": -158.8125762939453, + "loss": 0.7626, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5925025939941406, + "rewards/margins": 1.3613091707229614, + "rewards/rejected": -1.9538116455078125, + "step": 7710 + }, + { + "epoch": 0.89, + "learning_rate": 3.381716024815638e-08, + "logits/chosen": -2.4188497066497803, + "logits/rejected": -2.806673765182495, + "logps/chosen": -330.8678283691406, + "logps/rejected": -233.90020751953125, + "loss": 0.4428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4553423523902893, + "rewards/margins": 1.1576119661331177, + "rewards/rejected": -1.6129543781280518, + "step": 7711 + }, + { + "epoch": 0.89, + "learning_rate": 3.378204377853213e-08, + "logits/chosen": -2.8287100791931152, + "logits/rejected": -2.4348599910736084, + "logps/chosen": -313.26263427734375, + "logps/rejected": -298.743896484375, + "loss": 0.2984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34500542283058167, + "rewards/margins": 1.7418721914291382, + "rewards/rejected": -2.0868775844573975, + "step": 7712 + }, + { + "epoch": 0.89, + "learning_rate": 3.374692730890788e-08, + "logits/chosen": -3.153202533721924, + "logits/rejected": -3.0617928504943848, + "logps/chosen": -344.0357666015625, + "logps/rejected": -366.5128479003906, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05073484778404236, + "rewards/margins": 3.105733633041382, + "rewards/rejected": -3.156468391418457, + "step": 7713 + }, + { + "epoch": 0.89, + "learning_rate": 3.3711810839283625e-08, + "logits/chosen": -3.1959409713745117, + "logits/rejected": -2.352337121963501, + "logps/chosen": -213.7987823486328, + "logps/rejected": -178.318603515625, + "loss": 0.2848, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016226936131715775, + "rewards/margins": 1.5404002666473389, + "rewards/rejected": -1.5241732597351074, + "step": 7714 + }, + { + "epoch": 0.89, + "learning_rate": 3.3676694369659365e-08, + "logits/chosen": -3.040203809738159, + "logits/rejected": -3.0771360397338867, + "logps/chosen": -304.8036804199219, + "logps/rejected": -257.4837646484375, + "loss": 0.2657, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6961950659751892, + "rewards/margins": 1.9363155364990234, + "rewards/rejected": -1.240120530128479, + "step": 7715 + }, + { + "epoch": 0.89, + "learning_rate": 3.364157790003511e-08, + "logits/chosen": -3.3186709880828857, + "logits/rejected": -3.1841440200805664, + "logps/chosen": -177.72259521484375, + "logps/rejected": -185.0321807861328, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40844160318374634, + "rewards/margins": 1.7803547382354736, + "rewards/rejected": -1.371912956237793, + "step": 7716 + }, + { + "epoch": 0.89, + "learning_rate": 3.360646143041086e-08, + "logits/chosen": -3.1322526931762695, + "logits/rejected": -2.8662362098693848, + "logps/chosen": -308.20556640625, + "logps/rejected": -240.45701599121094, + "loss": 0.2534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06625162065029144, + "rewards/margins": 1.884826898574829, + "rewards/rejected": -1.818575143814087, + "step": 7717 + }, + { + "epoch": 0.89, + "learning_rate": 3.357134496078661e-08, + "logits/chosen": -3.414032459259033, + "logits/rejected": -3.501295328140259, + "logps/chosen": -332.8255310058594, + "logps/rejected": -240.69912719726562, + "loss": 0.4712, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46190088987350464, + "rewards/margins": 1.2377114295959473, + "rewards/rejected": -1.6996124982833862, + "step": 7718 + }, + { + "epoch": 0.89, + "learning_rate": 3.3536228491162354e-08, + "logits/chosen": -3.0624935626983643, + "logits/rejected": -2.562715768814087, + "logps/chosen": -390.570556640625, + "logps/rejected": -290.3553771972656, + "loss": 0.2371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2983400523662567, + "rewards/margins": 2.422008991241455, + "rewards/rejected": -2.123668670654297, + "step": 7719 + }, + { + "epoch": 0.89, + "learning_rate": 3.35011120215381e-08, + "logits/chosen": -2.645045042037964, + "logits/rejected": -2.576686143875122, + "logps/chosen": -155.7657928466797, + "logps/rejected": -185.26316833496094, + "loss": 1.2071, + "rewards/accuracies": 0.125, + "rewards/chosen": -1.1706604957580566, + "rewards/margins": -0.7531743049621582, + "rewards/rejected": -0.4174861013889313, + "step": 7720 + }, + { + "epoch": 0.89, + "learning_rate": 3.346599555191385e-08, + "logits/chosen": -3.4267921447753906, + "logits/rejected": -3.5135669708251953, + "logps/chosen": -346.3662414550781, + "logps/rejected": -291.49468994140625, + "loss": 0.367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11909550428390503, + "rewards/margins": 1.9072282314300537, + "rewards/rejected": -2.0263237953186035, + "step": 7721 + }, + { + "epoch": 0.89, + "learning_rate": 3.343087908228959e-08, + "logits/chosen": -3.817856550216675, + "logits/rejected": -3.513181447982788, + "logps/chosen": -149.0838623046875, + "logps/rejected": -104.24884033203125, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.506304919719696, + "rewards/margins": 1.18089759349823, + "rewards/rejected": -1.6872024536132812, + "step": 7722 + }, + { + "epoch": 0.89, + "learning_rate": 3.339576261266534e-08, + "logits/chosen": -2.756549119949341, + "logits/rejected": -2.644789695739746, + "logps/chosen": -179.9594268798828, + "logps/rejected": -264.8524475097656, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3373478055000305, + "rewards/margins": 1.2538487911224365, + "rewards/rejected": -1.5911965370178223, + "step": 7723 + }, + { + "epoch": 0.89, + "learning_rate": 3.3360646143041084e-08, + "logits/chosen": -2.7818658351898193, + "logits/rejected": -2.5825419425964355, + "logps/chosen": -427.6488342285156, + "logps/rejected": -282.6341857910156, + "loss": 0.4357, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03854234516620636, + "rewards/margins": 1.1265050172805786, + "rewards/rejected": -1.0879626274108887, + "step": 7724 + }, + { + "epoch": 0.89, + "learning_rate": 3.332552967341683e-08, + "logits/chosen": -3.342698097229004, + "logits/rejected": -3.284973621368408, + "logps/chosen": -112.23065185546875, + "logps/rejected": -171.39584350585938, + "loss": 0.5206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17175208032131195, + "rewards/margins": 1.4090499877929688, + "rewards/rejected": -1.5808022022247314, + "step": 7725 + }, + { + "epoch": 0.89, + "learning_rate": 3.329041320379258e-08, + "logits/chosen": -3.694077730178833, + "logits/rejected": -4.031009674072266, + "logps/chosen": -150.16653442382812, + "logps/rejected": -342.7575988769531, + "loss": 0.268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2047451138496399, + "rewards/margins": 4.190674304962158, + "rewards/rejected": -4.395419120788574, + "step": 7726 + }, + { + "epoch": 0.89, + "learning_rate": 3.3255296734168326e-08, + "logits/chosen": -2.6517434120178223, + "logits/rejected": -3.2167017459869385, + "logps/chosen": -286.5120849609375, + "logps/rejected": -224.70755004882812, + "loss": 0.2669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10657999664545059, + "rewards/margins": 1.6953599452972412, + "rewards/rejected": -1.801939845085144, + "step": 7727 + }, + { + "epoch": 0.89, + "learning_rate": 3.322018026454407e-08, + "logits/chosen": -3.2978157997131348, + "logits/rejected": -3.5656328201293945, + "logps/chosen": -198.2893524169922, + "logps/rejected": -272.68560791015625, + "loss": 0.5613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15021464228630066, + "rewards/margins": 0.7418673038482666, + "rewards/rejected": -0.8920819163322449, + "step": 7728 + }, + { + "epoch": 0.89, + "learning_rate": 3.3185063794919814e-08, + "logits/chosen": -3.5045554637908936, + "logits/rejected": -3.215005874633789, + "logps/chosen": -119.87413787841797, + "logps/rejected": -155.06973266601562, + "loss": 0.5794, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06424407660961151, + "rewards/margins": 1.2437567710876465, + "rewards/rejected": -1.179512619972229, + "step": 7729 + }, + { + "epoch": 0.89, + "learning_rate": 3.314994732529556e-08, + "logits/chosen": -3.4658117294311523, + "logits/rejected": -3.2018866539001465, + "logps/chosen": -171.7327880859375, + "logps/rejected": -150.44287109375, + "loss": 0.397, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030296653509140015, + "rewards/margins": 1.606359839439392, + "rewards/rejected": -1.5760631561279297, + "step": 7730 + }, + { + "epoch": 0.89, + "learning_rate": 3.311483085567131e-08, + "logits/chosen": -2.007868766784668, + "logits/rejected": -1.9814270734786987, + "logps/chosen": -481.23516845703125, + "logps/rejected": -420.2603759765625, + "loss": 0.6195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9627728462219238, + "rewards/margins": 0.4949665069580078, + "rewards/rejected": -1.4577393531799316, + "step": 7731 + }, + { + "epoch": 0.89, + "learning_rate": 3.3079714386047055e-08, + "logits/chosen": -2.501795768737793, + "logits/rejected": -2.6003057956695557, + "logps/chosen": -256.10491943359375, + "logps/rejected": -201.0556640625, + "loss": 0.3907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48919951915740967, + "rewards/margins": 1.5641649961471558, + "rewards/rejected": -2.0533645153045654, + "step": 7732 + }, + { + "epoch": 0.89, + "learning_rate": 3.30445979164228e-08, + "logits/chosen": -3.5034420490264893, + "logits/rejected": -3.714585065841675, + "logps/chosen": -191.09793090820312, + "logps/rejected": -233.52337646484375, + "loss": 0.4621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3328261971473694, + "rewards/margins": 2.3278915882110596, + "rewards/rejected": -2.660717725753784, + "step": 7733 + }, + { + "epoch": 0.89, + "learning_rate": 3.300948144679855e-08, + "logits/chosen": -2.7627954483032227, + "logits/rejected": -2.987563371658325, + "logps/chosen": -533.5601806640625, + "logps/rejected": -204.4390869140625, + "loss": 0.6347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02906123921275139, + "rewards/margins": 0.45422714948654175, + "rewards/rejected": -0.42516595125198364, + "step": 7734 + }, + { + "epoch": 0.89, + "learning_rate": 3.29743649771743e-08, + "logits/chosen": -2.655792236328125, + "logits/rejected": -2.351867914199829, + "logps/chosen": -373.5713195800781, + "logps/rejected": -277.4924011230469, + "loss": 0.3761, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06853579729795456, + "rewards/margins": 1.9429473876953125, + "rewards/rejected": -2.0114831924438477, + "step": 7735 + }, + { + "epoch": 0.89, + "learning_rate": 3.2939248507550044e-08, + "logits/chosen": -3.1493935585021973, + "logits/rejected": -3.3003406524658203, + "logps/chosen": -135.2838897705078, + "logps/rejected": -248.7139892578125, + "loss": 0.3609, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2205415964126587, + "rewards/margins": 3.1494359970092773, + "rewards/rejected": -2.92889404296875, + "step": 7736 + }, + { + "epoch": 0.89, + "learning_rate": 3.2904132037925785e-08, + "logits/chosen": -3.2338480949401855, + "logits/rejected": -3.3445420265197754, + "logps/chosen": -296.6545104980469, + "logps/rejected": -322.02734375, + "loss": 0.5596, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18018876016139984, + "rewards/margins": 2.422504425048828, + "rewards/rejected": -2.2423157691955566, + "step": 7737 + }, + { + "epoch": 0.89, + "learning_rate": 3.286901556830153e-08, + "logits/chosen": -3.0029428005218506, + "logits/rejected": -3.278355598449707, + "logps/chosen": -186.03042602539062, + "logps/rejected": -133.41319274902344, + "loss": 0.4302, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35790660977363586, + "rewards/margins": 0.9843102693557739, + "rewards/rejected": -1.3422167301177979, + "step": 7738 + }, + { + "epoch": 0.89, + "learning_rate": 3.283389909867728e-08, + "logits/chosen": -3.8233304023742676, + "logits/rejected": -3.791384220123291, + "logps/chosen": -210.23963928222656, + "logps/rejected": -273.9632568359375, + "loss": 0.2483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4137992262840271, + "rewards/margins": 2.452509641647339, + "rewards/rejected": -2.8663089275360107, + "step": 7739 + }, + { + "epoch": 0.89, + "learning_rate": 3.2798782629053027e-08, + "logits/chosen": -3.4914257526397705, + "logits/rejected": -3.6795010566711426, + "logps/chosen": -254.245361328125, + "logps/rejected": -258.84564208984375, + "loss": 0.241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3799264430999756, + "rewards/margins": 2.4329731464385986, + "rewards/rejected": -2.812899589538574, + "step": 7740 + }, + { + "epoch": 0.89, + "learning_rate": 3.2763666159428774e-08, + "logits/chosen": -3.259256601333618, + "logits/rejected": -3.0790326595306396, + "logps/chosen": -450.0965576171875, + "logps/rejected": -168.41812133789062, + "loss": 0.6268, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5453364849090576, + "rewards/margins": 0.955579936504364, + "rewards/rejected": -1.5009163618087769, + "step": 7741 + }, + { + "epoch": 0.89, + "learning_rate": 3.272854968980452e-08, + "logits/chosen": -3.0606257915496826, + "logits/rejected": -3.109020709991455, + "logps/chosen": -198.93539428710938, + "logps/rejected": -217.19345092773438, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47578954696655273, + "rewards/margins": 2.02500581741333, + "rewards/rejected": -1.5492162704467773, + "step": 7742 + }, + { + "epoch": 0.89, + "learning_rate": 3.269343322018027e-08, + "logits/chosen": -3.003836154937744, + "logits/rejected": -2.9604852199554443, + "logps/chosen": -172.08767700195312, + "logps/rejected": -113.27359008789062, + "loss": 1.2276, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2401279211044312, + "rewards/margins": -0.5364943742752075, + "rewards/rejected": -0.7036334276199341, + "step": 7743 + }, + { + "epoch": 0.89, + "learning_rate": 3.265831675055601e-08, + "logits/chosen": -3.250786542892456, + "logits/rejected": -3.308811902999878, + "logps/chosen": -212.99813842773438, + "logps/rejected": -182.38314819335938, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021382782608270645, + "rewards/margins": 1.591247320175171, + "rewards/rejected": -1.612630009651184, + "step": 7744 + }, + { + "epoch": 0.89, + "learning_rate": 3.2623200280931756e-08, + "logits/chosen": -2.9065022468566895, + "logits/rejected": -2.874143123626709, + "logps/chosen": -293.54095458984375, + "logps/rejected": -268.6265869140625, + "loss": 0.2529, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2599712908267975, + "rewards/margins": 1.8759708404541016, + "rewards/rejected": -1.615999460220337, + "step": 7745 + }, + { + "epoch": 0.89, + "learning_rate": 3.25880838113075e-08, + "logits/chosen": -2.663416862487793, + "logits/rejected": -2.8530025482177734, + "logps/chosen": -336.1132507324219, + "logps/rejected": -329.21356201171875, + "loss": 0.2386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17912523448467255, + "rewards/margins": 1.881790280342102, + "rewards/rejected": -2.060915470123291, + "step": 7746 + }, + { + "epoch": 0.89, + "learning_rate": 3.2552967341683244e-08, + "logits/chosen": -3.1978254318237305, + "logits/rejected": -3.250913619995117, + "logps/chosen": -175.27389526367188, + "logps/rejected": -261.72235107421875, + "loss": 0.4642, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4060221016407013, + "rewards/margins": 1.4670917987823486, + "rewards/rejected": -1.8731138706207275, + "step": 7747 + }, + { + "epoch": 0.89, + "learning_rate": 3.251785087205899e-08, + "logits/chosen": -3.715944766998291, + "logits/rejected": -3.562786340713501, + "logps/chosen": -213.31846618652344, + "logps/rejected": -180.7615203857422, + "loss": 0.2309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025960978120565414, + "rewards/margins": 1.9359104633331299, + "rewards/rejected": -1.9618713855743408, + "step": 7748 + }, + { + "epoch": 0.89, + "learning_rate": 3.248273440243474e-08, + "logits/chosen": -2.2066471576690674, + "logits/rejected": -2.1416239738464355, + "logps/chosen": -447.7305908203125, + "logps/rejected": -240.54151916503906, + "loss": 0.2902, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14992377161979675, + "rewards/margins": 1.9970163106918335, + "rewards/rejected": -1.8470923900604248, + "step": 7749 + }, + { + "epoch": 0.89, + "learning_rate": 3.2447617932810486e-08, + "logits/chosen": -3.320030927658081, + "logits/rejected": -3.197328805923462, + "logps/chosen": -233.27261352539062, + "logps/rejected": -207.77139282226562, + "loss": 0.4448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2951366901397705, + "rewards/margins": 1.2184524536132812, + "rewards/rejected": -1.5135891437530518, + "step": 7750 + }, + { + "epoch": 0.89, + "learning_rate": 3.241250146318623e-08, + "logits/chosen": -2.609611749649048, + "logits/rejected": -2.888796329498291, + "logps/chosen": -276.3360595703125, + "logps/rejected": -261.4114990234375, + "loss": 0.5219, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1111648827791214, + "rewards/margins": 1.113781213760376, + "rewards/rejected": -1.0026164054870605, + "step": 7751 + }, + { + "epoch": 0.89, + "learning_rate": 3.237738499356198e-08, + "logits/chosen": -3.259030818939209, + "logits/rejected": -3.3541276454925537, + "logps/chosen": -308.83740234375, + "logps/rejected": -252.72882080078125, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4137037992477417, + "rewards/margins": 2.6914455890655518, + "rewards/rejected": -2.2777419090270996, + "step": 7752 + }, + { + "epoch": 0.89, + "learning_rate": 3.234226852393773e-08, + "logits/chosen": -2.74841570854187, + "logits/rejected": -2.617208957672119, + "logps/chosen": -341.6676025390625, + "logps/rejected": -391.48089599609375, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24446898698806763, + "rewards/margins": 1.6166731119155884, + "rewards/rejected": -1.372204065322876, + "step": 7753 + }, + { + "epoch": 0.89, + "learning_rate": 3.230715205431347e-08, + "logits/chosen": -3.451307773590088, + "logits/rejected": -3.385662794113159, + "logps/chosen": -117.44454193115234, + "logps/rejected": -143.98745727539062, + "loss": 0.3658, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11785197257995605, + "rewards/margins": 1.8357584476470947, + "rewards/rejected": -1.7179064750671387, + "step": 7754 + }, + { + "epoch": 0.89, + "learning_rate": 3.2272035584689215e-08, + "logits/chosen": -2.618408679962158, + "logits/rejected": -2.82509446144104, + "logps/chosen": -278.5928955078125, + "logps/rejected": -273.00006103515625, + "loss": 0.3632, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12050643563270569, + "rewards/margins": 1.4834814071655273, + "rewards/rejected": -1.6039879322052002, + "step": 7755 + }, + { + "epoch": 0.89, + "learning_rate": 3.223691911506496e-08, + "logits/chosen": -2.839402914047241, + "logits/rejected": -3.096262216567993, + "logps/chosen": -254.1485595703125, + "logps/rejected": -217.7930450439453, + "loss": 0.2863, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18049967288970947, + "rewards/margins": 1.8717021942138672, + "rewards/rejected": -1.6912025213241577, + "step": 7756 + }, + { + "epoch": 0.89, + "learning_rate": 3.220180264544071e-08, + "logits/chosen": -3.6699399948120117, + "logits/rejected": -3.4313488006591797, + "logps/chosen": -116.4141845703125, + "logps/rejected": -128.45945739746094, + "loss": 0.5258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9416418671607971, + "rewards/margins": 1.3592443466186523, + "rewards/rejected": -2.3008861541748047, + "step": 7757 + }, + { + "epoch": 0.89, + "learning_rate": 3.216668617581646e-08, + "logits/chosen": -3.0435125827789307, + "logits/rejected": -3.1494460105895996, + "logps/chosen": -288.0492248535156, + "logps/rejected": -336.267333984375, + "loss": 0.3926, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17485564947128296, + "rewards/margins": 2.082815170288086, + "rewards/rejected": -1.9079595804214478, + "step": 7758 + }, + { + "epoch": 0.89, + "learning_rate": 3.2131569706192204e-08, + "logits/chosen": -2.776538848876953, + "logits/rejected": -2.926887273788452, + "logps/chosen": -497.0159606933594, + "logps/rejected": -382.6397399902344, + "loss": 0.854, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20331935584545135, + "rewards/margins": 0.7104790210723877, + "rewards/rejected": -0.5071597099304199, + "step": 7759 + }, + { + "epoch": 0.89, + "learning_rate": 3.209645323656795e-08, + "logits/chosen": -2.823354482650757, + "logits/rejected": -2.928422451019287, + "logps/chosen": -506.94903564453125, + "logps/rejected": -423.1087341308594, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10480528324842453, + "rewards/margins": 2.150928020477295, + "rewards/rejected": -2.2557332515716553, + "step": 7760 + }, + { + "epoch": 0.89, + "learning_rate": 3.206133676694369e-08, + "logits/chosen": -3.31866717338562, + "logits/rejected": -3.1086699962615967, + "logps/chosen": -267.66571044921875, + "logps/rejected": -221.0437774658203, + "loss": 0.4744, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09812498092651367, + "rewards/margins": 1.027414083480835, + "rewards/rejected": -0.9292891025543213, + "step": 7761 + }, + { + "epoch": 0.89, + "learning_rate": 3.202622029731944e-08, + "logits/chosen": -2.9873719215393066, + "logits/rejected": -3.056243658065796, + "logps/chosen": -137.45883178710938, + "logps/rejected": -155.4303741455078, + "loss": 0.4329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3647722601890564, + "rewards/margins": 1.3844400644302368, + "rewards/rejected": -1.749212384223938, + "step": 7762 + }, + { + "epoch": 0.89, + "learning_rate": 3.1991103827695187e-08, + "logits/chosen": -3.31479811668396, + "logits/rejected": -3.318800210952759, + "logps/chosen": -286.53875732421875, + "logps/rejected": -258.2714538574219, + "loss": 0.242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14727306365966797, + "rewards/margins": 2.5947091579437256, + "rewards/rejected": -2.7419822216033936, + "step": 7763 + }, + { + "epoch": 0.9, + "learning_rate": 3.1955987358070934e-08, + "logits/chosen": -2.8956830501556396, + "logits/rejected": -2.9655487537384033, + "logps/chosen": -297.3709716796875, + "logps/rejected": -264.487548828125, + "loss": 0.2554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3373333215713501, + "rewards/margins": 1.7924246788024902, + "rewards/rejected": -2.12975811958313, + "step": 7764 + }, + { + "epoch": 0.9, + "learning_rate": 3.192087088844668e-08, + "logits/chosen": -3.184483528137207, + "logits/rejected": -3.110083818435669, + "logps/chosen": -138.47952270507812, + "logps/rejected": -251.5528564453125, + "loss": 0.3499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05075228959321976, + "rewards/margins": 3.2091825008392334, + "rewards/rejected": -3.259934902191162, + "step": 7765 + }, + { + "epoch": 0.9, + "learning_rate": 3.188575441882243e-08, + "logits/chosen": -3.733704090118408, + "logits/rejected": -3.7118403911590576, + "logps/chosen": -197.01849365234375, + "logps/rejected": -183.10275268554688, + "loss": 0.2847, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13157439231872559, + "rewards/margins": 2.3602962493896484, + "rewards/rejected": -2.228721857070923, + "step": 7766 + }, + { + "epoch": 0.9, + "learning_rate": 3.1850637949198175e-08, + "logits/chosen": -3.5294461250305176, + "logits/rejected": -3.3068795204162598, + "logps/chosen": -382.39642333984375, + "logps/rejected": -225.06381225585938, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.047955721616744995, + "rewards/margins": 2.3628242015838623, + "rewards/rejected": -2.314868450164795, + "step": 7767 + }, + { + "epoch": 0.9, + "learning_rate": 3.1815521479573916e-08, + "logits/chosen": -3.6067750453948975, + "logits/rejected": -3.645845890045166, + "logps/chosen": -220.69253540039062, + "logps/rejected": -225.7537078857422, + "loss": 0.6013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4500094950199127, + "rewards/margins": 0.6037793159484863, + "rewards/rejected": -1.0537887811660767, + "step": 7768 + }, + { + "epoch": 0.9, + "learning_rate": 3.1780405009949663e-08, + "logits/chosen": -3.099208354949951, + "logits/rejected": -3.1625473499298096, + "logps/chosen": -222.03497314453125, + "logps/rejected": -251.26455688476562, + "loss": 0.5618, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4104800820350647, + "rewards/margins": 0.5272287130355835, + "rewards/rejected": -0.937708854675293, + "step": 7769 + }, + { + "epoch": 0.9, + "learning_rate": 3.174528854032541e-08, + "logits/chosen": -3.031942844390869, + "logits/rejected": -3.3723647594451904, + "logps/chosen": -210.31263732910156, + "logps/rejected": -329.3812255859375, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2958024740219116, + "rewards/margins": 2.5137486457824707, + "rewards/rejected": -2.809551239013672, + "step": 7770 + }, + { + "epoch": 0.9, + "learning_rate": 3.171017207070116e-08, + "logits/chosen": -2.6717443466186523, + "logits/rejected": -3.0957465171813965, + "logps/chosen": -201.66534423828125, + "logps/rejected": -286.53790283203125, + "loss": 0.2718, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.007530152797698975, + "rewards/margins": 3.7942147254943848, + "rewards/rejected": -3.786684513092041, + "step": 7771 + }, + { + "epoch": 0.9, + "learning_rate": 3.1675055601076905e-08, + "logits/chosen": -2.70711088180542, + "logits/rejected": -3.074040174484253, + "logps/chosen": -158.69796752929688, + "logps/rejected": -113.05538940429688, + "loss": 0.5226, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21361547708511353, + "rewards/margins": 0.684147834777832, + "rewards/rejected": -0.8977632522583008, + "step": 7772 + }, + { + "epoch": 0.9, + "learning_rate": 3.163993913145265e-08, + "logits/chosen": -2.316713333129883, + "logits/rejected": -2.244729518890381, + "logps/chosen": -242.96124267578125, + "logps/rejected": -278.79638671875, + "loss": 0.4086, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16444630920886993, + "rewards/margins": 1.1293755769729614, + "rewards/rejected": -0.9649292230606079, + "step": 7773 + }, + { + "epoch": 0.9, + "learning_rate": 3.16048226618284e-08, + "logits/chosen": -2.748243570327759, + "logits/rejected": -3.004669666290283, + "logps/chosen": -289.43267822265625, + "logps/rejected": -368.7463684082031, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21640148758888245, + "rewards/margins": 2.043475389480591, + "rewards/rejected": -2.2598769664764404, + "step": 7774 + }, + { + "epoch": 0.9, + "learning_rate": 3.156970619220414e-08, + "logits/chosen": -3.3452210426330566, + "logits/rejected": -3.42846417427063, + "logps/chosen": -235.58432006835938, + "logps/rejected": -225.80458068847656, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0666767805814743, + "rewards/margins": 1.915247917175293, + "rewards/rejected": -1.8485711812973022, + "step": 7775 + }, + { + "epoch": 0.9, + "learning_rate": 3.153458972257989e-08, + "logits/chosen": -2.774590253829956, + "logits/rejected": -2.874406576156616, + "logps/chosen": -310.28759765625, + "logps/rejected": -301.9608154296875, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15753909945487976, + "rewards/margins": 3.1475067138671875, + "rewards/rejected": -2.9899673461914062, + "step": 7776 + }, + { + "epoch": 0.9, + "learning_rate": 3.1499473252955635e-08, + "logits/chosen": -2.264498710632324, + "logits/rejected": -2.3836610317230225, + "logps/chosen": -284.7121887207031, + "logps/rejected": -241.90484619140625, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25553351640701294, + "rewards/margins": 1.3291255235671997, + "rewards/rejected": -1.073591947555542, + "step": 7777 + }, + { + "epoch": 0.9, + "learning_rate": 3.146435678333138e-08, + "logits/chosen": -3.0895509719848633, + "logits/rejected": -2.704430103302002, + "logps/chosen": -266.28118896484375, + "logps/rejected": -179.96063232421875, + "loss": 0.3082, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.055748552083969116, + "rewards/margins": 1.9948947429656982, + "rewards/rejected": -1.9391462802886963, + "step": 7778 + }, + { + "epoch": 0.9, + "learning_rate": 3.142924031370713e-08, + "logits/chosen": -2.9867701530456543, + "logits/rejected": -2.99653959274292, + "logps/chosen": -331.7909851074219, + "logps/rejected": -199.41397094726562, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24757876992225647, + "rewards/margins": 1.4736449718475342, + "rewards/rejected": -1.2260662317276, + "step": 7779 + }, + { + "epoch": 0.9, + "learning_rate": 3.1394123844082876e-08, + "logits/chosen": -2.8842945098876953, + "logits/rejected": -2.9293980598449707, + "logps/chosen": -265.8250732421875, + "logps/rejected": -167.9994659423828, + "loss": 0.4786, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.424845427274704, + "rewards/margins": 1.3890703916549683, + "rewards/rejected": -1.813915729522705, + "step": 7780 + }, + { + "epoch": 0.9, + "learning_rate": 3.1359007374458624e-08, + "logits/chosen": -2.7005465030670166, + "logits/rejected": -2.6682260036468506, + "logps/chosen": -413.13348388671875, + "logps/rejected": -268.87249755859375, + "loss": 0.5184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10275600850582123, + "rewards/margins": 1.3545186519622803, + "rewards/rejected": -1.4572747945785522, + "step": 7781 + }, + { + "epoch": 0.9, + "learning_rate": 3.1323890904834364e-08, + "logits/chosen": -2.602792739868164, + "logits/rejected": -2.6643996238708496, + "logps/chosen": -287.07562255859375, + "logps/rejected": -310.04608154296875, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008390024304389954, + "rewards/margins": 2.3074727058410645, + "rewards/rejected": -2.2990827560424805, + "step": 7782 + }, + { + "epoch": 0.9, + "learning_rate": 3.128877443521011e-08, + "logits/chosen": -3.18046236038208, + "logits/rejected": -3.3441717624664307, + "logps/chosen": -114.50439453125, + "logps/rejected": -211.90164184570312, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0009965971112251282, + "rewards/margins": 2.2106752395629883, + "rewards/rejected": -2.2096786499023438, + "step": 7783 + }, + { + "epoch": 0.9, + "learning_rate": 3.125365796558586e-08, + "logits/chosen": -3.045872688293457, + "logits/rejected": -2.9902167320251465, + "logps/chosen": -254.2425537109375, + "logps/rejected": -261.32513427734375, + "loss": 0.3934, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08585759997367859, + "rewards/margins": 1.0988686084747314, + "rewards/rejected": -1.1847261190414429, + "step": 7784 + }, + { + "epoch": 0.9, + "learning_rate": 3.1218541495961606e-08, + "logits/chosen": -3.0430636405944824, + "logits/rejected": -2.8303310871124268, + "logps/chosen": -207.12075805664062, + "logps/rejected": -246.49264526367188, + "loss": 0.5089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6795649528503418, + "rewards/margins": 1.8767845630645752, + "rewards/rejected": -2.556349277496338, + "step": 7785 + }, + { + "epoch": 0.9, + "learning_rate": 3.118342502633735e-08, + "logits/chosen": -2.804896831512451, + "logits/rejected": -3.017408847808838, + "logps/chosen": -204.60910034179688, + "logps/rejected": -195.4331817626953, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5684860944747925, + "rewards/margins": 1.8028619289398193, + "rewards/rejected": -2.3713481426239014, + "step": 7786 + }, + { + "epoch": 0.9, + "learning_rate": 3.11483085567131e-08, + "logits/chosen": -3.0154473781585693, + "logits/rejected": -3.374267578125, + "logps/chosen": -396.72540283203125, + "logps/rejected": -368.5937194824219, + "loss": 0.5768, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.546332597732544, + "rewards/margins": 1.6459972858428955, + "rewards/rejected": -2.1923298835754395, + "step": 7787 + }, + { + "epoch": 0.9, + "learning_rate": 3.111319208708885e-08, + "logits/chosen": -3.213884115219116, + "logits/rejected": -3.232037305831909, + "logps/chosen": -190.18780517578125, + "logps/rejected": -119.09656524658203, + "loss": 0.5027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19126199185848236, + "rewards/margins": 1.449885368347168, + "rewards/rejected": -1.6411473751068115, + "step": 7788 + }, + { + "epoch": 0.9, + "learning_rate": 3.107807561746459e-08, + "logits/chosen": -2.7316083908081055, + "logits/rejected": -2.5449788570404053, + "logps/chosen": -251.4188232421875, + "logps/rejected": -233.43865966796875, + "loss": 0.1883, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3227115571498871, + "rewards/margins": 2.0322113037109375, + "rewards/rejected": -1.7094998359680176, + "step": 7789 + }, + { + "epoch": 0.9, + "learning_rate": 3.1042959147840335e-08, + "logits/chosen": -3.3227922916412354, + "logits/rejected": -2.8386964797973633, + "logps/chosen": -254.95440673828125, + "logps/rejected": -259.3973388671875, + "loss": 0.3221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3963538110256195, + "rewards/margins": 1.5987297296524048, + "rewards/rejected": -1.9950834512710571, + "step": 7790 + }, + { + "epoch": 0.9, + "learning_rate": 3.100784267821608e-08, + "logits/chosen": -2.4748153686523438, + "logits/rejected": -2.434777021408081, + "logps/chosen": -263.6317138671875, + "logps/rejected": -151.89529418945312, + "loss": 0.3675, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1286814659833908, + "rewards/margins": 1.5517648458480835, + "rewards/rejected": -1.6804462671279907, + "step": 7791 + }, + { + "epoch": 0.9, + "learning_rate": 3.0972726208591823e-08, + "logits/chosen": -2.6453375816345215, + "logits/rejected": -2.490499496459961, + "logps/chosen": -162.45098876953125, + "logps/rejected": -252.7466583251953, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3012300431728363, + "rewards/margins": 1.5879521369934082, + "rewards/rejected": -1.8891820907592773, + "step": 7792 + }, + { + "epoch": 0.9, + "learning_rate": 3.093760973896757e-08, + "logits/chosen": -2.5269856452941895, + "logits/rejected": -2.5486247539520264, + "logps/chosen": -288.6363830566406, + "logps/rejected": -170.61572265625, + "loss": 0.4178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30026501417160034, + "rewards/margins": 0.8973388075828552, + "rewards/rejected": -1.1976038217544556, + "step": 7793 + }, + { + "epoch": 0.9, + "learning_rate": 3.090249326934332e-08, + "logits/chosen": -3.04618501663208, + "logits/rejected": -3.4729700088500977, + "logps/chosen": -253.50790405273438, + "logps/rejected": -146.75161743164062, + "loss": 0.587, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29081982374191284, + "rewards/margins": 0.5287672281265259, + "rewards/rejected": -0.8195871114730835, + "step": 7794 + }, + { + "epoch": 0.9, + "learning_rate": 3.0867376799719065e-08, + "logits/chosen": -2.771589756011963, + "logits/rejected": -2.701406955718994, + "logps/chosen": -207.8033447265625, + "logps/rejected": -298.49871826171875, + "loss": 0.4703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8988261222839355, + "rewards/margins": 2.344109058380127, + "rewards/rejected": -3.2429351806640625, + "step": 7795 + }, + { + "epoch": 0.9, + "learning_rate": 3.083226033009481e-08, + "logits/chosen": -3.302011251449585, + "logits/rejected": -2.896148204803467, + "logps/chosen": -408.2320861816406, + "logps/rejected": -246.6964874267578, + "loss": 0.4095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22144433856010437, + "rewards/margins": 1.5304447412490845, + "rewards/rejected": -1.7518889904022217, + "step": 7796 + }, + { + "epoch": 0.9, + "learning_rate": 3.079714386047056e-08, + "logits/chosen": -3.557800769805908, + "logits/rejected": -3.7130162715911865, + "logps/chosen": -154.16366577148438, + "logps/rejected": -218.97866821289062, + "loss": 0.1427, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33592191338539124, + "rewards/margins": 2.598933219909668, + "rewards/rejected": -2.2630114555358887, + "step": 7797 + }, + { + "epoch": 0.9, + "learning_rate": 3.076202739084631e-08, + "logits/chosen": -3.2626242637634277, + "logits/rejected": -3.311753511428833, + "logps/chosen": -118.90097045898438, + "logps/rejected": -212.41732788085938, + "loss": 0.3115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38824307918548584, + "rewards/margins": 1.7958723306655884, + "rewards/rejected": -2.184115171432495, + "step": 7798 + }, + { + "epoch": 0.9, + "learning_rate": 3.072691092122205e-08, + "logits/chosen": -3.6691017150878906, + "logits/rejected": -3.487724781036377, + "logps/chosen": -201.18531799316406, + "logps/rejected": -172.29026794433594, + "loss": 0.8268, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3854404389858246, + "rewards/margins": 0.47551342844963074, + "rewards/rejected": -0.8609538078308105, + "step": 7799 + }, + { + "epoch": 0.9, + "learning_rate": 3.0691794451597795e-08, + "logits/chosen": -3.098599910736084, + "logits/rejected": -3.1018755435943604, + "logps/chosen": -225.54949951171875, + "logps/rejected": -370.26690673828125, + "loss": 0.2393, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1196582019329071, + "rewards/margins": 2.7772297859191895, + "rewards/rejected": -2.65757155418396, + "step": 7800 + }, + { + "epoch": 0.9, + "learning_rate": 3.065667798197354e-08, + "logits/chosen": -2.8333261013031006, + "logits/rejected": -3.092324733734131, + "logps/chosen": -140.63760375976562, + "logps/rejected": -196.13363647460938, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09240604192018509, + "rewards/margins": 2.437268018722534, + "rewards/rejected": -2.5296740531921387, + "step": 7801 + }, + { + "epoch": 0.9, + "learning_rate": 3.062156151234929e-08, + "logits/chosen": -2.8085546493530273, + "logits/rejected": -2.9746334552764893, + "logps/chosen": -182.75848388671875, + "logps/rejected": -182.48367309570312, + "loss": 0.513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4765058159828186, + "rewards/margins": 1.1118367910385132, + "rewards/rejected": -1.5883426666259766, + "step": 7802 + }, + { + "epoch": 0.9, + "learning_rate": 3.0586445042725036e-08, + "logits/chosen": -3.2692179679870605, + "logits/rejected": -3.3894081115722656, + "logps/chosen": -284.3343200683594, + "logps/rejected": -218.2522735595703, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1991048902273178, + "rewards/margins": 1.4168310165405273, + "rewards/rejected": -1.615936040878296, + "step": 7803 + }, + { + "epoch": 0.9, + "learning_rate": 3.0551328573100784e-08, + "logits/chosen": -3.9299569129943848, + "logits/rejected": -3.853154182434082, + "logps/chosen": -332.6553649902344, + "logps/rejected": -303.6148986816406, + "loss": 0.5125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08901393413543701, + "rewards/margins": 1.2583497762680054, + "rewards/rejected": -1.3473635911941528, + "step": 7804 + }, + { + "epoch": 0.9, + "learning_rate": 3.051621210347653e-08, + "logits/chosen": -3.4354915618896484, + "logits/rejected": -3.329554557800293, + "logps/chosen": -179.05979919433594, + "logps/rejected": -210.20828247070312, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6075061559677124, + "rewards/margins": 1.6012015342712402, + "rewards/rejected": -0.9936953783035278, + "step": 7805 + }, + { + "epoch": 0.9, + "learning_rate": 3.048109563385227e-08, + "logits/chosen": -3.693774938583374, + "logits/rejected": -3.6042940616607666, + "logps/chosen": -311.261474609375, + "logps/rejected": -208.22079467773438, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05439196527004242, + "rewards/margins": 2.8182196617126465, + "rewards/rejected": -2.763827323913574, + "step": 7806 + }, + { + "epoch": 0.9, + "learning_rate": 3.044597916422802e-08, + "logits/chosen": -3.107422351837158, + "logits/rejected": -3.048698663711548, + "logps/chosen": -245.4534149169922, + "logps/rejected": -251.77638244628906, + "loss": 0.6014, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15875597298145294, + "rewards/margins": 0.4813253879547119, + "rewards/rejected": -0.6400814652442932, + "step": 7807 + }, + { + "epoch": 0.9, + "learning_rate": 3.0410862694603766e-08, + "logits/chosen": -2.9945740699768066, + "logits/rejected": -2.993337869644165, + "logps/chosen": -168.14230346679688, + "logps/rejected": -160.8536376953125, + "loss": 0.7263, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05640730261802673, + "rewards/margins": 0.7278283834457397, + "rewards/rejected": -0.7842357158660889, + "step": 7808 + }, + { + "epoch": 0.9, + "learning_rate": 3.037574622497951e-08, + "logits/chosen": -3.0626254081726074, + "logits/rejected": -3.1552796363830566, + "logps/chosen": -439.6769104003906, + "logps/rejected": -254.32872009277344, + "loss": 0.2691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.498441219329834, + "rewards/margins": 2.452948570251465, + "rewards/rejected": -2.951389789581299, + "step": 7809 + }, + { + "epoch": 0.9, + "learning_rate": 3.034062975535526e-08, + "logits/chosen": -3.800781726837158, + "logits/rejected": -3.650975227355957, + "logps/chosen": -346.2925109863281, + "logps/rejected": -284.1558532714844, + "loss": 0.2709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13335317373275757, + "rewards/margins": 3.2688796520233154, + "rewards/rejected": -3.4022326469421387, + "step": 7810 + }, + { + "epoch": 0.9, + "learning_rate": 3.030551328573101e-08, + "logits/chosen": -3.189026117324829, + "logits/rejected": -3.3325459957122803, + "logps/chosen": -232.50543212890625, + "logps/rejected": -255.28515625, + "loss": 0.6001, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5139163136482239, + "rewards/margins": 0.289531409740448, + "rewards/rejected": -0.8034477233886719, + "step": 7811 + }, + { + "epoch": 0.9, + "learning_rate": 3.0270396816106755e-08, + "logits/chosen": -3.1647489070892334, + "logits/rejected": -3.0846970081329346, + "logps/chosen": -350.3502502441406, + "logps/rejected": -295.945556640625, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44555580615997314, + "rewards/margins": 2.2534523010253906, + "rewards/rejected": -1.807896614074707, + "step": 7812 + }, + { + "epoch": 0.9, + "learning_rate": 3.0235280346482495e-08, + "logits/chosen": -3.0270113945007324, + "logits/rejected": -2.7543015480041504, + "logps/chosen": -226.32632446289062, + "logps/rejected": -251.8033447265625, + "loss": 0.2044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.271738737821579, + "rewards/margins": 2.424967050552368, + "rewards/rejected": -2.153228282928467, + "step": 7813 + }, + { + "epoch": 0.9, + "learning_rate": 3.020016387685824e-08, + "logits/chosen": -2.6890745162963867, + "logits/rejected": -2.9330434799194336, + "logps/chosen": -343.00531005859375, + "logps/rejected": -320.06005859375, + "loss": 0.3648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07912154495716095, + "rewards/margins": 1.6670434474945068, + "rewards/rejected": -1.7461650371551514, + "step": 7814 + }, + { + "epoch": 0.9, + "learning_rate": 3.016504740723399e-08, + "logits/chosen": -3.3053715229034424, + "logits/rejected": -3.0547518730163574, + "logps/chosen": -320.659912109375, + "logps/rejected": -359.66436767578125, + "loss": 0.3261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7376658320426941, + "rewards/margins": 2.6159353256225586, + "rewards/rejected": -3.3536014556884766, + "step": 7815 + }, + { + "epoch": 0.9, + "learning_rate": 3.012993093760974e-08, + "logits/chosen": -3.113060712814331, + "logits/rejected": -3.3458147048950195, + "logps/chosen": -295.1081848144531, + "logps/rejected": -203.1611328125, + "loss": 0.3126, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23938781023025513, + "rewards/margins": 2.1724114418029785, + "rewards/rejected": -1.9330233335494995, + "step": 7816 + }, + { + "epoch": 0.9, + "learning_rate": 3.0094814467985484e-08, + "logits/chosen": -2.7694664001464844, + "logits/rejected": -2.5070791244506836, + "logps/chosen": -281.05206298828125, + "logps/rejected": -274.0117492675781, + "loss": 0.333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05018618330359459, + "rewards/margins": 1.284712791442871, + "rewards/rejected": -1.3348989486694336, + "step": 7817 + }, + { + "epoch": 0.9, + "learning_rate": 3.005969799836123e-08, + "logits/chosen": -3.1946730613708496, + "logits/rejected": -3.1040821075439453, + "logps/chosen": -207.4013671875, + "logps/rejected": -353.5582275390625, + "loss": 0.594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6908109188079834, + "rewards/margins": 2.6344292163848877, + "rewards/rejected": -3.325240135192871, + "step": 7818 + }, + { + "epoch": 0.9, + "learning_rate": 3.002458152873698e-08, + "logits/chosen": -3.066459894180298, + "logits/rejected": -2.616295099258423, + "logps/chosen": -293.3780822753906, + "logps/rejected": -181.04324340820312, + "loss": 0.3147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3053177297115326, + "rewards/margins": 1.344879150390625, + "rewards/rejected": -1.6501967906951904, + "step": 7819 + }, + { + "epoch": 0.9, + "learning_rate": 2.9989465059112726e-08, + "logits/chosen": -3.898989677429199, + "logits/rejected": -3.645484447479248, + "logps/chosen": -162.3035888671875, + "logps/rejected": -274.24371337890625, + "loss": 0.4812, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5253479480743408, + "rewards/margins": 2.02262020111084, + "rewards/rejected": -2.5479681491851807, + "step": 7820 + }, + { + "epoch": 0.9, + "learning_rate": 2.995434858948847e-08, + "logits/chosen": -3.4251952171325684, + "logits/rejected": -2.895747661590576, + "logps/chosen": -358.1121520996094, + "logps/rejected": -261.7951965332031, + "loss": 0.9835, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.378661036491394, + "rewards/margins": 0.4363572597503662, + "rewards/rejected": -1.8150184154510498, + "step": 7821 + }, + { + "epoch": 0.9, + "learning_rate": 2.9919232119864214e-08, + "logits/chosen": -2.9101791381835938, + "logits/rejected": -2.8709945678710938, + "logps/chosen": -301.5269775390625, + "logps/rejected": -333.7005310058594, + "loss": 0.3811, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5649310350418091, + "rewards/margins": 1.8188796043395996, + "rewards/rejected": -1.253948450088501, + "step": 7822 + }, + { + "epoch": 0.9, + "learning_rate": 2.988411565023996e-08, + "logits/chosen": -3.5448057651519775, + "logits/rejected": -3.659106731414795, + "logps/chosen": -362.90533447265625, + "logps/rejected": -356.2138671875, + "loss": 0.4378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27571913599967957, + "rewards/margins": 1.5644207000732422, + "rewards/rejected": -1.8401397466659546, + "step": 7823 + }, + { + "epoch": 0.9, + "learning_rate": 2.984899918061571e-08, + "logits/chosen": -3.238248825073242, + "logits/rejected": -3.2354540824890137, + "logps/chosen": -375.36669921875, + "logps/rejected": -256.2217712402344, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18133734166622162, + "rewards/margins": 2.0289831161499023, + "rewards/rejected": -1.847645878791809, + "step": 7824 + }, + { + "epoch": 0.9, + "learning_rate": 2.9813882710991456e-08, + "logits/chosen": -3.1292054653167725, + "logits/rejected": -3.1935925483703613, + "logps/chosen": -115.11970520019531, + "logps/rejected": -224.84814453125, + "loss": 0.4173, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19043001532554626, + "rewards/margins": 1.879460096359253, + "rewards/rejected": -1.6890300512313843, + "step": 7825 + }, + { + "epoch": 0.9, + "learning_rate": 2.97787662413672e-08, + "logits/chosen": -3.032172918319702, + "logits/rejected": -3.0575919151306152, + "logps/chosen": -370.8744201660156, + "logps/rejected": -321.0774230957031, + "loss": 0.1891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3495209813117981, + "rewards/margins": 2.4869134426116943, + "rewards/rejected": -2.8364343643188477, + "step": 7826 + }, + { + "epoch": 0.9, + "learning_rate": 2.9743649771742947e-08, + "logits/chosen": -2.802924871444702, + "logits/rejected": -2.7647507190704346, + "logps/chosen": -262.8232421875, + "logps/rejected": -316.03314208984375, + "loss": 0.3145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36372318863868713, + "rewards/margins": 1.626491665840149, + "rewards/rejected": -1.9902148246765137, + "step": 7827 + }, + { + "epoch": 0.9, + "learning_rate": 2.9708533302118694e-08, + "logits/chosen": -2.503469228744507, + "logits/rejected": -2.802560329437256, + "logps/chosen": -420.9840393066406, + "logps/rejected": -272.2008972167969, + "loss": 0.3954, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5190492868423462, + "rewards/margins": 1.7978394031524658, + "rewards/rejected": -2.3168883323669434, + "step": 7828 + }, + { + "epoch": 0.9, + "learning_rate": 2.967341683249444e-08, + "logits/chosen": -2.409254789352417, + "logits/rejected": -2.4816765785217285, + "logps/chosen": -399.53436279296875, + "logps/rejected": -298.1506652832031, + "loss": 0.2963, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33370909094810486, + "rewards/margins": 1.6671545505523682, + "rewards/rejected": -1.3334453105926514, + "step": 7829 + }, + { + "epoch": 0.9, + "learning_rate": 2.9638300362870185e-08, + "logits/chosen": -2.5328409671783447, + "logits/rejected": -2.4208667278289795, + "logps/chosen": -388.964111328125, + "logps/rejected": -290.7350769042969, + "loss": 0.5049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2631969153881073, + "rewards/margins": 1.0116833448410034, + "rewards/rejected": -1.274880290031433, + "step": 7830 + }, + { + "epoch": 0.9, + "learning_rate": 2.9603183893245932e-08, + "logits/chosen": -3.321373701095581, + "logits/rejected": -3.497218608856201, + "logps/chosen": -222.80738830566406, + "logps/rejected": -335.01605224609375, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2063494473695755, + "rewards/margins": 2.531726837158203, + "rewards/rejected": -2.7380762100219727, + "step": 7831 + }, + { + "epoch": 0.9, + "learning_rate": 2.956806742362168e-08, + "logits/chosen": -3.524317741394043, + "logits/rejected": -3.445833683013916, + "logps/chosen": -574.4561157226562, + "logps/rejected": -350.5565185546875, + "loss": 0.4258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2087143510580063, + "rewards/margins": 1.3181647062301636, + "rewards/rejected": -1.526879072189331, + "step": 7832 + }, + { + "epoch": 0.9, + "learning_rate": 2.9532950953997424e-08, + "logits/chosen": -2.504621982574463, + "logits/rejected": -2.484931707382202, + "logps/chosen": -425.83721923828125, + "logps/rejected": -292.1983947753906, + "loss": 0.3904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05825109779834747, + "rewards/margins": 1.7424185276031494, + "rewards/rejected": -1.6841673851013184, + "step": 7833 + }, + { + "epoch": 0.9, + "learning_rate": 2.949783448437317e-08, + "logits/chosen": -3.0492172241210938, + "logits/rejected": -3.0419390201568604, + "logps/chosen": -478.5390625, + "logps/rejected": -313.32073974609375, + "loss": 0.4581, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33983314037323, + "rewards/margins": 1.7003750801086426, + "rewards/rejected": -2.040208339691162, + "step": 7834 + }, + { + "epoch": 0.9, + "learning_rate": 2.9462718014748918e-08, + "logits/chosen": -3.1191282272338867, + "logits/rejected": -3.164759635925293, + "logps/chosen": -168.68727111816406, + "logps/rejected": -170.16348266601562, + "loss": 0.3316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3404485583305359, + "rewards/margins": 1.8260252475738525, + "rewards/rejected": -1.4855767488479614, + "step": 7835 + }, + { + "epoch": 0.9, + "learning_rate": 2.9427601545124665e-08, + "logits/chosen": -3.1438021659851074, + "logits/rejected": -2.7998111248016357, + "logps/chosen": -303.9043884277344, + "logps/rejected": -259.82720947265625, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33572208881378174, + "rewards/margins": 1.7771189212799072, + "rewards/rejected": -2.1128411293029785, + "step": 7836 + }, + { + "epoch": 0.9, + "learning_rate": 2.939248507550041e-08, + "logits/chosen": -4.099872589111328, + "logits/rejected": -3.6996560096740723, + "logps/chosen": -359.8845520019531, + "logps/rejected": -254.70347595214844, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11736652255058289, + "rewards/margins": 1.9536365270614624, + "rewards/rejected": -1.8362699747085571, + "step": 7837 + }, + { + "epoch": 0.9, + "learning_rate": 2.9357368605876157e-08, + "logits/chosen": -3.941861629486084, + "logits/rejected": -3.7202041149139404, + "logps/chosen": -256.68109130859375, + "logps/rejected": -159.70281982421875, + "loss": 0.5399, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0575445182621479, + "rewards/margins": 1.7847795486450195, + "rewards/rejected": -1.7272350788116455, + "step": 7838 + }, + { + "epoch": 0.9, + "learning_rate": 2.9322252136251897e-08, + "logits/chosen": -3.0995984077453613, + "logits/rejected": -3.135566234588623, + "logps/chosen": -232.6947021484375, + "logps/rejected": -255.1054229736328, + "loss": 0.6986, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5742395520210266, + "rewards/margins": 2.5648908615112305, + "rewards/rejected": -3.1391305923461914, + "step": 7839 + }, + { + "epoch": 0.9, + "learning_rate": 2.9287135666627644e-08, + "logits/chosen": -3.3552074432373047, + "logits/rejected": -3.2976255416870117, + "logps/chosen": -170.05648803710938, + "logps/rejected": -216.1123504638672, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27471908926963806, + "rewards/margins": 2.084280490875244, + "rewards/rejected": -2.358999729156494, + "step": 7840 + }, + { + "epoch": 0.9, + "learning_rate": 2.925201919700339e-08, + "logits/chosen": -2.9034247398376465, + "logits/rejected": -2.7599687576293945, + "logps/chosen": -557.7071533203125, + "logps/rejected": -326.9872741699219, + "loss": 1.4051, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8963369131088257, + "rewards/margins": 0.130376935005188, + "rewards/rejected": -1.0267138481140137, + "step": 7841 + }, + { + "epoch": 0.9, + "learning_rate": 2.921690272737914e-08, + "logits/chosen": -2.371365547180176, + "logits/rejected": -2.701007604598999, + "logps/chosen": -327.18231201171875, + "logps/rejected": -284.3662414550781, + "loss": 0.5018, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.035202257335186005, + "rewards/margins": 0.7481095194816589, + "rewards/rejected": -0.7129073143005371, + "step": 7842 + }, + { + "epoch": 0.9, + "learning_rate": 2.9181786257754883e-08, + "logits/chosen": -3.454371452331543, + "logits/rejected": -3.445422887802124, + "logps/chosen": -152.30752563476562, + "logps/rejected": -257.1273498535156, + "loss": 0.3576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3617950677871704, + "rewards/margins": 1.118908405303955, + "rewards/rejected": -1.4807034730911255, + "step": 7843 + }, + { + "epoch": 0.9, + "learning_rate": 2.914666978813063e-08, + "logits/chosen": -2.805938720703125, + "logits/rejected": -2.7990875244140625, + "logps/chosen": -168.53256225585938, + "logps/rejected": -186.8749542236328, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.052128519862890244, + "rewards/margins": 1.8649131059646606, + "rewards/rejected": -1.8127846717834473, + "step": 7844 + }, + { + "epoch": 0.9, + "learning_rate": 2.9111553318506377e-08, + "logits/chosen": -3.9102749824523926, + "logits/rejected": -4.012794017791748, + "logps/chosen": -261.440185546875, + "logps/rejected": -223.09341430664062, + "loss": 0.2039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36316484212875366, + "rewards/margins": 1.8029118776321411, + "rewards/rejected": -1.4397470951080322, + "step": 7845 + }, + { + "epoch": 0.9, + "learning_rate": 2.9076436848882125e-08, + "logits/chosen": -3.4473016262054443, + "logits/rejected": -3.1729891300201416, + "logps/chosen": -285.378662109375, + "logps/rejected": -275.57110595703125, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5412144660949707, + "rewards/margins": 2.2048141956329346, + "rewards/rejected": -2.7460289001464844, + "step": 7846 + }, + { + "epoch": 0.9, + "learning_rate": 2.904132037925787e-08, + "logits/chosen": -2.8767664432525635, + "logits/rejected": -3.064741373062134, + "logps/chosen": -348.1396179199219, + "logps/rejected": -257.42633056640625, + "loss": 0.2331, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1464529037475586, + "rewards/margins": 2.8991146087646484, + "rewards/rejected": -2.752661943435669, + "step": 7847 + }, + { + "epoch": 0.9, + "learning_rate": 2.9006203909633616e-08, + "logits/chosen": -2.8237881660461426, + "logits/rejected": -2.948032855987549, + "logps/chosen": -182.38177490234375, + "logps/rejected": -325.1276550292969, + "loss": 0.3576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18169422447681427, + "rewards/margins": 1.8261866569519043, + "rewards/rejected": -1.6444923877716064, + "step": 7848 + }, + { + "epoch": 0.9, + "learning_rate": 2.8971087440009363e-08, + "logits/chosen": -3.353363275527954, + "logits/rejected": -3.3154592514038086, + "logps/chosen": -322.64532470703125, + "logps/rejected": -378.2384033203125, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.004413112998008728, + "rewards/margins": 1.684146523475647, + "rewards/rejected": -1.688559651374817, + "step": 7849 + }, + { + "epoch": 0.9, + "learning_rate": 2.8935970970385107e-08, + "logits/chosen": -2.93969464302063, + "logits/rejected": -3.0608925819396973, + "logps/chosen": -256.9266357421875, + "logps/rejected": -186.99636840820312, + "loss": 0.3266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.551852822303772, + "rewards/margins": 2.3994431495666504, + "rewards/rejected": -1.8475900888442993, + "step": 7850 + }, + { + "epoch": 0.91, + "learning_rate": 2.8900854500760854e-08, + "logits/chosen": -3.2338733673095703, + "logits/rejected": -3.0821585655212402, + "logps/chosen": -298.9503173828125, + "logps/rejected": -307.85833740234375, + "loss": 0.3268, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2982020080089569, + "rewards/margins": 1.93660569190979, + "rewards/rejected": -1.6384036540985107, + "step": 7851 + }, + { + "epoch": 0.91, + "learning_rate": 2.88657380311366e-08, + "logits/chosen": -2.7936437129974365, + "logits/rejected": -3.171645164489746, + "logps/chosen": -361.306640625, + "logps/rejected": -232.93421936035156, + "loss": 0.7276, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6817001104354858, + "rewards/margins": 1.026941180229187, + "rewards/rejected": -1.7086412906646729, + "step": 7852 + }, + { + "epoch": 0.91, + "learning_rate": 2.883062156151235e-08, + "logits/chosen": -3.438666343688965, + "logits/rejected": -3.2109484672546387, + "logps/chosen": -319.41796875, + "logps/rejected": -420.9537658691406, + "loss": 0.446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11063627153635025, + "rewards/margins": 1.7314003705978394, + "rewards/rejected": -1.842036485671997, + "step": 7853 + }, + { + "epoch": 0.91, + "learning_rate": 2.8795505091888092e-08, + "logits/chosen": -3.3926024436950684, + "logits/rejected": -3.1557838916778564, + "logps/chosen": -253.06130981445312, + "logps/rejected": -201.3717803955078, + "loss": 0.4079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14810988306999207, + "rewards/margins": 1.2894110679626465, + "rewards/rejected": -1.437520980834961, + "step": 7854 + }, + { + "epoch": 0.91, + "learning_rate": 2.876038862226384e-08, + "logits/chosen": -3.380932331085205, + "logits/rejected": -3.9840683937072754, + "logps/chosen": -91.88964080810547, + "logps/rejected": -224.34095764160156, + "loss": 0.1713, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2748192846775055, + "rewards/margins": 3.480587959289551, + "rewards/rejected": -3.2057688236236572, + "step": 7855 + }, + { + "epoch": 0.91, + "learning_rate": 2.8725272152639587e-08, + "logits/chosen": -2.2590932846069336, + "logits/rejected": -2.3274283409118652, + "logps/chosen": -303.5101318359375, + "logps/rejected": -297.15191650390625, + "loss": 0.391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4728449881076813, + "rewards/margins": 1.832222819328308, + "rewards/rejected": -2.305067777633667, + "step": 7856 + }, + { + "epoch": 0.91, + "learning_rate": 2.869015568301533e-08, + "logits/chosen": -2.801412582397461, + "logits/rejected": -2.5191047191619873, + "logps/chosen": -203.75830078125, + "logps/rejected": -176.73727416992188, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04368014633655548, + "rewards/margins": 1.4589707851409912, + "rewards/rejected": -1.5026509761810303, + "step": 7857 + }, + { + "epoch": 0.91, + "learning_rate": 2.8655039213391078e-08, + "logits/chosen": -2.69659423828125, + "logits/rejected": -3.0463767051696777, + "logps/chosen": -192.69546508789062, + "logps/rejected": -172.5802459716797, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6653135418891907, + "rewards/margins": 1.454699993133545, + "rewards/rejected": -2.120013475418091, + "step": 7858 + }, + { + "epoch": 0.91, + "learning_rate": 2.8619922743766825e-08, + "logits/chosen": -2.970149040222168, + "logits/rejected": -2.948598861694336, + "logps/chosen": -199.190185546875, + "logps/rejected": -179.20370483398438, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21554023027420044, + "rewards/margins": 2.0987606048583984, + "rewards/rejected": -1.8832203149795532, + "step": 7859 + }, + { + "epoch": 0.91, + "learning_rate": 2.8584806274142573e-08, + "logits/chosen": -2.7225728034973145, + "logits/rejected": -2.8144214153289795, + "logps/chosen": -216.82403564453125, + "logps/rejected": -301.126708984375, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6521344184875488, + "rewards/margins": 1.967595100402832, + "rewards/rejected": -2.6197292804718018, + "step": 7860 + }, + { + "epoch": 0.91, + "learning_rate": 2.8549689804518317e-08, + "logits/chosen": -3.2249643802642822, + "logits/rejected": -3.240175485610962, + "logps/chosen": -266.2273254394531, + "logps/rejected": -271.55987548828125, + "loss": 0.3854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24593773484230042, + "rewards/margins": 1.7364988327026367, + "rewards/rejected": -1.9824365377426147, + "step": 7861 + }, + { + "epoch": 0.91, + "learning_rate": 2.8514573334894064e-08, + "logits/chosen": -3.0681686401367188, + "logits/rejected": -3.340385675430298, + "logps/chosen": -89.01580047607422, + "logps/rejected": -234.29681396484375, + "loss": 0.3287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5759316086769104, + "rewards/margins": 2.196342706680298, + "rewards/rejected": -2.7722742557525635, + "step": 7862 + }, + { + "epoch": 0.91, + "learning_rate": 2.847945686526981e-08, + "logits/chosen": -2.6855578422546387, + "logits/rejected": -2.8005542755126953, + "logps/chosen": -171.07489013671875, + "logps/rejected": -170.78277587890625, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5896208882331848, + "rewards/margins": 1.089625597000122, + "rewards/rejected": -1.6792463064193726, + "step": 7863 + }, + { + "epoch": 0.91, + "learning_rate": 2.8444340395645558e-08, + "logits/chosen": -3.261836528778076, + "logits/rejected": -3.0541462898254395, + "logps/chosen": -275.8838195800781, + "logps/rejected": -213.26828002929688, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11604300141334534, + "rewards/margins": 2.511101722717285, + "rewards/rejected": -2.6271448135375977, + "step": 7864 + }, + { + "epoch": 0.91, + "learning_rate": 2.8409223926021302e-08, + "logits/chosen": -2.9331276416778564, + "logits/rejected": -3.182896852493286, + "logps/chosen": -312.85699462890625, + "logps/rejected": -384.49169921875, + "loss": 0.3113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.664020299911499, + "rewards/margins": 2.1916065216064453, + "rewards/rejected": -2.8556270599365234, + "step": 7865 + }, + { + "epoch": 0.91, + "learning_rate": 2.837410745639705e-08, + "logits/chosen": -3.4103686809539795, + "logits/rejected": -3.410764455795288, + "logps/chosen": -231.1832275390625, + "logps/rejected": -271.01727294921875, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2503363788127899, + "rewards/margins": 2.4205029010772705, + "rewards/rejected": -2.170166492462158, + "step": 7866 + }, + { + "epoch": 0.91, + "learning_rate": 2.8338990986772797e-08, + "logits/chosen": -2.976512908935547, + "logits/rejected": -3.441013813018799, + "logps/chosen": -282.9111328125, + "logps/rejected": -303.06060791015625, + "loss": 0.5314, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15608254075050354, + "rewards/margins": 1.527730107307434, + "rewards/rejected": -1.6838124990463257, + "step": 7867 + }, + { + "epoch": 0.91, + "learning_rate": 2.830387451714854e-08, + "logits/chosen": -2.2706379890441895, + "logits/rejected": -2.304987907409668, + "logps/chosen": -223.05523681640625, + "logps/rejected": -255.6548614501953, + "loss": 0.3925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2698545455932617, + "rewards/margins": 2.0810041427612305, + "rewards/rejected": -2.350858449935913, + "step": 7868 + }, + { + "epoch": 0.91, + "learning_rate": 2.8268758047524288e-08, + "logits/chosen": -3.187497615814209, + "logits/rejected": -2.9333109855651855, + "logps/chosen": -155.69381713867188, + "logps/rejected": -190.18092346191406, + "loss": 0.432, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15649613738059998, + "rewards/margins": 1.3555160760879517, + "rewards/rejected": -1.1990197896957397, + "step": 7869 + }, + { + "epoch": 0.91, + "learning_rate": 2.8233641577900035e-08, + "logits/chosen": -3.616450071334839, + "logits/rejected": -3.5422778129577637, + "logps/chosen": -257.33001708984375, + "logps/rejected": -404.42913818359375, + "loss": 0.3182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22646081447601318, + "rewards/margins": 2.4391775131225586, + "rewards/rejected": -2.6656386852264404, + "step": 7870 + }, + { + "epoch": 0.91, + "learning_rate": 2.8198525108275782e-08, + "logits/chosen": -3.0948143005371094, + "logits/rejected": -3.099029541015625, + "logps/chosen": -182.31103515625, + "logps/rejected": -218.478515625, + "loss": 0.4845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.056755825877189636, + "rewards/margins": 0.9922435879707336, + "rewards/rejected": -1.0489994287490845, + "step": 7871 + }, + { + "epoch": 0.91, + "learning_rate": 2.8163408638651526e-08, + "logits/chosen": -2.631380081176758, + "logits/rejected": -2.7525432109832764, + "logps/chosen": -227.43600463867188, + "logps/rejected": -293.2517395019531, + "loss": 0.5307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8211767077445984, + "rewards/margins": 2.4203782081604004, + "rewards/rejected": -3.2415549755096436, + "step": 7872 + }, + { + "epoch": 0.91, + "learning_rate": 2.8128292169027273e-08, + "logits/chosen": -3.1462223529815674, + "logits/rejected": -2.664015293121338, + "logps/chosen": -197.06451416015625, + "logps/rejected": -236.86895751953125, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25734424591064453, + "rewards/margins": 1.530692458152771, + "rewards/rejected": -1.7880367040634155, + "step": 7873 + }, + { + "epoch": 0.91, + "learning_rate": 2.809317569940302e-08, + "logits/chosen": -3.772977352142334, + "logits/rejected": -3.680572509765625, + "logps/chosen": -233.95852661132812, + "logps/rejected": -252.161865234375, + "loss": 0.7992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7417940497398376, + "rewards/margins": 0.8490232229232788, + "rewards/rejected": -1.5908172130584717, + "step": 7874 + }, + { + "epoch": 0.91, + "learning_rate": 2.8058059229778765e-08, + "logits/chosen": -3.300304412841797, + "logits/rejected": -3.079775810241699, + "logps/chosen": -151.00369262695312, + "logps/rejected": -285.8630065917969, + "loss": 0.5848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49417024850845337, + "rewards/margins": 0.948479413986206, + "rewards/rejected": -1.4426497220993042, + "step": 7875 + }, + { + "epoch": 0.91, + "learning_rate": 2.8022942760154512e-08, + "logits/chosen": -3.6560425758361816, + "logits/rejected": -3.9088521003723145, + "logps/chosen": -202.6317138671875, + "logps/rejected": -333.2917175292969, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30315718054771423, + "rewards/margins": 3.923092842102051, + "rewards/rejected": -4.226250648498535, + "step": 7876 + }, + { + "epoch": 0.91, + "learning_rate": 2.798782629053026e-08, + "logits/chosen": -2.489370584487915, + "logits/rejected": -2.486433982849121, + "logps/chosen": -263.4953918457031, + "logps/rejected": -313.1733093261719, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2138318121433258, + "rewards/margins": 2.683227300643921, + "rewards/rejected": -2.469395637512207, + "step": 7877 + }, + { + "epoch": 0.91, + "learning_rate": 2.7952709820906006e-08, + "logits/chosen": -2.634549856185913, + "logits/rejected": -3.066575765609741, + "logps/chosen": -480.6684875488281, + "logps/rejected": -320.5803527832031, + "loss": 0.3566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01476326584815979, + "rewards/margins": 1.9138613939285278, + "rewards/rejected": -1.8990981578826904, + "step": 7878 + }, + { + "epoch": 0.91, + "learning_rate": 2.791759335128175e-08, + "logits/chosen": -3.1958718299865723, + "logits/rejected": -2.8537871837615967, + "logps/chosen": -303.78094482421875, + "logps/rejected": -272.33685302734375, + "loss": 0.2241, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19052666425704956, + "rewards/margins": 2.699345111846924, + "rewards/rejected": -2.5088186264038086, + "step": 7879 + }, + { + "epoch": 0.91, + "learning_rate": 2.7882476881657498e-08, + "logits/chosen": -3.307584762573242, + "logits/rejected": -3.6234841346740723, + "logps/chosen": -127.79502868652344, + "logps/rejected": -328.2816162109375, + "loss": 0.1319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019870907068252563, + "rewards/margins": 4.384125232696533, + "rewards/rejected": -4.364254474639893, + "step": 7880 + }, + { + "epoch": 0.91, + "learning_rate": 2.7847360412033245e-08, + "logits/chosen": -3.267671823501587, + "logits/rejected": -3.2538702487945557, + "logps/chosen": -296.01837158203125, + "logps/rejected": -295.0340270996094, + "loss": 0.9895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38569048047065735, + "rewards/margins": 0.42762237787246704, + "rewards/rejected": -0.8133128881454468, + "step": 7881 + }, + { + "epoch": 0.91, + "learning_rate": 2.781224394240899e-08, + "logits/chosen": -3.6124510765075684, + "logits/rejected": -3.346214771270752, + "logps/chosen": -449.00482177734375, + "logps/rejected": -227.291748046875, + "loss": 0.3481, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26432085037231445, + "rewards/margins": 2.1947319507598877, + "rewards/rejected": -1.9304108619689941, + "step": 7882 + }, + { + "epoch": 0.91, + "learning_rate": 2.7777127472784736e-08, + "logits/chosen": -2.936983346939087, + "logits/rejected": -2.981520652770996, + "logps/chosen": -175.75306701660156, + "logps/rejected": -174.70826721191406, + "loss": 0.3671, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0986546128988266, + "rewards/margins": 1.3175843954086304, + "rewards/rejected": -1.2189298868179321, + "step": 7883 + }, + { + "epoch": 0.91, + "learning_rate": 2.7742011003160483e-08, + "logits/chosen": -2.891658306121826, + "logits/rejected": -2.9772136211395264, + "logps/chosen": -268.66192626953125, + "logps/rejected": -497.1708068847656, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.352385938167572, + "rewards/margins": 2.681187391281128, + "rewards/rejected": -2.3288016319274902, + "step": 7884 + }, + { + "epoch": 0.91, + "learning_rate": 2.770689453353623e-08, + "logits/chosen": -3.0808560848236084, + "logits/rejected": -2.975882053375244, + "logps/chosen": -271.6749572753906, + "logps/rejected": -260.585693359375, + "loss": 0.7898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.574550986289978, + "rewards/margins": 0.7112786769866943, + "rewards/rejected": -1.2858296632766724, + "step": 7885 + }, + { + "epoch": 0.91, + "learning_rate": 2.7671778063911974e-08, + "logits/chosen": -3.112577199935913, + "logits/rejected": -3.4017016887664795, + "logps/chosen": -243.94732666015625, + "logps/rejected": -322.32257080078125, + "loss": 0.2678, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.009979836642742157, + "rewards/margins": 2.3615102767944336, + "rewards/rejected": -2.3515305519104004, + "step": 7886 + }, + { + "epoch": 0.91, + "learning_rate": 2.7636661594287718e-08, + "logits/chosen": -2.8162436485290527, + "logits/rejected": -2.987809181213379, + "logps/chosen": -258.6773986816406, + "logps/rejected": -158.58470153808594, + "loss": 0.4371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05953305959701538, + "rewards/margins": 1.2956877946853638, + "rewards/rejected": -1.2361546754837036, + "step": 7887 + }, + { + "epoch": 0.91, + "learning_rate": 2.7601545124663465e-08, + "logits/chosen": -2.730985403060913, + "logits/rejected": -2.709660530090332, + "logps/chosen": -235.2060546875, + "logps/rejected": -319.4970397949219, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21528835594654083, + "rewards/margins": 1.7959524393081665, + "rewards/rejected": -2.0112407207489014, + "step": 7888 + }, + { + "epoch": 0.91, + "learning_rate": 2.756642865503921e-08, + "logits/chosen": -2.9345836639404297, + "logits/rejected": -2.687929153442383, + "logps/chosen": -219.58438110351562, + "logps/rejected": -206.67825317382812, + "loss": 0.2475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7437154054641724, + "rewards/margins": 2.4127626419067383, + "rewards/rejected": -1.6690471172332764, + "step": 7889 + }, + { + "epoch": 0.91, + "learning_rate": 2.7531312185414957e-08, + "logits/chosen": -3.2612295150756836, + "logits/rejected": -3.310053825378418, + "logps/chosen": -144.12173461914062, + "logps/rejected": -181.5188751220703, + "loss": 0.5019, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13782274723052979, + "rewards/margins": 0.6889849305152893, + "rewards/rejected": -0.5511621832847595, + "step": 7890 + }, + { + "epoch": 0.91, + "learning_rate": 2.7496195715790704e-08, + "logits/chosen": -2.817446708679199, + "logits/rejected": -2.868762493133545, + "logps/chosen": -220.50408935546875, + "logps/rejected": -274.2353515625, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12293568253517151, + "rewards/margins": 1.6589994430541992, + "rewards/rejected": -1.7819350957870483, + "step": 7891 + }, + { + "epoch": 0.91, + "learning_rate": 2.7461079246166448e-08, + "logits/chosen": -2.9165406227111816, + "logits/rejected": -3.0869510173797607, + "logps/chosen": -313.349853515625, + "logps/rejected": -240.09695434570312, + "loss": 0.3508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09044202417135239, + "rewards/margins": 1.8668770790100098, + "rewards/rejected": -1.9573192596435547, + "step": 7892 + }, + { + "epoch": 0.91, + "learning_rate": 2.7425962776542195e-08, + "logits/chosen": -2.560563087463379, + "logits/rejected": -2.470278739929199, + "logps/chosen": -244.05540466308594, + "logps/rejected": -181.59225463867188, + "loss": 0.2634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015650849789381027, + "rewards/margins": 1.3314273357391357, + "rewards/rejected": -1.3470782041549683, + "step": 7893 + }, + { + "epoch": 0.91, + "learning_rate": 2.7390846306917942e-08, + "logits/chosen": -3.3210389614105225, + "logits/rejected": -3.37764310836792, + "logps/chosen": -246.11795043945312, + "logps/rejected": -286.30517578125, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3584965169429779, + "rewards/margins": 2.90488338470459, + "rewards/rejected": -2.54638671875, + "step": 7894 + }, + { + "epoch": 0.91, + "learning_rate": 2.735572983729369e-08, + "logits/chosen": -3.4864871501922607, + "logits/rejected": -3.3597018718719482, + "logps/chosen": -171.92633056640625, + "logps/rejected": -192.95787048339844, + "loss": 0.4126, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07447750866413116, + "rewards/margins": 1.434267282485962, + "rewards/rejected": -1.3597898483276367, + "step": 7895 + }, + { + "epoch": 0.91, + "learning_rate": 2.7320613367669433e-08, + "logits/chosen": -3.5664775371551514, + "logits/rejected": -3.6361324787139893, + "logps/chosen": -321.92657470703125, + "logps/rejected": -283.3940124511719, + "loss": 0.5194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2843543589115143, + "rewards/margins": 1.7879265546798706, + "rewards/rejected": -2.0722808837890625, + "step": 7896 + }, + { + "epoch": 0.91, + "learning_rate": 2.728549689804518e-08, + "logits/chosen": -2.695937395095825, + "logits/rejected": -2.810237407684326, + "logps/chosen": -323.49530029296875, + "logps/rejected": -237.6997528076172, + "loss": 0.3613, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11599382013082504, + "rewards/margins": 1.2745610475540161, + "rewards/rejected": -1.1585673093795776, + "step": 7897 + }, + { + "epoch": 0.91, + "learning_rate": 2.7250380428420928e-08, + "logits/chosen": -3.2684009075164795, + "logits/rejected": -3.454017400741577, + "logps/chosen": -269.02105712890625, + "logps/rejected": -129.60650634765625, + "loss": 0.2993, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.051874756813049316, + "rewards/margins": 1.8590662479400635, + "rewards/rejected": -1.8071916103363037, + "step": 7898 + }, + { + "epoch": 0.91, + "learning_rate": 2.7215263958796672e-08, + "logits/chosen": -3.5919623374938965, + "logits/rejected": -3.2112557888031006, + "logps/chosen": -230.43789672851562, + "logps/rejected": -209.43055725097656, + "loss": 0.6966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5634943842887878, + "rewards/margins": 0.733788013458252, + "rewards/rejected": -1.2972824573516846, + "step": 7899 + }, + { + "epoch": 0.91, + "learning_rate": 2.718014748917242e-08, + "logits/chosen": -4.027463912963867, + "logits/rejected": -3.604475259780884, + "logps/chosen": -212.96249389648438, + "logps/rejected": -147.77073669433594, + "loss": 0.4185, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03131598234176636, + "rewards/margins": 1.8702412843704224, + "rewards/rejected": -1.8389253616333008, + "step": 7900 + }, + { + "epoch": 0.91, + "learning_rate": 2.7145031019548166e-08, + "logits/chosen": -2.8121705055236816, + "logits/rejected": -3.056072235107422, + "logps/chosen": -302.20208740234375, + "logps/rejected": -316.0823974609375, + "loss": 0.3486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23209147155284882, + "rewards/margins": 2.0709238052368164, + "rewards/rejected": -2.3030154705047607, + "step": 7901 + }, + { + "epoch": 0.91, + "learning_rate": 2.7109914549923914e-08, + "logits/chosen": -3.1160337924957275, + "logits/rejected": -2.970256805419922, + "logps/chosen": -260.2726745605469, + "logps/rejected": -256.4873962402344, + "loss": 0.2167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5281132459640503, + "rewards/margins": 1.973752737045288, + "rewards/rejected": -2.501866102218628, + "step": 7902 + }, + { + "epoch": 0.91, + "learning_rate": 2.7074798080299657e-08, + "logits/chosen": -3.4578540325164795, + "logits/rejected": -3.471820592880249, + "logps/chosen": -274.9444274902344, + "logps/rejected": -230.9774627685547, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22982528805732727, + "rewards/margins": 2.7048535346984863, + "rewards/rejected": -2.934678792953491, + "step": 7903 + }, + { + "epoch": 0.91, + "learning_rate": 2.7039681610675405e-08, + "logits/chosen": -2.650062322616577, + "logits/rejected": -2.726612091064453, + "logps/chosen": -394.8077697753906, + "logps/rejected": -352.046875, + "loss": 0.3408, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47377437353134155, + "rewards/margins": 2.282484531402588, + "rewards/rejected": -1.8087100982666016, + "step": 7904 + }, + { + "epoch": 0.91, + "learning_rate": 2.7004565141051152e-08, + "logits/chosen": -3.3896634578704834, + "logits/rejected": -3.118875026702881, + "logps/chosen": -254.33847045898438, + "logps/rejected": -256.6885681152344, + "loss": 0.358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3688364624977112, + "rewards/margins": 1.7750738859176636, + "rewards/rejected": -1.4062374830245972, + "step": 7905 + }, + { + "epoch": 0.91, + "learning_rate": 2.69694486714269e-08, + "logits/chosen": -3.078901767730713, + "logits/rejected": -3.3975675106048584, + "logps/chosen": -275.5174560546875, + "logps/rejected": -150.14845275878906, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5812190175056458, + "rewards/margins": 0.6251519322395325, + "rewards/rejected": -1.2063709497451782, + "step": 7906 + }, + { + "epoch": 0.91, + "learning_rate": 2.6934332201802643e-08, + "logits/chosen": -3.1360511779785156, + "logits/rejected": -3.3958992958068848, + "logps/chosen": -385.2325439453125, + "logps/rejected": -250.38668823242188, + "loss": 0.6845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10780245810747147, + "rewards/margins": 0.6432649493217468, + "rewards/rejected": -0.7510673999786377, + "step": 7907 + }, + { + "epoch": 0.91, + "learning_rate": 2.689921573217839e-08, + "logits/chosen": -3.8994977474212646, + "logits/rejected": -3.429996967315674, + "logps/chosen": -276.8229675292969, + "logps/rejected": -216.77377319335938, + "loss": 0.2155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12388042360544205, + "rewards/margins": 2.059345006942749, + "rewards/rejected": -1.93546462059021, + "step": 7908 + }, + { + "epoch": 0.91, + "learning_rate": 2.6864099262554138e-08, + "logits/chosen": -2.5549445152282715, + "logits/rejected": -2.5712807178497314, + "logps/chosen": -260.4879150390625, + "logps/rejected": -192.489013671875, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19378383457660675, + "rewards/margins": 0.7703691720962524, + "rewards/rejected": -0.5765852332115173, + "step": 7909 + }, + { + "epoch": 0.91, + "learning_rate": 2.682898279292988e-08, + "logits/chosen": -2.973583698272705, + "logits/rejected": -3.012664318084717, + "logps/chosen": -154.38900756835938, + "logps/rejected": -270.981201171875, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.422580748796463, + "rewards/margins": 1.9436957836151123, + "rewards/rejected": -2.366276502609253, + "step": 7910 + }, + { + "epoch": 0.91, + "learning_rate": 2.679386632330563e-08, + "logits/chosen": -2.9126954078674316, + "logits/rejected": -2.9284260272979736, + "logps/chosen": -132.50607299804688, + "logps/rejected": -240.022705078125, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40902048349380493, + "rewards/margins": 1.454905390739441, + "rewards/rejected": -1.8639259338378906, + "step": 7911 + }, + { + "epoch": 0.91, + "learning_rate": 2.6758749853681376e-08, + "logits/chosen": -3.243450164794922, + "logits/rejected": -3.0728838443756104, + "logps/chosen": -154.16836547851562, + "logps/rejected": -214.31362915039062, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.003006555140018463, + "rewards/margins": 1.80141282081604, + "rewards/rejected": -1.7984063625335693, + "step": 7912 + }, + { + "epoch": 0.91, + "learning_rate": 2.6723633384057123e-08, + "logits/chosen": -3.4315383434295654, + "logits/rejected": -3.5121498107910156, + "logps/chosen": -465.10430908203125, + "logps/rejected": -407.72296142578125, + "loss": 0.2166, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3533813953399658, + "rewards/margins": 2.2711703777313232, + "rewards/rejected": -1.9177889823913574, + "step": 7913 + }, + { + "epoch": 0.91, + "learning_rate": 2.6688516914432867e-08, + "logits/chosen": -2.97613787651062, + "logits/rejected": -3.162426710128784, + "logps/chosen": -623.1483764648438, + "logps/rejected": -288.71356201171875, + "loss": 0.54, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6821424961090088, + "rewards/margins": 0.8123636245727539, + "rewards/rejected": -1.4945061206817627, + "step": 7914 + }, + { + "epoch": 0.91, + "learning_rate": 2.6653400444808614e-08, + "logits/chosen": -3.4631102085113525, + "logits/rejected": -3.1367695331573486, + "logps/chosen": -246.68331909179688, + "logps/rejected": -260.56341552734375, + "loss": 0.1488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.197942852973938, + "rewards/margins": 3.3274903297424316, + "rewards/rejected": -3.129547595977783, + "step": 7915 + }, + { + "epoch": 0.91, + "learning_rate": 2.661828397518436e-08, + "logits/chosen": -3.1834874153137207, + "logits/rejected": -3.4566903114318848, + "logps/chosen": -265.06097412109375, + "logps/rejected": -244.05780029296875, + "loss": 0.3483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10610383749008179, + "rewards/margins": 1.797206163406372, + "rewards/rejected": -1.9033100605010986, + "step": 7916 + }, + { + "epoch": 0.91, + "learning_rate": 2.6583167505560106e-08, + "logits/chosen": -3.790099859237671, + "logits/rejected": -3.383934259414673, + "logps/chosen": -330.03240966796875, + "logps/rejected": -313.65985107421875, + "loss": 0.2171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024107974022626877, + "rewards/margins": 3.2243809700012207, + "rewards/rejected": -3.248488664627075, + "step": 7917 + }, + { + "epoch": 0.91, + "learning_rate": 2.6548051035935853e-08, + "logits/chosen": -3.014209270477295, + "logits/rejected": -2.916205406188965, + "logps/chosen": -265.4150390625, + "logps/rejected": -230.60015869140625, + "loss": 0.3307, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16199035942554474, + "rewards/margins": 2.5398664474487305, + "rewards/rejected": -2.3778762817382812, + "step": 7918 + }, + { + "epoch": 0.91, + "learning_rate": 2.65129345663116e-08, + "logits/chosen": -2.4960522651672363, + "logits/rejected": -2.6684045791625977, + "logps/chosen": -321.713134765625, + "logps/rejected": -290.3084716796875, + "loss": 0.1681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38368362188339233, + "rewards/margins": 2.7201507091522217, + "rewards/rejected": -2.3364672660827637, + "step": 7919 + }, + { + "epoch": 0.91, + "learning_rate": 2.6477818096687347e-08, + "logits/chosen": -2.906826972961426, + "logits/rejected": -2.8093433380126953, + "logps/chosen": -350.14129638671875, + "logps/rejected": -262.18115234375, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0791972205042839, + "rewards/margins": 2.1768746376037598, + "rewards/rejected": -2.2560718059539795, + "step": 7920 + }, + { + "epoch": 0.91, + "learning_rate": 2.644270162706309e-08, + "logits/chosen": -3.1663944721221924, + "logits/rejected": -3.010119915008545, + "logps/chosen": -218.75100708007812, + "logps/rejected": -225.37091064453125, + "loss": 0.3221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22143548727035522, + "rewards/margins": 2.568321704864502, + "rewards/rejected": -2.789757251739502, + "step": 7921 + }, + { + "epoch": 0.91, + "learning_rate": 2.640758515743884e-08, + "logits/chosen": -2.877117872238159, + "logits/rejected": -3.0283703804016113, + "logps/chosen": -129.39622497558594, + "logps/rejected": -255.99874877929688, + "loss": 0.691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5295476913452148, + "rewards/margins": 0.6724177002906799, + "rewards/rejected": -1.2019654512405396, + "step": 7922 + }, + { + "epoch": 0.91, + "learning_rate": 2.6372468687814586e-08, + "logits/chosen": -2.7801549434661865, + "logits/rejected": -2.735271692276001, + "logps/chosen": -249.81961059570312, + "logps/rejected": -183.65936279296875, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3297932744026184, + "rewards/margins": 2.4324135780334473, + "rewards/rejected": -2.1026203632354736, + "step": 7923 + }, + { + "epoch": 0.91, + "learning_rate": 2.633735221819033e-08, + "logits/chosen": -2.4594147205352783, + "logits/rejected": -2.4480416774749756, + "logps/chosen": -386.9161376953125, + "logps/rejected": -285.9794616699219, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2700170874595642, + "rewards/margins": 1.8158276081085205, + "rewards/rejected": -2.0858447551727295, + "step": 7924 + }, + { + "epoch": 0.91, + "learning_rate": 2.6302235748566077e-08, + "logits/chosen": -3.1329729557037354, + "logits/rejected": -3.4244697093963623, + "logps/chosen": -399.0824890136719, + "logps/rejected": -232.18731689453125, + "loss": 0.1938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5136888027191162, + "rewards/margins": 2.299452543258667, + "rewards/rejected": -1.7857637405395508, + "step": 7925 + }, + { + "epoch": 0.91, + "learning_rate": 2.6267119278941824e-08, + "logits/chosen": -3.303642749786377, + "logits/rejected": -3.2839174270629883, + "logps/chosen": -151.54495239257812, + "logps/rejected": -199.93118286132812, + "loss": 0.1854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6612294316291809, + "rewards/margins": 3.2654836177825928, + "rewards/rejected": -2.6042542457580566, + "step": 7926 + }, + { + "epoch": 0.91, + "learning_rate": 2.623200280931757e-08, + "logits/chosen": -2.9629931449890137, + "logits/rejected": -2.4939796924591064, + "logps/chosen": -366.44586181640625, + "logps/rejected": -331.01690673828125, + "loss": 0.4422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5571781396865845, + "rewards/margins": 1.5492804050445557, + "rewards/rejected": -2.1064586639404297, + "step": 7927 + }, + { + "epoch": 0.91, + "learning_rate": 2.6196886339693315e-08, + "logits/chosen": -2.859267234802246, + "logits/rejected": -2.9977216720581055, + "logps/chosen": -460.93475341796875, + "logps/rejected": -161.92727661132812, + "loss": 0.8193, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33396315574645996, + "rewards/margins": 0.6467481851577759, + "rewards/rejected": -0.9807113409042358, + "step": 7928 + }, + { + "epoch": 0.91, + "learning_rate": 2.6161769870069063e-08, + "logits/chosen": -2.709243059158325, + "logits/rejected": -2.981123924255371, + "logps/chosen": -266.0618896484375, + "logps/rejected": -301.1204528808594, + "loss": 0.4513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2805801331996918, + "rewards/margins": 1.4517219066619873, + "rewards/rejected": -1.732301950454712, + "step": 7929 + }, + { + "epoch": 0.91, + "learning_rate": 2.612665340044481e-08, + "logits/chosen": -2.61173415184021, + "logits/rejected": -2.776883125305176, + "logps/chosen": -481.8065185546875, + "logps/rejected": -298.9688415527344, + "loss": 0.2557, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2324690818786621, + "rewards/margins": 3.3635215759277344, + "rewards/rejected": -3.1310524940490723, + "step": 7930 + }, + { + "epoch": 0.91, + "learning_rate": 2.6091536930820557e-08, + "logits/chosen": -3.152933120727539, + "logits/rejected": -3.1133902072906494, + "logps/chosen": -351.9236145019531, + "logps/rejected": -251.25054931640625, + "loss": 0.4405, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.282807856798172, + "rewards/margins": 1.6621642112731934, + "rewards/rejected": -1.3793562650680542, + "step": 7931 + }, + { + "epoch": 0.91, + "learning_rate": 2.60564204611963e-08, + "logits/chosen": -3.7819626331329346, + "logits/rejected": -3.4273838996887207, + "logps/chosen": -441.37396240234375, + "logps/rejected": -303.412109375, + "loss": 0.3585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4512269198894501, + "rewards/margins": 2.3583033084869385, + "rewards/rejected": -2.809530258178711, + "step": 7932 + }, + { + "epoch": 0.91, + "learning_rate": 2.6021303991572048e-08, + "logits/chosen": -3.1561684608459473, + "logits/rejected": -2.8332066535949707, + "logps/chosen": -435.2096252441406, + "logps/rejected": -362.68121337890625, + "loss": 0.185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37212514877319336, + "rewards/margins": 2.327362537384033, + "rewards/rejected": -1.9552375078201294, + "step": 7933 + }, + { + "epoch": 0.91, + "learning_rate": 2.5986187521947795e-08, + "logits/chosen": -3.251254081726074, + "logits/rejected": -3.2636234760284424, + "logps/chosen": -163.11953735351562, + "logps/rejected": -212.18124389648438, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0008269846439361572, + "rewards/margins": 2.405949354171753, + "rewards/rejected": -2.4051222801208496, + "step": 7934 + }, + { + "epoch": 0.91, + "learning_rate": 2.5951071052323536e-08, + "logits/chosen": -3.4906299114227295, + "logits/rejected": -3.493117094039917, + "logps/chosen": -211.20407104492188, + "logps/rejected": -162.78024291992188, + "loss": 0.344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33041495084762573, + "rewards/margins": 1.8704743385314941, + "rewards/rejected": -1.5400593280792236, + "step": 7935 + }, + { + "epoch": 0.91, + "learning_rate": 2.5915954582699283e-08, + "logits/chosen": -3.649817943572998, + "logits/rejected": -3.2017147541046143, + "logps/chosen": -300.1605224609375, + "logps/rejected": -173.2865447998047, + "loss": 0.3426, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6705621480941772, + "rewards/margins": 2.455692768096924, + "rewards/rejected": -1.785130500793457, + "step": 7936 + }, + { + "epoch": 0.91, + "learning_rate": 2.588083811307503e-08, + "logits/chosen": -3.5021328926086426, + "logits/rejected": -3.28826904296875, + "logps/chosen": -303.6654357910156, + "logps/rejected": -160.97396850585938, + "loss": 0.3252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28051722049713135, + "rewards/margins": 1.4995503425598145, + "rewards/rejected": -1.2190332412719727, + "step": 7937 + }, + { + "epoch": 0.92, + "learning_rate": 2.5845721643450774e-08, + "logits/chosen": -3.2079715728759766, + "logits/rejected": -2.870375156402588, + "logps/chosen": -261.8691101074219, + "logps/rejected": -229.22161865234375, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8226915597915649, + "rewards/margins": 0.711804986000061, + "rewards/rejected": -1.534496545791626, + "step": 7938 + }, + { + "epoch": 0.92, + "learning_rate": 2.581060517382652e-08, + "logits/chosen": -3.7844393253326416, + "logits/rejected": -3.6793339252471924, + "logps/chosen": -128.09764099121094, + "logps/rejected": -192.2578582763672, + "loss": 0.3379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7517159581184387, + "rewards/margins": 1.8626763820648193, + "rewards/rejected": -2.6143925189971924, + "step": 7939 + }, + { + "epoch": 0.92, + "learning_rate": 2.577548870420227e-08, + "logits/chosen": -3.0112953186035156, + "logits/rejected": -2.7731447219848633, + "logps/chosen": -261.6444091796875, + "logps/rejected": -354.0215148925781, + "loss": 0.1826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10158685594797134, + "rewards/margins": 3.353416919708252, + "rewards/rejected": -3.4550042152404785, + "step": 7940 + }, + { + "epoch": 0.92, + "learning_rate": 2.5740372234578013e-08, + "logits/chosen": -3.112687826156616, + "logits/rejected": -2.903775930404663, + "logps/chosen": -229.68714904785156, + "logps/rejected": -347.0059509277344, + "loss": 0.3166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47899460792541504, + "rewards/margins": 1.352095127105713, + "rewards/rejected": -0.8731005191802979, + "step": 7941 + }, + { + "epoch": 0.92, + "learning_rate": 2.570525576495376e-08, + "logits/chosen": -3.2223713397979736, + "logits/rejected": -3.1203532218933105, + "logps/chosen": -312.038330078125, + "logps/rejected": -299.94873046875, + "loss": 0.2391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16063238680362701, + "rewards/margins": 2.8604888916015625, + "rewards/rejected": -3.0211212635040283, + "step": 7942 + }, + { + "epoch": 0.92, + "learning_rate": 2.5670139295329507e-08, + "logits/chosen": -3.619633674621582, + "logits/rejected": -3.680840492248535, + "logps/chosen": -353.4227294921875, + "logps/rejected": -286.24713134765625, + "loss": 0.2338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12400294095277786, + "rewards/margins": 2.145894765853882, + "rewards/rejected": -2.0218915939331055, + "step": 7943 + }, + { + "epoch": 0.92, + "learning_rate": 2.5635022825705255e-08, + "logits/chosen": -2.846874237060547, + "logits/rejected": -3.0241894721984863, + "logps/chosen": -282.9507751464844, + "logps/rejected": -448.26727294921875, + "loss": 0.1688, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08168859779834747, + "rewards/margins": 2.887544631958008, + "rewards/rejected": -2.805856227874756, + "step": 7944 + }, + { + "epoch": 0.92, + "learning_rate": 2.5599906356081e-08, + "logits/chosen": -2.7325515747070312, + "logits/rejected": -2.809171199798584, + "logps/chosen": -258.73052978515625, + "logps/rejected": -252.80880737304688, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9340596795082092, + "rewards/margins": 2.030808210372925, + "rewards/rejected": -2.9648678302764893, + "step": 7945 + }, + { + "epoch": 0.92, + "learning_rate": 2.5564789886456746e-08, + "logits/chosen": -2.9240994453430176, + "logits/rejected": -2.8437981605529785, + "logps/chosen": -261.47528076171875, + "logps/rejected": -274.9072570800781, + "loss": 0.3744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15589338541030884, + "rewards/margins": 1.4081629514694214, + "rewards/rejected": -1.564056396484375, + "step": 7946 + }, + { + "epoch": 0.92, + "learning_rate": 2.5529673416832493e-08, + "logits/chosen": -2.975680112838745, + "logits/rejected": -3.029426097869873, + "logps/chosen": -323.5231018066406, + "logps/rejected": -208.8004150390625, + "loss": 0.4295, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2919239401817322, + "rewards/margins": 1.6094872951507568, + "rewards/rejected": -1.9014112949371338, + "step": 7947 + }, + { + "epoch": 0.92, + "learning_rate": 2.549455694720824e-08, + "logits/chosen": -2.8188319206237793, + "logits/rejected": -2.863424301147461, + "logps/chosen": -251.12132263183594, + "logps/rejected": -243.0237579345703, + "loss": 0.6118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07408449798822403, + "rewards/margins": 0.7702306509017944, + "rewards/rejected": -0.8443151712417603, + "step": 7948 + }, + { + "epoch": 0.92, + "learning_rate": 2.5459440477583984e-08, + "logits/chosen": -2.7839198112487793, + "logits/rejected": -3.020292043685913, + "logps/chosen": -101.5988540649414, + "logps/rejected": -95.53126525878906, + "loss": 0.3847, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006349533796310425, + "rewards/margins": 1.4300590753555298, + "rewards/rejected": -1.4364086389541626, + "step": 7949 + }, + { + "epoch": 0.92, + "learning_rate": 2.542432400795973e-08, + "logits/chosen": -2.3427493572235107, + "logits/rejected": -2.6410818099975586, + "logps/chosen": -484.6178894042969, + "logps/rejected": -396.8139343261719, + "loss": 0.4795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21733418107032776, + "rewards/margins": 1.455613374710083, + "rewards/rejected": -1.6729474067687988, + "step": 7950 + }, + { + "epoch": 0.92, + "learning_rate": 2.538920753833548e-08, + "logits/chosen": -3.341948986053467, + "logits/rejected": -2.842869281768799, + "logps/chosen": -307.70416259765625, + "logps/rejected": -291.4599304199219, + "loss": 0.6554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12628404796123505, + "rewards/margins": 1.4938112497329712, + "rewards/rejected": -1.6200952529907227, + "step": 7951 + }, + { + "epoch": 0.92, + "learning_rate": 2.5354091068711222e-08, + "logits/chosen": -2.6168055534362793, + "logits/rejected": -2.5283749103546143, + "logps/chosen": -288.567138671875, + "logps/rejected": -361.37353515625, + "loss": 0.1747, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.314666748046875, + "rewards/margins": 3.100090265274048, + "rewards/rejected": -2.785423755645752, + "step": 7952 + }, + { + "epoch": 0.92, + "learning_rate": 2.531897459908697e-08, + "logits/chosen": -3.1522672176361084, + "logits/rejected": -3.123307228088379, + "logps/chosen": -206.51446533203125, + "logps/rejected": -211.52706909179688, + "loss": 0.8763, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44226834177970886, + "rewards/margins": 0.3003653287887573, + "rewards/rejected": -0.7426337003707886, + "step": 7953 + }, + { + "epoch": 0.92, + "learning_rate": 2.5283858129462717e-08, + "logits/chosen": -3.1563916206359863, + "logits/rejected": -3.405907392501831, + "logps/chosen": -170.27017211914062, + "logps/rejected": -213.7570037841797, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26567983627319336, + "rewards/margins": 1.003861427307129, + "rewards/rejected": -1.2695412635803223, + "step": 7954 + }, + { + "epoch": 0.92, + "learning_rate": 2.5248741659838464e-08, + "logits/chosen": -4.2164530754089355, + "logits/rejected": -3.8874258995056152, + "logps/chosen": -713.1478881835938, + "logps/rejected": -444.2315673828125, + "loss": 0.1111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006337761878967285, + "rewards/margins": 3.0571205615997314, + "rewards/rejected": -3.0634584426879883, + "step": 7955 + }, + { + "epoch": 0.92, + "learning_rate": 2.5213625190214208e-08, + "logits/chosen": -2.900765895843506, + "logits/rejected": -2.9156994819641113, + "logps/chosen": -169.92568969726562, + "logps/rejected": -287.5469970703125, + "loss": 0.1925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1827513575553894, + "rewards/margins": 3.9212982654571533, + "rewards/rejected": -4.1040496826171875, + "step": 7956 + }, + { + "epoch": 0.92, + "learning_rate": 2.5178508720589955e-08, + "logits/chosen": -3.8262407779693604, + "logits/rejected": -3.6091339588165283, + "logps/chosen": -354.7514343261719, + "logps/rejected": -263.23968505859375, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.016332536935806274, + "rewards/margins": 1.1269266605377197, + "rewards/rejected": -1.1432591676712036, + "step": 7957 + }, + { + "epoch": 0.92, + "learning_rate": 2.5143392250965703e-08, + "logits/chosen": -3.5838255882263184, + "logits/rejected": -3.1417033672332764, + "logps/chosen": -298.9044494628906, + "logps/rejected": -256.43865966796875, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30571049451828003, + "rewards/margins": 3.4428939819335938, + "rewards/rejected": -3.13718318939209, + "step": 7958 + }, + { + "epoch": 0.92, + "learning_rate": 2.5108275781341447e-08, + "logits/chosen": -2.2856366634368896, + "logits/rejected": -2.7725045680999756, + "logps/chosen": -245.8197021484375, + "logps/rejected": -316.387451171875, + "loss": 0.13, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20989608764648438, + "rewards/margins": 2.463804006576538, + "rewards/rejected": -2.2539079189300537, + "step": 7959 + }, + { + "epoch": 0.92, + "learning_rate": 2.5073159311717194e-08, + "logits/chosen": -3.3323707580566406, + "logits/rejected": -3.363924741744995, + "logps/chosen": -192.63897705078125, + "logps/rejected": -204.76016235351562, + "loss": 0.7221, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.297797441482544, + "rewards/margins": 0.5280490517616272, + "rewards/rejected": -1.8258464336395264, + "step": 7960 + }, + { + "epoch": 0.92, + "learning_rate": 2.503804284209294e-08, + "logits/chosen": -2.785418748855591, + "logits/rejected": -2.6745078563690186, + "logps/chosen": -452.9596252441406, + "logps/rejected": -364.5073547363281, + "loss": 0.4622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3088122606277466, + "rewards/margins": 1.1348011493682861, + "rewards/rejected": -1.4436134099960327, + "step": 7961 + }, + { + "epoch": 0.92, + "learning_rate": 2.5002926372468688e-08, + "logits/chosen": -3.2402262687683105, + "logits/rejected": -3.0054268836975098, + "logps/chosen": -231.65277099609375, + "logps/rejected": -295.12261962890625, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06966429948806763, + "rewards/margins": 2.129422426223755, + "rewards/rejected": -2.059757947921753, + "step": 7962 + }, + { + "epoch": 0.92, + "learning_rate": 2.4967809902844432e-08, + "logits/chosen": -2.662245512008667, + "logits/rejected": -2.740061044692993, + "logps/chosen": -320.7571105957031, + "logps/rejected": -271.4331359863281, + "loss": 0.5355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35642755031585693, + "rewards/margins": 1.6217944622039795, + "rewards/rejected": -1.9782218933105469, + "step": 7963 + }, + { + "epoch": 0.92, + "learning_rate": 2.493269343322018e-08, + "logits/chosen": -2.7573680877685547, + "logits/rejected": -2.5379068851470947, + "logps/chosen": -261.2873840332031, + "logps/rejected": -280.81829833984375, + "loss": 0.5063, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36269548535346985, + "rewards/margins": 1.0935238599777222, + "rewards/rejected": -0.7308283448219299, + "step": 7964 + }, + { + "epoch": 0.92, + "learning_rate": 2.4897576963595927e-08, + "logits/chosen": -2.955077886581421, + "logits/rejected": -2.870755434036255, + "logps/chosen": -194.1336212158203, + "logps/rejected": -288.6918029785156, + "loss": 0.2323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42971518635749817, + "rewards/margins": 2.7184877395629883, + "rewards/rejected": -2.2887725830078125, + "step": 7965 + }, + { + "epoch": 0.92, + "learning_rate": 2.4862460493971674e-08, + "logits/chosen": -3.3210458755493164, + "logits/rejected": -3.044792413711548, + "logps/chosen": -198.91635131835938, + "logps/rejected": -238.81149291992188, + "loss": 0.3515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3496764004230499, + "rewards/margins": 2.8651082515716553, + "rewards/rejected": -3.214784622192383, + "step": 7966 + }, + { + "epoch": 0.92, + "learning_rate": 2.4827344024347418e-08, + "logits/chosen": -2.655428886413574, + "logits/rejected": -2.953444242477417, + "logps/chosen": -218.8328857421875, + "logps/rejected": -219.76229858398438, + "loss": 0.3842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08510073274374008, + "rewards/margins": 2.2906980514526367, + "rewards/rejected": -2.205597400665283, + "step": 7967 + }, + { + "epoch": 0.92, + "learning_rate": 2.4792227554723165e-08, + "logits/chosen": -3.479766368865967, + "logits/rejected": -3.4668102264404297, + "logps/chosen": -241.31056213378906, + "logps/rejected": -283.42034912109375, + "loss": 0.41, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3198361098766327, + "rewards/margins": 2.563183307647705, + "rewards/rejected": -2.243346929550171, + "step": 7968 + }, + { + "epoch": 0.92, + "learning_rate": 2.4757111085098912e-08, + "logits/chosen": -3.948929786682129, + "logits/rejected": -3.797544479370117, + "logps/chosen": -185.3629608154297, + "logps/rejected": -249.7896270751953, + "loss": 0.4294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3182118535041809, + "rewards/margins": 0.8638822436332703, + "rewards/rejected": -1.1820940971374512, + "step": 7969 + }, + { + "epoch": 0.92, + "learning_rate": 2.4721994615474656e-08, + "logits/chosen": -2.9249064922332764, + "logits/rejected": -3.0677285194396973, + "logps/chosen": -287.0469665527344, + "logps/rejected": -299.1763916015625, + "loss": 0.3716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2654849886894226, + "rewards/margins": 2.8185348510742188, + "rewards/rejected": -3.084019660949707, + "step": 7970 + }, + { + "epoch": 0.92, + "learning_rate": 2.4686878145850403e-08, + "logits/chosen": -2.758213520050049, + "logits/rejected": -2.9135236740112305, + "logps/chosen": -304.6352233886719, + "logps/rejected": -268.0983581542969, + "loss": 0.3049, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13260594010353088, + "rewards/margins": 1.695177674293518, + "rewards/rejected": -1.5625717639923096, + "step": 7971 + }, + { + "epoch": 0.92, + "learning_rate": 2.465176167622615e-08, + "logits/chosen": -2.964590072631836, + "logits/rejected": -2.989288806915283, + "logps/chosen": -249.7349090576172, + "logps/rejected": -193.16195678710938, + "loss": 0.4111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1861923187971115, + "rewards/margins": 1.887162208557129, + "rewards/rejected": -2.073354721069336, + "step": 7972 + }, + { + "epoch": 0.92, + "learning_rate": 2.4616645206601898e-08, + "logits/chosen": -2.770979404449463, + "logits/rejected": -2.7881431579589844, + "logps/chosen": -433.44921875, + "logps/rejected": -349.163330078125, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47925668954849243, + "rewards/margins": 2.566767930984497, + "rewards/rejected": -3.0460243225097656, + "step": 7973 + }, + { + "epoch": 0.92, + "learning_rate": 2.4581528736977642e-08, + "logits/chosen": -3.7025179862976074, + "logits/rejected": -3.5895352363586426, + "logps/chosen": -320.334228515625, + "logps/rejected": -182.38491821289062, + "loss": 0.3893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09739457815885544, + "rewards/margins": 2.0112223625183105, + "rewards/rejected": -2.108616828918457, + "step": 7974 + }, + { + "epoch": 0.92, + "learning_rate": 2.454641226735339e-08, + "logits/chosen": -3.7679660320281982, + "logits/rejected": -3.4078102111816406, + "logps/chosen": -352.31549072265625, + "logps/rejected": -270.2923278808594, + "loss": 0.3361, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4060327410697937, + "rewards/margins": 3.0678670406341553, + "rewards/rejected": -2.661834239959717, + "step": 7975 + }, + { + "epoch": 0.92, + "learning_rate": 2.4511295797729136e-08, + "logits/chosen": -3.1979784965515137, + "logits/rejected": -3.0890536308288574, + "logps/chosen": -203.72900390625, + "logps/rejected": -251.32192993164062, + "loss": 0.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08005403727293015, + "rewards/margins": 1.859372854232788, + "rewards/rejected": -1.9394267797470093, + "step": 7976 + }, + { + "epoch": 0.92, + "learning_rate": 2.447617932810488e-08, + "logits/chosen": -2.938526153564453, + "logits/rejected": -2.892923593521118, + "logps/chosen": -235.83287048339844, + "logps/rejected": -259.1390686035156, + "loss": 0.8747, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6366168856620789, + "rewards/margins": 0.8398971557617188, + "rewards/rejected": -1.4765139818191528, + "step": 7977 + }, + { + "epoch": 0.92, + "learning_rate": 2.4441062858480628e-08, + "logits/chosen": -3.272444725036621, + "logits/rejected": -3.118973970413208, + "logps/chosen": -266.59722900390625, + "logps/rejected": -224.6721954345703, + "loss": 0.4204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38437366485595703, + "rewards/margins": 1.6319831609725952, + "rewards/rejected": -2.016356945037842, + "step": 7978 + }, + { + "epoch": 0.92, + "learning_rate": 2.4405946388856375e-08, + "logits/chosen": -3.2520971298217773, + "logits/rejected": -3.2545907497406006, + "logps/chosen": -98.0849380493164, + "logps/rejected": -213.66030883789062, + "loss": 0.5117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32629820704460144, + "rewards/margins": 2.3057079315185547, + "rewards/rejected": -2.6320064067840576, + "step": 7979 + }, + { + "epoch": 0.92, + "learning_rate": 2.4370829919232122e-08, + "logits/chosen": -3.9352660179138184, + "logits/rejected": -3.9203040599823, + "logps/chosen": -385.2655334472656, + "logps/rejected": -351.1178894042969, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40106192231178284, + "rewards/margins": 1.67147958278656, + "rewards/rejected": -1.2704176902770996, + "step": 7980 + }, + { + "epoch": 0.92, + "learning_rate": 2.4335713449607866e-08, + "logits/chosen": -2.601738691329956, + "logits/rejected": -2.3341872692108154, + "logps/chosen": -256.02923583984375, + "logps/rejected": -234.41331481933594, + "loss": 0.4783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5755335092544556, + "rewards/margins": 0.8575092554092407, + "rewards/rejected": -1.4330426454544067, + "step": 7981 + }, + { + "epoch": 0.92, + "learning_rate": 2.430059697998361e-08, + "logits/chosen": -2.8481054306030273, + "logits/rejected": -2.89680814743042, + "logps/chosen": -112.02164459228516, + "logps/rejected": -262.2196044921875, + "loss": 0.2213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22728495299816132, + "rewards/margins": 3.0434162616729736, + "rewards/rejected": -2.816131353378296, + "step": 7982 + }, + { + "epoch": 0.92, + "learning_rate": 2.4265480510359357e-08, + "logits/chosen": -2.837524175643921, + "logits/rejected": -2.8734920024871826, + "logps/chosen": -137.19505310058594, + "logps/rejected": -267.20635986328125, + "loss": 0.5607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6975009441375732, + "rewards/margins": 1.4645583629608154, + "rewards/rejected": -2.1620593070983887, + "step": 7983 + }, + { + "epoch": 0.92, + "learning_rate": 2.42303640407351e-08, + "logits/chosen": -3.722883462905884, + "logits/rejected": -3.9376885890960693, + "logps/chosen": -156.9109344482422, + "logps/rejected": -213.02793884277344, + "loss": 0.1845, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08633880317211151, + "rewards/margins": 3.663761615753174, + "rewards/rejected": -3.577422857284546, + "step": 7984 + }, + { + "epoch": 0.92, + "learning_rate": 2.4195247571110848e-08, + "logits/chosen": -3.7101268768310547, + "logits/rejected": -3.563126802444458, + "logps/chosen": -268.6546630859375, + "logps/rejected": -207.15771484375, + "loss": 0.3225, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2198849320411682, + "rewards/margins": 1.6496437788009644, + "rewards/rejected": -1.4297587871551514, + "step": 7985 + }, + { + "epoch": 0.92, + "learning_rate": 2.4160131101486595e-08, + "logits/chosen": -2.9743704795837402, + "logits/rejected": -3.19992733001709, + "logps/chosen": -155.3919219970703, + "logps/rejected": -369.80303955078125, + "loss": 0.4091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07224000245332718, + "rewards/margins": 2.241482734680176, + "rewards/rejected": -2.313723087310791, + "step": 7986 + }, + { + "epoch": 0.92, + "learning_rate": 2.412501463186234e-08, + "logits/chosen": -2.914078950881958, + "logits/rejected": -3.123157024383545, + "logps/chosen": -348.9054260253906, + "logps/rejected": -231.1888427734375, + "loss": 0.2336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023736730217933655, + "rewards/margins": 1.9931539297103882, + "rewards/rejected": -1.9694173336029053, + "step": 7987 + }, + { + "epoch": 0.92, + "learning_rate": 2.4089898162238087e-08, + "logits/chosen": -2.8585307598114014, + "logits/rejected": -3.3647897243499756, + "logps/chosen": -404.0294494628906, + "logps/rejected": -353.5616149902344, + "loss": 0.1346, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47464534640312195, + "rewards/margins": 2.4419124126434326, + "rewards/rejected": -1.9672671556472778, + "step": 7988 + }, + { + "epoch": 0.92, + "learning_rate": 2.4054781692613834e-08, + "logits/chosen": -3.0357375144958496, + "logits/rejected": -3.2840394973754883, + "logps/chosen": -166.8461456298828, + "logps/rejected": -311.505615234375, + "loss": 0.4262, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5038833618164062, + "rewards/margins": 2.238184690475464, + "rewards/rejected": -2.74206805229187, + "step": 7989 + }, + { + "epoch": 0.92, + "learning_rate": 2.401966522298958e-08, + "logits/chosen": -3.1495141983032227, + "logits/rejected": -3.386413335800171, + "logps/chosen": -289.9006652832031, + "logps/rejected": -412.4649963378906, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19903138279914856, + "rewards/margins": 4.108625888824463, + "rewards/rejected": -3.9095945358276367, + "step": 7990 + }, + { + "epoch": 0.92, + "learning_rate": 2.3984548753365325e-08, + "logits/chosen": -3.7406835556030273, + "logits/rejected": -3.7937631607055664, + "logps/chosen": -189.83917236328125, + "logps/rejected": -292.11456298828125, + "loss": 0.5547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40531522035598755, + "rewards/margins": 2.033177614212036, + "rewards/rejected": -2.4384925365448, + "step": 7991 + }, + { + "epoch": 0.92, + "learning_rate": 2.3949432283741072e-08, + "logits/chosen": -3.2771506309509277, + "logits/rejected": -2.954336404800415, + "logps/chosen": -474.3189697265625, + "logps/rejected": -353.4359436035156, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3095828890800476, + "rewards/margins": 2.832674741744995, + "rewards/rejected": -3.1422576904296875, + "step": 7992 + }, + { + "epoch": 0.92, + "learning_rate": 2.391431581411682e-08, + "logits/chosen": -2.9199979305267334, + "logits/rejected": -3.2120838165283203, + "logps/chosen": -310.13336181640625, + "logps/rejected": -375.8575134277344, + "loss": 0.4055, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6926270127296448, + "rewards/margins": 2.3751206398010254, + "rewards/rejected": -3.0677478313446045, + "step": 7993 + }, + { + "epoch": 0.92, + "learning_rate": 2.3879199344492563e-08, + "logits/chosen": -3.2166967391967773, + "logits/rejected": -2.9573628902435303, + "logps/chosen": -256.3385314941406, + "logps/rejected": -257.9768371582031, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4053897261619568, + "rewards/margins": 2.7715206146240234, + "rewards/rejected": -2.366130828857422, + "step": 7994 + }, + { + "epoch": 0.92, + "learning_rate": 2.384408287486831e-08, + "logits/chosen": -2.755082130432129, + "logits/rejected": -2.5797276496887207, + "logps/chosen": -163.77833557128906, + "logps/rejected": -325.8861389160156, + "loss": 0.3245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09722520411014557, + "rewards/margins": 2.0924129486083984, + "rewards/rejected": -2.1896378993988037, + "step": 7995 + }, + { + "epoch": 0.92, + "learning_rate": 2.3808966405244058e-08, + "logits/chosen": -3.03595232963562, + "logits/rejected": -2.835731267929077, + "logps/chosen": -184.3993377685547, + "logps/rejected": -202.08932495117188, + "loss": 0.4698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37215837836265564, + "rewards/margins": 1.0346730947494507, + "rewards/rejected": -1.4068315029144287, + "step": 7996 + }, + { + "epoch": 0.92, + "learning_rate": 2.3773849935619805e-08, + "logits/chosen": -2.7571933269500732, + "logits/rejected": -2.89835262298584, + "logps/chosen": -257.661376953125, + "logps/rejected": -204.52627563476562, + "loss": 0.2315, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08006398379802704, + "rewards/margins": 1.9561642408370972, + "rewards/rejected": -1.8761003017425537, + "step": 7997 + }, + { + "epoch": 0.92, + "learning_rate": 2.373873346599555e-08, + "logits/chosen": -2.554165840148926, + "logits/rejected": -2.679136276245117, + "logps/chosen": -221.43060302734375, + "logps/rejected": -214.6878204345703, + "loss": 0.6098, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20120036602020264, + "rewards/margins": 1.3186014890670776, + "rewards/rejected": -1.117401123046875, + "step": 7998 + }, + { + "epoch": 0.92, + "learning_rate": 2.3703616996371296e-08, + "logits/chosen": -2.6791343688964844, + "logits/rejected": -2.845362901687622, + "logps/chosen": -276.14117431640625, + "logps/rejected": -132.26437377929688, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1979052722454071, + "rewards/margins": 1.0924736261367798, + "rewards/rejected": -1.2903789281845093, + "step": 7999 + }, + { + "epoch": 0.92, + "learning_rate": 2.3668500526747044e-08, + "logits/chosen": -2.697489023208618, + "logits/rejected": -2.844036817550659, + "logps/chosen": -372.4266357421875, + "logps/rejected": -385.4359130859375, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6623721718788147, + "rewards/margins": 3.4289071559906006, + "rewards/rejected": -2.7665348052978516, + "step": 8000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.835569143295288, + "eval_logits/rejected": -2.797374725341797, + "eval_logps/chosen": -293.9241638183594, + "eval_logps/rejected": -237.81394958496094, + "eval_loss": 0.4283079504966736, + "eval_rewards/accuracies": 0.8142856955528259, + "eval_rewards/chosen": 0.013080031611025333, + "eval_rewards/margins": 1.3602137565612793, + "eval_rewards/rejected": -1.347133755683899, + "eval_runtime": 32.5613, + "eval_samples_per_second": 2.15, + "eval_steps_per_second": 1.075, + "step": 8000 + }, + { + "epoch": 0.92, + "learning_rate": 2.3633384057122787e-08, + "logits/chosen": -2.1724233627319336, + "logits/rejected": -2.342683792114258, + "logps/chosen": -391.67315673828125, + "logps/rejected": -212.12399291992188, + "loss": 0.5792, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3684316873550415, + "rewards/margins": 0.7934242486953735, + "rewards/rejected": -0.42499256134033203, + "step": 8001 + }, + { + "epoch": 0.92, + "learning_rate": 2.3598267587498535e-08, + "logits/chosen": -2.787339210510254, + "logits/rejected": -2.713188409805298, + "logps/chosen": -150.95730590820312, + "logps/rejected": -132.15199279785156, + "loss": 0.2549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09024590998888016, + "rewards/margins": 1.5078314542770386, + "rewards/rejected": -1.5980772972106934, + "step": 8002 + }, + { + "epoch": 0.92, + "learning_rate": 2.3563151117874282e-08, + "logits/chosen": -2.625744104385376, + "logits/rejected": -2.660984754562378, + "logps/chosen": -226.54466247558594, + "logps/rejected": -196.52415466308594, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13031230866909027, + "rewards/margins": 1.8259893655776978, + "rewards/rejected": -1.6956771612167358, + "step": 8003 + }, + { + "epoch": 0.92, + "learning_rate": 2.352803464825003e-08, + "logits/chosen": -2.718841075897217, + "logits/rejected": -2.6312150955200195, + "logps/chosen": -168.17881774902344, + "logps/rejected": -175.49191284179688, + "loss": 0.3824, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23947195708751678, + "rewards/margins": 1.799863338470459, + "rewards/rejected": -1.5603913068771362, + "step": 8004 + }, + { + "epoch": 0.92, + "learning_rate": 2.3492918178625773e-08, + "logits/chosen": -3.319441795349121, + "logits/rejected": -3.318993330001831, + "logps/chosen": -218.38894653320312, + "logps/rejected": -277.68115234375, + "loss": 0.2527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43785539269447327, + "rewards/margins": 1.687408447265625, + "rewards/rejected": -2.1252639293670654, + "step": 8005 + }, + { + "epoch": 0.92, + "learning_rate": 2.345780170900152e-08, + "logits/chosen": -2.829005002975464, + "logits/rejected": -3.0559656620025635, + "logps/chosen": -230.8877410888672, + "logps/rejected": -312.5660095214844, + "loss": 0.1981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4095618724822998, + "rewards/margins": 2.4204623699188232, + "rewards/rejected": -2.830024242401123, + "step": 8006 + }, + { + "epoch": 0.92, + "learning_rate": 2.3422685239377268e-08, + "logits/chosen": -3.584613800048828, + "logits/rejected": -3.0837960243225098, + "logps/chosen": -159.92909240722656, + "logps/rejected": -240.91714477539062, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01420469582080841, + "rewards/margins": 1.8061156272888184, + "rewards/rejected": -1.8203203678131104, + "step": 8007 + }, + { + "epoch": 0.92, + "learning_rate": 2.3387568769753015e-08, + "logits/chosen": -3.503955841064453, + "logits/rejected": -3.535832405090332, + "logps/chosen": -312.06927490234375, + "logps/rejected": -297.1146240234375, + "loss": 0.632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5955910682678223, + "rewards/margins": 1.3245437145233154, + "rewards/rejected": -1.9201347827911377, + "step": 8008 + }, + { + "epoch": 0.92, + "learning_rate": 2.335245230012876e-08, + "logits/chosen": -2.9475369453430176, + "logits/rejected": -2.8640012741088867, + "logps/chosen": -182.304931640625, + "logps/rejected": -219.90350341796875, + "loss": 0.5339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25493237376213074, + "rewards/margins": 1.2262051105499268, + "rewards/rejected": -0.9712728261947632, + "step": 8009 + }, + { + "epoch": 0.92, + "learning_rate": 2.3317335830504506e-08, + "logits/chosen": -3.0417327880859375, + "logits/rejected": -2.9444663524627686, + "logps/chosen": -394.12701416015625, + "logps/rejected": -278.0802001953125, + "loss": 0.8724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48388341069221497, + "rewards/margins": 0.43384259939193726, + "rewards/rejected": -0.9177259802818298, + "step": 8010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3282219360880253e-08, + "logits/chosen": -3.563889980316162, + "logits/rejected": -3.7180380821228027, + "logps/chosen": -195.51048278808594, + "logps/rejected": -285.7178039550781, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23970991373062134, + "rewards/margins": 2.495819091796875, + "rewards/rejected": -2.2561092376708984, + "step": 8011 + }, + { + "epoch": 0.92, + "learning_rate": 2.3247102891255997e-08, + "logits/chosen": -3.162048816680908, + "logits/rejected": -2.8991172313690186, + "logps/chosen": -367.10064697265625, + "logps/rejected": -264.2179870605469, + "loss": 0.2075, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42170393466949463, + "rewards/margins": 2.7889981269836426, + "rewards/rejected": -2.3672943115234375, + "step": 8012 + }, + { + "epoch": 0.92, + "learning_rate": 2.3211986421631744e-08, + "logits/chosen": -3.4292049407958984, + "logits/rejected": -2.944840908050537, + "logps/chosen": -233.85409545898438, + "logps/rejected": -203.40557861328125, + "loss": 0.4091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14242830872535706, + "rewards/margins": 1.7398442029953003, + "rewards/rejected": -1.882272481918335, + "step": 8013 + }, + { + "epoch": 0.92, + "learning_rate": 2.3176869952007492e-08, + "logits/chosen": -3.7155160903930664, + "logits/rejected": -3.73345947265625, + "logps/chosen": -248.3722686767578, + "logps/rejected": -198.58419799804688, + "loss": 0.7253, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.763799250125885, + "rewards/margins": 0.9025464057922363, + "rewards/rejected": -1.6663457155227661, + "step": 8014 + }, + { + "epoch": 0.92, + "learning_rate": 2.314175348238324e-08, + "logits/chosen": -3.3617641925811768, + "logits/rejected": -3.1111819744110107, + "logps/chosen": -368.0428466796875, + "logps/rejected": -258.979736328125, + "loss": 0.4685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7069573998451233, + "rewards/margins": 1.0855486392974854, + "rewards/rejected": -1.7925058603286743, + "step": 8015 + }, + { + "epoch": 0.92, + "learning_rate": 2.3106637012758983e-08, + "logits/chosen": -3.7164645195007324, + "logits/rejected": -3.54648494720459, + "logps/chosen": -235.0008544921875, + "logps/rejected": -268.8841552734375, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6305361986160278, + "rewards/margins": 1.5419895648956299, + "rewards/rejected": -2.1725258827209473, + "step": 8016 + }, + { + "epoch": 0.92, + "learning_rate": 2.307152054313473e-08, + "logits/chosen": -2.90130615234375, + "logits/rejected": -2.9186625480651855, + "logps/chosen": -454.83905029296875, + "logps/rejected": -386.7666015625, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03389720618724823, + "rewards/margins": 2.1285321712493896, + "rewards/rejected": -2.094635009765625, + "step": 8017 + }, + { + "epoch": 0.92, + "learning_rate": 2.3036404073510477e-08, + "logits/chosen": -2.756119728088379, + "logits/rejected": -3.0202603340148926, + "logps/chosen": -213.91641235351562, + "logps/rejected": -235.87088012695312, + "loss": 0.338, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22496476769447327, + "rewards/margins": 2.1691184043884277, + "rewards/rejected": -1.9441534280776978, + "step": 8018 + }, + { + "epoch": 0.92, + "learning_rate": 2.300128760388622e-08, + "logits/chosen": -3.0618300437927246, + "logits/rejected": -2.9044430255889893, + "logps/chosen": -208.32080078125, + "logps/rejected": -268.1991882324219, + "loss": 0.2605, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18411913514137268, + "rewards/margins": 2.12042498588562, + "rewards/rejected": -1.9363057613372803, + "step": 8019 + }, + { + "epoch": 0.92, + "learning_rate": 2.296617113426197e-08, + "logits/chosen": -3.560091018676758, + "logits/rejected": -3.345008373260498, + "logps/chosen": -238.9890594482422, + "logps/rejected": -189.31939697265625, + "loss": 0.7539, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49841228127479553, + "rewards/margins": 0.1434486210346222, + "rewards/rejected": -0.6418609619140625, + "step": 8020 + }, + { + "epoch": 0.92, + "learning_rate": 2.2931054664637716e-08, + "logits/chosen": -2.8054933547973633, + "logits/rejected": -3.2866199016571045, + "logps/chosen": -250.70785522460938, + "logps/rejected": -238.81724548339844, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43123799562454224, + "rewards/margins": 1.810842514038086, + "rewards/rejected": -1.3796045780181885, + "step": 8021 + }, + { + "epoch": 0.92, + "learning_rate": 2.2895938195013463e-08, + "logits/chosen": -3.351930618286133, + "logits/rejected": -2.97041392326355, + "logps/chosen": -282.8058166503906, + "logps/rejected": -162.94004821777344, + "loss": 0.3839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22391116619110107, + "rewards/margins": 1.2494418621063232, + "rewards/rejected": -1.4733531475067139, + "step": 8022 + }, + { + "epoch": 0.92, + "learning_rate": 2.2860821725389207e-08, + "logits/chosen": -2.666412115097046, + "logits/rejected": -2.5294978618621826, + "logps/chosen": -328.77734375, + "logps/rejected": -241.4656219482422, + "loss": 0.3732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0010636448860168457, + "rewards/margins": 1.3246134519577026, + "rewards/rejected": -1.323549747467041, + "step": 8023 + }, + { + "epoch": 0.93, + "learning_rate": 2.2825705255764954e-08, + "logits/chosen": -2.7709925174713135, + "logits/rejected": -2.6102023124694824, + "logps/chosen": -219.91946411132812, + "logps/rejected": -303.8999938964844, + "loss": 0.6264, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6625933647155762, + "rewards/margins": 0.41801536083221436, + "rewards/rejected": -1.0806087255477905, + "step": 8024 + }, + { + "epoch": 0.93, + "learning_rate": 2.27905887861407e-08, + "logits/chosen": -3.0734310150146484, + "logits/rejected": -3.0897393226623535, + "logps/chosen": -252.2906951904297, + "logps/rejected": -326.469482421875, + "loss": 0.4862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19562888145446777, + "rewards/margins": 1.5324032306671143, + "rewards/rejected": -1.336774230003357, + "step": 8025 + }, + { + "epoch": 0.93, + "learning_rate": 2.275547231651645e-08, + "logits/chosen": -3.0480473041534424, + "logits/rejected": -3.040666103363037, + "logps/chosen": -117.35163116455078, + "logps/rejected": -221.1124267578125, + "loss": 0.5414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3586985468864441, + "rewards/margins": 1.298027753829956, + "rewards/rejected": -1.656726360321045, + "step": 8026 + }, + { + "epoch": 0.93, + "learning_rate": 2.2720355846892193e-08, + "logits/chosen": -2.9954309463500977, + "logits/rejected": -3.0361976623535156, + "logps/chosen": -299.86187744140625, + "logps/rejected": -261.8692321777344, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0953797921538353, + "rewards/margins": 2.121105670928955, + "rewards/rejected": -2.025725841522217, + "step": 8027 + }, + { + "epoch": 0.93, + "learning_rate": 2.268523937726794e-08, + "logits/chosen": -3.0802016258239746, + "logits/rejected": -3.1498680114746094, + "logps/chosen": -201.27334594726562, + "logps/rejected": -263.4569091796875, + "loss": 0.6643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20887836813926697, + "rewards/margins": 2.514129400253296, + "rewards/rejected": -2.723007917404175, + "step": 8028 + }, + { + "epoch": 0.93, + "learning_rate": 2.2650122907643687e-08, + "logits/chosen": -2.7074484825134277, + "logits/rejected": -2.3851118087768555, + "logps/chosen": -160.68678283691406, + "logps/rejected": -190.7713165283203, + "loss": 0.4816, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5415672659873962, + "rewards/margins": 0.8958225250244141, + "rewards/rejected": -1.437389850616455, + "step": 8029 + }, + { + "epoch": 0.93, + "learning_rate": 2.2615006438019428e-08, + "logits/chosen": -2.3639674186706543, + "logits/rejected": -2.484079360961914, + "logps/chosen": -432.4842834472656, + "logps/rejected": -277.243896484375, + "loss": 0.3369, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43280091881752014, + "rewards/margins": 1.264365553855896, + "rewards/rejected": -0.8315646648406982, + "step": 8030 + }, + { + "epoch": 0.93, + "learning_rate": 2.2579889968395175e-08, + "logits/chosen": -3.488673686981201, + "logits/rejected": -3.0228750705718994, + "logps/chosen": -225.40988159179688, + "logps/rejected": -141.59994506835938, + "loss": 0.3408, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14891758561134338, + "rewards/margins": 1.5525624752044678, + "rewards/rejected": -1.4036449193954468, + "step": 8031 + }, + { + "epoch": 0.93, + "learning_rate": 2.2544773498770922e-08, + "logits/chosen": -2.9155263900756836, + "logits/rejected": -3.0366387367248535, + "logps/chosen": -96.57861328125, + "logps/rejected": -200.00270080566406, + "loss": 0.3303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010015420615673065, + "rewards/margins": 2.8856685161590576, + "rewards/rejected": -2.8756532669067383, + "step": 8032 + }, + { + "epoch": 0.93, + "learning_rate": 2.2509657029146666e-08, + "logits/chosen": -3.1655402183532715, + "logits/rejected": -2.820721387863159, + "logps/chosen": -304.3794250488281, + "logps/rejected": -327.476318359375, + "loss": 0.4273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.042627573013305664, + "rewards/margins": 2.0305325984954834, + "rewards/rejected": -1.9879051446914673, + "step": 8033 + }, + { + "epoch": 0.93, + "learning_rate": 2.2474540559522413e-08, + "logits/chosen": -2.6612465381622314, + "logits/rejected": -2.5630931854248047, + "logps/chosen": -392.0572204589844, + "logps/rejected": -321.8609619140625, + "loss": 0.5964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.017085429280996323, + "rewards/margins": 0.6436210870742798, + "rewards/rejected": -0.6607065200805664, + "step": 8034 + }, + { + "epoch": 0.93, + "learning_rate": 2.243942408989816e-08, + "logits/chosen": -3.4162697792053223, + "logits/rejected": -3.5580239295959473, + "logps/chosen": -291.88970947265625, + "logps/rejected": -353.3874816894531, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.031783148646354675, + "rewards/margins": 2.3157076835632324, + "rewards/rejected": -2.3474907875061035, + "step": 8035 + }, + { + "epoch": 0.93, + "learning_rate": 2.2404307620273904e-08, + "logits/chosen": -2.3144588470458984, + "logits/rejected": -2.3785009384155273, + "logps/chosen": -211.83187866210938, + "logps/rejected": -163.4004669189453, + "loss": 0.5268, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14751967787742615, + "rewards/margins": 0.7623153328895569, + "rewards/rejected": -0.6147956252098083, + "step": 8036 + }, + { + "epoch": 0.93, + "learning_rate": 2.236919115064965e-08, + "logits/chosen": -3.133043050765991, + "logits/rejected": -3.0839285850524902, + "logps/chosen": -192.89895629882812, + "logps/rejected": -229.5494384765625, + "loss": 0.2501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02995244413614273, + "rewards/margins": 1.8407917022705078, + "rewards/rejected": -1.8707441091537476, + "step": 8037 + }, + { + "epoch": 0.93, + "learning_rate": 2.23340746810254e-08, + "logits/chosen": -2.7742860317230225, + "logits/rejected": -2.934805154800415, + "logps/chosen": -291.8312072753906, + "logps/rejected": -278.15216064453125, + "loss": 0.2476, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0674750953912735, + "rewards/margins": 2.666072368621826, + "rewards/rejected": -2.598597288131714, + "step": 8038 + }, + { + "epoch": 0.93, + "learning_rate": 2.2298958211401146e-08, + "logits/chosen": -2.376309871673584, + "logits/rejected": -2.438676118850708, + "logps/chosen": -411.384033203125, + "logps/rejected": -209.94436645507812, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05738034099340439, + "rewards/margins": 1.6633398532867432, + "rewards/rejected": -1.7207201719284058, + "step": 8039 + }, + { + "epoch": 0.93, + "learning_rate": 2.226384174177689e-08, + "logits/chosen": -2.9948127269744873, + "logits/rejected": -2.6520228385925293, + "logps/chosen": -196.4091033935547, + "logps/rejected": -182.95570373535156, + "loss": 0.2888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14376959204673767, + "rewards/margins": 1.2831405401229858, + "rewards/rejected": -1.1393709182739258, + "step": 8040 + }, + { + "epoch": 0.93, + "learning_rate": 2.2228725272152637e-08, + "logits/chosen": -3.231226682662964, + "logits/rejected": -2.991774082183838, + "logps/chosen": -365.33984375, + "logps/rejected": -270.27960205078125, + "loss": 0.3149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29492223262786865, + "rewards/margins": 1.255335807800293, + "rewards/rejected": -1.550257921218872, + "step": 8041 + }, + { + "epoch": 0.93, + "learning_rate": 2.2193608802528385e-08, + "logits/chosen": -3.7253496646881104, + "logits/rejected": -3.6466448307037354, + "logps/chosen": -204.97171020507812, + "logps/rejected": -228.76583862304688, + "loss": 0.285, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6465979218482971, + "rewards/margins": 2.571103811264038, + "rewards/rejected": -1.9245059490203857, + "step": 8042 + }, + { + "epoch": 0.93, + "learning_rate": 2.2158492332904132e-08, + "logits/chosen": -2.6136341094970703, + "logits/rejected": -2.774787425994873, + "logps/chosen": -292.97222900390625, + "logps/rejected": -322.89801025390625, + "loss": 0.5204, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4007762670516968, + "rewards/margins": 1.4480220079421997, + "rewards/rejected": -1.8487982749938965, + "step": 8043 + }, + { + "epoch": 0.93, + "learning_rate": 2.2123375863279876e-08, + "logits/chosen": -2.6222429275512695, + "logits/rejected": -2.6073966026306152, + "logps/chosen": -249.88442993164062, + "logps/rejected": -206.62930297851562, + "loss": 0.6687, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47415265440940857, + "rewards/margins": 1.5394659042358398, + "rewards/rejected": -1.0653133392333984, + "step": 8044 + }, + { + "epoch": 0.93, + "learning_rate": 2.2088259393655623e-08, + "logits/chosen": -3.178874969482422, + "logits/rejected": -3.1844992637634277, + "logps/chosen": -269.6679992675781, + "logps/rejected": -370.9991760253906, + "loss": 0.3731, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11180319637060165, + "rewards/margins": 2.7016475200653076, + "rewards/rejected": -2.813450574874878, + "step": 8045 + }, + { + "epoch": 0.93, + "learning_rate": 2.205314292403137e-08, + "logits/chosen": -3.90211820602417, + "logits/rejected": -3.7662229537963867, + "logps/chosen": -168.53150939941406, + "logps/rejected": -203.20297241210938, + "loss": 0.464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19040226936340332, + "rewards/margins": 1.8774058818817139, + "rewards/rejected": -2.067808151245117, + "step": 8046 + }, + { + "epoch": 0.93, + "learning_rate": 2.2018026454407114e-08, + "logits/chosen": -2.7487173080444336, + "logits/rejected": -2.5938825607299805, + "logps/chosen": -310.8316345214844, + "logps/rejected": -251.44712829589844, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4507037401199341, + "rewards/margins": 3.3422939777374268, + "rewards/rejected": -2.8915903568267822, + "step": 8047 + }, + { + "epoch": 0.93, + "learning_rate": 2.198290998478286e-08, + "logits/chosen": -2.4923648834228516, + "logits/rejected": -2.4849119186401367, + "logps/chosen": -249.54673767089844, + "logps/rejected": -371.259521484375, + "loss": 0.3945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34374839067459106, + "rewards/margins": 1.7309404611587524, + "rewards/rejected": -1.3871921300888062, + "step": 8048 + }, + { + "epoch": 0.93, + "learning_rate": 2.194779351515861e-08, + "logits/chosen": -3.4455666542053223, + "logits/rejected": -3.2057583332061768, + "logps/chosen": -380.2109680175781, + "logps/rejected": -319.1497802734375, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.554500937461853, + "rewards/margins": 1.674023985862732, + "rewards/rejected": -1.119523048400879, + "step": 8049 + }, + { + "epoch": 0.93, + "learning_rate": 2.1912677045534356e-08, + "logits/chosen": -3.203096866607666, + "logits/rejected": -2.894627332687378, + "logps/chosen": -220.1932373046875, + "logps/rejected": -253.53416442871094, + "loss": 0.4163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37024858593940735, + "rewards/margins": 1.0915600061416626, + "rewards/rejected": -1.4618085622787476, + "step": 8050 + }, + { + "epoch": 0.93, + "learning_rate": 2.18775605759101e-08, + "logits/chosen": -3.371891736984253, + "logits/rejected": -3.72292160987854, + "logps/chosen": -145.913330078125, + "logps/rejected": -273.21099853515625, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5463504791259766, + "rewards/margins": 1.7948492765426636, + "rewards/rejected": -2.3411996364593506, + "step": 8051 + }, + { + "epoch": 0.93, + "learning_rate": 2.1842444106285847e-08, + "logits/chosen": -2.977965831756592, + "logits/rejected": -2.79372239112854, + "logps/chosen": -504.9857177734375, + "logps/rejected": -382.57952880859375, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6529496908187866, + "rewards/margins": 0.35635823011398315, + "rewards/rejected": -1.009307861328125, + "step": 8052 + }, + { + "epoch": 0.93, + "learning_rate": 2.1807327636661594e-08, + "logits/chosen": -2.5472304821014404, + "logits/rejected": -2.92918062210083, + "logps/chosen": -306.8487548828125, + "logps/rejected": -284.9100036621094, + "loss": 0.4324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2854974567890167, + "rewards/margins": 1.710404396057129, + "rewards/rejected": -1.9959018230438232, + "step": 8053 + }, + { + "epoch": 0.93, + "learning_rate": 2.1772211167037338e-08, + "logits/chosen": -2.8351669311523438, + "logits/rejected": -2.8565452098846436, + "logps/chosen": -134.96197509765625, + "logps/rejected": -220.5603790283203, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4804842174053192, + "rewards/margins": 2.1051108837127686, + "rewards/rejected": -1.6246265172958374, + "step": 8054 + }, + { + "epoch": 0.93, + "learning_rate": 2.1737094697413085e-08, + "logits/chosen": -2.892791986465454, + "logits/rejected": -3.168473958969116, + "logps/chosen": -275.5835266113281, + "logps/rejected": -234.11143493652344, + "loss": 0.3464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.362690806388855, + "rewards/margins": 1.894276738166809, + "rewards/rejected": -2.256967544555664, + "step": 8055 + }, + { + "epoch": 0.93, + "learning_rate": 2.1701978227788833e-08, + "logits/chosen": -3.3889451026916504, + "logits/rejected": -3.223766565322876, + "logps/chosen": -291.94903564453125, + "logps/rejected": -216.06309509277344, + "loss": 0.4423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3739625811576843, + "rewards/margins": 1.0975284576416016, + "rewards/rejected": -1.4714910984039307, + "step": 8056 + }, + { + "epoch": 0.93, + "learning_rate": 2.166686175816458e-08, + "logits/chosen": -3.1827874183654785, + "logits/rejected": -2.876659870147705, + "logps/chosen": -283.4848937988281, + "logps/rejected": -332.50244140625, + "loss": 0.5133, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006913580000400543, + "rewards/margins": 1.38084876537323, + "rewards/rejected": -1.3877623081207275, + "step": 8057 + }, + { + "epoch": 0.93, + "learning_rate": 2.1631745288540324e-08, + "logits/chosen": -3.051818609237671, + "logits/rejected": -3.093229293823242, + "logps/chosen": -206.67462158203125, + "logps/rejected": -335.47442626953125, + "loss": 0.2549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2838868200778961, + "rewards/margins": 3.458343029022217, + "rewards/rejected": -3.1744561195373535, + "step": 8058 + }, + { + "epoch": 0.93, + "learning_rate": 2.159662881891607e-08, + "logits/chosen": -3.3496854305267334, + "logits/rejected": -3.238542079925537, + "logps/chosen": -336.114013671875, + "logps/rejected": -334.67803955078125, + "loss": 0.1295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30358371138572693, + "rewards/margins": 3.1701087951660156, + "rewards/rejected": -3.4736926555633545, + "step": 8059 + }, + { + "epoch": 0.93, + "learning_rate": 2.1561512349291818e-08, + "logits/chosen": -2.9567649364471436, + "logits/rejected": -3.3683412075042725, + "logps/chosen": -187.49916076660156, + "logps/rejected": -240.76626586914062, + "loss": 0.3362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19685910642147064, + "rewards/margins": 2.6336238384246826, + "rewards/rejected": -2.8304831981658936, + "step": 8060 + }, + { + "epoch": 0.93, + "learning_rate": 2.1526395879667562e-08, + "logits/chosen": -2.126779079437256, + "logits/rejected": -2.24544095993042, + "logps/chosen": -414.1973876953125, + "logps/rejected": -291.379150390625, + "loss": 0.2698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18754692375659943, + "rewards/margins": 2.70690655708313, + "rewards/rejected": -2.519359588623047, + "step": 8061 + }, + { + "epoch": 0.93, + "learning_rate": 2.149127941004331e-08, + "logits/chosen": -3.3584656715393066, + "logits/rejected": -3.216667652130127, + "logps/chosen": -152.94430541992188, + "logps/rejected": -120.81934356689453, + "loss": 0.5034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14885786175727844, + "rewards/margins": 1.4871370792388916, + "rewards/rejected": -1.6359946727752686, + "step": 8062 + }, + { + "epoch": 0.93, + "learning_rate": 2.1456162940419057e-08, + "logits/chosen": -2.7087745666503906, + "logits/rejected": -2.960291862487793, + "logps/chosen": -152.6139373779297, + "logps/rejected": -309.90594482421875, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1740807592868805, + "rewards/margins": 1.82393217086792, + "rewards/rejected": -1.6498514413833618, + "step": 8063 + }, + { + "epoch": 0.93, + "learning_rate": 2.1421046470794804e-08, + "logits/chosen": -2.5769567489624023, + "logits/rejected": -2.454758405685425, + "logps/chosen": -323.085693359375, + "logps/rejected": -300.901123046875, + "loss": 0.5496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36756354570388794, + "rewards/margins": 1.2306584119796753, + "rewards/rejected": -1.598221778869629, + "step": 8064 + }, + { + "epoch": 0.93, + "learning_rate": 2.1385930001170548e-08, + "logits/chosen": -2.2873916625976562, + "logits/rejected": -2.2486572265625, + "logps/chosen": -469.1017761230469, + "logps/rejected": -356.34124755859375, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014566242694854736, + "rewards/margins": 1.237168550491333, + "rewards/rejected": -1.251734733581543, + "step": 8065 + }, + { + "epoch": 0.93, + "learning_rate": 2.1350813531546295e-08, + "logits/chosen": -2.975848436355591, + "logits/rejected": -2.806783676147461, + "logps/chosen": -143.56488037109375, + "logps/rejected": -147.34429931640625, + "loss": 0.7565, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0422065258026123, + "rewards/margins": 0.36837247014045715, + "rewards/rejected": -1.410578966140747, + "step": 8066 + }, + { + "epoch": 0.93, + "learning_rate": 2.1315697061922042e-08, + "logits/chosen": -2.4749534130096436, + "logits/rejected": -2.714411973953247, + "logps/chosen": -149.89308166503906, + "logps/rejected": -277.4193420410156, + "loss": 0.1987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6521108746528625, + "rewards/margins": 2.8434157371520996, + "rewards/rejected": -2.1913046836853027, + "step": 8067 + }, + { + "epoch": 0.93, + "learning_rate": 2.128058059229779e-08, + "logits/chosen": -3.4942097663879395, + "logits/rejected": -3.631436347961426, + "logps/chosen": -249.5019073486328, + "logps/rejected": -292.50384521484375, + "loss": 0.2597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053129106760025024, + "rewards/margins": 2.578902006149292, + "rewards/rejected": -2.632031202316284, + "step": 8068 + }, + { + "epoch": 0.93, + "learning_rate": 2.1245464122673533e-08, + "logits/chosen": -2.4760751724243164, + "logits/rejected": -2.6290132999420166, + "logps/chosen": -194.73138427734375, + "logps/rejected": -325.412353515625, + "loss": 0.3041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030846334993839264, + "rewards/margins": 2.0988078117370605, + "rewards/rejected": -2.1296539306640625, + "step": 8069 + }, + { + "epoch": 0.93, + "learning_rate": 2.121034765304928e-08, + "logits/chosen": -3.968874931335449, + "logits/rejected": -3.7376835346221924, + "logps/chosen": -267.50592041015625, + "logps/rejected": -242.0328369140625, + "loss": 0.3931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05458257719874382, + "rewards/margins": 2.5381810665130615, + "rewards/rejected": -2.483598470687866, + "step": 8070 + }, + { + "epoch": 0.93, + "learning_rate": 2.1175231183425028e-08, + "logits/chosen": -3.180668830871582, + "logits/rejected": -3.17086124420166, + "logps/chosen": -268.07586669921875, + "logps/rejected": -198.1901092529297, + "loss": 0.5335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34602439403533936, + "rewards/margins": 2.361846446990967, + "rewards/rejected": -2.7078709602355957, + "step": 8071 + }, + { + "epoch": 0.93, + "learning_rate": 2.1140114713800772e-08, + "logits/chosen": -3.199389934539795, + "logits/rejected": -3.101529121398926, + "logps/chosen": -193.34632873535156, + "logps/rejected": -228.3878936767578, + "loss": 0.3745, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02575865387916565, + "rewards/margins": 1.104827880859375, + "rewards/rejected": -1.0790691375732422, + "step": 8072 + }, + { + "epoch": 0.93, + "learning_rate": 2.110499824417652e-08, + "logits/chosen": -3.213878870010376, + "logits/rejected": -3.296269655227661, + "logps/chosen": -147.52151489257812, + "logps/rejected": -208.64244079589844, + "loss": 0.4546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16033579409122467, + "rewards/margins": 2.320171594619751, + "rewards/rejected": -2.1598358154296875, + "step": 8073 + }, + { + "epoch": 0.93, + "learning_rate": 2.1069881774552266e-08, + "logits/chosen": -3.5929787158966064, + "logits/rejected": -3.459207773208618, + "logps/chosen": -154.98812866210938, + "logps/rejected": -167.06016540527344, + "loss": 0.375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05450374633073807, + "rewards/margins": 1.8598337173461914, + "rewards/rejected": -1.8053300380706787, + "step": 8074 + }, + { + "epoch": 0.93, + "learning_rate": 2.1034765304928014e-08, + "logits/chosen": -2.794090747833252, + "logits/rejected": -2.7147178649902344, + "logps/chosen": -429.6033630371094, + "logps/rejected": -292.47271728515625, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15033668279647827, + "rewards/margins": 2.142580032348633, + "rewards/rejected": -1.9922435283660889, + "step": 8075 + }, + { + "epoch": 0.93, + "learning_rate": 2.0999648835303758e-08, + "logits/chosen": -2.8155558109283447, + "logits/rejected": -2.667402744293213, + "logps/chosen": -436.729736328125, + "logps/rejected": -229.29457092285156, + "loss": 0.3332, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7033935785293579, + "rewards/margins": 1.3620812892913818, + "rewards/rejected": -0.6586877107620239, + "step": 8076 + }, + { + "epoch": 0.93, + "learning_rate": 2.0964532365679505e-08, + "logits/chosen": -3.4112768173217773, + "logits/rejected": -3.194660186767578, + "logps/chosen": -348.2955322265625, + "logps/rejected": -208.45623779296875, + "loss": 0.5913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29935359954833984, + "rewards/margins": 0.7482244372367859, + "rewards/rejected": -1.0475780963897705, + "step": 8077 + }, + { + "epoch": 0.93, + "learning_rate": 2.0929415896055245e-08, + "logits/chosen": -3.1244821548461914, + "logits/rejected": -3.344791889190674, + "logps/chosen": -226.32232666015625, + "logps/rejected": -245.29757690429688, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0761437863111496, + "rewards/margins": 4.399238586425781, + "rewards/rejected": -4.323094844818115, + "step": 8078 + }, + { + "epoch": 0.93, + "learning_rate": 2.0894299426430993e-08, + "logits/chosen": -3.0453269481658936, + "logits/rejected": -2.6799395084381104, + "logps/chosen": -375.8236083984375, + "logps/rejected": -293.3228759765625, + "loss": 0.4297, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.311745822429657, + "rewards/margins": 1.4517728090286255, + "rewards/rejected": -1.1400269269943237, + "step": 8079 + }, + { + "epoch": 0.93, + "learning_rate": 2.085918295680674e-08, + "logits/chosen": -3.0627384185791016, + "logits/rejected": -3.2640278339385986, + "logps/chosen": -219.43557739257812, + "logps/rejected": -229.38941955566406, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7568104267120361, + "rewards/margins": 2.6775851249694824, + "rewards/rejected": -1.9207748174667358, + "step": 8080 + }, + { + "epoch": 0.93, + "learning_rate": 2.0824066487182487e-08, + "logits/chosen": -2.988327741622925, + "logits/rejected": -2.8853816986083984, + "logps/chosen": -246.1224822998047, + "logps/rejected": -261.421630859375, + "loss": 0.3027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2626824975013733, + "rewards/margins": 1.876063585281372, + "rewards/rejected": -2.1387460231781006, + "step": 8081 + }, + { + "epoch": 0.93, + "learning_rate": 2.078895001755823e-08, + "logits/chosen": -3.0947728157043457, + "logits/rejected": -3.757822036743164, + "logps/chosen": -133.91244506835938, + "logps/rejected": -247.65306091308594, + "loss": 0.3825, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3986765146255493, + "rewards/margins": 3.112192392349243, + "rewards/rejected": -3.510869026184082, + "step": 8082 + }, + { + "epoch": 0.93, + "learning_rate": 2.0753833547933978e-08, + "logits/chosen": -3.8714845180511475, + "logits/rejected": -3.9260966777801514, + "logps/chosen": -84.74301147460938, + "logps/rejected": -111.62326049804688, + "loss": 0.249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.344917356967926, + "rewards/margins": 1.8704187870025635, + "rewards/rejected": -1.5255014896392822, + "step": 8083 + }, + { + "epoch": 0.93, + "learning_rate": 2.0718717078309725e-08, + "logits/chosen": -3.135199546813965, + "logits/rejected": -3.263073682785034, + "logps/chosen": -367.54632568359375, + "logps/rejected": -248.56723022460938, + "loss": 0.6517, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5009723901748657, + "rewards/margins": 0.6522146463394165, + "rewards/rejected": -1.1531870365142822, + "step": 8084 + }, + { + "epoch": 0.93, + "learning_rate": 2.0683600608685473e-08, + "logits/chosen": -2.941527843475342, + "logits/rejected": -2.638857841491699, + "logps/chosen": -297.9062805175781, + "logps/rejected": -212.77609252929688, + "loss": 0.444, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28539276123046875, + "rewards/margins": 0.8133763670921326, + "rewards/rejected": -1.0987690687179565, + "step": 8085 + }, + { + "epoch": 0.93, + "learning_rate": 2.0648484139061217e-08, + "logits/chosen": -2.6733078956604004, + "logits/rejected": -2.697108030319214, + "logps/chosen": -300.8211669921875, + "logps/rejected": -314.420166015625, + "loss": 0.4041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5142174363136292, + "rewards/margins": 1.8323487043380737, + "rewards/rejected": -2.3465662002563477, + "step": 8086 + }, + { + "epoch": 0.93, + "learning_rate": 2.0613367669436964e-08, + "logits/chosen": -3.5505597591400146, + "logits/rejected": -3.102961540222168, + "logps/chosen": -389.4725036621094, + "logps/rejected": -280.70111083984375, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14622654020786285, + "rewards/margins": 1.5360004901885986, + "rewards/rejected": -1.3897740840911865, + "step": 8087 + }, + { + "epoch": 0.93, + "learning_rate": 2.057825119981271e-08, + "logits/chosen": -3.521099805831909, + "logits/rejected": -3.9785842895507812, + "logps/chosen": -135.639404296875, + "logps/rejected": -276.3324890136719, + "loss": 0.4127, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05982981622219086, + "rewards/margins": 1.9095255136489868, + "rewards/rejected": -1.8496955633163452, + "step": 8088 + }, + { + "epoch": 0.93, + "learning_rate": 2.0543134730188455e-08, + "logits/chosen": -3.23769211769104, + "logits/rejected": -3.558887004852295, + "logps/chosen": -202.42324829101562, + "logps/rejected": -384.7553405761719, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22373290359973907, + "rewards/margins": 2.626112699508667, + "rewards/rejected": -2.4023799896240234, + "step": 8089 + }, + { + "epoch": 0.93, + "learning_rate": 2.0508018260564202e-08, + "logits/chosen": -3.4767751693725586, + "logits/rejected": -3.651686906814575, + "logps/chosen": -171.30287170410156, + "logps/rejected": -231.30938720703125, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5225474834442139, + "rewards/margins": 2.7629446983337402, + "rewards/rejected": -2.2403972148895264, + "step": 8090 + }, + { + "epoch": 0.93, + "learning_rate": 2.047290179093995e-08, + "logits/chosen": -3.200641632080078, + "logits/rejected": -3.3357455730438232, + "logps/chosen": -335.6983642578125, + "logps/rejected": -288.73541259765625, + "loss": 0.2827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22778654098510742, + "rewards/margins": 1.957155704498291, + "rewards/rejected": -2.1849424839019775, + "step": 8091 + }, + { + "epoch": 0.93, + "learning_rate": 2.0437785321315697e-08, + "logits/chosen": -2.8114664554595947, + "logits/rejected": -2.6173765659332275, + "logps/chosen": -153.9854736328125, + "logps/rejected": -288.30682373046875, + "loss": 0.2699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.061629436910152435, + "rewards/margins": 2.3366758823394775, + "rewards/rejected": -2.3983051776885986, + "step": 8092 + }, + { + "epoch": 0.93, + "learning_rate": 2.040266885169144e-08, + "logits/chosen": -3.4999642372131348, + "logits/rejected": -3.2342216968536377, + "logps/chosen": -172.46441650390625, + "logps/rejected": -189.98513793945312, + "loss": 0.3444, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.058913201093673706, + "rewards/margins": 2.567509174346924, + "rewards/rejected": -2.508596181869507, + "step": 8093 + }, + { + "epoch": 0.93, + "learning_rate": 2.0367552382067188e-08, + "logits/chosen": -2.0452213287353516, + "logits/rejected": -2.1029324531555176, + "logps/chosen": -281.57843017578125, + "logps/rejected": -230.03387451171875, + "loss": 0.4449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12009875476360321, + "rewards/margins": 0.9490899443626404, + "rewards/rejected": -1.0691885948181152, + "step": 8094 + }, + { + "epoch": 0.93, + "learning_rate": 2.0332435912442935e-08, + "logits/chosen": -3.707587242126465, + "logits/rejected": -3.682727575302124, + "logps/chosen": -145.97369384765625, + "logps/rejected": -205.54718017578125, + "loss": 0.2632, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22958174347877502, + "rewards/margins": 2.3507938385009766, + "rewards/rejected": -2.5803754329681396, + "step": 8095 + }, + { + "epoch": 0.93, + "learning_rate": 2.029731944281868e-08, + "logits/chosen": -3.1546547412872314, + "logits/rejected": -3.0216550827026367, + "logps/chosen": -178.43344116210938, + "logps/rejected": -140.3117218017578, + "loss": 0.699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8672754764556885, + "rewards/margins": 0.5112407207489014, + "rewards/rejected": -1.3785161972045898, + "step": 8096 + }, + { + "epoch": 0.93, + "learning_rate": 2.0262202973194426e-08, + "logits/chosen": -2.9222824573516846, + "logits/rejected": -2.6594746112823486, + "logps/chosen": -402.86126708984375, + "logps/rejected": -351.3917541503906, + "loss": 0.6282, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02136099338531494, + "rewards/margins": 1.064349889755249, + "rewards/rejected": -1.0429890155792236, + "step": 8097 + }, + { + "epoch": 0.93, + "learning_rate": 2.0227086503570174e-08, + "logits/chosen": -3.1137003898620605, + "logits/rejected": -3.1431779861450195, + "logps/chosen": -161.5111541748047, + "logps/rejected": -203.87982177734375, + "loss": 0.4083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49005863070487976, + "rewards/margins": 3.1748738288879395, + "rewards/rejected": -3.6649322509765625, + "step": 8098 + }, + { + "epoch": 0.93, + "learning_rate": 2.019197003394592e-08, + "logits/chosen": -3.5250937938690186, + "logits/rejected": -3.4417107105255127, + "logps/chosen": -339.6767578125, + "logps/rejected": -265.8301086425781, + "loss": 0.394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48561084270477295, + "rewards/margins": 2.605225086212158, + "rewards/rejected": -3.0908360481262207, + "step": 8099 + }, + { + "epoch": 0.93, + "learning_rate": 2.0156853564321665e-08, + "logits/chosen": -3.436467170715332, + "logits/rejected": -3.380082130432129, + "logps/chosen": -376.41094970703125, + "logps/rejected": -345.2998962402344, + "loss": 0.4091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22684729099273682, + "rewards/margins": 2.242433547973633, + "rewards/rejected": -2.469280481338501, + "step": 8100 + }, + { + "epoch": 0.93, + "learning_rate": 2.0121737094697412e-08, + "logits/chosen": -3.0203166007995605, + "logits/rejected": -3.010812759399414, + "logps/chosen": -204.81243896484375, + "logps/rejected": -286.08551025390625, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04367043077945709, + "rewards/margins": 2.403377056121826, + "rewards/rejected": -2.4470479488372803, + "step": 8101 + }, + { + "epoch": 0.93, + "learning_rate": 2.008662062507316e-08, + "logits/chosen": -3.7657790184020996, + "logits/rejected": -3.7112507820129395, + "logps/chosen": -129.5463409423828, + "logps/rejected": -232.66021728515625, + "loss": 0.222, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06256355345249176, + "rewards/margins": 3.1622138023376465, + "rewards/rejected": -3.0996501445770264, + "step": 8102 + }, + { + "epoch": 0.93, + "learning_rate": 2.0051504155448903e-08, + "logits/chosen": -2.3435287475585938, + "logits/rejected": -2.4874625205993652, + "logps/chosen": -339.7044372558594, + "logps/rejected": -233.93223571777344, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1205572783946991, + "rewards/margins": 0.5190112590789795, + "rewards/rejected": -0.639568567276001, + "step": 8103 + }, + { + "epoch": 0.93, + "learning_rate": 2.001638768582465e-08, + "logits/chosen": -3.2479965686798096, + "logits/rejected": -3.1107118129730225, + "logps/chosen": -267.124267578125, + "logps/rejected": -195.2169647216797, + "loss": 0.3014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23511840403079987, + "rewards/margins": 2.248173475265503, + "rewards/rejected": -2.0130553245544434, + "step": 8104 + }, + { + "epoch": 0.93, + "learning_rate": 1.9981271216200398e-08, + "logits/chosen": -2.973837375640869, + "logits/rejected": -3.0558862686157227, + "logps/chosen": -141.24984741210938, + "logps/rejected": -218.24215698242188, + "loss": 0.3533, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5070230960845947, + "rewards/margins": 1.8132102489471436, + "rewards/rejected": -1.3061871528625488, + "step": 8105 + }, + { + "epoch": 0.93, + "learning_rate": 1.9946154746576145e-08, + "logits/chosen": -3.0318777561187744, + "logits/rejected": -3.1058592796325684, + "logps/chosen": -142.4993438720703, + "logps/rejected": -218.39588928222656, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35413381457328796, + "rewards/margins": 2.1445133686065674, + "rewards/rejected": -1.7903796434402466, + "step": 8106 + }, + { + "epoch": 0.93, + "learning_rate": 1.991103827695189e-08, + "logits/chosen": -3.576904773712158, + "logits/rejected": -3.4405999183654785, + "logps/chosen": -249.99359130859375, + "logps/rejected": -247.49388122558594, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05463904142379761, + "rewards/margins": 1.3230315446853638, + "rewards/rejected": -1.3776706457138062, + "step": 8107 + }, + { + "epoch": 0.93, + "learning_rate": 1.9875921807327636e-08, + "logits/chosen": -3.0454511642456055, + "logits/rejected": -2.6837806701660156, + "logps/chosen": -199.45028686523438, + "logps/rejected": -241.3609619140625, + "loss": 0.3736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05857644975185394, + "rewards/margins": 2.8184494972229004, + "rewards/rejected": -2.877026081085205, + "step": 8108 + }, + { + "epoch": 0.93, + "learning_rate": 1.9840805337703383e-08, + "logits/chosen": -3.1020450592041016, + "logits/rejected": -3.0849525928497314, + "logps/chosen": -325.1348876953125, + "logps/rejected": -323.0201416015625, + "loss": 0.1639, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3762098550796509, + "rewards/margins": 2.3277084827423096, + "rewards/rejected": -1.9514985084533691, + "step": 8109 + }, + { + "epoch": 0.93, + "learning_rate": 1.980568886807913e-08, + "logits/chosen": -3.2337746620178223, + "logits/rejected": -2.676320791244507, + "logps/chosen": -261.4246826171875, + "logps/rejected": -211.84219360351562, + "loss": 0.3063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26545172929763794, + "rewards/margins": 1.704857349395752, + "rewards/rejected": -1.9703090190887451, + "step": 8110 + }, + { + "epoch": 0.94, + "learning_rate": 1.9770572398454874e-08, + "logits/chosen": -3.0332088470458984, + "logits/rejected": -3.1374850273132324, + "logps/chosen": -320.1058654785156, + "logps/rejected": -205.93597412109375, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06201007217168808, + "rewards/margins": 1.3711251020431519, + "rewards/rejected": -1.3091151714324951, + "step": 8111 + }, + { + "epoch": 0.94, + "learning_rate": 1.9735455928830622e-08, + "logits/chosen": -3.255770444869995, + "logits/rejected": -3.0535144805908203, + "logps/chosen": -232.1608123779297, + "logps/rejected": -262.94073486328125, + "loss": 0.6199, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.498000830411911, + "rewards/margins": 1.6027576923370361, + "rewards/rejected": -1.1047568321228027, + "step": 8112 + }, + { + "epoch": 0.94, + "learning_rate": 1.970033945920637e-08, + "logits/chosen": -3.220871686935425, + "logits/rejected": -3.1240410804748535, + "logps/chosen": -248.4283447265625, + "logps/rejected": -339.1689147949219, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44286996126174927, + "rewards/margins": 2.863872766494751, + "rewards/rejected": -3.3067429065704346, + "step": 8113 + }, + { + "epoch": 0.94, + "learning_rate": 1.9665222989582113e-08, + "logits/chosen": -2.868803024291992, + "logits/rejected": -2.903228282928467, + "logps/chosen": -307.3362731933594, + "logps/rejected": -438.91668701171875, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.671288013458252, + "rewards/margins": 3.1667098999023438, + "rewards/rejected": -2.495421886444092, + "step": 8114 + }, + { + "epoch": 0.94, + "learning_rate": 1.963010651995786e-08, + "logits/chosen": -3.5724828243255615, + "logits/rejected": -3.648022174835205, + "logps/chosen": -144.06398010253906, + "logps/rejected": -289.49700927734375, + "loss": 0.2776, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.003629133105278015, + "rewards/margins": 4.023458480834961, + "rewards/rejected": -4.019829750061035, + "step": 8115 + }, + { + "epoch": 0.94, + "learning_rate": 1.9594990050333607e-08, + "logits/chosen": -3.0363411903381348, + "logits/rejected": -3.2065887451171875, + "logps/chosen": -280.5263671875, + "logps/rejected": -224.31756591796875, + "loss": 0.4107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25136861205101013, + "rewards/margins": 2.149350643157959, + "rewards/rejected": -2.400719165802002, + "step": 8116 + }, + { + "epoch": 0.94, + "learning_rate": 1.9559873580709355e-08, + "logits/chosen": -2.7716751098632812, + "logits/rejected": -2.7742228507995605, + "logps/chosen": -475.25665283203125, + "logps/rejected": -302.7544860839844, + "loss": 0.2237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11932497471570969, + "rewards/margins": 2.431936502456665, + "rewards/rejected": -2.5512616634368896, + "step": 8117 + }, + { + "epoch": 0.94, + "learning_rate": 1.95247571110851e-08, + "logits/chosen": -2.4218568801879883, + "logits/rejected": -2.6374778747558594, + "logps/chosen": -422.4732971191406, + "logps/rejected": -383.747802734375, + "loss": 0.3806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6989766359329224, + "rewards/margins": 1.4744174480438232, + "rewards/rejected": -2.173394203186035, + "step": 8118 + }, + { + "epoch": 0.94, + "learning_rate": 1.9489640641460846e-08, + "logits/chosen": -2.6716842651367188, + "logits/rejected": -3.0347440242767334, + "logps/chosen": -186.69271850585938, + "logps/rejected": -417.3408508300781, + "loss": 0.1425, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33973628282546997, + "rewards/margins": 3.867523193359375, + "rewards/rejected": -3.5277867317199707, + "step": 8119 + }, + { + "epoch": 0.94, + "learning_rate": 1.9454524171836593e-08, + "logits/chosen": -2.188133955001831, + "logits/rejected": -2.2462518215179443, + "logps/chosen": -326.8382873535156, + "logps/rejected": -268.86566162109375, + "loss": 0.4467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09941811114549637, + "rewards/margins": 1.08613920211792, + "rewards/rejected": -0.9867210984230042, + "step": 8120 + }, + { + "epoch": 0.94, + "learning_rate": 1.9419407702212337e-08, + "logits/chosen": -3.0630125999450684, + "logits/rejected": -2.987393617630005, + "logps/chosen": -401.7766418457031, + "logps/rejected": -379.81121826171875, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12918640673160553, + "rewards/margins": 1.7517932653427124, + "rewards/rejected": -1.6226069927215576, + "step": 8121 + }, + { + "epoch": 0.94, + "learning_rate": 1.9384291232588084e-08, + "logits/chosen": -2.826364040374756, + "logits/rejected": -2.737515687942505, + "logps/chosen": -348.8150634765625, + "logps/rejected": -386.28607177734375, + "loss": 0.4845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2681174874305725, + "rewards/margins": 1.8318108320236206, + "rewards/rejected": -2.099928379058838, + "step": 8122 + }, + { + "epoch": 0.94, + "learning_rate": 1.934917476296383e-08, + "logits/chosen": -3.2220664024353027, + "logits/rejected": -3.2804388999938965, + "logps/chosen": -165.6617431640625, + "logps/rejected": -192.40728759765625, + "loss": 0.5457, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7802593111991882, + "rewards/margins": 1.319090723991394, + "rewards/rejected": -2.0993499755859375, + "step": 8123 + }, + { + "epoch": 0.94, + "learning_rate": 1.931405829333958e-08, + "logits/chosen": -3.534693479537964, + "logits/rejected": -3.292240858078003, + "logps/chosen": -218.77340698242188, + "logps/rejected": -265.78814697265625, + "loss": 0.4012, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46384087204933167, + "rewards/margins": 2.0452182292938232, + "rewards/rejected": -2.509059190750122, + "step": 8124 + }, + { + "epoch": 0.94, + "learning_rate": 1.9278941823715323e-08, + "logits/chosen": -3.7640581130981445, + "logits/rejected": -3.467867851257324, + "logps/chosen": -385.540283203125, + "logps/rejected": -228.18069458007812, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10741977393627167, + "rewards/margins": 1.786490559577942, + "rewards/rejected": -1.89391028881073, + "step": 8125 + }, + { + "epoch": 0.94, + "learning_rate": 1.9243825354091066e-08, + "logits/chosen": -3.5181193351745605, + "logits/rejected": -3.5929291248321533, + "logps/chosen": -262.10601806640625, + "logps/rejected": -197.092529296875, + "loss": 0.3322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31223592162132263, + "rewards/margins": 1.8965249061584473, + "rewards/rejected": -2.2087607383728027, + "step": 8126 + }, + { + "epoch": 0.94, + "learning_rate": 1.9208708884466814e-08, + "logits/chosen": -3.421161651611328, + "logits/rejected": -3.5838851928710938, + "logps/chosen": -170.16221618652344, + "logps/rejected": -255.75076293945312, + "loss": 0.1526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04456467181444168, + "rewards/margins": 3.1398234367370605, + "rewards/rejected": -3.1843879222869873, + "step": 8127 + }, + { + "epoch": 0.94, + "learning_rate": 1.9173592414842558e-08, + "logits/chosen": -2.3176236152648926, + "logits/rejected": -2.1563918590545654, + "logps/chosen": -240.0767822265625, + "logps/rejected": -258.974853515625, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2696719169616699, + "rewards/margins": 0.33790165185928345, + "rewards/rejected": -0.6075736284255981, + "step": 8128 + }, + { + "epoch": 0.94, + "learning_rate": 1.9138475945218305e-08, + "logits/chosen": -2.9639925956726074, + "logits/rejected": -2.973402500152588, + "logps/chosen": -116.26839447021484, + "logps/rejected": -141.46078491210938, + "loss": 0.3104, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05447874963283539, + "rewards/margins": 2.318026542663574, + "rewards/rejected": -2.263547897338867, + "step": 8129 + }, + { + "epoch": 0.94, + "learning_rate": 1.9103359475594052e-08, + "logits/chosen": -4.03908634185791, + "logits/rejected": -4.012759685516357, + "logps/chosen": -289.4230651855469, + "logps/rejected": -247.8490447998047, + "loss": 0.7654, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1473051905632019, + "rewards/margins": 0.48383641242980957, + "rewards/rejected": -0.6311416029930115, + "step": 8130 + }, + { + "epoch": 0.94, + "learning_rate": 1.9068243005969796e-08, + "logits/chosen": -2.9633941650390625, + "logits/rejected": -2.961158514022827, + "logps/chosen": -292.8026123046875, + "logps/rejected": -254.13916015625, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.768979549407959, + "rewards/margins": 2.1012661457061768, + "rewards/rejected": -2.8702456951141357, + "step": 8131 + }, + { + "epoch": 0.94, + "learning_rate": 1.9033126536345543e-08, + "logits/chosen": -3.38356351852417, + "logits/rejected": -3.5959115028381348, + "logps/chosen": -177.4767608642578, + "logps/rejected": -252.30372619628906, + "loss": 0.1878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5052100419998169, + "rewards/margins": 3.2442679405212402, + "rewards/rejected": -2.739057779312134, + "step": 8132 + }, + { + "epoch": 0.94, + "learning_rate": 1.899801006672129e-08, + "logits/chosen": -3.400052070617676, + "logits/rejected": -3.1680917739868164, + "logps/chosen": -315.2216491699219, + "logps/rejected": -230.8302764892578, + "loss": 0.3741, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07330694794654846, + "rewards/margins": 1.7744859457015991, + "rewards/rejected": -1.701179027557373, + "step": 8133 + }, + { + "epoch": 0.94, + "learning_rate": 1.8962893597097038e-08, + "logits/chosen": -2.9619884490966797, + "logits/rejected": -2.6542587280273438, + "logps/chosen": -175.0492401123047, + "logps/rejected": -209.59432983398438, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22633367776870728, + "rewards/margins": 1.3600375652313232, + "rewards/rejected": -1.5863711833953857, + "step": 8134 + }, + { + "epoch": 0.94, + "learning_rate": 1.8927777127472782e-08, + "logits/chosen": -2.736616849899292, + "logits/rejected": -2.919780731201172, + "logps/chosen": -358.18084716796875, + "logps/rejected": -241.42141723632812, + "loss": 0.1703, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43369653820991516, + "rewards/margins": 2.6027069091796875, + "rewards/rejected": -2.1690104007720947, + "step": 8135 + }, + { + "epoch": 0.94, + "learning_rate": 1.889266065784853e-08, + "logits/chosen": -3.7158148288726807, + "logits/rejected": -4.019084453582764, + "logps/chosen": -242.95999145507812, + "logps/rejected": -174.6965789794922, + "loss": 0.4646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42001909017562866, + "rewards/margins": 1.176490306854248, + "rewards/rejected": -1.5965094566345215, + "step": 8136 + }, + { + "epoch": 0.94, + "learning_rate": 1.8857544188224276e-08, + "logits/chosen": -2.791288375854492, + "logits/rejected": -2.7436184883117676, + "logps/chosen": -142.1248779296875, + "logps/rejected": -220.20144653320312, + "loss": 0.2555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0018380675464868546, + "rewards/margins": 2.1187543869018555, + "rewards/rejected": -2.1205928325653076, + "step": 8137 + }, + { + "epoch": 0.94, + "learning_rate": 1.882242771860002e-08, + "logits/chosen": -2.984785318374634, + "logits/rejected": -2.998809814453125, + "logps/chosen": -271.77630615234375, + "logps/rejected": -366.782958984375, + "loss": 0.57, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3985166847705841, + "rewards/margins": 2.210306167602539, + "rewards/rejected": -2.608822822570801, + "step": 8138 + }, + { + "epoch": 0.94, + "learning_rate": 1.8787311248975767e-08, + "logits/chosen": -3.2184832096099854, + "logits/rejected": -3.0143752098083496, + "logps/chosen": -224.8107147216797, + "logps/rejected": -183.1089630126953, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11117149144411087, + "rewards/margins": 0.9929934144020081, + "rewards/rejected": -0.881821870803833, + "step": 8139 + }, + { + "epoch": 0.94, + "learning_rate": 1.8752194779351515e-08, + "logits/chosen": -2.6027615070343018, + "logits/rejected": -2.5623464584350586, + "logps/chosen": -261.7636413574219, + "logps/rejected": -230.25262451171875, + "loss": 0.5754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022464334964752197, + "rewards/margins": 1.288555383682251, + "rewards/rejected": -1.266090989112854, + "step": 8140 + }, + { + "epoch": 0.94, + "learning_rate": 1.8717078309727262e-08, + "logits/chosen": -3.3297548294067383, + "logits/rejected": -3.396092653274536, + "logps/chosen": -207.54168701171875, + "logps/rejected": -185.50167846679688, + "loss": 0.2171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09918709099292755, + "rewards/margins": 2.2100629806518555, + "rewards/rejected": -2.1108756065368652, + "step": 8141 + }, + { + "epoch": 0.94, + "learning_rate": 1.8681961840103006e-08, + "logits/chosen": -2.9688148498535156, + "logits/rejected": -2.9483141899108887, + "logps/chosen": -300.43670654296875, + "logps/rejected": -267.80877685546875, + "loss": 0.2533, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4452897012233734, + "rewards/margins": 2.273124933242798, + "rewards/rejected": -1.827835202217102, + "step": 8142 + }, + { + "epoch": 0.94, + "learning_rate": 1.8646845370478753e-08, + "logits/chosen": -3.6129331588745117, + "logits/rejected": -3.3116116523742676, + "logps/chosen": -294.37054443359375, + "logps/rejected": -216.973876953125, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43749427795410156, + "rewards/margins": 2.7185287475585938, + "rewards/rejected": -2.281034469604492, + "step": 8143 + }, + { + "epoch": 0.94, + "learning_rate": 1.86117289008545e-08, + "logits/chosen": -2.6731138229370117, + "logits/rejected": -2.5181496143341064, + "logps/chosen": -322.41900634765625, + "logps/rejected": -251.1419677734375, + "loss": 0.5493, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3432113528251648, + "rewards/margins": 1.4536491632461548, + "rewards/rejected": -1.7968604564666748, + "step": 8144 + }, + { + "epoch": 0.94, + "learning_rate": 1.8576612431230247e-08, + "logits/chosen": -2.8262109756469727, + "logits/rejected": -3.0960910320281982, + "logps/chosen": -307.8598327636719, + "logps/rejected": -188.70156860351562, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41191503405570984, + "rewards/margins": 1.55869722366333, + "rewards/rejected": -1.1467821598052979, + "step": 8145 + }, + { + "epoch": 0.94, + "learning_rate": 1.854149596160599e-08, + "logits/chosen": -2.7580485343933105, + "logits/rejected": -2.6962392330169678, + "logps/chosen": -466.1018981933594, + "logps/rejected": -453.7894287109375, + "loss": 0.547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17873027920722961, + "rewards/margins": 1.3551915884017944, + "rewards/rejected": -1.5339218378067017, + "step": 8146 + }, + { + "epoch": 0.94, + "learning_rate": 1.850637949198174e-08, + "logits/chosen": -2.7847390174865723, + "logits/rejected": -2.9441752433776855, + "logps/chosen": -596.1390380859375, + "logps/rejected": -404.5362548828125, + "loss": 0.3587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058462899178266525, + "rewards/margins": 1.566924810409546, + "rewards/rejected": -1.6253876686096191, + "step": 8147 + }, + { + "epoch": 0.94, + "learning_rate": 1.8471263022357486e-08, + "logits/chosen": -2.7857542037963867, + "logits/rejected": -2.7415571212768555, + "logps/chosen": -279.7179260253906, + "logps/rejected": -203.29251098632812, + "loss": 0.3486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1678428053855896, + "rewards/margins": 1.5028886795043945, + "rewards/rejected": -1.670731544494629, + "step": 8148 + }, + { + "epoch": 0.94, + "learning_rate": 1.843614655273323e-08, + "logits/chosen": -2.673872947692871, + "logits/rejected": -2.44907283782959, + "logps/chosen": -297.9399719238281, + "logps/rejected": -185.8231201171875, + "loss": 0.418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04347944259643555, + "rewards/margins": 1.016410231590271, + "rewards/rejected": -1.0598896741867065, + "step": 8149 + }, + { + "epoch": 0.94, + "learning_rate": 1.8401030083108977e-08, + "logits/chosen": -2.6014344692230225, + "logits/rejected": -2.564140558242798, + "logps/chosen": -155.03762817382812, + "logps/rejected": -134.90716552734375, + "loss": 0.4227, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14859744906425476, + "rewards/margins": 1.2212873697280884, + "rewards/rejected": -1.369884729385376, + "step": 8150 + }, + { + "epoch": 0.94, + "learning_rate": 1.8365913613484724e-08, + "logits/chosen": -3.5034735202789307, + "logits/rejected": -3.323676109313965, + "logps/chosen": -161.03878784179688, + "logps/rejected": -275.11077880859375, + "loss": 0.2628, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07247796654701233, + "rewards/margins": 2.00203800201416, + "rewards/rejected": -1.9295601844787598, + "step": 8151 + }, + { + "epoch": 0.94, + "learning_rate": 1.833079714386047e-08, + "logits/chosen": -2.9243712425231934, + "logits/rejected": -2.9721269607543945, + "logps/chosen": -201.64712524414062, + "logps/rejected": -263.46392822265625, + "loss": 0.8039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5323455333709717, + "rewards/margins": 0.42537960410118103, + "rewards/rejected": -0.9577252268791199, + "step": 8152 + }, + { + "epoch": 0.94, + "learning_rate": 1.8295680674236215e-08, + "logits/chosen": -2.351823091506958, + "logits/rejected": -2.4209532737731934, + "logps/chosen": -368.16229248046875, + "logps/rejected": -300.0963439941406, + "loss": 0.2317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40202292799949646, + "rewards/margins": 2.6498379707336426, + "rewards/rejected": -2.247814893722534, + "step": 8153 + }, + { + "epoch": 0.94, + "learning_rate": 1.8260564204611963e-08, + "logits/chosen": -2.4149374961853027, + "logits/rejected": -2.7137088775634766, + "logps/chosen": -394.4599914550781, + "logps/rejected": -231.90199279785156, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5498688220977783, + "rewards/margins": 0.9547281861305237, + "rewards/rejected": -1.5045969486236572, + "step": 8154 + }, + { + "epoch": 0.94, + "learning_rate": 1.822544773498771e-08, + "logits/chosen": -2.857023239135742, + "logits/rejected": -3.1355719566345215, + "logps/chosen": -342.98846435546875, + "logps/rejected": -254.0294647216797, + "loss": 0.4672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.559821605682373, + "rewards/margins": 1.3363882303237915, + "rewards/rejected": -0.7765663862228394, + "step": 8155 + }, + { + "epoch": 0.94, + "learning_rate": 1.8190331265363454e-08, + "logits/chosen": -2.92746639251709, + "logits/rejected": -3.1179733276367188, + "logps/chosen": -235.04808044433594, + "logps/rejected": -251.31405639648438, + "loss": 0.856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46951723098754883, + "rewards/margins": 2.4779512882232666, + "rewards/rejected": -2.9474685192108154, + "step": 8156 + }, + { + "epoch": 0.94, + "learning_rate": 1.81552147957392e-08, + "logits/chosen": -2.8115758895874023, + "logits/rejected": -2.718144655227661, + "logps/chosen": -315.5123596191406, + "logps/rejected": -231.58616638183594, + "loss": 0.809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012514084577560425, + "rewards/margins": 0.11164455115795135, + "rewards/rejected": -0.12415864318609238, + "step": 8157 + }, + { + "epoch": 0.94, + "learning_rate": 1.8120098326114948e-08, + "logits/chosen": -2.4723544120788574, + "logits/rejected": -2.8020567893981934, + "logps/chosen": -151.33200073242188, + "logps/rejected": -347.4915771484375, + "loss": 0.5386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3404717743396759, + "rewards/margins": 1.9145132303237915, + "rewards/rejected": -2.2549848556518555, + "step": 8158 + }, + { + "epoch": 0.94, + "learning_rate": 1.8084981856490696e-08, + "logits/chosen": -3.5449304580688477, + "logits/rejected": -3.2956647872924805, + "logps/chosen": -178.89576721191406, + "logps/rejected": -149.06817626953125, + "loss": 0.4503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45487621426582336, + "rewards/margins": 0.9965558052062988, + "rewards/rejected": -1.4514319896697998, + "step": 8159 + }, + { + "epoch": 0.94, + "learning_rate": 1.804986538686644e-08, + "logits/chosen": -2.50209641456604, + "logits/rejected": -2.763795852661133, + "logps/chosen": -127.74301147460938, + "logps/rejected": -211.9455108642578, + "loss": 0.3722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17317873239517212, + "rewards/margins": 1.4406728744506836, + "rewards/rejected": -1.2674942016601562, + "step": 8160 + }, + { + "epoch": 0.94, + "learning_rate": 1.8014748917242183e-08, + "logits/chosen": -2.9465103149414062, + "logits/rejected": -3.0236661434173584, + "logps/chosen": -409.89483642578125, + "logps/rejected": -296.695556640625, + "loss": 0.2591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48401930928230286, + "rewards/margins": 1.836775779724121, + "rewards/rejected": -1.352756381034851, + "step": 8161 + }, + { + "epoch": 0.94, + "learning_rate": 1.797963244761793e-08, + "logits/chosen": -2.897360324859619, + "logits/rejected": -2.8500776290893555, + "logps/chosen": -324.31982421875, + "logps/rejected": -337.29498291015625, + "loss": 0.1885, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6375852823257446, + "rewards/margins": 2.9714462757110596, + "rewards/rejected": -2.3338608741760254, + "step": 8162 + }, + { + "epoch": 0.94, + "learning_rate": 1.7944515977993678e-08, + "logits/chosen": -3.034310817718506, + "logits/rejected": -3.0834858417510986, + "logps/chosen": -110.35882568359375, + "logps/rejected": -210.45535278320312, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12174283713102341, + "rewards/margins": 1.8761634826660156, + "rewards/rejected": -1.7544206380844116, + "step": 8163 + }, + { + "epoch": 0.94, + "learning_rate": 1.7909399508369425e-08, + "logits/chosen": -2.7571284770965576, + "logits/rejected": -2.7454986572265625, + "logps/chosen": -326.7891845703125, + "logps/rejected": -257.5616760253906, + "loss": 0.3561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25400757789611816, + "rewards/margins": 1.0092096328735352, + "rewards/rejected": -1.2632172107696533, + "step": 8164 + }, + { + "epoch": 0.94, + "learning_rate": 1.787428303874517e-08, + "logits/chosen": -3.312713146209717, + "logits/rejected": -3.6067419052124023, + "logps/chosen": -143.99403381347656, + "logps/rejected": -285.6132507324219, + "loss": 0.5473, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5546338558197021, + "rewards/margins": 1.685487151145935, + "rewards/rejected": -2.2401211261749268, + "step": 8165 + }, + { + "epoch": 0.94, + "learning_rate": 1.7839166569120916e-08, + "logits/chosen": -3.53360652923584, + "logits/rejected": -3.6113157272338867, + "logps/chosen": -238.14706420898438, + "logps/rejected": -166.36260986328125, + "loss": 0.3297, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.361843466758728, + "rewards/margins": 1.5868897438049316, + "rewards/rejected": -1.2250462770462036, + "step": 8166 + }, + { + "epoch": 0.94, + "learning_rate": 1.7804050099496663e-08, + "logits/chosen": -2.629732370376587, + "logits/rejected": -3.026935338973999, + "logps/chosen": -219.40087890625, + "logps/rejected": -378.3924865722656, + "loss": 0.3162, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5205529928207397, + "rewards/margins": 2.3323090076446533, + "rewards/rejected": -2.8528618812561035, + "step": 8167 + }, + { + "epoch": 0.94, + "learning_rate": 1.7768933629872407e-08, + "logits/chosen": -3.10290789604187, + "logits/rejected": -3.0232598781585693, + "logps/chosen": -322.5345458984375, + "logps/rejected": -238.12452697753906, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18800771236419678, + "rewards/margins": 1.5264841318130493, + "rewards/rejected": -1.3384764194488525, + "step": 8168 + }, + { + "epoch": 0.94, + "learning_rate": 1.7733817160248155e-08, + "logits/chosen": -3.1708736419677734, + "logits/rejected": -3.0764541625976562, + "logps/chosen": -200.05615234375, + "logps/rejected": -261.7193298339844, + "loss": 0.2795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47360551357269287, + "rewards/margins": 1.803372859954834, + "rewards/rejected": -2.2769784927368164, + "step": 8169 + }, + { + "epoch": 0.94, + "learning_rate": 1.7698700690623902e-08, + "logits/chosen": -2.2533557415008545, + "logits/rejected": -2.1916022300720215, + "logps/chosen": -301.38616943359375, + "logps/rejected": -361.1244201660156, + "loss": 0.2665, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0018864348530769348, + "rewards/margins": 2.1028456687927246, + "rewards/rejected": -2.100959062576294, + "step": 8170 + }, + { + "epoch": 0.94, + "learning_rate": 1.766358422099965e-08, + "logits/chosen": -3.0362954139709473, + "logits/rejected": -2.822458028793335, + "logps/chosen": -392.6993408203125, + "logps/rejected": -269.53594970703125, + "loss": 0.3256, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2884213626384735, + "rewards/margins": 2.2868733406066895, + "rewards/rejected": -1.9984519481658936, + "step": 8171 + }, + { + "epoch": 0.94, + "learning_rate": 1.7628467751375393e-08, + "logits/chosen": -2.7963531017303467, + "logits/rejected": -2.7931082248687744, + "logps/chosen": -383.13519287109375, + "logps/rejected": -346.6846618652344, + "loss": 0.6855, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7914069294929504, + "rewards/margins": 0.9207128286361694, + "rewards/rejected": -1.7121198177337646, + "step": 8172 + }, + { + "epoch": 0.94, + "learning_rate": 1.759335128175114e-08, + "logits/chosen": -3.661818027496338, + "logits/rejected": -3.416332960128784, + "logps/chosen": -185.7296142578125, + "logps/rejected": -202.80609130859375, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07354666292667389, + "rewards/margins": 1.349813461303711, + "rewards/rejected": -1.4233601093292236, + "step": 8173 + }, + { + "epoch": 0.94, + "learning_rate": 1.7558234812126888e-08, + "logits/chosen": -3.4523391723632812, + "logits/rejected": -3.334061861038208, + "logps/chosen": -574.772705078125, + "logps/rejected": -419.60467529296875, + "loss": 0.3704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0849202573299408, + "rewards/margins": 1.9473963975906372, + "rewards/rejected": -2.0323164463043213, + "step": 8174 + }, + { + "epoch": 0.94, + "learning_rate": 1.7523118342502635e-08, + "logits/chosen": -3.256793260574341, + "logits/rejected": -2.9841067790985107, + "logps/chosen": -272.07562255859375, + "logps/rejected": -221.69589233398438, + "loss": 0.9939, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.089319109916687, + "rewards/margins": 0.5585801601409912, + "rewards/rejected": -1.6478992700576782, + "step": 8175 + }, + { + "epoch": 0.94, + "learning_rate": 1.748800187287838e-08, + "logits/chosen": -2.5723648071289062, + "logits/rejected": -2.6450517177581787, + "logps/chosen": -347.0587463378906, + "logps/rejected": -333.13818359375, + "loss": 0.3151, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37328076362609863, + "rewards/margins": 1.8505440950393677, + "rewards/rejected": -1.477263331413269, + "step": 8176 + }, + { + "epoch": 0.94, + "learning_rate": 1.7452885403254126e-08, + "logits/chosen": -2.684037446975708, + "logits/rejected": -2.6587769985198975, + "logps/chosen": -500.7362060546875, + "logps/rejected": -431.42626953125, + "loss": 0.2988, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2750774323940277, + "rewards/margins": 2.128095865249634, + "rewards/rejected": -1.8530184030532837, + "step": 8177 + }, + { + "epoch": 0.94, + "learning_rate": 1.7417768933629873e-08, + "logits/chosen": -3.833390235900879, + "logits/rejected": -3.879261016845703, + "logps/chosen": -185.5983428955078, + "logps/rejected": -222.092041015625, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41515523195266724, + "rewards/margins": 1.5582764148712158, + "rewards/rejected": -1.1431212425231934, + "step": 8178 + }, + { + "epoch": 0.94, + "learning_rate": 1.7382652464005617e-08, + "logits/chosen": -3.3049867153167725, + "logits/rejected": -3.2110989093780518, + "logps/chosen": -261.9088134765625, + "logps/rejected": -246.26809692382812, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47261250019073486, + "rewards/margins": 2.517286539077759, + "rewards/rejected": -2.989898920059204, + "step": 8179 + }, + { + "epoch": 0.94, + "learning_rate": 1.7347535994381364e-08, + "logits/chosen": -3.209726095199585, + "logits/rejected": -3.222686767578125, + "logps/chosen": -209.04864501953125, + "logps/rejected": -301.27691650390625, + "loss": 0.5382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42358940839767456, + "rewards/margins": 1.8804998397827148, + "rewards/rejected": -2.304089307785034, + "step": 8180 + }, + { + "epoch": 0.94, + "learning_rate": 1.731241952475711e-08, + "logits/chosen": -2.595937728881836, + "logits/rejected": -2.6520159244537354, + "logps/chosen": -396.0236511230469, + "logps/rejected": -202.24176025390625, + "loss": 0.7467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1522451490163803, + "rewards/margins": 0.8624707460403442, + "rewards/rejected": -0.7102255821228027, + "step": 8181 + }, + { + "epoch": 0.94, + "learning_rate": 1.727730305513286e-08, + "logits/chosen": -3.465639352798462, + "logits/rejected": -3.437981128692627, + "logps/chosen": -262.19952392578125, + "logps/rejected": -211.46751403808594, + "loss": 0.1536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.060538604855537415, + "rewards/margins": 2.710240125656128, + "rewards/rejected": -2.7707791328430176, + "step": 8182 + }, + { + "epoch": 0.94, + "learning_rate": 1.7242186585508603e-08, + "logits/chosen": -2.7431929111480713, + "logits/rejected": -3.160226821899414, + "logps/chosen": -133.91766357421875, + "logps/rejected": -138.53933715820312, + "loss": 0.3126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03880004957318306, + "rewards/margins": 1.5290675163269043, + "rewards/rejected": -1.490267276763916, + "step": 8183 + }, + { + "epoch": 0.94, + "learning_rate": 1.720707011588435e-08, + "logits/chosen": -3.4638404846191406, + "logits/rejected": -3.4181041717529297, + "logps/chosen": -178.20632934570312, + "logps/rejected": -222.76104736328125, + "loss": 0.3095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14339207112789154, + "rewards/margins": 4.471451759338379, + "rewards/rejected": -4.614843368530273, + "step": 8184 + }, + { + "epoch": 0.94, + "learning_rate": 1.7171953646260094e-08, + "logits/chosen": -2.735933303833008, + "logits/rejected": -2.6350550651550293, + "logps/chosen": -395.56610107421875, + "logps/rejected": -303.74664306640625, + "loss": 0.2445, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4905344247817993, + "rewards/margins": 2.201305389404297, + "rewards/rejected": -1.710770845413208, + "step": 8185 + }, + { + "epoch": 0.94, + "learning_rate": 1.713683717663584e-08, + "logits/chosen": -3.640927791595459, + "logits/rejected": -3.537898063659668, + "logps/chosen": -235.9541015625, + "logps/rejected": -238.00704956054688, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3299994170665741, + "rewards/margins": 2.1851584911346436, + "rewards/rejected": -1.855159044265747, + "step": 8186 + }, + { + "epoch": 0.94, + "learning_rate": 1.710172070701159e-08, + "logits/chosen": -3.670579195022583, + "logits/rejected": -3.31243896484375, + "logps/chosen": -400.0976257324219, + "logps/rejected": -166.83251953125, + "loss": 0.2419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15896055102348328, + "rewards/margins": 1.7964537143707275, + "rewards/rejected": -1.9554142951965332, + "step": 8187 + }, + { + "epoch": 0.94, + "learning_rate": 1.7066604237387332e-08, + "logits/chosen": -3.256181478500366, + "logits/rejected": -3.586435317993164, + "logps/chosen": -145.37689208984375, + "logps/rejected": -237.75535583496094, + "loss": 0.3289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08352944254875183, + "rewards/margins": 3.1627519130706787, + "rewards/rejected": -3.0792226791381836, + "step": 8188 + }, + { + "epoch": 0.94, + "learning_rate": 1.703148776776308e-08, + "logits/chosen": -3.524756669998169, + "logits/rejected": -3.858372688293457, + "logps/chosen": -302.4464416503906, + "logps/rejected": -386.4407653808594, + "loss": 0.8982, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2480573654174805, + "rewards/margins": 2.2425270080566406, + "rewards/rejected": -3.490584373474121, + "step": 8189 + }, + { + "epoch": 0.94, + "learning_rate": 1.6996371298138827e-08, + "logits/chosen": -3.6286473274230957, + "logits/rejected": -3.801705837249756, + "logps/chosen": -443.7227783203125, + "logps/rejected": -249.3665771484375, + "loss": 0.7725, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3813709616661072, + "rewards/margins": 1.7577624320983887, + "rewards/rejected": -2.1391334533691406, + "step": 8190 + }, + { + "epoch": 0.94, + "learning_rate": 1.696125482851457e-08, + "logits/chosen": -3.1533098220825195, + "logits/rejected": -3.043942451477051, + "logps/chosen": -198.21463012695312, + "logps/rejected": -251.66473388671875, + "loss": 0.4345, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.058684855699539185, + "rewards/margins": 1.3391234874725342, + "rewards/rejected": -1.2804384231567383, + "step": 8191 + }, + { + "epoch": 0.94, + "learning_rate": 1.6926138358890318e-08, + "logits/chosen": -3.1368894577026367, + "logits/rejected": -3.2552740573883057, + "logps/chosen": -87.27498626708984, + "logps/rejected": -183.72869873046875, + "loss": 0.7744, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4817124903202057, + "rewards/margins": 0.7488498687744141, + "rewards/rejected": -1.2305623292922974, + "step": 8192 + }, + { + "epoch": 0.94, + "learning_rate": 1.6891021889266065e-08, + "logits/chosen": -3.318396806716919, + "logits/rejected": -3.300485849380493, + "logps/chosen": -301.31329345703125, + "logps/rejected": -219.75338745117188, + "loss": 0.6437, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3271428346633911, + "rewards/margins": 1.127153992652893, + "rewards/rejected": -1.4542968273162842, + "step": 8193 + }, + { + "epoch": 0.94, + "learning_rate": 1.6855905419641812e-08, + "logits/chosen": -3.243210792541504, + "logits/rejected": -3.2204103469848633, + "logps/chosen": -200.63470458984375, + "logps/rejected": -205.8739013671875, + "loss": 0.202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08668699860572815, + "rewards/margins": 2.152761936187744, + "rewards/rejected": -2.066074848175049, + "step": 8194 + }, + { + "epoch": 0.94, + "learning_rate": 1.6820788950017556e-08, + "logits/chosen": -3.393320322036743, + "logits/rejected": -2.99713134765625, + "logps/chosen": -163.70123291015625, + "logps/rejected": -167.90652465820312, + "loss": 0.4367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3250158727169037, + "rewards/margins": 1.6998326778411865, + "rewards/rejected": -2.024848699569702, + "step": 8195 + }, + { + "epoch": 0.94, + "learning_rate": 1.6785672480393304e-08, + "logits/chosen": -3.2904388904571533, + "logits/rejected": -2.957132339477539, + "logps/chosen": -280.9899597167969, + "logps/rejected": -197.3377227783203, + "loss": 0.4804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47176849842071533, + "rewards/margins": 1.6066287755966187, + "rewards/rejected": -2.078397274017334, + "step": 8196 + }, + { + "epoch": 0.94, + "learning_rate": 1.675055601076905e-08, + "logits/chosen": -2.3939108848571777, + "logits/rejected": -2.9260315895080566, + "logps/chosen": -188.6431884765625, + "logps/rejected": -229.49127197265625, + "loss": 0.1688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5344288349151611, + "rewards/margins": 2.6310200691223145, + "rewards/rejected": -2.096590995788574, + "step": 8197 + }, + { + "epoch": 0.95, + "learning_rate": 1.6715439541144795e-08, + "logits/chosen": -2.4816136360168457, + "logits/rejected": -2.743485689163208, + "logps/chosen": -327.7810974121094, + "logps/rejected": -314.2917785644531, + "loss": 0.3389, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1691344678401947, + "rewards/margins": 2.4031965732574463, + "rewards/rejected": -2.2340621948242188, + "step": 8198 + }, + { + "epoch": 0.95, + "learning_rate": 1.6680323071520542e-08, + "logits/chosen": -2.8620526790618896, + "logits/rejected": -3.2579879760742188, + "logps/chosen": -330.3363342285156, + "logps/rejected": -312.0775146484375, + "loss": 0.2308, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17177648842334747, + "rewards/margins": 3.5783941745758057, + "rewards/rejected": -3.4066176414489746, + "step": 8199 + }, + { + "epoch": 0.95, + "learning_rate": 1.664520660189629e-08, + "logits/chosen": -2.812788724899292, + "logits/rejected": -2.8027877807617188, + "logps/chosen": -82.58345031738281, + "logps/rejected": -134.635986328125, + "loss": 0.4916, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4537752866744995, + "rewards/margins": 0.7132229208946228, + "rewards/rejected": -1.1669981479644775, + "step": 8200 + }, + { + "epoch": 0.95, + "learning_rate": 1.6610090132272036e-08, + "logits/chosen": -3.2601284980773926, + "logits/rejected": -3.1930699348449707, + "logps/chosen": -132.95388793945312, + "logps/rejected": -147.77536010742188, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32066333293914795, + "rewards/margins": 1.5845098495483398, + "rewards/rejected": -1.2638466358184814, + "step": 8201 + }, + { + "epoch": 0.95, + "learning_rate": 1.657497366264778e-08, + "logits/chosen": -3.490626811981201, + "logits/rejected": -3.728221893310547, + "logps/chosen": -303.9148864746094, + "logps/rejected": -290.1959228515625, + "loss": 0.4674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24737754464149475, + "rewards/margins": 1.1534066200256348, + "rewards/rejected": -1.4007842540740967, + "step": 8202 + }, + { + "epoch": 0.95, + "learning_rate": 1.6539857193023528e-08, + "logits/chosen": -3.2871265411376953, + "logits/rejected": -3.294301748275757, + "logps/chosen": -141.0595703125, + "logps/rejected": -173.48016357421875, + "loss": 0.6767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46653860807418823, + "rewards/margins": 1.390503168106079, + "rewards/rejected": -1.857041835784912, + "step": 8203 + }, + { + "epoch": 0.95, + "learning_rate": 1.6504740723399275e-08, + "logits/chosen": -2.704057216644287, + "logits/rejected": -3.1224329471588135, + "logps/chosen": -163.2122802734375, + "logps/rejected": -219.3159637451172, + "loss": 0.5343, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9955257177352905, + "rewards/margins": 2.045006036758423, + "rewards/rejected": -3.040531635284424, + "step": 8204 + }, + { + "epoch": 0.95, + "learning_rate": 1.6469624253775022e-08, + "logits/chosen": -3.3526034355163574, + "logits/rejected": -3.325371742248535, + "logps/chosen": -171.75274658203125, + "logps/rejected": -278.3505554199219, + "loss": 0.3932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3134555518627167, + "rewards/margins": 2.2649595737457275, + "rewards/rejected": -2.5784149169921875, + "step": 8205 + }, + { + "epoch": 0.95, + "learning_rate": 1.6434507784150766e-08, + "logits/chosen": -3.753615379333496, + "logits/rejected": -3.6616721153259277, + "logps/chosen": -172.57952880859375, + "logps/rejected": -182.27658081054688, + "loss": 0.2312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08016112446784973, + "rewards/margins": 1.9451558589935303, + "rewards/rejected": -2.0253169536590576, + "step": 8206 + }, + { + "epoch": 0.95, + "learning_rate": 1.6399391314526513e-08, + "logits/chosen": -3.644166946411133, + "logits/rejected": -3.450709819793701, + "logps/chosen": -201.61097717285156, + "logps/rejected": -220.2666778564453, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05219858139753342, + "rewards/margins": 2.7639944553375244, + "rewards/rejected": -2.711796283721924, + "step": 8207 + }, + { + "epoch": 0.95, + "learning_rate": 1.636427484490226e-08, + "logits/chosen": -3.025402069091797, + "logits/rejected": -3.0377516746520996, + "logps/chosen": -264.8951721191406, + "logps/rejected": -281.4089660644531, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18848800659179688, + "rewards/margins": 1.495556354522705, + "rewards/rejected": -1.3070685863494873, + "step": 8208 + }, + { + "epoch": 0.95, + "learning_rate": 1.6329158375278004e-08, + "logits/chosen": -3.087926149368286, + "logits/rejected": -3.294440269470215, + "logps/chosen": -166.95907592773438, + "logps/rejected": -190.9630889892578, + "loss": 0.1973, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06519429385662079, + "rewards/margins": 2.155423641204834, + "rewards/rejected": -2.0902292728424072, + "step": 8209 + }, + { + "epoch": 0.95, + "learning_rate": 1.629404190565375e-08, + "logits/chosen": -3.0046486854553223, + "logits/rejected": -3.112730026245117, + "logps/chosen": -159.9574432373047, + "logps/rejected": -266.94891357421875, + "loss": 0.1826, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14056234061717987, + "rewards/margins": 2.934500217437744, + "rewards/rejected": -2.7939376831054688, + "step": 8210 + }, + { + "epoch": 0.95, + "learning_rate": 1.6258925436029496e-08, + "logits/chosen": -3.131894588470459, + "logits/rejected": -2.923311233520508, + "logps/chosen": -296.8849182128906, + "logps/rejected": -166.84710693359375, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41574665904045105, + "rewards/margins": 0.9659462571144104, + "rewards/rejected": -0.5501996278762817, + "step": 8211 + }, + { + "epoch": 0.95, + "learning_rate": 1.6223808966405243e-08, + "logits/chosen": -3.120729923248291, + "logits/rejected": -3.2750959396362305, + "logps/chosen": -105.42279052734375, + "logps/rejected": -158.7264404296875, + "loss": 0.5977, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41063058376312256, + "rewards/margins": 1.134529709815979, + "rewards/rejected": -1.5451604127883911, + "step": 8212 + }, + { + "epoch": 0.95, + "learning_rate": 1.618869249678099e-08, + "logits/chosen": -3.105663776397705, + "logits/rejected": -3.2051539421081543, + "logps/chosen": -244.02313232421875, + "logps/rejected": -297.44482421875, + "loss": 0.2166, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13897782564163208, + "rewards/margins": 2.947629690170288, + "rewards/rejected": -2.808651924133301, + "step": 8213 + }, + { + "epoch": 0.95, + "learning_rate": 1.6153576027156734e-08, + "logits/chosen": -2.5704214572906494, + "logits/rejected": -3.129823923110962, + "logps/chosen": -279.55010986328125, + "logps/rejected": -291.8548583984375, + "loss": 0.4326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40502822399139404, + "rewards/margins": 1.835604190826416, + "rewards/rejected": -2.2406325340270996, + "step": 8214 + }, + { + "epoch": 0.95, + "learning_rate": 1.611845955753248e-08, + "logits/chosen": -2.762256145477295, + "logits/rejected": -2.810194969177246, + "logps/chosen": -173.51962280273438, + "logps/rejected": -203.73468017578125, + "loss": 0.5491, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021342262625694275, + "rewards/margins": 1.500236988067627, + "rewards/rejected": -1.478894829750061, + "step": 8215 + }, + { + "epoch": 0.95, + "learning_rate": 1.608334308790823e-08, + "logits/chosen": -2.7437829971313477, + "logits/rejected": -2.751807689666748, + "logps/chosen": -207.1459503173828, + "logps/rejected": -152.3746337890625, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.474897176027298, + "rewards/margins": 2.1351046562194824, + "rewards/rejected": -1.6602072715759277, + "step": 8216 + }, + { + "epoch": 0.95, + "learning_rate": 1.6048226618283976e-08, + "logits/chosen": -2.2935750484466553, + "logits/rejected": -2.3654778003692627, + "logps/chosen": -182.8057403564453, + "logps/rejected": -225.74911499023438, + "loss": 0.5102, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26263201236724854, + "rewards/margins": 1.0653802156448364, + "rewards/rejected": -0.8027482032775879, + "step": 8217 + }, + { + "epoch": 0.95, + "learning_rate": 1.601311014865972e-08, + "logits/chosen": -3.118767261505127, + "logits/rejected": -3.4249792098999023, + "logps/chosen": -172.46945190429688, + "logps/rejected": -174.78443908691406, + "loss": 0.323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.034323014318943024, + "rewards/margins": 1.9493227005004883, + "rewards/rejected": -1.9836456775665283, + "step": 8218 + }, + { + "epoch": 0.95, + "learning_rate": 1.5977993679035467e-08, + "logits/chosen": -2.883897304534912, + "logits/rejected": -2.988283634185791, + "logps/chosen": -129.30409240722656, + "logps/rejected": -227.33428955078125, + "loss": 0.5017, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5410102605819702, + "rewards/margins": 0.7723912000656128, + "rewards/rejected": -1.3134015798568726, + "step": 8219 + }, + { + "epoch": 0.95, + "learning_rate": 1.5942877209411214e-08, + "logits/chosen": -3.0430986881256104, + "logits/rejected": -3.0488839149475098, + "logps/chosen": -225.2998046875, + "logps/rejected": -224.41421508789062, + "loss": 0.6459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2981608808040619, + "rewards/margins": 0.6007200479507446, + "rewards/rejected": -0.8988809585571289, + "step": 8220 + }, + { + "epoch": 0.95, + "learning_rate": 1.5907760739786958e-08, + "logits/chosen": -2.785719394683838, + "logits/rejected": -2.9434454441070557, + "logps/chosen": -252.40362548828125, + "logps/rejected": -299.7442932128906, + "loss": 0.2048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3780643343925476, + "rewards/margins": 3.18212890625, + "rewards/rejected": -2.8040645122528076, + "step": 8221 + }, + { + "epoch": 0.95, + "learning_rate": 1.5872644270162705e-08, + "logits/chosen": -3.126917839050293, + "logits/rejected": -3.0756540298461914, + "logps/chosen": -236.4794464111328, + "logps/rejected": -270.8515319824219, + "loss": 0.3246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08024759590625763, + "rewards/margins": 1.9774391651153564, + "rewards/rejected": -2.0576870441436768, + "step": 8222 + }, + { + "epoch": 0.95, + "learning_rate": 1.5837527800538453e-08, + "logits/chosen": -2.2370827198028564, + "logits/rejected": -2.388514518737793, + "logps/chosen": -236.5407257080078, + "logps/rejected": -129.404296875, + "loss": 0.2127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21934986114501953, + "rewards/margins": 2.1705026626586914, + "rewards/rejected": -1.9511528015136719, + "step": 8223 + }, + { + "epoch": 0.95, + "learning_rate": 1.58024113309142e-08, + "logits/chosen": -2.834913730621338, + "logits/rejected": -2.779510259628296, + "logps/chosen": -371.0307922363281, + "logps/rejected": -344.0577087402344, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30353468656539917, + "rewards/margins": 2.19862699508667, + "rewards/rejected": -1.895092248916626, + "step": 8224 + }, + { + "epoch": 0.95, + "learning_rate": 1.5767294861289944e-08, + "logits/chosen": -2.446337938308716, + "logits/rejected": -2.3566110134124756, + "logps/chosen": -201.25814819335938, + "logps/rejected": -187.96548461914062, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12175148725509644, + "rewards/margins": 1.5445765256881714, + "rewards/rejected": -1.6663278341293335, + "step": 8225 + }, + { + "epoch": 0.95, + "learning_rate": 1.573217839166569e-08, + "logits/chosen": -3.3017666339874268, + "logits/rejected": -3.2026782035827637, + "logps/chosen": -283.6519775390625, + "logps/rejected": -380.90179443359375, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2574186325073242, + "rewards/margins": 3.3695621490478516, + "rewards/rejected": -3.1121435165405273, + "step": 8226 + }, + { + "epoch": 0.95, + "learning_rate": 1.5697061922041438e-08, + "logits/chosen": -2.8463644981384277, + "logits/rejected": -2.6944236755371094, + "logps/chosen": -507.3891906738281, + "logps/rejected": -300.4443054199219, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3947167694568634, + "rewards/margins": 1.0157959461212158, + "rewards/rejected": -1.4105126857757568, + "step": 8227 + }, + { + "epoch": 0.95, + "learning_rate": 1.5661945452417182e-08, + "logits/chosen": -2.275399684906006, + "logits/rejected": -2.474114418029785, + "logps/chosen": -360.2444763183594, + "logps/rejected": -243.35043334960938, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5487843751907349, + "rewards/margins": 0.24798502027988434, + "rewards/rejected": -0.796769380569458, + "step": 8228 + }, + { + "epoch": 0.95, + "learning_rate": 1.562682898279293e-08, + "logits/chosen": -3.0128846168518066, + "logits/rejected": -2.742933511734009, + "logps/chosen": -220.90921020507812, + "logps/rejected": -184.084228515625, + "loss": 0.2275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2848397493362427, + "rewards/margins": 2.683603048324585, + "rewards/rejected": -2.968442678451538, + "step": 8229 + }, + { + "epoch": 0.95, + "learning_rate": 1.5591712513168677e-08, + "logits/chosen": -3.1469240188598633, + "logits/rejected": -3.0178396701812744, + "logps/chosen": -303.40191650390625, + "logps/rejected": -184.71502685546875, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10997083783149719, + "rewards/margins": 2.4358434677124023, + "rewards/rejected": -2.3258726596832275, + "step": 8230 + }, + { + "epoch": 0.95, + "learning_rate": 1.5556596043544424e-08, + "logits/chosen": -2.72654390335083, + "logits/rejected": -2.601412534713745, + "logps/chosen": -278.7293701171875, + "logps/rejected": -315.77655029296875, + "loss": 0.5713, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03932797908782959, + "rewards/margins": 1.9768569469451904, + "rewards/rejected": -2.0161850452423096, + "step": 8231 + }, + { + "epoch": 0.95, + "learning_rate": 1.5521479573920168e-08, + "logits/chosen": -3.7213213443756104, + "logits/rejected": -3.971909999847412, + "logps/chosen": -178.84634399414062, + "logps/rejected": -198.11172485351562, + "loss": 0.5061, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5730245113372803, + "rewards/margins": 1.2351694107055664, + "rewards/rejected": -1.8081941604614258, + "step": 8232 + }, + { + "epoch": 0.95, + "learning_rate": 1.5486363104295912e-08, + "logits/chosen": -3.7951862812042236, + "logits/rejected": -3.7009530067443848, + "logps/chosen": -242.45762634277344, + "logps/rejected": -275.89495849609375, + "loss": 0.4082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.259093701839447, + "rewards/margins": 1.7944221496582031, + "rewards/rejected": -2.053515672683716, + "step": 8233 + }, + { + "epoch": 0.95, + "learning_rate": 1.545124663467166e-08, + "logits/chosen": -2.8726401329040527, + "logits/rejected": -2.84236216545105, + "logps/chosen": -296.1202392578125, + "logps/rejected": -271.3155822753906, + "loss": 0.574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26624467968940735, + "rewards/margins": 2.0868430137634277, + "rewards/rejected": -2.3530876636505127, + "step": 8234 + }, + { + "epoch": 0.95, + "learning_rate": 1.5416130165047406e-08, + "logits/chosen": -2.4935855865478516, + "logits/rejected": -3.1072025299072266, + "logps/chosen": -304.6036376953125, + "logps/rejected": -353.6661376953125, + "loss": 0.3499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.034298986196517944, + "rewards/margins": 2.2546679973602295, + "rewards/rejected": -2.2889671325683594, + "step": 8235 + }, + { + "epoch": 0.95, + "learning_rate": 1.5381013695423153e-08, + "logits/chosen": -2.435995578765869, + "logits/rejected": -2.6241252422332764, + "logps/chosen": -205.0628204345703, + "logps/rejected": -156.2589111328125, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.029611259698867798, + "rewards/margins": 2.0836033821105957, + "rewards/rejected": -2.1132144927978516, + "step": 8236 + }, + { + "epoch": 0.95, + "learning_rate": 1.5345897225798897e-08, + "logits/chosen": -2.708845376968384, + "logits/rejected": -2.888165235519409, + "logps/chosen": -348.1418151855469, + "logps/rejected": -158.44781494140625, + "loss": 1.1795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1869009733200073, + "rewards/margins": 0.34230929613113403, + "rewards/rejected": -1.5292103290557861, + "step": 8237 + }, + { + "epoch": 0.95, + "learning_rate": 1.5310780756174645e-08, + "logits/chosen": -3.2474470138549805, + "logits/rejected": -3.3787484169006348, + "logps/chosen": -142.2418212890625, + "logps/rejected": -164.19589233398438, + "loss": 0.4073, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28977325558662415, + "rewards/margins": 1.8503177165985107, + "rewards/rejected": -1.5605443716049194, + "step": 8238 + }, + { + "epoch": 0.95, + "learning_rate": 1.5275664286550392e-08, + "logits/chosen": -3.128875970840454, + "logits/rejected": -3.309847354888916, + "logps/chosen": -258.1601867675781, + "logps/rejected": -110.73875427246094, + "loss": 0.4387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3814738988876343, + "rewards/margins": 0.7139061689376831, + "rewards/rejected": -1.0953800678253174, + "step": 8239 + }, + { + "epoch": 0.95, + "learning_rate": 1.5240547816926136e-08, + "logits/chosen": -2.668130397796631, + "logits/rejected": -2.4263410568237305, + "logps/chosen": -267.9188537597656, + "logps/rejected": -339.1885986328125, + "loss": 0.4191, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01929982751607895, + "rewards/margins": 1.0208733081817627, + "rewards/rejected": -1.0401731729507446, + "step": 8240 + }, + { + "epoch": 0.95, + "learning_rate": 1.5205431347301883e-08, + "logits/chosen": -2.8403642177581787, + "logits/rejected": -2.65486741065979, + "logps/chosen": -219.2029266357422, + "logps/rejected": -122.58714294433594, + "loss": 0.5096, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18359404802322388, + "rewards/margins": 1.0307050943374634, + "rewards/rejected": -0.8471111059188843, + "step": 8241 + }, + { + "epoch": 0.95, + "learning_rate": 1.517031487767763e-08, + "logits/chosen": -2.969252109527588, + "logits/rejected": -2.7769699096679688, + "logps/chosen": -253.57852172851562, + "logps/rejected": -266.30499267578125, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17603209614753723, + "rewards/margins": 2.0415267944335938, + "rewards/rejected": -2.2175586223602295, + "step": 8242 + }, + { + "epoch": 0.95, + "learning_rate": 1.5135198408053377e-08, + "logits/chosen": -3.2662341594696045, + "logits/rejected": -3.611793041229248, + "logps/chosen": -186.13198852539062, + "logps/rejected": -200.3729248046875, + "loss": 0.3665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11456766724586487, + "rewards/margins": 1.9970074892044067, + "rewards/rejected": -2.111575126647949, + "step": 8243 + }, + { + "epoch": 0.95, + "learning_rate": 1.510008193842912e-08, + "logits/chosen": -2.9553093910217285, + "logits/rejected": -2.90592360496521, + "logps/chosen": -293.00225830078125, + "logps/rejected": -285.9473876953125, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6524166464805603, + "rewards/margins": 1.545724630355835, + "rewards/rejected": -2.198141574859619, + "step": 8244 + }, + { + "epoch": 0.95, + "learning_rate": 1.506496546880487e-08, + "logits/chosen": -2.9966237545013428, + "logits/rejected": -2.892843008041382, + "logps/chosen": -197.93243408203125, + "logps/rejected": -277.37359619140625, + "loss": 0.3205, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1410657912492752, + "rewards/margins": 3.0153555870056152, + "rewards/rejected": -2.8742895126342773, + "step": 8245 + }, + { + "epoch": 0.95, + "learning_rate": 1.5029848999180616e-08, + "logits/chosen": -3.777984380722046, + "logits/rejected": -3.8629801273345947, + "logps/chosen": -241.86729431152344, + "logps/rejected": -167.6749267578125, + "loss": 0.2298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.862123966217041, + "rewards/margins": 2.0270814895629883, + "rewards/rejected": -1.1649575233459473, + "step": 8246 + }, + { + "epoch": 0.95, + "learning_rate": 1.4994732529556363e-08, + "logits/chosen": -2.833996534347534, + "logits/rejected": -2.5663673877716064, + "logps/chosen": -145.63504028320312, + "logps/rejected": -269.19964599609375, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09467703104019165, + "rewards/margins": 3.088162422180176, + "rewards/rejected": -2.993485450744629, + "step": 8247 + }, + { + "epoch": 0.95, + "learning_rate": 1.4959616059932107e-08, + "logits/chosen": -3.387742042541504, + "logits/rejected": -3.565957546234131, + "logps/chosen": -184.724365234375, + "logps/rejected": -293.4902648925781, + "loss": 0.2867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07957261055707932, + "rewards/margins": 2.0599169731140137, + "rewards/rejected": -1.9803444147109985, + "step": 8248 + }, + { + "epoch": 0.95, + "learning_rate": 1.4924499590307854e-08, + "logits/chosen": -3.854768991470337, + "logits/rejected": -3.729292392730713, + "logps/chosen": -189.33538818359375, + "logps/rejected": -227.29953002929688, + "loss": 0.6, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4908967614173889, + "rewards/margins": 1.4630119800567627, + "rewards/rejected": -1.9539086818695068, + "step": 8249 + }, + { + "epoch": 0.95, + "learning_rate": 1.48893831206836e-08, + "logits/chosen": -3.515562057495117, + "logits/rejected": -2.7844552993774414, + "logps/chosen": -224.0501708984375, + "logps/rejected": -258.92803955078125, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6445354223251343, + "rewards/margins": 1.1457926034927368, + "rewards/rejected": -1.790328025817871, + "step": 8250 + }, + { + "epoch": 0.95, + "learning_rate": 1.4854266651059347e-08, + "logits/chosen": -3.609443426132202, + "logits/rejected": -3.56705904006958, + "logps/chosen": -140.5050048828125, + "logps/rejected": -265.04144287109375, + "loss": 0.185, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40054553747177124, + "rewards/margins": 3.872006416320801, + "rewards/rejected": -3.4714608192443848, + "step": 8251 + }, + { + "epoch": 0.95, + "learning_rate": 1.4819150181435093e-08, + "logits/chosen": -3.971944570541382, + "logits/rejected": -3.5025181770324707, + "logps/chosen": -244.74334716796875, + "logps/rejected": -254.66751098632812, + "loss": 0.5662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4298558533191681, + "rewards/margins": 1.6099966764450073, + "rewards/rejected": -2.0398526191711426, + "step": 8252 + }, + { + "epoch": 0.95, + "learning_rate": 1.478403371181084e-08, + "logits/chosen": -3.149454355239868, + "logits/rejected": -2.756873607635498, + "logps/chosen": -365.79693603515625, + "logps/rejected": -260.39788818359375, + "loss": 0.2832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4141916334629059, + "rewards/margins": 1.5371320247650146, + "rewards/rejected": -1.1229404211044312, + "step": 8253 + }, + { + "epoch": 0.95, + "learning_rate": 1.4748917242186585e-08, + "logits/chosen": -2.5218136310577393, + "logits/rejected": -2.6424124240875244, + "logps/chosen": -379.86309814453125, + "logps/rejected": -305.38531494140625, + "loss": 0.5845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5088021755218506, + "rewards/margins": 1.1483252048492432, + "rewards/rejected": -1.6571276187896729, + "step": 8254 + }, + { + "epoch": 0.95, + "learning_rate": 1.4713800772562333e-08, + "logits/chosen": -3.7103357315063477, + "logits/rejected": -3.9747860431671143, + "logps/chosen": -156.41409301757812, + "logps/rejected": -281.7643737792969, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007502416148781776, + "rewards/margins": 1.765620231628418, + "rewards/rejected": -1.7581177949905396, + "step": 8255 + }, + { + "epoch": 0.95, + "learning_rate": 1.4678684302938078e-08, + "logits/chosen": -3.572471857070923, + "logits/rejected": -3.675083637237549, + "logps/chosen": -308.5960388183594, + "logps/rejected": -331.8685302734375, + "loss": 0.3326, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03995952010154724, + "rewards/margins": 2.750368595123291, + "rewards/rejected": -2.710409164428711, + "step": 8256 + }, + { + "epoch": 0.95, + "learning_rate": 1.4643567833313822e-08, + "logits/chosen": -3.8916492462158203, + "logits/rejected": -3.590463638305664, + "logps/chosen": -421.23779296875, + "logps/rejected": -418.8205871582031, + "loss": 0.2585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07219311594963074, + "rewards/margins": 3.73925518989563, + "rewards/rejected": -3.811448335647583, + "step": 8257 + }, + { + "epoch": 0.95, + "learning_rate": 1.460845136368957e-08, + "logits/chosen": -3.0521113872528076, + "logits/rejected": -2.9243834018707275, + "logps/chosen": -192.96823120117188, + "logps/rejected": -208.53640747070312, + "loss": 0.3646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40024542808532715, + "rewards/margins": 1.7781167030334473, + "rewards/rejected": -2.1783618927001953, + "step": 8258 + }, + { + "epoch": 0.95, + "learning_rate": 1.4573334894065315e-08, + "logits/chosen": -3.2460811138153076, + "logits/rejected": -3.2191967964172363, + "logps/chosen": -209.1927490234375, + "logps/rejected": -147.4943389892578, + "loss": 0.8816, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7279817461967468, + "rewards/margins": 0.7797703742980957, + "rewards/rejected": -1.5077519416809082, + "step": 8259 + }, + { + "epoch": 0.95, + "learning_rate": 1.4538218424441062e-08, + "logits/chosen": -2.7057747840881348, + "logits/rejected": -2.4894022941589355, + "logps/chosen": -275.1524963378906, + "logps/rejected": -209.80001831054688, + "loss": 0.3148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08709488809108734, + "rewards/margins": 1.449735164642334, + "rewards/rejected": -1.536830186843872, + "step": 8260 + }, + { + "epoch": 0.95, + "learning_rate": 1.4503101954816808e-08, + "logits/chosen": -2.748729705810547, + "logits/rejected": -2.8736796379089355, + "logps/chosen": -210.43716430664062, + "logps/rejected": -296.22357177734375, + "loss": 0.1464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5601373314857483, + "rewards/margins": 3.757070302963257, + "rewards/rejected": -3.1969330310821533, + "step": 8261 + }, + { + "epoch": 0.95, + "learning_rate": 1.4467985485192553e-08, + "logits/chosen": -3.3517367839813232, + "logits/rejected": -3.4727675914764404, + "logps/chosen": -305.001953125, + "logps/rejected": -287.24066162109375, + "loss": 0.4119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3569161295890808, + "rewards/margins": 1.2706716060638428, + "rewards/rejected": -1.6275877952575684, + "step": 8262 + }, + { + "epoch": 0.95, + "learning_rate": 1.44328690155683e-08, + "logits/chosen": -2.910613536834717, + "logits/rejected": -3.02905011177063, + "logps/chosen": -169.02452087402344, + "logps/rejected": -266.8596496582031, + "loss": 0.3417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.008430056273937225, + "rewards/margins": 1.4330143928527832, + "rewards/rejected": -1.4414445161819458, + "step": 8263 + }, + { + "epoch": 0.95, + "learning_rate": 1.4397752545944046e-08, + "logits/chosen": -3.5216925144195557, + "logits/rejected": -3.5592074394226074, + "logps/chosen": -128.70835876464844, + "logps/rejected": -168.03363037109375, + "loss": 0.8629, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1049141138792038, + "rewards/margins": 0.3693433403968811, + "rewards/rejected": -0.2644291818141937, + "step": 8264 + }, + { + "epoch": 0.95, + "learning_rate": 1.4362636076319793e-08, + "logits/chosen": -3.4989237785339355, + "logits/rejected": -3.4006950855255127, + "logps/chosen": -333.2518005371094, + "logps/rejected": -228.68106079101562, + "loss": 0.3657, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6153453588485718, + "rewards/margins": 2.057711601257324, + "rewards/rejected": -1.442366123199463, + "step": 8265 + }, + { + "epoch": 0.95, + "learning_rate": 1.4327519606695539e-08, + "logits/chosen": -3.5452423095703125, + "logits/rejected": -3.2625255584716797, + "logps/chosen": -268.45330810546875, + "logps/rejected": -294.48663330078125, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4794980585575104, + "rewards/margins": 3.1246089935302734, + "rewards/rejected": -2.645111083984375, + "step": 8266 + }, + { + "epoch": 0.95, + "learning_rate": 1.4292403137071286e-08, + "logits/chosen": -3.2553274631500244, + "logits/rejected": -3.2313218116760254, + "logps/chosen": -315.8193054199219, + "logps/rejected": -218.30810546875, + "loss": 0.3406, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2441224902868271, + "rewards/margins": 2.1272685527801514, + "rewards/rejected": -1.8831461668014526, + "step": 8267 + }, + { + "epoch": 0.95, + "learning_rate": 1.4257286667447032e-08, + "logits/chosen": -3.1642954349517822, + "logits/rejected": -3.3857221603393555, + "logps/chosen": -260.1934814453125, + "logps/rejected": -357.06890869140625, + "loss": 0.412, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41890451312065125, + "rewards/margins": 2.520261287689209, + "rewards/rejected": -2.9391658306121826, + "step": 8268 + }, + { + "epoch": 0.95, + "learning_rate": 1.4222170197822779e-08, + "logits/chosen": -2.790724277496338, + "logits/rejected": -3.0167055130004883, + "logps/chosen": -284.969482421875, + "logps/rejected": -282.53106689453125, + "loss": 0.2764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5384602546691895, + "rewards/margins": 2.9212517738342285, + "rewards/rejected": -2.382791519165039, + "step": 8269 + }, + { + "epoch": 0.95, + "learning_rate": 1.4187053728198525e-08, + "logits/chosen": -3.781431198120117, + "logits/rejected": -4.054003715515137, + "logps/chosen": -115.28715515136719, + "logps/rejected": -165.31942749023438, + "loss": 0.253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6309633851051331, + "rewards/margins": 2.374691963195801, + "rewards/rejected": -1.743728518486023, + "step": 8270 + }, + { + "epoch": 0.95, + "learning_rate": 1.415193725857427e-08, + "logits/chosen": -3.2740638256073, + "logits/rejected": -3.4012904167175293, + "logps/chosen": -206.34042358398438, + "logps/rejected": -292.7893371582031, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6463313698768616, + "rewards/margins": 1.5236260890960693, + "rewards/rejected": -2.1699576377868652, + "step": 8271 + }, + { + "epoch": 0.95, + "learning_rate": 1.4116820788950018e-08, + "logits/chosen": -3.063417434692383, + "logits/rejected": -3.140129327774048, + "logps/chosen": -143.380615234375, + "logps/rejected": -207.98114013671875, + "loss": 0.3841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38790908455848694, + "rewards/margins": 1.7792935371398926, + "rewards/rejected": -2.1672024726867676, + "step": 8272 + }, + { + "epoch": 0.95, + "learning_rate": 1.4081704319325763e-08, + "logits/chosen": -2.428615093231201, + "logits/rejected": -2.602424144744873, + "logps/chosen": -485.16888427734375, + "logps/rejected": -377.37774658203125, + "loss": 0.3829, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3134302496910095, + "rewards/margins": 2.264782190322876, + "rewards/rejected": -1.9513520002365112, + "step": 8273 + }, + { + "epoch": 0.95, + "learning_rate": 1.404658784970151e-08, + "logits/chosen": -2.636791467666626, + "logits/rejected": -2.4307098388671875, + "logps/chosen": -153.61888122558594, + "logps/rejected": -244.98312377929688, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005350708961486816, + "rewards/margins": 0.7849456667900085, + "rewards/rejected": -0.7795950174331665, + "step": 8274 + }, + { + "epoch": 0.95, + "learning_rate": 1.4011471380077256e-08, + "logits/chosen": -2.2900447845458984, + "logits/rejected": -2.266974925994873, + "logps/chosen": -125.2791519165039, + "logps/rejected": -223.76040649414062, + "loss": 0.4536, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07715275883674622, + "rewards/margins": 1.082786202430725, + "rewards/rejected": -1.159938931465149, + "step": 8275 + }, + { + "epoch": 0.95, + "learning_rate": 1.3976354910453003e-08, + "logits/chosen": -3.1984896659851074, + "logits/rejected": -3.1118476390838623, + "logps/chosen": -140.6183624267578, + "logps/rejected": -118.84224700927734, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.54518723487854, + "rewards/margins": 1.178221583366394, + "rewards/rejected": -0.633034348487854, + "step": 8276 + }, + { + "epoch": 0.95, + "learning_rate": 1.3941238440828749e-08, + "logits/chosen": -3.314821481704712, + "logits/rejected": -2.9306371212005615, + "logps/chosen": -212.38771057128906, + "logps/rejected": -163.811767578125, + "loss": 0.7723, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.583911120891571, + "rewards/margins": 0.9184215068817139, + "rewards/rejected": -1.5023326873779297, + "step": 8277 + }, + { + "epoch": 0.95, + "learning_rate": 1.3906121971204494e-08, + "logits/chosen": -2.8416147232055664, + "logits/rejected": -2.804090976715088, + "logps/chosen": -290.3372802734375, + "logps/rejected": -122.77336120605469, + "loss": 0.783, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23467686772346497, + "rewards/margins": 0.8077397346496582, + "rewards/rejected": -1.0424166917800903, + "step": 8278 + }, + { + "epoch": 0.95, + "learning_rate": 1.3871005501580242e-08, + "logits/chosen": -2.028742551803589, + "logits/rejected": -1.8264014720916748, + "logps/chosen": -275.3419494628906, + "logps/rejected": -289.8919982910156, + "loss": 0.2094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23118162155151367, + "rewards/margins": 2.138932466506958, + "rewards/rejected": -2.3701140880584717, + "step": 8279 + }, + { + "epoch": 0.95, + "learning_rate": 1.3835889031955987e-08, + "logits/chosen": -2.86384916305542, + "logits/rejected": -2.870595932006836, + "logps/chosen": -239.38780212402344, + "logps/rejected": -259.69989013671875, + "loss": 0.4389, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2971067428588867, + "rewards/margins": 1.757723331451416, + "rewards/rejected": -2.0548300743103027, + "step": 8280 + }, + { + "epoch": 0.95, + "learning_rate": 1.3800772562331733e-08, + "logits/chosen": -3.0230178833007812, + "logits/rejected": -2.7238361835479736, + "logps/chosen": -347.1412048339844, + "logps/rejected": -359.8379821777344, + "loss": 0.5643, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1111907958984375, + "rewards/margins": 0.7488512396812439, + "rewards/rejected": -0.6376605033874512, + "step": 8281 + }, + { + "epoch": 0.95, + "learning_rate": 1.3765656092707478e-08, + "logits/chosen": -2.8979930877685547, + "logits/rejected": -2.738325834274292, + "logps/chosen": -185.16453552246094, + "logps/rejected": -247.8653564453125, + "loss": 0.4805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1412678360939026, + "rewards/margins": 2.1107494831085205, + "rewards/rejected": -2.2520174980163574, + "step": 8282 + }, + { + "epoch": 0.95, + "learning_rate": 1.3730539623083224e-08, + "logits/chosen": -3.6049342155456543, + "logits/rejected": -3.4563145637512207, + "logps/chosen": -228.02239990234375, + "logps/rejected": -198.47314453125, + "loss": 0.3996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.237548828125, + "rewards/margins": 1.7041387557983398, + "rewards/rejected": -1.9416875839233398, + "step": 8283 + }, + { + "epoch": 0.95, + "learning_rate": 1.3695423153458971e-08, + "logits/chosen": -3.3590359687805176, + "logits/rejected": -3.6160755157470703, + "logps/chosen": -180.7003936767578, + "logps/rejected": -231.48350524902344, + "loss": 0.3915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018077760934829712, + "rewards/margins": 2.9970061779022217, + "rewards/rejected": -2.978928565979004, + "step": 8284 + }, + { + "epoch": 0.96, + "learning_rate": 1.3660306683834717e-08, + "logits/chosen": -3.661324977874756, + "logits/rejected": -3.793510675430298, + "logps/chosen": -207.0034942626953, + "logps/rejected": -245.124755859375, + "loss": 0.4738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8031078577041626, + "rewards/margins": 1.8221588134765625, + "rewards/rejected": -2.6252665519714355, + "step": 8285 + }, + { + "epoch": 0.96, + "learning_rate": 1.3625190214210464e-08, + "logits/chosen": -3.059300422668457, + "logits/rejected": -3.1600863933563232, + "logps/chosen": -291.9253845214844, + "logps/rejected": -336.7073669433594, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42295241355895996, + "rewards/margins": 2.7965152263641357, + "rewards/rejected": -2.373562812805176, + "step": 8286 + }, + { + "epoch": 0.96, + "learning_rate": 1.359007374458621e-08, + "logits/chosen": -3.0627851486206055, + "logits/rejected": -3.184109687805176, + "logps/chosen": -239.51913452148438, + "logps/rejected": -265.8860168457031, + "loss": 0.4993, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5561214685440063, + "rewards/margins": 2.0229475498199463, + "rewards/rejected": -1.466826319694519, + "step": 8287 + }, + { + "epoch": 0.96, + "learning_rate": 1.3554957274961957e-08, + "logits/chosen": -3.5015735626220703, + "logits/rejected": -3.482973098754883, + "logps/chosen": -356.20782470703125, + "logps/rejected": -387.2317810058594, + "loss": 0.2822, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10837945342063904, + "rewards/margins": 2.7391717433929443, + "rewards/rejected": -2.847551107406616, + "step": 8288 + }, + { + "epoch": 0.96, + "learning_rate": 1.3519840805337702e-08, + "logits/chosen": -3.9006190299987793, + "logits/rejected": -3.545283079147339, + "logps/chosen": -137.8027801513672, + "logps/rejected": -112.09077453613281, + "loss": 0.3349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10106566548347473, + "rewards/margins": 1.4865074157714844, + "rewards/rejected": -1.5875732898712158, + "step": 8289 + }, + { + "epoch": 0.96, + "learning_rate": 1.348472433571345e-08, + "logits/chosen": -3.612732410430908, + "logits/rejected": -3.5788369178771973, + "logps/chosen": -409.31854248046875, + "logps/rejected": -362.3607177734375, + "loss": 0.3418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4367539882659912, + "rewards/margins": 2.784336805343628, + "rewards/rejected": -2.3475828170776367, + "step": 8290 + }, + { + "epoch": 0.96, + "learning_rate": 1.3449607866089195e-08, + "logits/chosen": -3.499504566192627, + "logits/rejected": -3.363041877746582, + "logps/chosen": -197.84814453125, + "logps/rejected": -271.4588623046875, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20790208876132965, + "rewards/margins": 2.007131338119507, + "rewards/rejected": -2.215033531188965, + "step": 8291 + }, + { + "epoch": 0.96, + "learning_rate": 1.341449139646494e-08, + "logits/chosen": -3.470132827758789, + "logits/rejected": -3.461972236633301, + "logps/chosen": -272.3686828613281, + "logps/rejected": -290.0356140136719, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13070176541805267, + "rewards/margins": 1.9671165943145752, + "rewards/rejected": -1.8364146947860718, + "step": 8292 + }, + { + "epoch": 0.96, + "learning_rate": 1.3379374926840688e-08, + "logits/chosen": -3.9186582565307617, + "logits/rejected": -3.721738576889038, + "logps/chosen": -261.04840087890625, + "logps/rejected": -218.58486938476562, + "loss": 0.5936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3003842532634735, + "rewards/margins": 1.6091973781585693, + "rewards/rejected": -1.9095815420150757, + "step": 8293 + }, + { + "epoch": 0.96, + "learning_rate": 1.3344258457216434e-08, + "logits/chosen": -3.291627883911133, + "logits/rejected": -3.216060161590576, + "logps/chosen": -293.1528015136719, + "logps/rejected": -197.66122436523438, + "loss": 0.7025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1359265297651291, + "rewards/margins": 1.0045456886291504, + "rewards/rejected": -1.1404720544815063, + "step": 8294 + }, + { + "epoch": 0.96, + "learning_rate": 1.330914198759218e-08, + "logits/chosen": -2.7195615768432617, + "logits/rejected": -2.7487967014312744, + "logps/chosen": -565.5466918945312, + "logps/rejected": -322.7362976074219, + "loss": 0.4274, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09694815427064896, + "rewards/margins": 0.8577335476875305, + "rewards/rejected": -0.9546816349029541, + "step": 8295 + }, + { + "epoch": 0.96, + "learning_rate": 1.3274025517967926e-08, + "logits/chosen": -3.1486330032348633, + "logits/rejected": -2.9082202911376953, + "logps/chosen": -167.2576904296875, + "logps/rejected": -240.68621826171875, + "loss": 0.8943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8826574087142944, + "rewards/margins": 0.7086774110794067, + "rewards/rejected": -1.5913348197937012, + "step": 8296 + }, + { + "epoch": 0.96, + "learning_rate": 1.3238909048343674e-08, + "logits/chosen": -3.296518325805664, + "logits/rejected": -3.0827856063842773, + "logps/chosen": -263.598876953125, + "logps/rejected": -175.9322052001953, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4610040783882141, + "rewards/margins": 1.9768823385238647, + "rewards/rejected": -1.5158780813217163, + "step": 8297 + }, + { + "epoch": 0.96, + "learning_rate": 1.320379257871942e-08, + "logits/chosen": -2.710878849029541, + "logits/rejected": -2.6290817260742188, + "logps/chosen": -426.05328369140625, + "logps/rejected": -350.56524658203125, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14546924829483032, + "rewards/margins": 1.0578967332839966, + "rewards/rejected": -0.9124274849891663, + "step": 8298 + }, + { + "epoch": 0.96, + "learning_rate": 1.3168676109095165e-08, + "logits/chosen": -2.803089141845703, + "logits/rejected": -3.2952938079833984, + "logps/chosen": -103.19120788574219, + "logps/rejected": -247.6085205078125, + "loss": 0.277, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28170379996299744, + "rewards/margins": 2.675689697265625, + "rewards/rejected": -2.3939859867095947, + "step": 8299 + }, + { + "epoch": 0.96, + "learning_rate": 1.3133559639470912e-08, + "logits/chosen": -3.584475040435791, + "logits/rejected": -3.5499305725097656, + "logps/chosen": -400.31414794921875, + "logps/rejected": -341.586669921875, + "loss": 0.2138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13942933082580566, + "rewards/margins": 2.5025081634521484, + "rewards/rejected": -2.641937494277954, + "step": 8300 + }, + { + "epoch": 0.96, + "learning_rate": 1.3098443169846658e-08, + "logits/chosen": -2.8413326740264893, + "logits/rejected": -2.682149887084961, + "logps/chosen": -216.79803466796875, + "logps/rejected": -309.5220642089844, + "loss": 0.2351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0944717526435852, + "rewards/margins": 2.5069546699523926, + "rewards/rejected": -2.601426601409912, + "step": 8301 + }, + { + "epoch": 0.96, + "learning_rate": 1.3063326700222405e-08, + "logits/chosen": -3.6985836029052734, + "logits/rejected": -3.6557083129882812, + "logps/chosen": -69.42398834228516, + "logps/rejected": -100.91555786132812, + "loss": 0.4295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07189862430095673, + "rewards/margins": 0.9814072847366333, + "rewards/rejected": -0.9095085859298706, + "step": 8302 + }, + { + "epoch": 0.96, + "learning_rate": 1.302821023059815e-08, + "logits/chosen": -3.2567996978759766, + "logits/rejected": -3.0298914909362793, + "logps/chosen": -191.20367431640625, + "logps/rejected": -272.88104248046875, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4948158264160156, + "rewards/margins": 3.167259693145752, + "rewards/rejected": -3.6620755195617676, + "step": 8303 + }, + { + "epoch": 0.96, + "learning_rate": 1.2993093760973898e-08, + "logits/chosen": -3.045347213745117, + "logits/rejected": -3.3705902099609375, + "logps/chosen": -146.20303344726562, + "logps/rejected": -318.67999267578125, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06272351741790771, + "rewards/margins": 2.9151148796081543, + "rewards/rejected": -2.9778380393981934, + "step": 8304 + }, + { + "epoch": 0.96, + "learning_rate": 1.2957977291349642e-08, + "logits/chosen": -3.063415050506592, + "logits/rejected": -3.241237163543701, + "logps/chosen": -277.8468933105469, + "logps/rejected": -174.09793090820312, + "loss": 0.5373, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5477906465530396, + "rewards/margins": 0.6145612001419067, + "rewards/rejected": -1.1623518466949463, + "step": 8305 + }, + { + "epoch": 0.96, + "learning_rate": 1.2922860821725387e-08, + "logits/chosen": -3.0128707885742188, + "logits/rejected": -3.206698179244995, + "logps/chosen": -138.36163330078125, + "logps/rejected": -272.8291015625, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18411535024642944, + "rewards/margins": 2.2222652435302734, + "rewards/rejected": -2.4063804149627686, + "step": 8306 + }, + { + "epoch": 0.96, + "learning_rate": 1.2887744352101134e-08, + "logits/chosen": -3.059342384338379, + "logits/rejected": -3.111412286758423, + "logps/chosen": -124.45681762695312, + "logps/rejected": -197.85507202148438, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37650084495544434, + "rewards/margins": 1.5740798711776733, + "rewards/rejected": -1.9505807161331177, + "step": 8307 + }, + { + "epoch": 0.96, + "learning_rate": 1.285262788247688e-08, + "logits/chosen": -3.0233023166656494, + "logits/rejected": -2.9006476402282715, + "logps/chosen": -239.59161376953125, + "logps/rejected": -383.79150390625, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2367362231016159, + "rewards/margins": 3.467280626296997, + "rewards/rejected": -3.230544328689575, + "step": 8308 + }, + { + "epoch": 0.96, + "learning_rate": 1.2817511412852627e-08, + "logits/chosen": -3.470857620239258, + "logits/rejected": -3.424672842025757, + "logps/chosen": -210.60536193847656, + "logps/rejected": -171.46722412109375, + "loss": 0.331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15453150868415833, + "rewards/margins": 1.0666377544403076, + "rewards/rejected": -1.2211692333221436, + "step": 8309 + }, + { + "epoch": 0.96, + "learning_rate": 1.2782394943228373e-08, + "logits/chosen": -2.7635385990142822, + "logits/rejected": -2.736882448196411, + "logps/chosen": -296.8058776855469, + "logps/rejected": -227.1547088623047, + "loss": 0.2848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2156868875026703, + "rewards/margins": 1.8027162551879883, + "rewards/rejected": -2.0184030532836914, + "step": 8310 + }, + { + "epoch": 0.96, + "learning_rate": 1.274727847360412e-08, + "logits/chosen": -2.369112730026245, + "logits/rejected": -2.338441848754883, + "logps/chosen": -425.47381591796875, + "logps/rejected": -286.86572265625, + "loss": 0.4302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0032943710684776306, + "rewards/margins": 1.1478815078735352, + "rewards/rejected": -1.1511757373809814, + "step": 8311 + }, + { + "epoch": 0.96, + "learning_rate": 1.2712162003979866e-08, + "logits/chosen": -3.441132068634033, + "logits/rejected": -3.3137896060943604, + "logps/chosen": -318.3377380371094, + "logps/rejected": -438.000732421875, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08224048465490341, + "rewards/margins": 3.488095283508301, + "rewards/rejected": -3.4058547019958496, + "step": 8312 + }, + { + "epoch": 0.96, + "learning_rate": 1.2677045534355611e-08, + "logits/chosen": -4.161600112915039, + "logits/rejected": -3.9250497817993164, + "logps/chosen": -161.736572265625, + "logps/rejected": -188.98956298828125, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3939509689807892, + "rewards/margins": 3.1528501510620117, + "rewards/rejected": -2.758898973464966, + "step": 8313 + }, + { + "epoch": 0.96, + "learning_rate": 1.2641929064731358e-08, + "logits/chosen": -3.1986632347106934, + "logits/rejected": -2.898909330368042, + "logps/chosen": -221.6466064453125, + "logps/rejected": -174.53518676757812, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3208311200141907, + "rewards/margins": 1.4786393642425537, + "rewards/rejected": -1.7994705438613892, + "step": 8314 + }, + { + "epoch": 0.96, + "learning_rate": 1.2606812595107104e-08, + "logits/chosen": -3.0905911922454834, + "logits/rejected": -3.028481960296631, + "logps/chosen": -213.78160095214844, + "logps/rejected": -174.30029296875, + "loss": 0.5756, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017976611852645874, + "rewards/margins": 0.6575988531112671, + "rewards/rejected": -0.6396222710609436, + "step": 8315 + }, + { + "epoch": 0.96, + "learning_rate": 1.2571696125482851e-08, + "logits/chosen": -3.1392083168029785, + "logits/rejected": -2.5463080406188965, + "logps/chosen": -284.0442199707031, + "logps/rejected": -200.0133056640625, + "loss": 0.3426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.029979363083839417, + "rewards/margins": 2.201843500137329, + "rewards/rejected": -2.231822967529297, + "step": 8316 + }, + { + "epoch": 0.96, + "learning_rate": 1.2536579655858597e-08, + "logits/chosen": -3.2021946907043457, + "logits/rejected": -3.441253662109375, + "logps/chosen": -319.3117980957031, + "logps/rejected": -217.3712921142578, + "loss": 0.317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35259026288986206, + "rewards/margins": 1.7494137287139893, + "rewards/rejected": -2.102004051208496, + "step": 8317 + }, + { + "epoch": 0.96, + "learning_rate": 1.2501463186234344e-08, + "logits/chosen": -3.041901111602783, + "logits/rejected": -3.025028944015503, + "logps/chosen": -350.5223693847656, + "logps/rejected": -336.04461669921875, + "loss": 0.3742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4548894166946411, + "rewards/margins": 1.636356234550476, + "rewards/rejected": -2.091245651245117, + "step": 8318 + }, + { + "epoch": 0.96, + "learning_rate": 1.246634671661009e-08, + "logits/chosen": -2.7568929195404053, + "logits/rejected": -2.3521616458892822, + "logps/chosen": -406.0611267089844, + "logps/rejected": -367.01959228515625, + "loss": 0.273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9415393471717834, + "rewards/margins": 1.5403416156768799, + "rewards/rejected": -2.4818809032440186, + "step": 8319 + }, + { + "epoch": 0.96, + "learning_rate": 1.2431230246985837e-08, + "logits/chosen": -3.176500082015991, + "logits/rejected": -2.8431355953216553, + "logps/chosen": -277.8806457519531, + "logps/rejected": -300.62255859375, + "loss": 0.2113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9580151438713074, + "rewards/margins": 1.8921281099319458, + "rewards/rejected": -0.9341130256652832, + "step": 8320 + }, + { + "epoch": 0.96, + "learning_rate": 1.2396113777361583e-08, + "logits/chosen": -2.654128074645996, + "logits/rejected": -2.975630044937134, + "logps/chosen": -203.5262451171875, + "logps/rejected": -349.23101806640625, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13941937685012817, + "rewards/margins": 1.5806442499160767, + "rewards/rejected": -1.7200636863708496, + "step": 8321 + }, + { + "epoch": 0.96, + "learning_rate": 1.2360997307737328e-08, + "logits/chosen": -3.2189950942993164, + "logits/rejected": -3.1958065032958984, + "logps/chosen": -440.855224609375, + "logps/rejected": -236.7337646484375, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12371267378330231, + "rewards/margins": 1.5814650058746338, + "rewards/rejected": -1.705177664756775, + "step": 8322 + }, + { + "epoch": 0.96, + "learning_rate": 1.2325880838113075e-08, + "logits/chosen": -3.2147018909454346, + "logits/rejected": -3.2391600608825684, + "logps/chosen": -179.22091674804688, + "logps/rejected": -287.8453674316406, + "loss": 0.2351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.134456068277359, + "rewards/margins": 2.670729637145996, + "rewards/rejected": -2.805185556411743, + "step": 8323 + }, + { + "epoch": 0.96, + "learning_rate": 1.2290764368488821e-08, + "logits/chosen": -3.6570916175842285, + "logits/rejected": -3.3298940658569336, + "logps/chosen": -447.36474609375, + "logps/rejected": -247.44729614257812, + "loss": 0.824, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.151455283164978, + "rewards/margins": 0.45083773136138916, + "rewards/rejected": -1.6022930145263672, + "step": 8324 + }, + { + "epoch": 0.96, + "learning_rate": 1.2255647898864568e-08, + "logits/chosen": -2.811570644378662, + "logits/rejected": -2.5945653915405273, + "logps/chosen": -232.38682556152344, + "logps/rejected": -239.2474365234375, + "loss": 0.3294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06838984787464142, + "rewards/margins": 1.4631593227386475, + "rewards/rejected": -1.531549096107483, + "step": 8325 + }, + { + "epoch": 0.96, + "learning_rate": 1.2220531429240314e-08, + "logits/chosen": -2.777160167694092, + "logits/rejected": -3.104778289794922, + "logps/chosen": -231.21397399902344, + "logps/rejected": -179.45834350585938, + "loss": 0.4056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22073610126972198, + "rewards/margins": 1.982387900352478, + "rewards/rejected": -2.2031240463256836, + "step": 8326 + }, + { + "epoch": 0.96, + "learning_rate": 1.2185414959616061e-08, + "logits/chosen": -3.577101230621338, + "logits/rejected": -3.4305624961853027, + "logps/chosen": -349.3119201660156, + "logps/rejected": -312.13336181640625, + "loss": 0.2981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19576379656791687, + "rewards/margins": 2.8408281803131104, + "rewards/rejected": -2.64506459236145, + "step": 8327 + }, + { + "epoch": 0.96, + "learning_rate": 1.2150298489991805e-08, + "logits/chosen": -2.835967540740967, + "logits/rejected": -2.773076057434082, + "logps/chosen": -263.3528747558594, + "logps/rejected": -239.9281005859375, + "loss": 0.4249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3023580014705658, + "rewards/margins": 1.5369768142700195, + "rewards/rejected": -1.8393347263336182, + "step": 8328 + }, + { + "epoch": 0.96, + "learning_rate": 1.211518202036755e-08, + "logits/chosen": -2.464341640472412, + "logits/rejected": -2.4652998447418213, + "logps/chosen": -247.4571533203125, + "logps/rejected": -213.1072998046875, + "loss": 0.3355, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08892417699098587, + "rewards/margins": 1.4855237007141113, + "rewards/rejected": -1.3965994119644165, + "step": 8329 + }, + { + "epoch": 0.96, + "learning_rate": 1.2080065550743298e-08, + "logits/chosen": -2.708432912826538, + "logits/rejected": -2.8787789344787598, + "logps/chosen": -282.4986877441406, + "logps/rejected": -307.5788879394531, + "loss": 0.2318, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16228751838207245, + "rewards/margins": 2.3172192573547363, + "rewards/rejected": -2.1549320220947266, + "step": 8330 + }, + { + "epoch": 0.96, + "learning_rate": 1.2044949081119043e-08, + "logits/chosen": -2.7748122215270996, + "logits/rejected": -2.8240559101104736, + "logps/chosen": -340.56878662109375, + "logps/rejected": -259.9059753417969, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26199257373809814, + "rewards/margins": 2.4334537982940674, + "rewards/rejected": -2.695446252822876, + "step": 8331 + }, + { + "epoch": 0.96, + "learning_rate": 1.200983261149479e-08, + "logits/chosen": -3.7430453300476074, + "logits/rejected": -3.0116302967071533, + "logps/chosen": -278.8753967285156, + "logps/rejected": -256.9015197753906, + "loss": 0.081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4746891260147095, + "rewards/margins": 3.7006633281707764, + "rewards/rejected": -3.2259743213653564, + "step": 8332 + }, + { + "epoch": 0.96, + "learning_rate": 1.1974716141870536e-08, + "logits/chosen": -2.8136937618255615, + "logits/rejected": -2.932187557220459, + "logps/chosen": -193.53054809570312, + "logps/rejected": -277.6652526855469, + "loss": 0.8477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5571325421333313, + "rewards/margins": 0.7924224734306335, + "rewards/rejected": -1.3495550155639648, + "step": 8333 + }, + { + "epoch": 0.96, + "learning_rate": 1.1939599672246282e-08, + "logits/chosen": -3.102844715118408, + "logits/rejected": -2.9656200408935547, + "logps/chosen": -176.94561767578125, + "logps/rejected": -175.3705596923828, + "loss": 0.5596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.001324981451034546, + "rewards/margins": 1.0902819633483887, + "rewards/rejected": -1.091606855392456, + "step": 8334 + }, + { + "epoch": 0.96, + "learning_rate": 1.1904483202622029e-08, + "logits/chosen": -3.206015110015869, + "logits/rejected": -3.195622444152832, + "logps/chosen": -317.5715637207031, + "logps/rejected": -315.10809326171875, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36299842596054077, + "rewards/margins": 3.121387481689453, + "rewards/rejected": -2.7583889961242676, + "step": 8335 + }, + { + "epoch": 0.96, + "learning_rate": 1.1869366732997775e-08, + "logits/chosen": -2.3920087814331055, + "logits/rejected": -2.4658753871917725, + "logps/chosen": -363.11676025390625, + "logps/rejected": -336.82537841796875, + "loss": 0.4258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35463541746139526, + "rewards/margins": 0.9706010818481445, + "rewards/rejected": -1.325236439704895, + "step": 8336 + }, + { + "epoch": 0.96, + "learning_rate": 1.1834250263373522e-08, + "logits/chosen": -3.0141780376434326, + "logits/rejected": -3.168233633041382, + "logps/chosen": -174.62091064453125, + "logps/rejected": -146.85549926757812, + "loss": 0.3996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13578039407730103, + "rewards/margins": 1.2334873676300049, + "rewards/rejected": -1.3692677021026611, + "step": 8337 + }, + { + "epoch": 0.96, + "learning_rate": 1.1799133793749267e-08, + "logits/chosen": -3.2313952445983887, + "logits/rejected": -2.9742326736450195, + "logps/chosen": -258.1947021484375, + "logps/rejected": -216.05862426757812, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17683231830596924, + "rewards/margins": 1.7335424423217773, + "rewards/rejected": -1.9103747606277466, + "step": 8338 + }, + { + "epoch": 0.96, + "learning_rate": 1.1764017324125015e-08, + "logits/chosen": -2.680307388305664, + "logits/rejected": -2.6267452239990234, + "logps/chosen": -322.4468688964844, + "logps/rejected": -247.83578491210938, + "loss": 0.6144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34582751989364624, + "rewards/margins": 1.000154972076416, + "rewards/rejected": -1.345982551574707, + "step": 8339 + }, + { + "epoch": 0.96, + "learning_rate": 1.172890085450076e-08, + "logits/chosen": -2.6219286918640137, + "logits/rejected": -2.457559585571289, + "logps/chosen": -348.919921875, + "logps/rejected": -393.7067565917969, + "loss": 0.4312, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3196490406990051, + "rewards/margins": 1.2903155088424683, + "rewards/rejected": -0.9706665277481079, + "step": 8340 + }, + { + "epoch": 0.96, + "learning_rate": 1.1693784384876507e-08, + "logits/chosen": -3.399691104888916, + "logits/rejected": -3.024003028869629, + "logps/chosen": -279.5122985839844, + "logps/rejected": -320.81689453125, + "loss": 0.7154, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9422402381896973, + "rewards/margins": 0.8962936997413635, + "rewards/rejected": -1.8385341167449951, + "step": 8341 + }, + { + "epoch": 0.96, + "learning_rate": 1.1658667915252253e-08, + "logits/chosen": -3.774261474609375, + "logits/rejected": -3.656068801879883, + "logps/chosen": -306.2177734375, + "logps/rejected": -232.972412109375, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10800585895776749, + "rewards/margins": 1.2873046398162842, + "rewards/rejected": -1.395310401916504, + "step": 8342 + }, + { + "epoch": 0.96, + "learning_rate": 1.1623551445627999e-08, + "logits/chosen": -3.7829084396362305, + "logits/rejected": -3.6244254112243652, + "logps/chosen": -315.57373046875, + "logps/rejected": -192.29306030273438, + "loss": 0.1364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6309095025062561, + "rewards/margins": 2.854215145111084, + "rewards/rejected": -2.2233057022094727, + "step": 8343 + }, + { + "epoch": 0.96, + "learning_rate": 1.1588434976003746e-08, + "logits/chosen": -3.070521831512451, + "logits/rejected": -3.0306851863861084, + "logps/chosen": -246.73489379882812, + "logps/rejected": -369.80255126953125, + "loss": 0.2053, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4843379259109497, + "rewards/margins": 3.3994319438934326, + "rewards/rejected": -2.9150938987731934, + "step": 8344 + }, + { + "epoch": 0.96, + "learning_rate": 1.1553318506379491e-08, + "logits/chosen": -3.410555362701416, + "logits/rejected": -3.197970390319824, + "logps/chosen": -193.8603973388672, + "logps/rejected": -261.99444580078125, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2860303223133087, + "rewards/margins": 2.784748077392578, + "rewards/rejected": -2.498717784881592, + "step": 8345 + }, + { + "epoch": 0.96, + "learning_rate": 1.1518202036755239e-08, + "logits/chosen": -3.625643253326416, + "logits/rejected": -3.4338743686676025, + "logps/chosen": -353.9469299316406, + "logps/rejected": -293.3583068847656, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3488956093788147, + "rewards/margins": 1.209123969078064, + "rewards/rejected": -1.5580196380615234, + "step": 8346 + }, + { + "epoch": 0.96, + "learning_rate": 1.1483085567130984e-08, + "logits/chosen": -2.511627674102783, + "logits/rejected": -2.481801986694336, + "logps/chosen": -365.9727478027344, + "logps/rejected": -220.04505920410156, + "loss": 0.337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.231556698679924, + "rewards/margins": 1.361771583557129, + "rewards/rejected": -1.1302149295806885, + "step": 8347 + }, + { + "epoch": 0.96, + "learning_rate": 1.1447969097506731e-08, + "logits/chosen": -4.0744781494140625, + "logits/rejected": -3.7220377922058105, + "logps/chosen": -269.595947265625, + "logps/rejected": -299.44085693359375, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20033805072307587, + "rewards/margins": 2.318641424179077, + "rewards/rejected": -2.1183032989501953, + "step": 8348 + }, + { + "epoch": 0.96, + "learning_rate": 1.1412852627882477e-08, + "logits/chosen": -3.3912301063537598, + "logits/rejected": -3.3960955142974854, + "logps/chosen": -148.5876922607422, + "logps/rejected": -186.01873779296875, + "loss": 0.3327, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19692040979862213, + "rewards/margins": 2.4906721115112305, + "rewards/rejected": -2.2937517166137695, + "step": 8349 + }, + { + "epoch": 0.96, + "learning_rate": 1.1377736158258224e-08, + "logits/chosen": -3.225585699081421, + "logits/rejected": -2.8741447925567627, + "logps/chosen": -248.22134399414062, + "logps/rejected": -305.0395812988281, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1085050106048584, + "rewards/margins": 2.3262534141540527, + "rewards/rejected": -2.434758424758911, + "step": 8350 + }, + { + "epoch": 0.96, + "learning_rate": 1.134261968863397e-08, + "logits/chosen": -3.4938855171203613, + "logits/rejected": -3.4918696880340576, + "logps/chosen": -244.40086364746094, + "logps/rejected": -293.53936767578125, + "loss": 0.2739, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8410211801528931, + "rewards/margins": 2.875941514968872, + "rewards/rejected": -2.0349202156066895, + "step": 8351 + }, + { + "epoch": 0.96, + "learning_rate": 1.1307503219009714e-08, + "logits/chosen": -3.9203619956970215, + "logits/rejected": -3.691798210144043, + "logps/chosen": -342.06585693359375, + "logps/rejected": -302.5403747558594, + "loss": 0.327, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32267117500305176, + "rewards/margins": 1.7062797546386719, + "rewards/rejected": -1.3836085796356201, + "step": 8352 + }, + { + "epoch": 0.96, + "learning_rate": 1.1272386749385461e-08, + "logits/chosen": -3.4015979766845703, + "logits/rejected": -3.3749566078186035, + "logps/chosen": -235.3915252685547, + "logps/rejected": -237.5419464111328, + "loss": 0.7968, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13905665278434753, + "rewards/margins": 0.7795342803001404, + "rewards/rejected": -0.6404776573181152, + "step": 8353 + }, + { + "epoch": 0.96, + "learning_rate": 1.1237270279761207e-08, + "logits/chosen": -3.5383455753326416, + "logits/rejected": -3.5388622283935547, + "logps/chosen": -347.6019287109375, + "logps/rejected": -255.0184783935547, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.306110680103302, + "rewards/margins": 1.0887365341186523, + "rewards/rejected": -1.3948472738265991, + "step": 8354 + }, + { + "epoch": 0.96, + "learning_rate": 1.1202153810136952e-08, + "logits/chosen": -3.7726616859436035, + "logits/rejected": -3.780094623565674, + "logps/chosen": -336.2822265625, + "logps/rejected": -294.748046875, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22648678719997406, + "rewards/margins": 2.163719892501831, + "rewards/rejected": -1.9372328519821167, + "step": 8355 + }, + { + "epoch": 0.96, + "learning_rate": 1.11670373405127e-08, + "logits/chosen": -3.0075392723083496, + "logits/rejected": -2.889493703842163, + "logps/chosen": -204.1392822265625, + "logps/rejected": -262.16455078125, + "loss": 0.264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10321972519159317, + "rewards/margins": 1.9211444854736328, + "rewards/rejected": -2.0243642330169678, + "step": 8356 + }, + { + "epoch": 0.96, + "learning_rate": 1.1131920870888445e-08, + "logits/chosen": -3.42621111869812, + "logits/rejected": -3.34328556060791, + "logps/chosen": -259.737548828125, + "logps/rejected": -293.0054016113281, + "loss": 0.1706, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3775869309902191, + "rewards/margins": 3.2843382358551025, + "rewards/rejected": -2.9067511558532715, + "step": 8357 + }, + { + "epoch": 0.96, + "learning_rate": 1.1096804401264192e-08, + "logits/chosen": -3.051287889480591, + "logits/rejected": -2.8661699295043945, + "logps/chosen": -206.09030151367188, + "logps/rejected": -262.78558349609375, + "loss": 0.5994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2342820018529892, + "rewards/margins": 0.9157870411872864, + "rewards/rejected": -1.150068998336792, + "step": 8358 + }, + { + "epoch": 0.96, + "learning_rate": 1.1061687931639938e-08, + "logits/chosen": -2.573491096496582, + "logits/rejected": -2.6437511444091797, + "logps/chosen": -397.5391845703125, + "logps/rejected": -514.5775756835938, + "loss": 0.6022, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8454912900924683, + "rewards/margins": 1.334378719329834, + "rewards/rejected": -2.179870128631592, + "step": 8359 + }, + { + "epoch": 0.96, + "learning_rate": 1.1026571462015685e-08, + "logits/chosen": -3.6013875007629395, + "logits/rejected": -3.5567262172698975, + "logps/chosen": -368.9732666015625, + "logps/rejected": -291.4639892578125, + "loss": 0.6285, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5808574557304382, + "rewards/margins": 1.3060407638549805, + "rewards/rejected": -1.8868982791900635, + "step": 8360 + }, + { + "epoch": 0.96, + "learning_rate": 1.099145499239143e-08, + "logits/chosen": -3.1787893772125244, + "logits/rejected": -3.1560988426208496, + "logps/chosen": -130.888427734375, + "logps/rejected": -112.83800506591797, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4592573046684265, + "rewards/margins": 0.6163191199302673, + "rewards/rejected": -1.0755764245986938, + "step": 8361 + }, + { + "epoch": 0.96, + "learning_rate": 1.0956338522767178e-08, + "logits/chosen": -3.6684792041778564, + "logits/rejected": -3.139749526977539, + "logps/chosen": -264.620361328125, + "logps/rejected": -331.8872985839844, + "loss": 0.4216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11644697189331055, + "rewards/margins": 1.5230236053466797, + "rewards/rejected": -1.6394708156585693, + "step": 8362 + }, + { + "epoch": 0.96, + "learning_rate": 1.0921222053142923e-08, + "logits/chosen": -2.487468719482422, + "logits/rejected": -2.2279136180877686, + "logps/chosen": -245.96463012695312, + "logps/rejected": -182.60719299316406, + "loss": 0.3208, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5240422487258911, + "rewards/margins": 1.7499158382415771, + "rewards/rejected": -1.2258737087249756, + "step": 8363 + }, + { + "epoch": 0.96, + "learning_rate": 1.0886105583518669e-08, + "logits/chosen": -3.7783703804016113, + "logits/rejected": -3.609898567199707, + "logps/chosen": -172.27752685546875, + "logps/rejected": -235.08804321289062, + "loss": 0.3084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15288373827934265, + "rewards/margins": 3.59371018409729, + "rewards/rejected": -3.746593713760376, + "step": 8364 + }, + { + "epoch": 0.96, + "learning_rate": 1.0850989113894416e-08, + "logits/chosen": -3.3761239051818848, + "logits/rejected": -3.313227653503418, + "logps/chosen": -193.20167541503906, + "logps/rejected": -186.2373046875, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11200673878192902, + "rewards/margins": 2.6004879474639893, + "rewards/rejected": -2.7124948501586914, + "step": 8365 + }, + { + "epoch": 0.96, + "learning_rate": 1.0815872644270162e-08, + "logits/chosen": -3.0463600158691406, + "logits/rejected": -3.4370076656341553, + "logps/chosen": -337.37091064453125, + "logps/rejected": -263.44781494140625, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2181866466999054, + "rewards/margins": 2.9932994842529297, + "rewards/rejected": -2.775113105773926, + "step": 8366 + }, + { + "epoch": 0.96, + "learning_rate": 1.0780756174645909e-08, + "logits/chosen": -3.1007347106933594, + "logits/rejected": -3.3043954372406006, + "logps/chosen": -233.0471954345703, + "logps/rejected": -326.2756042480469, + "loss": 0.4307, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47641295194625854, + "rewards/margins": 2.3825886249542236, + "rewards/rejected": -2.859001398086548, + "step": 8367 + }, + { + "epoch": 0.96, + "learning_rate": 1.0745639705021655e-08, + "logits/chosen": -3.105009078979492, + "logits/rejected": -3.1570119857788086, + "logps/chosen": -180.45680236816406, + "logps/rejected": -175.01531982421875, + "loss": 0.3782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33033281564712524, + "rewards/margins": 1.8355581760406494, + "rewards/rejected": -2.16589093208313, + "step": 8368 + }, + { + "epoch": 0.96, + "learning_rate": 1.0710523235397402e-08, + "logits/chosen": -3.549992561340332, + "logits/rejected": -3.5817532539367676, + "logps/chosen": -392.5830383300781, + "logps/rejected": -302.9142150878906, + "loss": 0.4062, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6391738653182983, + "rewards/margins": 1.4591518640518188, + "rewards/rejected": -2.098325729370117, + "step": 8369 + }, + { + "epoch": 0.96, + "learning_rate": 1.0675406765773148e-08, + "logits/chosen": -3.0783042907714844, + "logits/rejected": -2.7937545776367188, + "logps/chosen": -311.4524230957031, + "logps/rejected": -205.97613525390625, + "loss": 0.351, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34792596101760864, + "rewards/margins": 1.4021936655044556, + "rewards/rejected": -1.0542676448822021, + "step": 8370 + }, + { + "epoch": 0.97, + "learning_rate": 1.0640290296148895e-08, + "logits/chosen": -2.605991840362549, + "logits/rejected": -2.6813442707061768, + "logps/chosen": -238.73785400390625, + "logps/rejected": -432.681884765625, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24159081280231476, + "rewards/margins": 1.2608883380889893, + "rewards/rejected": -1.5024791955947876, + "step": 8371 + }, + { + "epoch": 0.97, + "learning_rate": 1.060517382652464e-08, + "logits/chosen": -2.108330249786377, + "logits/rejected": -2.2844552993774414, + "logps/chosen": -233.19692993164062, + "logps/rejected": -173.91851806640625, + "loss": 0.5369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5502626299858093, + "rewards/margins": 1.0513514280319214, + "rewards/rejected": -1.6016141176223755, + "step": 8372 + }, + { + "epoch": 0.97, + "learning_rate": 1.0570057356900386e-08, + "logits/chosen": -3.068117618560791, + "logits/rejected": -3.18377947807312, + "logps/chosen": -229.62010192871094, + "logps/rejected": -221.99391174316406, + "loss": 0.4541, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0775289461016655, + "rewards/margins": 1.1989206075668335, + "rewards/rejected": -1.1213915348052979, + "step": 8373 + }, + { + "epoch": 0.97, + "learning_rate": 1.0534940887276133e-08, + "logits/chosen": -3.1026909351348877, + "logits/rejected": -2.8794331550598145, + "logps/chosen": -247.34310913085938, + "logps/rejected": -196.1335906982422, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20142866671085358, + "rewards/margins": 1.7865043878555298, + "rewards/rejected": -1.9879331588745117, + "step": 8374 + }, + { + "epoch": 0.97, + "learning_rate": 1.0499824417651879e-08, + "logits/chosen": -3.6565980911254883, + "logits/rejected": -3.838531970977783, + "logps/chosen": -185.8131103515625, + "logps/rejected": -214.59463500976562, + "loss": 0.2687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14594735205173492, + "rewards/margins": 2.695552349090576, + "rewards/rejected": -2.8414995670318604, + "step": 8375 + }, + { + "epoch": 0.97, + "learning_rate": 1.0464707948027623e-08, + "logits/chosen": -2.5613412857055664, + "logits/rejected": -2.4179847240448, + "logps/chosen": -243.0853271484375, + "logps/rejected": -305.832275390625, + "loss": 0.4106, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0027925558388233185, + "rewards/margins": 1.4293853044509888, + "rewards/rejected": -1.4265927076339722, + "step": 8376 + }, + { + "epoch": 0.97, + "learning_rate": 1.042959147840337e-08, + "logits/chosen": -3.6622071266174316, + "logits/rejected": -3.0762760639190674, + "logps/chosen": -273.20440673828125, + "logps/rejected": -298.4071960449219, + "loss": 0.2073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29549628496170044, + "rewards/margins": 1.886146068572998, + "rewards/rejected": -1.5906498432159424, + "step": 8377 + }, + { + "epoch": 0.97, + "learning_rate": 1.0394475008779116e-08, + "logits/chosen": -3.62972354888916, + "logits/rejected": -3.817674160003662, + "logps/chosen": -198.19970703125, + "logps/rejected": -224.87962341308594, + "loss": 0.4837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22127294540405273, + "rewards/margins": 0.6773828864097595, + "rewards/rejected": -0.8986558318138123, + "step": 8378 + }, + { + "epoch": 0.97, + "learning_rate": 1.0359358539154863e-08, + "logits/chosen": -2.6171622276306152, + "logits/rejected": -2.8154525756835938, + "logps/chosen": -308.27154541015625, + "logps/rejected": -436.47216796875, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27204954624176025, + "rewards/margins": 1.6139622926712036, + "rewards/rejected": -1.8860119581222534, + "step": 8379 + }, + { + "epoch": 0.97, + "learning_rate": 1.0324242069530608e-08, + "logits/chosen": -2.565946340560913, + "logits/rejected": -2.5145270824432373, + "logps/chosen": -326.9057922363281, + "logps/rejected": -359.2528381347656, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6061733961105347, + "rewards/margins": 1.484626293182373, + "rewards/rejected": -0.8784528970718384, + "step": 8380 + }, + { + "epoch": 0.97, + "learning_rate": 1.0289125599906356e-08, + "logits/chosen": -3.339923620223999, + "logits/rejected": -3.753171443939209, + "logps/chosen": -254.01025390625, + "logps/rejected": -231.7086639404297, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16549429297447205, + "rewards/margins": 1.8804755210876465, + "rewards/rejected": -1.714981198310852, + "step": 8381 + }, + { + "epoch": 0.97, + "learning_rate": 1.0254009130282101e-08, + "logits/chosen": -2.5662381649017334, + "logits/rejected": -2.845761299133301, + "logps/chosen": -504.08160400390625, + "logps/rejected": -428.8846435546875, + "loss": 0.1814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3760469853878021, + "rewards/margins": 4.12640380859375, + "rewards/rejected": -3.750357151031494, + "step": 8382 + }, + { + "epoch": 0.97, + "learning_rate": 1.0218892660657848e-08, + "logits/chosen": -3.310889482498169, + "logits/rejected": -3.2768239974975586, + "logps/chosen": -116.40216827392578, + "logps/rejected": -250.23040771484375, + "loss": 0.21, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1205882728099823, + "rewards/margins": 3.0165646076202393, + "rewards/rejected": -3.137152910232544, + "step": 8383 + }, + { + "epoch": 0.97, + "learning_rate": 1.0183776191033594e-08, + "logits/chosen": -3.018362283706665, + "logits/rejected": -3.2070484161376953, + "logps/chosen": -318.85302734375, + "logps/rejected": -212.66879272460938, + "loss": 0.5901, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.039605528116226196, + "rewards/margins": 0.7976378202438354, + "rewards/rejected": -0.7580322623252869, + "step": 8384 + }, + { + "epoch": 0.97, + "learning_rate": 1.014865972140934e-08, + "logits/chosen": -3.3908605575561523, + "logits/rejected": -3.7795255184173584, + "logps/chosen": -236.3668212890625, + "logps/rejected": -167.8939208984375, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45229461789131165, + "rewards/margins": 1.2911782264709473, + "rewards/rejected": -1.7434728145599365, + "step": 8385 + }, + { + "epoch": 0.97, + "learning_rate": 1.0113543251785087e-08, + "logits/chosen": -3.527090072631836, + "logits/rejected": -3.62808895111084, + "logps/chosen": -273.10345458984375, + "logps/rejected": -232.60101318359375, + "loss": 0.1276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10846251249313354, + "rewards/margins": 2.9995741844177246, + "rewards/rejected": -3.108036756515503, + "step": 8386 + }, + { + "epoch": 0.97, + "learning_rate": 1.0078426782160832e-08, + "logits/chosen": -2.821237802505493, + "logits/rejected": -2.914905309677124, + "logps/chosen": -161.9700164794922, + "logps/rejected": -293.0438232421875, + "loss": 0.3218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5734564065933228, + "rewards/margins": 2.190204381942749, + "rewards/rejected": -2.7636609077453613, + "step": 8387 + }, + { + "epoch": 0.97, + "learning_rate": 1.004331031253658e-08, + "logits/chosen": -2.2285447120666504, + "logits/rejected": -2.7544503211975098, + "logps/chosen": -330.267578125, + "logps/rejected": -201.34249877929688, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16118726134300232, + "rewards/margins": 1.6354562044143677, + "rewards/rejected": -1.7966433763504028, + "step": 8388 + }, + { + "epoch": 0.97, + "learning_rate": 1.0008193842912325e-08, + "logits/chosen": -3.134453296661377, + "logits/rejected": -3.1870687007904053, + "logps/chosen": -314.43218994140625, + "logps/rejected": -210.53684997558594, + "loss": 0.3773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5068659782409668, + "rewards/margins": 1.5577776432037354, + "rewards/rejected": -2.064643621444702, + "step": 8389 + }, + { + "epoch": 0.97, + "learning_rate": 9.973077373288072e-09, + "logits/chosen": -2.7010555267333984, + "logits/rejected": -2.987222671508789, + "logps/chosen": -194.25979614257812, + "logps/rejected": -286.1780090332031, + "loss": 0.3275, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11472973227500916, + "rewards/margins": 3.7130191326141357, + "rewards/rejected": -3.5982892513275146, + "step": 8390 + }, + { + "epoch": 0.97, + "learning_rate": 9.937960903663818e-09, + "logits/chosen": -2.8756279945373535, + "logits/rejected": -3.021961212158203, + "logps/chosen": -377.89208984375, + "logps/rejected": -268.6658020019531, + "loss": 1.0754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9096782803535461, + "rewards/margins": 0.09256502985954285, + "rewards/rejected": -1.0022432804107666, + "step": 8391 + }, + { + "epoch": 0.97, + "learning_rate": 9.902844434039565e-09, + "logits/chosen": -2.948106288909912, + "logits/rejected": -2.754967212677002, + "logps/chosen": -218.20574951171875, + "logps/rejected": -273.3581848144531, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5688002705574036, + "rewards/margins": 1.4077889919281006, + "rewards/rejected": -1.9765892028808594, + "step": 8392 + }, + { + "epoch": 0.97, + "learning_rate": 9.867727964415311e-09, + "logits/chosen": -3.4410223960876465, + "logits/rejected": -3.3055577278137207, + "logps/chosen": -180.4617462158203, + "logps/rejected": -159.66038513183594, + "loss": 0.2069, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4931867718696594, + "rewards/margins": 2.150625467300415, + "rewards/rejected": -1.6574387550354004, + "step": 8393 + }, + { + "epoch": 0.97, + "learning_rate": 9.832611494791056e-09, + "logits/chosen": -3.9050979614257812, + "logits/rejected": -3.380505084991455, + "logps/chosen": -276.4307556152344, + "logps/rejected": -315.4677734375, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09048400819301605, + "rewards/margins": 1.5715667009353638, + "rewards/rejected": -1.4810826778411865, + "step": 8394 + }, + { + "epoch": 0.97, + "learning_rate": 9.797495025166804e-09, + "logits/chosen": -2.486135721206665, + "logits/rejected": -2.662494659423828, + "logps/chosen": -311.4579772949219, + "logps/rejected": -297.58282470703125, + "loss": 0.269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15604937076568604, + "rewards/margins": 2.055245876312256, + "rewards/rejected": -2.2112953662872314, + "step": 8395 + }, + { + "epoch": 0.97, + "learning_rate": 9.76237855554255e-09, + "logits/chosen": -3.2524023056030273, + "logits/rejected": -3.174048900604248, + "logps/chosen": -208.98583984375, + "logps/rejected": -204.2353515625, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14812420308589935, + "rewards/margins": 1.4177249670028687, + "rewards/rejected": -1.5658491849899292, + "step": 8396 + }, + { + "epoch": 0.97, + "learning_rate": 9.727262085918296e-09, + "logits/chosen": -3.3746867179870605, + "logits/rejected": -3.54532790184021, + "logps/chosen": -327.8814392089844, + "logps/rejected": -311.01055908203125, + "loss": 0.3278, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3664657175540924, + "rewards/margins": 2.320497512817383, + "rewards/rejected": -1.9540317058563232, + "step": 8397 + }, + { + "epoch": 0.97, + "learning_rate": 9.692145616294042e-09, + "logits/chosen": -3.7393250465393066, + "logits/rejected": -3.021017074584961, + "logps/chosen": -347.60504150390625, + "logps/rejected": -177.91415405273438, + "loss": 0.2521, + "rewards/accuracies": 0.875, + "rewards/chosen": 5.211681127548218e-05, + "rewards/margins": 2.0167319774627686, + "rewards/rejected": -2.0166797637939453, + "step": 8398 + }, + { + "epoch": 0.97, + "learning_rate": 9.65702914666979e-09, + "logits/chosen": -2.635904550552368, + "logits/rejected": -2.4802560806274414, + "logps/chosen": -292.46533203125, + "logps/rejected": -302.8505554199219, + "loss": 1.06, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0918371677398682, + "rewards/margins": -0.45846885442733765, + "rewards/rejected": -0.6333682537078857, + "step": 8399 + }, + { + "epoch": 0.97, + "learning_rate": 9.621912677045533e-09, + "logits/chosen": -2.6441781520843506, + "logits/rejected": -2.659958839416504, + "logps/chosen": -166.8682098388672, + "logps/rejected": -215.74281311035156, + "loss": 0.3947, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2554742693901062, + "rewards/margins": 2.5519919395446777, + "rewards/rejected": -2.2965176105499268, + "step": 8400 + }, + { + "epoch": 0.97, + "learning_rate": 9.586796207421279e-09, + "logits/chosen": -2.9591426849365234, + "logits/rejected": -2.8547120094299316, + "logps/chosen": -318.00146484375, + "logps/rejected": -169.09852600097656, + "loss": 0.5803, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013365454971790314, + "rewards/margins": 1.258200764656067, + "rewards/rejected": -1.271566390991211, + "step": 8401 + }, + { + "epoch": 0.97, + "learning_rate": 9.551679737797026e-09, + "logits/chosen": -2.523224115371704, + "logits/rejected": -2.5810837745666504, + "logps/chosen": -289.3690185546875, + "logps/rejected": -243.92283630371094, + "loss": 0.4409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5940326452255249, + "rewards/margins": 1.1500616073608398, + "rewards/rejected": -1.7440943717956543, + "step": 8402 + }, + { + "epoch": 0.97, + "learning_rate": 9.516563268172772e-09, + "logits/chosen": -3.3571534156799316, + "logits/rejected": -3.4374966621398926, + "logps/chosen": -407.5580749511719, + "logps/rejected": -347.18701171875, + "loss": 0.1713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3963181674480438, + "rewards/margins": 3.069068431854248, + "rewards/rejected": -2.672750234603882, + "step": 8403 + }, + { + "epoch": 0.97, + "learning_rate": 9.481446798548519e-09, + "logits/chosen": -3.0084073543548584, + "logits/rejected": -3.371002197265625, + "logps/chosen": -140.05503845214844, + "logps/rejected": -211.66207885742188, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4996350407600403, + "rewards/margins": 2.620623826980591, + "rewards/rejected": -2.1209888458251953, + "step": 8404 + }, + { + "epoch": 0.97, + "learning_rate": 9.446330328924264e-09, + "logits/chosen": -3.064945697784424, + "logits/rejected": -3.360159158706665, + "logps/chosen": -154.15371704101562, + "logps/rejected": -260.14703369140625, + "loss": 0.339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032532498240470886, + "rewards/margins": 2.2552030086517334, + "rewards/rejected": -2.222670555114746, + "step": 8405 + }, + { + "epoch": 0.97, + "learning_rate": 9.41121385930001e-09, + "logits/chosen": -3.442075252532959, + "logits/rejected": -3.4038009643554688, + "logps/chosen": -281.39080810546875, + "logps/rejected": -168.22610473632812, + "loss": 0.3338, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4891003668308258, + "rewards/margins": 1.7336540222167969, + "rewards/rejected": -1.244553804397583, + "step": 8406 + }, + { + "epoch": 0.97, + "learning_rate": 9.376097389675757e-09, + "logits/chosen": -2.974752902984619, + "logits/rejected": -2.8256354331970215, + "logps/chosen": -295.47662353515625, + "logps/rejected": -350.033935546875, + "loss": 0.4056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4749956727027893, + "rewards/margins": 1.2463704347610474, + "rewards/rejected": -1.721366047859192, + "step": 8407 + }, + { + "epoch": 0.97, + "learning_rate": 9.340980920051503e-09, + "logits/chosen": -3.551370143890381, + "logits/rejected": -3.638317823410034, + "logps/chosen": -275.5017395019531, + "logps/rejected": -251.56695556640625, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04572046920657158, + "rewards/margins": 1.3908506631851196, + "rewards/rejected": -1.3451303243637085, + "step": 8408 + }, + { + "epoch": 0.97, + "learning_rate": 9.30586445042725e-09, + "logits/chosen": -3.1304540634155273, + "logits/rejected": -3.0111711025238037, + "logps/chosen": -158.97024536132812, + "logps/rejected": -252.0541534423828, + "loss": 0.6097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9647552967071533, + "rewards/margins": 0.9240675568580627, + "rewards/rejected": -1.8888227939605713, + "step": 8409 + }, + { + "epoch": 0.97, + "learning_rate": 9.270747980802996e-09, + "logits/chosen": -3.0113649368286133, + "logits/rejected": -3.0797882080078125, + "logps/chosen": -269.68304443359375, + "logps/rejected": -281.2015380859375, + "loss": 0.2866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06046495586633682, + "rewards/margins": 1.7535547018051147, + "rewards/rejected": -1.8140195608139038, + "step": 8410 + }, + { + "epoch": 0.97, + "learning_rate": 9.235631511178743e-09, + "logits/chosen": -3.62265682220459, + "logits/rejected": -3.701540946960449, + "logps/chosen": -341.08233642578125, + "logps/rejected": -313.3612976074219, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3652455508708954, + "rewards/margins": 2.9969892501831055, + "rewards/rejected": -2.631743907928467, + "step": 8411 + }, + { + "epoch": 0.97, + "learning_rate": 9.200515041554488e-09, + "logits/chosen": -1.7785189151763916, + "logits/rejected": -2.0118889808654785, + "logps/chosen": -262.4716796875, + "logps/rejected": -277.7813720703125, + "loss": 0.4971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2938191294670105, + "rewards/margins": 0.9168039560317993, + "rewards/rejected": -1.2106231451034546, + "step": 8412 + }, + { + "epoch": 0.97, + "learning_rate": 9.165398571930236e-09, + "logits/chosen": -3.9835095405578613, + "logits/rejected": -4.1817097663879395, + "logps/chosen": -284.0061950683594, + "logps/rejected": -239.89694213867188, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5119163990020752, + "rewards/margins": 1.835291862487793, + "rewards/rejected": -2.3472084999084473, + "step": 8413 + }, + { + "epoch": 0.97, + "learning_rate": 9.130282102305981e-09, + "logits/chosen": -2.325190782546997, + "logits/rejected": -1.9487972259521484, + "logps/chosen": -489.46270751953125, + "logps/rejected": -327.79412841796875, + "loss": 0.4688, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30379363894462585, + "rewards/margins": 2.114424705505371, + "rewards/rejected": -2.4182186126708984, + "step": 8414 + }, + { + "epoch": 0.97, + "learning_rate": 9.095165632681727e-09, + "logits/chosen": -3.176107406616211, + "logits/rejected": -3.487386465072632, + "logps/chosen": -135.98165893554688, + "logps/rejected": -230.57981872558594, + "loss": 0.457, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7797970771789551, + "rewards/margins": 0.8574131727218628, + "rewards/rejected": -1.6372102499008179, + "step": 8415 + }, + { + "epoch": 0.97, + "learning_rate": 9.060049163057474e-09, + "logits/chosen": -3.7591748237609863, + "logits/rejected": -3.552121639251709, + "logps/chosen": -255.68797302246094, + "logps/rejected": -258.18408203125, + "loss": 0.3724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25968602299690247, + "rewards/margins": 0.9982882738113403, + "rewards/rejected": -1.25797438621521, + "step": 8416 + }, + { + "epoch": 0.97, + "learning_rate": 9.02493269343322e-09, + "logits/chosen": -2.9031543731689453, + "logits/rejected": -2.9792163372039795, + "logps/chosen": -148.63812255859375, + "logps/rejected": -240.86782836914062, + "loss": 0.4043, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38928067684173584, + "rewards/margins": 1.7886607646942139, + "rewards/rejected": -1.399380087852478, + "step": 8417 + }, + { + "epoch": 0.97, + "learning_rate": 8.989816223808965e-09, + "logits/chosen": -2.9580183029174805, + "logits/rejected": -3.0599515438079834, + "logps/chosen": -340.7004089355469, + "logps/rejected": -224.855224609375, + "loss": 0.4378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5318108797073364, + "rewards/margins": 1.2630248069763184, + "rewards/rejected": -1.7948358058929443, + "step": 8418 + }, + { + "epoch": 0.97, + "learning_rate": 8.954699754184713e-09, + "logits/chosen": -2.60532546043396, + "logits/rejected": -2.6548120975494385, + "logps/chosen": -206.2239227294922, + "logps/rejected": -362.374755859375, + "loss": 0.1975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0584198459982872, + "rewards/margins": 2.5225110054016113, + "rewards/rejected": -2.4640913009643555, + "step": 8419 + }, + { + "epoch": 0.97, + "learning_rate": 8.919583284560458e-09, + "logits/chosen": -3.589961528778076, + "logits/rejected": -3.813948392868042, + "logps/chosen": -75.11849975585938, + "logps/rejected": -268.4548034667969, + "loss": 0.2946, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05391992628574371, + "rewards/margins": 3.491680860519409, + "rewards/rejected": -3.437760829925537, + "step": 8420 + }, + { + "epoch": 0.97, + "learning_rate": 8.884466814936204e-09, + "logits/chosen": -2.8860201835632324, + "logits/rejected": -3.0575625896453857, + "logps/chosen": -190.44007873535156, + "logps/rejected": -325.57757568359375, + "loss": 0.1628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1732223629951477, + "rewards/margins": 2.3098130226135254, + "rewards/rejected": -2.4830353260040283, + "step": 8421 + }, + { + "epoch": 0.97, + "learning_rate": 8.849350345311951e-09, + "logits/chosen": -3.5799875259399414, + "logits/rejected": -3.436499834060669, + "logps/chosen": -357.90582275390625, + "logps/rejected": -215.13235473632812, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2094860076904297, + "rewards/margins": 2.6277050971984863, + "rewards/rejected": -2.837191104888916, + "step": 8422 + }, + { + "epoch": 0.97, + "learning_rate": 8.814233875687697e-09, + "logits/chosen": -3.130901336669922, + "logits/rejected": -3.100586414337158, + "logps/chosen": -250.59014892578125, + "logps/rejected": -249.2540740966797, + "loss": 0.5549, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2820245623588562, + "rewards/margins": 1.9295485019683838, + "rewards/rejected": -1.6475238800048828, + "step": 8423 + }, + { + "epoch": 0.97, + "learning_rate": 8.779117406063444e-09, + "logits/chosen": -2.908703327178955, + "logits/rejected": -2.729710102081299, + "logps/chosen": -170.94032287597656, + "logps/rejected": -181.90565490722656, + "loss": 0.8448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7636530995368958, + "rewards/margins": 0.361050546169281, + "rewards/rejected": -1.1247036457061768, + "step": 8424 + }, + { + "epoch": 0.97, + "learning_rate": 8.74400093643919e-09, + "logits/chosen": -3.1324214935302734, + "logits/rejected": -3.304983139038086, + "logps/chosen": -183.11270141601562, + "logps/rejected": -142.7198486328125, + "loss": 0.5273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011617705225944519, + "rewards/margins": 0.9910141229629517, + "rewards/rejected": -1.0026317834854126, + "step": 8425 + }, + { + "epoch": 0.97, + "learning_rate": 8.708884466814937e-09, + "logits/chosen": -3.3177943229675293, + "logits/rejected": -3.577139139175415, + "logps/chosen": -202.10000610351562, + "logps/rejected": -262.4162292480469, + "loss": 0.2588, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3413122892379761, + "rewards/margins": 1.8321030139923096, + "rewards/rejected": -1.490790605545044, + "step": 8426 + }, + { + "epoch": 0.97, + "learning_rate": 8.673767997190682e-09, + "logits/chosen": -3.231203556060791, + "logits/rejected": -2.677748680114746, + "logps/chosen": -658.9725341796875, + "logps/rejected": -233.56832885742188, + "loss": 0.2511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40233314037323, + "rewards/margins": 2.0843820571899414, + "rewards/rejected": -2.486715316772461, + "step": 8427 + }, + { + "epoch": 0.97, + "learning_rate": 8.63865152756643e-09, + "logits/chosen": -3.0973427295684814, + "logits/rejected": -3.339567184448242, + "logps/chosen": -273.0887145996094, + "logps/rejected": -286.77691650390625, + "loss": 0.4199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3005458414554596, + "rewards/margins": 1.4008681774139404, + "rewards/rejected": -1.7014141082763672, + "step": 8428 + }, + { + "epoch": 0.97, + "learning_rate": 8.603535057942175e-09, + "logits/chosen": -3.2679266929626465, + "logits/rejected": -3.3864011764526367, + "logps/chosen": -214.06427001953125, + "logps/rejected": -233.89022827148438, + "loss": 0.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00703786313533783, + "rewards/margins": 1.214803695678711, + "rewards/rejected": -1.207765817642212, + "step": 8429 + }, + { + "epoch": 0.97, + "learning_rate": 8.56841858831792e-09, + "logits/chosen": -2.726724624633789, + "logits/rejected": -3.064340353012085, + "logps/chosen": -219.16258239746094, + "logps/rejected": -209.36106872558594, + "loss": 0.5378, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.177345871925354, + "rewards/margins": 1.3730950355529785, + "rewards/rejected": -1.550440788269043, + "step": 8430 + }, + { + "epoch": 0.97, + "learning_rate": 8.533302118693666e-09, + "logits/chosen": -2.5691819190979004, + "logits/rejected": -2.700509548187256, + "logps/chosen": -228.4883270263672, + "logps/rejected": -412.16900634765625, + "loss": 0.4476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07041977345943451, + "rewards/margins": 2.6801564693450928, + "rewards/rejected": -2.609736680984497, + "step": 8431 + }, + { + "epoch": 0.97, + "learning_rate": 8.498185649069413e-09, + "logits/chosen": -2.973529815673828, + "logits/rejected": -3.201263904571533, + "logps/chosen": -225.58535766601562, + "logps/rejected": -221.8105926513672, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6797542572021484, + "rewards/margins": 1.602910041809082, + "rewards/rejected": -2.2826645374298096, + "step": 8432 + }, + { + "epoch": 0.97, + "learning_rate": 8.463069179445159e-09, + "logits/chosen": -2.921501636505127, + "logits/rejected": -2.9915084838867188, + "logps/chosen": -264.0403137207031, + "logps/rejected": -285.39642333984375, + "loss": 0.435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1676577627658844, + "rewards/margins": 1.2559481859207153, + "rewards/rejected": -1.4236059188842773, + "step": 8433 + }, + { + "epoch": 0.97, + "learning_rate": 8.427952709820906e-09, + "logits/chosen": -2.9410154819488525, + "logits/rejected": -3.211538314819336, + "logps/chosen": -238.24871826171875, + "logps/rejected": -208.40740966796875, + "loss": 0.579, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22446183860301971, + "rewards/margins": 1.5096815824508667, + "rewards/rejected": -1.285219669342041, + "step": 8434 + }, + { + "epoch": 0.97, + "learning_rate": 8.392836240196652e-09, + "logits/chosen": -3.12337589263916, + "logits/rejected": -3.382150173187256, + "logps/chosen": -326.0976257324219, + "logps/rejected": -223.6885986328125, + "loss": 0.5221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5976174473762512, + "rewards/margins": 1.4886724948883057, + "rewards/rejected": -2.086289882659912, + "step": 8435 + }, + { + "epoch": 0.97, + "learning_rate": 8.357719770572397e-09, + "logits/chosen": -3.312103271484375, + "logits/rejected": -3.4446029663085938, + "logps/chosen": -247.78341674804688, + "logps/rejected": -229.12933349609375, + "loss": 0.2469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35446399450302124, + "rewards/margins": 2.385605812072754, + "rewards/rejected": -2.740069627761841, + "step": 8436 + }, + { + "epoch": 0.97, + "learning_rate": 8.322603300948145e-09, + "logits/chosen": -3.286302328109741, + "logits/rejected": -3.2838504314422607, + "logps/chosen": -275.45904541015625, + "logps/rejected": -242.29273986816406, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5535131692886353, + "rewards/margins": 1.8610289096832275, + "rewards/rejected": -2.4145419597625732, + "step": 8437 + }, + { + "epoch": 0.97, + "learning_rate": 8.28748683132389e-09, + "logits/chosen": -3.3749213218688965, + "logits/rejected": -3.7651848793029785, + "logps/chosen": -190.9357147216797, + "logps/rejected": -200.5861053466797, + "loss": 0.4979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05998539179563522, + "rewards/margins": 1.9054255485534668, + "rewards/rejected": -1.845440149307251, + "step": 8438 + }, + { + "epoch": 0.97, + "learning_rate": 8.252370361699637e-09, + "logits/chosen": -2.337521553039551, + "logits/rejected": -2.308581829071045, + "logps/chosen": -168.5311737060547, + "logps/rejected": -261.06707763671875, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24522832036018372, + "rewards/margins": 2.434825897216797, + "rewards/rejected": -2.1895976066589355, + "step": 8439 + }, + { + "epoch": 0.97, + "learning_rate": 8.217253892075383e-09, + "logits/chosen": -3.796539545059204, + "logits/rejected": -3.5094943046569824, + "logps/chosen": -326.50506591796875, + "logps/rejected": -257.970703125, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.215492382645607, + "rewards/margins": 3.1530280113220215, + "rewards/rejected": -3.3685200214385986, + "step": 8440 + }, + { + "epoch": 0.97, + "learning_rate": 8.18213742245113e-09, + "logits/chosen": -3.0618038177490234, + "logits/rejected": -2.9919629096984863, + "logps/chosen": -177.88943481445312, + "logps/rejected": -279.259765625, + "loss": 0.4286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07048708200454712, + "rewards/margins": 1.0877712965011597, + "rewards/rejected": -1.1582584381103516, + "step": 8441 + }, + { + "epoch": 0.97, + "learning_rate": 8.147020952826874e-09, + "logits/chosen": -3.554231643676758, + "logits/rejected": -3.4620437622070312, + "logps/chosen": -208.33322143554688, + "logps/rejected": -159.91249084472656, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.001927502453327179, + "rewards/margins": 1.890863299369812, + "rewards/rejected": -1.8889358043670654, + "step": 8442 + }, + { + "epoch": 0.97, + "learning_rate": 8.111904483202621e-09, + "logits/chosen": -3.2849061489105225, + "logits/rejected": -3.3064053058624268, + "logps/chosen": -207.39105224609375, + "logps/rejected": -266.3307800292969, + "loss": 0.557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.773143470287323, + "rewards/margins": 0.8364297151565552, + "rewards/rejected": -1.6095731258392334, + "step": 8443 + }, + { + "epoch": 0.97, + "learning_rate": 8.076788013578367e-09, + "logits/chosen": -3.5888831615448, + "logits/rejected": -3.9313549995422363, + "logps/chosen": -455.7917785644531, + "logps/rejected": -458.7384338378906, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1357952058315277, + "rewards/margins": 3.3609485626220703, + "rewards/rejected": -3.225153684616089, + "step": 8444 + }, + { + "epoch": 0.97, + "learning_rate": 8.041671543954114e-09, + "logits/chosen": -3.183074474334717, + "logits/rejected": -2.9479448795318604, + "logps/chosen": -317.97344970703125, + "logps/rejected": -228.6673583984375, + "loss": 0.5455, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06765228509902954, + "rewards/margins": 1.0499480962753296, + "rewards/rejected": -1.117600440979004, + "step": 8445 + }, + { + "epoch": 0.97, + "learning_rate": 8.00655507432986e-09, + "logits/chosen": -2.9984371662139893, + "logits/rejected": -2.847534418106079, + "logps/chosen": -164.04083251953125, + "logps/rejected": -257.59918212890625, + "loss": 0.4088, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5056501030921936, + "rewards/margins": 2.45600962638855, + "rewards/rejected": -2.9616596698760986, + "step": 8446 + }, + { + "epoch": 0.97, + "learning_rate": 7.971438604705607e-09, + "logits/chosen": -2.8934905529022217, + "logits/rejected": -2.8812272548675537, + "logps/chosen": -321.766357421875, + "logps/rejected": -383.79193115234375, + "loss": 0.5697, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08661418408155441, + "rewards/margins": 1.2753524780273438, + "rewards/rejected": -1.1887381076812744, + "step": 8447 + }, + { + "epoch": 0.97, + "learning_rate": 7.936322135081353e-09, + "logits/chosen": -3.2338428497314453, + "logits/rejected": -3.505424976348877, + "logps/chosen": -286.4169006347656, + "logps/rejected": -387.6412658691406, + "loss": 0.2951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08707311749458313, + "rewards/margins": 1.4228544235229492, + "rewards/rejected": -1.5099276304244995, + "step": 8448 + }, + { + "epoch": 0.97, + "learning_rate": 7.9012056654571e-09, + "logits/chosen": -3.4245548248291016, + "logits/rejected": -3.18292498588562, + "logps/chosen": -176.4376983642578, + "logps/rejected": -280.36431884765625, + "loss": 0.4635, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005509167909622192, + "rewards/margins": 1.3783385753631592, + "rewards/rejected": -1.3728294372558594, + "step": 8449 + }, + { + "epoch": 0.97, + "learning_rate": 7.866089195832845e-09, + "logits/chosen": -2.9239397048950195, + "logits/rejected": -2.795772075653076, + "logps/chosen": -475.38507080078125, + "logps/rejected": -334.826904296875, + "loss": 0.1453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3018825650215149, + "rewards/margins": 3.2029008865356445, + "rewards/rejected": -2.9010181427001953, + "step": 8450 + }, + { + "epoch": 0.97, + "learning_rate": 7.830972726208591e-09, + "logits/chosen": -2.3931281566619873, + "logits/rejected": -2.514948606491089, + "logps/chosen": -253.69070434570312, + "logps/rejected": -263.8158264160156, + "loss": 0.4294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1243424266576767, + "rewards/margins": 1.6393765211105347, + "rewards/rejected": -1.763719081878662, + "step": 8451 + }, + { + "epoch": 0.97, + "learning_rate": 7.795856256584338e-09, + "logits/chosen": -3.198887348175049, + "logits/rejected": -3.1819355487823486, + "logps/chosen": -218.28741455078125, + "logps/rejected": -269.8429870605469, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3657569885253906, + "rewards/margins": 2.493149995803833, + "rewards/rejected": -2.1273932456970215, + "step": 8452 + }, + { + "epoch": 0.97, + "learning_rate": 7.760739786960084e-09, + "logits/chosen": -2.5978691577911377, + "logits/rejected": -2.7538676261901855, + "logps/chosen": -541.8552856445312, + "logps/rejected": -390.70538330078125, + "loss": 0.2592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8011832237243652, + "rewards/margins": 1.7862989902496338, + "rewards/rejected": -0.9851157665252686, + "step": 8453 + }, + { + "epoch": 0.97, + "learning_rate": 7.72562331733583e-09, + "logits/chosen": -2.998096227645874, + "logits/rejected": -3.3441169261932373, + "logps/chosen": -189.32965087890625, + "logps/rejected": -155.8365478515625, + "loss": 0.7261, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3956030309200287, + "rewards/margins": 0.17512786388397217, + "rewards/rejected": -0.5707309246063232, + "step": 8454 + }, + { + "epoch": 0.97, + "learning_rate": 7.690506847711577e-09, + "logits/chosen": -2.8574888706207275, + "logits/rejected": -2.992537021636963, + "logps/chosen": -239.76779174804688, + "logps/rejected": -210.01885986328125, + "loss": 0.491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5048083066940308, + "rewards/margins": 1.7492955923080444, + "rewards/rejected": -2.254103899002075, + "step": 8455 + }, + { + "epoch": 0.97, + "learning_rate": 7.655390378087322e-09, + "logits/chosen": -2.875180721282959, + "logits/rejected": -2.890360116958618, + "logps/chosen": -143.51181030273438, + "logps/rejected": -299.0469055175781, + "loss": 0.4173, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2668341100215912, + "rewards/margins": 1.5020010471343994, + "rewards/rejected": -1.2351669073104858, + "step": 8456 + }, + { + "epoch": 0.97, + "learning_rate": 7.620273908463068e-09, + "logits/chosen": -3.2421882152557373, + "logits/rejected": -3.2816162109375, + "logps/chosen": -306.53521728515625, + "logps/rejected": -358.46221923828125, + "loss": 0.6347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8493460416793823, + "rewards/margins": 1.554395079612732, + "rewards/rejected": -2.4037411212921143, + "step": 8457 + }, + { + "epoch": 0.98, + "learning_rate": 7.585157438838815e-09, + "logits/chosen": -3.064868450164795, + "logits/rejected": -3.13398814201355, + "logps/chosen": -304.1175537109375, + "logps/rejected": -261.4644775390625, + "loss": 0.7485, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3816271126270294, + "rewards/margins": 0.6329077482223511, + "rewards/rejected": -1.014534831047058, + "step": 8458 + }, + { + "epoch": 0.98, + "learning_rate": 7.55004096921456e-09, + "logits/chosen": -4.058652400970459, + "logits/rejected": -3.949221134185791, + "logps/chosen": -86.2927474975586, + "logps/rejected": -95.63933563232422, + "loss": 0.2926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.479600191116333, + "rewards/margins": 1.5495434999465942, + "rewards/rejected": -1.0699434280395508, + "step": 8459 + }, + { + "epoch": 0.98, + "learning_rate": 7.514924499590308e-09, + "logits/chosen": -3.4089598655700684, + "logits/rejected": -3.026139497756958, + "logps/chosen": -326.42108154296875, + "logps/rejected": -241.8504180908203, + "loss": 1.0241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7534061670303345, + "rewards/margins": -0.11472529172897339, + "rewards/rejected": -0.6386807560920715, + "step": 8460 + }, + { + "epoch": 0.98, + "learning_rate": 7.479808029966053e-09, + "logits/chosen": -3.539700984954834, + "logits/rejected": -3.4401373863220215, + "logps/chosen": -120.50401306152344, + "logps/rejected": -142.69139099121094, + "loss": 0.3613, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3258820176124573, + "rewards/margins": 1.8816559314727783, + "rewards/rejected": -1.5557738542556763, + "step": 8461 + }, + { + "epoch": 0.98, + "learning_rate": 7.4446915603418e-09, + "logits/chosen": -3.336719512939453, + "logits/rejected": -3.554978370666504, + "logps/chosen": -287.3680725097656, + "logps/rejected": -237.15216064453125, + "loss": 0.5642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6230790019035339, + "rewards/margins": 0.6419951319694519, + "rewards/rejected": -1.2650740146636963, + "step": 8462 + }, + { + "epoch": 0.98, + "learning_rate": 7.409575090717546e-09, + "logits/chosen": -2.5466864109039307, + "logits/rejected": -2.408594846725464, + "logps/chosen": -337.4132995605469, + "logps/rejected": -388.8101501464844, + "loss": 0.302, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6088428497314453, + "rewards/margins": 2.242417812347412, + "rewards/rejected": -1.6335749626159668, + "step": 8463 + }, + { + "epoch": 0.98, + "learning_rate": 7.374458621093293e-09, + "logits/chosen": -3.092594623565674, + "logits/rejected": -3.2774574756622314, + "logps/chosen": -153.77410888671875, + "logps/rejected": -231.7120361328125, + "loss": 0.2599, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3202836513519287, + "rewards/margins": 1.8681021928787231, + "rewards/rejected": -2.1883859634399414, + "step": 8464 + }, + { + "epoch": 0.98, + "learning_rate": 7.339342151469039e-09, + "logits/chosen": -3.6672587394714355, + "logits/rejected": -3.6846890449523926, + "logps/chosen": -164.27532958984375, + "logps/rejected": -221.946044921875, + "loss": 0.1642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36008062958717346, + "rewards/margins": 2.4470369815826416, + "rewards/rejected": -2.086956262588501, + "step": 8465 + }, + { + "epoch": 0.98, + "learning_rate": 7.304225681844785e-09, + "logits/chosen": -3.2541215419769287, + "logits/rejected": -3.153207778930664, + "logps/chosen": -208.61163330078125, + "logps/rejected": -209.39495849609375, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24462464451789856, + "rewards/margins": 2.1284637451171875, + "rewards/rejected": -2.3730883598327637, + "step": 8466 + }, + { + "epoch": 0.98, + "learning_rate": 7.269109212220531e-09, + "logits/chosen": -3.8739960193634033, + "logits/rejected": -3.6803321838378906, + "logps/chosen": -323.9363098144531, + "logps/rejected": -248.87367248535156, + "loss": 0.3554, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35727477073669434, + "rewards/margins": 2.4737980365753174, + "rewards/rejected": -2.116523265838623, + "step": 8467 + }, + { + "epoch": 0.98, + "learning_rate": 7.233992742596277e-09, + "logits/chosen": -2.7860612869262695, + "logits/rejected": -2.8490891456604004, + "logps/chosen": -497.5242614746094, + "logps/rejected": -217.41055297851562, + "loss": 0.3788, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14588817954063416, + "rewards/margins": 1.030962586402893, + "rewards/rejected": -0.8850744962692261, + "step": 8468 + }, + { + "epoch": 0.98, + "learning_rate": 7.198876272972023e-09, + "logits/chosen": -3.260657787322998, + "logits/rejected": -3.2256827354431152, + "logps/chosen": -221.76560974121094, + "logps/rejected": -147.77293395996094, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08930650353431702, + "rewards/margins": 1.5668606758117676, + "rewards/rejected": -1.477554202079773, + "step": 8469 + }, + { + "epoch": 0.98, + "learning_rate": 7.1637598033477695e-09, + "logits/chosen": -2.954310894012451, + "logits/rejected": -2.848630428314209, + "logps/chosen": -124.78446960449219, + "logps/rejected": -193.85659790039062, + "loss": 0.129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2611718773841858, + "rewards/margins": 2.866994857788086, + "rewards/rejected": -2.605823040008545, + "step": 8470 + }, + { + "epoch": 0.98, + "learning_rate": 7.128643333723516e-09, + "logits/chosen": -3.824342966079712, + "logits/rejected": -3.4010262489318848, + "logps/chosen": -320.6964111328125, + "logps/rejected": -244.51654052734375, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02732469141483307, + "rewards/margins": 1.2796918153762817, + "rewards/rejected": -1.3070164918899536, + "step": 8471 + }, + { + "epoch": 0.98, + "learning_rate": 7.093526864099262e-09, + "logits/chosen": -3.0326199531555176, + "logits/rejected": -3.2329461574554443, + "logps/chosen": -160.24642944335938, + "logps/rejected": -191.69921875, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21656447649002075, + "rewards/margins": 2.095737934112549, + "rewards/rejected": -2.312302589416504, + "step": 8472 + }, + { + "epoch": 0.98, + "learning_rate": 7.058410394475009e-09, + "logits/chosen": -3.5692758560180664, + "logits/rejected": -3.743993043899536, + "logps/chosen": -86.12794494628906, + "logps/rejected": -214.58734130859375, + "loss": 0.4124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24406638741493225, + "rewards/margins": 3.570068836212158, + "rewards/rejected": -3.8141355514526367, + "step": 8473 + }, + { + "epoch": 0.98, + "learning_rate": 7.023293924850755e-09, + "logits/chosen": -3.36289381980896, + "logits/rejected": -3.529120445251465, + "logps/chosen": -241.98193359375, + "logps/rejected": -250.06793212890625, + "loss": 0.4929, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014819830656051636, + "rewards/margins": 1.5112448930740356, + "rewards/rejected": -1.4964251518249512, + "step": 8474 + }, + { + "epoch": 0.98, + "learning_rate": 6.9881774552265016e-09, + "logits/chosen": -3.2262582778930664, + "logits/rejected": -3.4128856658935547, + "logps/chosen": -307.86407470703125, + "logps/rejected": -314.9330749511719, + "loss": 0.4762, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16166181862354279, + "rewards/margins": 1.8200843334197998, + "rewards/rejected": -1.9817461967468262, + "step": 8475 + }, + { + "epoch": 0.98, + "learning_rate": 6.953060985602247e-09, + "logits/chosen": -3.9772093296051025, + "logits/rejected": -3.6806013584136963, + "logps/chosen": -388.94378662109375, + "logps/rejected": -288.0254821777344, + "loss": 0.2267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.003414541482925415, + "rewards/margins": 2.201543092727661, + "rewards/rejected": -2.204957962036133, + "step": 8476 + }, + { + "epoch": 0.98, + "learning_rate": 6.9179445159779936e-09, + "logits/chosen": -3.043762683868408, + "logits/rejected": -3.070531129837036, + "logps/chosen": -578.2501831054688, + "logps/rejected": -404.6870422363281, + "loss": 0.8866, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.162374496459961, + "rewards/margins": -0.010277248919010162, + "rewards/rejected": -1.1520973443984985, + "step": 8477 + }, + { + "epoch": 0.98, + "learning_rate": 6.882828046353739e-09, + "logits/chosen": -3.502081871032715, + "logits/rejected": -3.3114190101623535, + "logps/chosen": -226.29159545898438, + "logps/rejected": -257.4394836425781, + "loss": 0.1727, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10398821532726288, + "rewards/margins": 3.822413921356201, + "rewards/rejected": -3.7184255123138428, + "step": 8478 + }, + { + "epoch": 0.98, + "learning_rate": 6.8477115767294856e-09, + "logits/chosen": -3.4955384731292725, + "logits/rejected": -3.5288140773773193, + "logps/chosen": -267.10162353515625, + "logps/rejected": -265.1327209472656, + "loss": 0.212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.058431342244148254, + "rewards/margins": 2.580204725265503, + "rewards/rejected": -2.6386358737945557, + "step": 8479 + }, + { + "epoch": 0.98, + "learning_rate": 6.812595107105232e-09, + "logits/chosen": -3.0712790489196777, + "logits/rejected": -2.7467620372772217, + "logps/chosen": -205.8949432373047, + "logps/rejected": -279.7840881347656, + "loss": 0.3864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16965949535369873, + "rewards/margins": 2.198101043701172, + "rewards/rejected": -2.367760419845581, + "step": 8480 + }, + { + "epoch": 0.98, + "learning_rate": 6.777478637480978e-09, + "logits/chosen": -3.021322250366211, + "logits/rejected": -3.015963554382324, + "logps/chosen": -307.05279541015625, + "logps/rejected": -321.27960205078125, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09905458986759186, + "rewards/margins": 0.3263217508792877, + "rewards/rejected": -0.22726716101169586, + "step": 8481 + }, + { + "epoch": 0.98, + "learning_rate": 6.742362167856725e-09, + "logits/chosen": -3.4139211177825928, + "logits/rejected": -2.736502170562744, + "logps/chosen": -338.7869567871094, + "logps/rejected": -193.4683074951172, + "loss": 0.2982, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0066890716552734375, + "rewards/margins": 1.8478224277496338, + "rewards/rejected": -1.8411333560943604, + "step": 8482 + }, + { + "epoch": 0.98, + "learning_rate": 6.70724569823247e-09, + "logits/chosen": -3.256650686264038, + "logits/rejected": -3.445671558380127, + "logps/chosen": -146.2180633544922, + "logps/rejected": -194.78274536132812, + "loss": 0.6388, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37593984603881836, + "rewards/margins": 1.6026803255081177, + "rewards/rejected": -1.9786202907562256, + "step": 8483 + }, + { + "epoch": 0.98, + "learning_rate": 6.672129228608217e-09, + "logits/chosen": -3.311394214630127, + "logits/rejected": -3.544567584991455, + "logps/chosen": -254.2255096435547, + "logps/rejected": -222.4074249267578, + "loss": 0.5799, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0866827964782715, + "rewards/margins": 1.1077947616577148, + "rewards/rejected": -2.1944775581359863, + "step": 8484 + }, + { + "epoch": 0.98, + "learning_rate": 6.637012758983963e-09, + "logits/chosen": -2.837104320526123, + "logits/rejected": -2.44183349609375, + "logps/chosen": -400.37689208984375, + "logps/rejected": -481.4322814941406, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.351169615983963, + "rewards/margins": 1.9379820823669434, + "rewards/rejected": -2.289151668548584, + "step": 8485 + }, + { + "epoch": 0.98, + "learning_rate": 6.60189628935971e-09, + "logits/chosen": -2.8657331466674805, + "logits/rejected": -2.897822856903076, + "logps/chosen": -310.7184753417969, + "logps/rejected": -251.59860229492188, + "loss": 0.5025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15521568059921265, + "rewards/margins": 1.4926784038543701, + "rewards/rejected": -1.6478941440582275, + "step": 8486 + }, + { + "epoch": 0.98, + "learning_rate": 6.566779819735456e-09, + "logits/chosen": -3.03426456451416, + "logits/rejected": -2.9944827556610107, + "logps/chosen": -253.21038818359375, + "logps/rejected": -169.9788055419922, + "loss": 0.3845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032130323350429535, + "rewards/margins": 1.5077852010726929, + "rewards/rejected": -1.5399155616760254, + "step": 8487 + }, + { + "epoch": 0.98, + "learning_rate": 6.5316633501112024e-09, + "logits/chosen": -2.1996219158172607, + "logits/rejected": -2.5172324180603027, + "logps/chosen": -405.1337890625, + "logps/rejected": -292.83935546875, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.006255242973566055, + "rewards/margins": 1.2822327613830566, + "rewards/rejected": -1.2884879112243652, + "step": 8488 + }, + { + "epoch": 0.98, + "learning_rate": 6.496546880486949e-09, + "logits/chosen": -3.8786821365356445, + "logits/rejected": -3.766284704208374, + "logps/chosen": -295.08123779296875, + "logps/rejected": -263.99896240234375, + "loss": 0.2219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6233601570129395, + "rewards/margins": 2.8805785179138184, + "rewards/rejected": -3.503938674926758, + "step": 8489 + }, + { + "epoch": 0.98, + "learning_rate": 6.461430410862694e-09, + "logits/chosen": -3.084059715270996, + "logits/rejected": -3.265758514404297, + "logps/chosen": -191.93821716308594, + "logps/rejected": -339.48046875, + "loss": 0.2274, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5367034673690796, + "rewards/margins": 4.049424648284912, + "rewards/rejected": -3.512721538543701, + "step": 8490 + }, + { + "epoch": 0.98, + "learning_rate": 6.42631394123844e-09, + "logits/chosen": -3.5621612071990967, + "logits/rejected": -3.470043420791626, + "logps/chosen": -312.8921813964844, + "logps/rejected": -308.21453857421875, + "loss": 0.5788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032635778188705444, + "rewards/margins": 1.9471313953399658, + "rewards/rejected": -1.979767084121704, + "step": 8491 + }, + { + "epoch": 0.98, + "learning_rate": 6.3911974716141864e-09, + "logits/chosen": -3.699871063232422, + "logits/rejected": -3.568979263305664, + "logps/chosen": -184.05386352539062, + "logps/rejected": -222.36697387695312, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13250714540481567, + "rewards/margins": 2.4029572010040283, + "rewards/rejected": -2.2704498767852783, + "step": 8492 + }, + { + "epoch": 0.98, + "learning_rate": 6.356081001989933e-09, + "logits/chosen": -3.044790029525757, + "logits/rejected": -2.9673855304718018, + "logps/chosen": -375.24090576171875, + "logps/rejected": -250.930908203125, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012716822326183319, + "rewards/margins": 1.8871909379959106, + "rewards/rejected": -1.8999075889587402, + "step": 8493 + }, + { + "epoch": 0.98, + "learning_rate": 6.320964532365679e-09, + "logits/chosen": -3.293546676635742, + "logits/rejected": -3.1234829425811768, + "logps/chosen": -462.0926513671875, + "logps/rejected": -400.81756591796875, + "loss": 0.1975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4342710077762604, + "rewards/margins": 2.308196783065796, + "rewards/rejected": -2.7424678802490234, + "step": 8494 + }, + { + "epoch": 0.98, + "learning_rate": 6.285848062741426e-09, + "logits/chosen": -3.0001041889190674, + "logits/rejected": -3.493598222732544, + "logps/chosen": -205.2208251953125, + "logps/rejected": -309.0013732910156, + "loss": 0.3268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13551993668079376, + "rewards/margins": 2.311723470687866, + "rewards/rejected": -2.4472434520721436, + "step": 8495 + }, + { + "epoch": 0.98, + "learning_rate": 6.250731593117172e-09, + "logits/chosen": -2.2359471321105957, + "logits/rejected": -2.4334816932678223, + "logps/chosen": -203.56834411621094, + "logps/rejected": -299.93548583984375, + "loss": 0.433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4051065444946289, + "rewards/margins": 1.6684437990188599, + "rewards/rejected": -2.07354998588562, + "step": 8496 + }, + { + "epoch": 0.98, + "learning_rate": 6.2156151234929185e-09, + "logits/chosen": -3.732530117034912, + "logits/rejected": -3.6657090187072754, + "logps/chosen": -192.59141540527344, + "logps/rejected": -245.81906127929688, + "loss": 0.1651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2982720732688904, + "rewards/margins": 2.7652266025543213, + "rewards/rejected": -2.466954469680786, + "step": 8497 + }, + { + "epoch": 0.98, + "learning_rate": 6.180498653868664e-09, + "logits/chosen": -3.031069278717041, + "logits/rejected": -3.448622703552246, + "logps/chosen": -195.2930450439453, + "logps/rejected": -226.67454528808594, + "loss": 0.3733, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04790334403514862, + "rewards/margins": 2.161766290664673, + "rewards/rejected": -2.209669589996338, + "step": 8498 + }, + { + "epoch": 0.98, + "learning_rate": 6.1453821842444105e-09, + "logits/chosen": -3.546642303466797, + "logits/rejected": -3.4998908042907715, + "logps/chosen": -249.80780029296875, + "logps/rejected": -179.9160614013672, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24191740155220032, + "rewards/margins": 1.7192165851593018, + "rewards/rejected": -1.4772990942001343, + "step": 8499 + }, + { + "epoch": 0.98, + "learning_rate": 6.110265714620157e-09, + "logits/chosen": -3.4534051418304443, + "logits/rejected": -3.1558313369750977, + "logps/chosen": -163.5133514404297, + "logps/rejected": -195.2133331298828, + "loss": 1.0196, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6240538954734802, + "rewards/margins": 0.6159093379974365, + "rewards/rejected": -1.2399632930755615, + "step": 8500 + } + ], + "logging_steps": 1, + "max_steps": 8674, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}